Merge "Cleanup interface initialization code to create fewer scopes"
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index f8be21a..b60eebf 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -521,7 +521,8 @@
       move_resolver_(graph->GetArena(), this),
       isa_features_(isa_features),
       method_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-      relative_call_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
+      relative_call_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      fixups_to_jump_tables_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
   // Use a fake return address register to mimic Quick.
   AddAllocatedRegister(Location::RegisterLocation(kFakeReturnRegister));
 }
@@ -5669,6 +5670,51 @@
   }
 }
 
+void LocationsBuilderX86::VisitX86PackedSwitch(HX86PackedSwitch* switch_instr) {
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(switch_instr, LocationSummary::kNoCall);
+  locations->SetInAt(0, Location::RequiresRegister());
+
+  // Constant area pointer.
+  locations->SetInAt(1, Location::RequiresRegister());
+
+  // And the temporary we need.
+  locations->AddTemp(Location::RequiresRegister());
+}
+
+void InstructionCodeGeneratorX86::VisitX86PackedSwitch(HX86PackedSwitch* switch_instr) {
+  int32_t lower_bound = switch_instr->GetStartValue();
+  int32_t num_entries = switch_instr->GetNumEntries();
+  LocationSummary* locations = switch_instr->GetLocations();
+  Register value_reg = locations->InAt(0).AsRegister<Register>();
+  HBasicBlock* default_block = switch_instr->GetDefaultBlock();
+
+  // Optimizing has a jump area.
+  Register temp_reg = locations->GetTemp(0).AsRegister<Register>();
+  Register constant_area = locations->InAt(1).AsRegister<Register>();
+
+  // Remove the bias, if needed.
+  if (lower_bound != 0) {
+    __ leal(temp_reg, Address(value_reg, -lower_bound));
+    value_reg = temp_reg;
+  }
+
+  // Is the value in range?
+  DCHECK_GE(num_entries, 1);
+  __ cmpl(value_reg, Immediate(num_entries - 1));
+  __ j(kAbove, codegen_->GetLabelOf(default_block));
+
+  // We are in the range of the table.
+  // Load (target-constant_area) from the jump table, indexing by the value.
+  __ movl(temp_reg, codegen_->LiteralCaseTable(switch_instr, constant_area, value_reg));
+
+  // Compute the actual target address by adding in constant_area.
+  __ addl(temp_reg, constant_area);
+
+  // And jump.
+  __ jmp(temp_reg);
+}
+
 void LocationsBuilderX86::VisitX86ComputeBaseMethodAddress(
     HX86ComputeBaseMethodAddress* insn) {
   LocationSummary* locations =
@@ -5752,28 +5798,18 @@
   }
 }
 
-void CodeGeneratorX86::Finalize(CodeAllocator* allocator) {
-  // Generate the constant area if needed.
-  X86Assembler* assembler = GetAssembler();
-  if (!assembler->IsConstantAreaEmpty()) {
-    // Align to 4 byte boundary to reduce cache misses, as the data is 4 and 8
-    // byte values.
-    assembler->Align(4, 0);
-    constant_area_start_ = assembler->CodeSize();
-    assembler->AddConstantArea();
-  }
-
-  // And finish up.
-  CodeGenerator::Finalize(allocator);
-}
-
 /**
  * Class to handle late fixup of offsets into constant area.
  */
 class RIPFixup : public AssemblerFixup, public ArenaObject<kArenaAllocCodeGenerator> {
  public:
-  RIPFixup(const CodeGeneratorX86& codegen, int offset)
-      : codegen_(codegen), offset_into_constant_area_(offset) {}
+  RIPFixup(CodeGeneratorX86& codegen, size_t offset)
+      : codegen_(&codegen), offset_into_constant_area_(offset) {}
+
+ protected:
+  void SetOffset(size_t offset) { offset_into_constant_area_ = offset; }
+
+  CodeGeneratorX86* codegen_;
 
  private:
   void Process(const MemoryRegion& region, int pos) OVERRIDE {
@@ -5781,19 +5817,77 @@
     // last 4 bytes of the instruction.
     // The value to patch is the distance from the offset in the constant area
     // from the address computed by the HX86ComputeBaseMethodAddress instruction.
-    int32_t constant_offset = codegen_.ConstantAreaStart() + offset_into_constant_area_;
-    int32_t relative_position = constant_offset - codegen_.GetMethodAddressOffset();;
+    int32_t constant_offset = codegen_->ConstantAreaStart() + offset_into_constant_area_;
+    int32_t relative_position = constant_offset - codegen_->GetMethodAddressOffset();;
 
     // Patch in the right value.
     region.StoreUnaligned<int32_t>(pos - 4, relative_position);
   }
 
-  const CodeGeneratorX86& codegen_;
-
   // Location in constant area that the fixup refers to.
-  int offset_into_constant_area_;
+  int32_t offset_into_constant_area_;
 };
 
+/**
+ * Class to handle late fixup of offsets to a jump table that will be created in the
+ * constant area.
+ */
+class JumpTableRIPFixup : public RIPFixup {
+ public:
+  JumpTableRIPFixup(CodeGeneratorX86& codegen, HX86PackedSwitch* switch_instr)
+      : RIPFixup(codegen, static_cast<size_t>(-1)), switch_instr_(switch_instr) {}
+
+  void CreateJumpTable() {
+    X86Assembler* assembler = codegen_->GetAssembler();
+
+    // Ensure that the reference to the jump table has the correct offset.
+    const int32_t offset_in_constant_table = assembler->ConstantAreaSize();
+    SetOffset(offset_in_constant_table);
+
+    // The label values in the jump table are computed relative to the
+    // instruction addressing the constant area.
+    const int32_t relative_offset = codegen_->GetMethodAddressOffset();
+
+    // Populate the jump table with the correct values for the jump table.
+    int32_t num_entries = switch_instr_->GetNumEntries();
+    HBasicBlock* block = switch_instr_->GetBlock();
+    const ArenaVector<HBasicBlock*>& successors = block->GetSuccessors();
+    // The value that we want is the target offset - the position of the table.
+    for (int32_t i = 0; i < num_entries; i++) {
+      HBasicBlock* b = successors[i];
+      Label* l = codegen_->GetLabelOf(b);
+      DCHECK(l->IsBound());
+      int32_t offset_to_block = l->Position() - relative_offset;
+      assembler->AppendInt32(offset_to_block);
+    }
+  }
+
+ private:
+  const HX86PackedSwitch* switch_instr_;
+};
+
+void CodeGeneratorX86::Finalize(CodeAllocator* allocator) {
+  // Generate the constant area if needed.
+  X86Assembler* assembler = GetAssembler();
+  if (!assembler->IsConstantAreaEmpty() || !fixups_to_jump_tables_.empty()) {
+    // Align to 4 byte boundary to reduce cache misses, as the data is 4 and 8
+    // byte values.
+    assembler->Align(4, 0);
+    constant_area_start_ = assembler->CodeSize();
+
+    // Populate any jump tables.
+    for (auto jump_table : fixups_to_jump_tables_) {
+      jump_table->CreateJumpTable();
+    }
+
+    // And now add the constant area to the generated code.
+    assembler->AddConstantArea();
+  }
+
+  // And finish up.
+  CodeGenerator::Finalize(allocator);
+}
+
 Address CodeGeneratorX86::LiteralDoubleAddress(double v, Register reg) {
   AssemblerFixup* fixup = new (GetGraph()->GetArena()) RIPFixup(*this, __ AddDouble(v));
   return Address(reg, kDummy32BitOffset, fixup);
@@ -5814,6 +5908,20 @@
   return Address(reg, kDummy32BitOffset, fixup);
 }
 
+Address CodeGeneratorX86::LiteralCaseTable(HX86PackedSwitch* switch_instr,
+                                           Register reg,
+                                           Register value) {
+  // Create a fixup to be used to create and address the jump table.
+  JumpTableRIPFixup* table_fixup =
+      new (GetGraph()->GetArena()) JumpTableRIPFixup(*this, switch_instr);
+
+  // We have to populate the jump tables.
+  fixups_to_jump_tables_.push_back(table_fixup);
+
+  // We want a scaled address, as we are extracting the correct offset from the table.
+  return Address(reg, value, TIMES_4, kDummy32BitOffset, table_fixup);
+}
+
 /**
  * Finds instructions that need the constant area base as an input.
  */
@@ -5864,6 +5972,21 @@
     }
   }
 
+  void VisitPackedSwitch(HPackedSwitch* switch_insn) OVERRIDE {
+    // We need to replace the HPackedSwitch with a HX86PackedSwitch in order to
+    // address the constant area.
+    InitializeConstantAreaPointer(switch_insn);
+    HGraph* graph = GetGraph();
+    HBasicBlock* block = switch_insn->GetBlock();
+    HX86PackedSwitch* x86_switch = new (graph->GetArena()) HX86PackedSwitch(
+        switch_insn->GetStartValue(),
+        switch_insn->GetNumEntries(),
+        switch_insn->InputAt(0),
+        base_,
+        switch_insn->GetDexPc());
+    block->ReplaceAndRemoveInstructionWith(switch_insn, x86_switch);
+  }
+
   void InitializeConstantAreaPointer(HInstruction* user) {
     // Ensure we only initialize the pointer once.
     if (base_ != nullptr) {
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index ae2d84f..fdfc5ab 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -245,6 +245,8 @@
   DISALLOW_COPY_AND_ASSIGN(InstructionCodeGeneratorX86);
 };
 
+class JumpTableRIPFixup;
+
 class CodeGeneratorX86 : public CodeGenerator {
  public:
   CodeGeneratorX86(HGraph* graph,
@@ -385,6 +387,8 @@
   Address LiteralInt32Address(int32_t v, Register reg);
   Address LiteralInt64Address(int64_t v, Register reg);
 
+  Address LiteralCaseTable(HX86PackedSwitch* switch_instr, Register reg, Register value);
+
   void Finalize(CodeAllocator* allocator) OVERRIDE;
 
  private:
@@ -405,6 +409,9 @@
   // Used for fixups to the constant area.
   int32_t constant_area_start_;
 
+  // Fixups for jump tables that need to be patched after the constant table is generated.
+  ArenaVector<JumpTableRIPFixup*> fixups_to_jump_tables_;
+
   // If there is a HX86ComputeBaseMethodAddress instruction in the graph
   // (which shall be the sole instruction of this kind), subtracting this offset
   // from the value contained in the out register of this HX86ComputeBaseMethodAddress
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 21120a0..f0d9420 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -670,7 +670,8 @@
         constant_area_start_(0),
         method_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
         relative_call_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-        pc_rel_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
+        pc_rel_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+        fixups_to_jump_tables_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
   AddAllocatedRegister(Location::RegisterLocation(kFakeReturnRegister));
 }
 
@@ -5322,31 +5323,43 @@
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(switch_instr, LocationSummary::kNoCall);
   locations->SetInAt(0, Location::RequiresRegister());
+  locations->AddTemp(Location::RequiresRegister());
+  locations->AddTemp(Location::RequiresRegister());
 }
 
 void InstructionCodeGeneratorX86_64::VisitPackedSwitch(HPackedSwitch* switch_instr) {
   int32_t lower_bound = switch_instr->GetStartValue();
   int32_t num_entries = switch_instr->GetNumEntries();
   LocationSummary* locations = switch_instr->GetLocations();
-  CpuRegister value_reg = locations->InAt(0).AsRegister<CpuRegister>();
+  CpuRegister value_reg_in = locations->InAt(0).AsRegister<CpuRegister>();
+  CpuRegister temp_reg = locations->GetTemp(0).AsRegister<CpuRegister>();
+  CpuRegister base_reg = locations->GetTemp(1).AsRegister<CpuRegister>();
+
+  // Remove the bias, if needed.
+  Register value_reg_out = value_reg_in.AsRegister();
+  if (lower_bound != 0) {
+    __ leal(temp_reg, Address(value_reg_in, -lower_bound));
+    value_reg_out = temp_reg.AsRegister();
+  }
+  CpuRegister value_reg(value_reg_out);
+
+  // Is the value in range?
   HBasicBlock* default_block = switch_instr->GetDefaultBlock();
+  __ cmpl(value_reg, Immediate(num_entries - 1));
+  __ j(kAbove, codegen_->GetLabelOf(default_block));
 
-  // Create a series of compare/jumps.
-  const ArenaVector<HBasicBlock*>& successors = switch_instr->GetBlock()->GetSuccessors();
-  for (int i = 0; i < num_entries; i++) {
-    int32_t case_value = lower_bound + i;
-    if (case_value == 0) {
-      __ testl(value_reg, value_reg);
-    } else {
-      __ cmpl(value_reg, Immediate(case_value));
-    }
-    __ j(kEqual, codegen_->GetLabelOf(successors[i]));
-  }
+  // We are in the range of the table.
+  // Load the address of the jump table in the constant area.
+  __ leaq(base_reg, codegen_->LiteralCaseTable(switch_instr));
 
-  // And the default for any other value.
-  if (!codegen_->GoesToNextBlock(switch_instr->GetBlock(), default_block)) {
-      __ jmp(codegen_->GetLabelOf(default_block));
-  }
+  // Load the (signed) offset from the jump table.
+  __ movsxd(temp_reg, Address(base_reg, value_reg, TIMES_4, 0));
+
+  // Add the offset to the address of the table base.
+  __ addq(temp_reg, base_reg);
+
+  // And jump.
+  __ jmp(temp_reg);
 }
 
 void CodeGeneratorX86_64::Load64BitValue(CpuRegister dest, int64_t value) {
@@ -5372,15 +5385,85 @@
   }
 }
 
+/**
+ * Class to handle late fixup of offsets into constant area.
+ */
+class RIPFixup : public AssemblerFixup, public ArenaObject<kArenaAllocCodeGenerator> {
+ public:
+  RIPFixup(CodeGeneratorX86_64& codegen, size_t offset)
+      : codegen_(&codegen), offset_into_constant_area_(offset) {}
+
+ protected:
+  void SetOffset(size_t offset) { offset_into_constant_area_ = offset; }
+
+  CodeGeneratorX86_64* codegen_;
+
+ private:
+  void Process(const MemoryRegion& region, int pos) OVERRIDE {
+    // Patch the correct offset for the instruction.  We use the address of the
+    // 'next' instruction, which is 'pos' (patch the 4 bytes before).
+    int32_t constant_offset = codegen_->ConstantAreaStart() + offset_into_constant_area_;
+    int32_t relative_position = constant_offset - pos;
+
+    // Patch in the right value.
+    region.StoreUnaligned<int32_t>(pos - 4, relative_position);
+  }
+
+  // Location in constant area that the fixup refers to.
+  size_t offset_into_constant_area_;
+};
+
+/**
+ t * Class to handle late fixup of offsets to a jump table that will be created in the
+ * constant area.
+ */
+class JumpTableRIPFixup : public RIPFixup {
+ public:
+  JumpTableRIPFixup(CodeGeneratorX86_64& codegen, HPackedSwitch* switch_instr)
+      : RIPFixup(codegen, -1), switch_instr_(switch_instr) {}
+
+  void CreateJumpTable() {
+    X86_64Assembler* assembler = codegen_->GetAssembler();
+
+    // Ensure that the reference to the jump table has the correct offset.
+    const int32_t offset_in_constant_table = assembler->ConstantAreaSize();
+    SetOffset(offset_in_constant_table);
+
+    // Compute the offset from the start of the function to this jump table.
+    const int32_t current_table_offset = assembler->CodeSize() + offset_in_constant_table;
+
+    // Populate the jump table with the correct values for the jump table.
+    int32_t num_entries = switch_instr_->GetNumEntries();
+    HBasicBlock* block = switch_instr_->GetBlock();
+    const ArenaVector<HBasicBlock*>& successors = block->GetSuccessors();
+    // The value that we want is the target offset - the position of the table.
+    for (int32_t i = 0; i < num_entries; i++) {
+      HBasicBlock* b = successors[i];
+      Label* l = codegen_->GetLabelOf(b);
+      DCHECK(l->IsBound());
+      int32_t offset_to_block = l->Position() - current_table_offset;
+      assembler->AppendInt32(offset_to_block);
+    }
+  }
+
+ private:
+  const HPackedSwitch* switch_instr_;
+};
+
 void CodeGeneratorX86_64::Finalize(CodeAllocator* allocator) {
   // Generate the constant area if needed.
   X86_64Assembler* assembler = GetAssembler();
-  if (!assembler->IsConstantAreaEmpty()) {
-    // Align to 4 byte boundary to reduce cache misses, as the data is 4 and 8
-    // byte values.  If used for vectors at a later time, this will need to be
-    // updated to 16 bytes with the appropriate offset.
+  if (!assembler->IsConstantAreaEmpty() || !fixups_to_jump_tables_.empty()) {
+    // Align to 4 byte boundary to reduce cache misses, as the data is 4 and 8 byte values.
     assembler->Align(4, 0);
     constant_area_start_ = assembler->CodeSize();
+
+    // Populate any jump tables.
+    for (auto jump_table : fixups_to_jump_tables_) {
+      jump_table->CreateJumpTable();
+    }
+
+    // And now add the constant area to the generated code.
     assembler->AddConstantArea();
   }
 
@@ -5388,31 +5471,6 @@
   CodeGenerator::Finalize(allocator);
 }
 
-/**
- * Class to handle late fixup of offsets into constant area.
- */
-class RIPFixup : public AssemblerFixup, public ArenaObject<kArenaAllocCodeGenerator> {
-  public:
-    RIPFixup(const CodeGeneratorX86_64& codegen, int offset)
-      : codegen_(codegen), offset_into_constant_area_(offset) {}
-
-  private:
-    void Process(const MemoryRegion& region, int pos) OVERRIDE {
-      // Patch the correct offset for the instruction.  We use the address of the
-      // 'next' instruction, which is 'pos' (patch the 4 bytes before).
-      int constant_offset = codegen_.ConstantAreaStart() + offset_into_constant_area_;
-      int relative_position = constant_offset - pos;
-
-      // Patch in the right value.
-      region.StoreUnaligned<int32_t>(pos - 4, relative_position);
-    }
-
-    const CodeGeneratorX86_64& codegen_;
-
-    // Location in constant area that the fixup refers to.
-    int offset_into_constant_area_;
-};
-
 Address CodeGeneratorX86_64::LiteralDoubleAddress(double v) {
   AssemblerFixup* fixup = new (GetGraph()->GetArena()) RIPFixup(*this, __ AddDouble(v));
   return Address::RIP(fixup);
@@ -5453,6 +5511,16 @@
   GetMoveResolver()->EmitNativeCode(&parallel_move);
 }
 
+Address CodeGeneratorX86_64::LiteralCaseTable(HPackedSwitch* switch_instr) {
+  // Create a fixup to be used to create and address the jump table.
+  JumpTableRIPFixup* table_fixup =
+      new (GetGraph()->GetArena()) JumpTableRIPFixup(*this, switch_instr);
+
+  // We have to populate the jump tables.
+  fixups_to_jump_tables_.push_back(table_fixup);
+  return Address::RIP(table_fixup);
+}
+
 #undef __
 
 }  // namespace x86_64
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index d6a6a7e..dc86a48 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -234,6 +234,9 @@
   DISALLOW_COPY_AND_ASSIGN(InstructionCodeGeneratorX86_64);
 };
 
+// Class for fixups to jump tables.
+class JumpTableRIPFixup;
+
 class CodeGeneratorX86_64 : public CodeGenerator {
  public:
   CodeGeneratorX86_64(HGraph* graph,
@@ -354,6 +357,7 @@
 
   // Load a 64 bit value into a register in the most efficient manner.
   void Load64BitValue(CpuRegister dest, int64_t value);
+  Address LiteralCaseTable(HPackedSwitch* switch_instr);
 
   // Store a 64 bit value into a DoubleStackSlot in the most efficient manner.
   void Store64BitValueToStack(Location dest, int64_t value);
@@ -391,6 +395,9 @@
   // We will fix this up in the linker later to have the right value.
   static constexpr int32_t kDummy32BitOffset = 256;
 
+  // Fixups for jump tables need to be handled specially.
+  ArenaVector<JumpTableRIPFixup*> fixups_to_jump_tables_;
+
   DISALLOW_COPY_AND_ASSIGN(CodeGeneratorX86_64);
 };
 
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index 24a89bc..ed401b6 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -606,8 +606,23 @@
 void HBasicBlock::ReplaceAndRemoveInstructionWith(HInstruction* initial,
                                                   HInstruction* replacement) {
   DCHECK(initial->GetBlock() == this);
-  InsertInstructionBefore(replacement, initial);
-  initial->ReplaceWith(replacement);
+  if (initial->IsControlFlow()) {
+    // We can only replace a control flow instruction with another control flow instruction.
+    DCHECK(replacement->IsControlFlow());
+    DCHECK_EQ(replacement->GetId(), -1);
+    DCHECK_EQ(replacement->GetType(), Primitive::kPrimVoid);
+    DCHECK_EQ(initial->GetBlock(), this);
+    DCHECK_EQ(initial->GetType(), Primitive::kPrimVoid);
+    DCHECK(initial->GetUses().IsEmpty());
+    DCHECK(initial->GetEnvUses().IsEmpty());
+    replacement->SetBlock(this);
+    replacement->SetId(GetGraph()->GetNextInstructionId());
+    instructions_.InsertInstructionBefore(replacement, initial);
+    UpdateInputsUsers(replacement);
+  } else {
+    InsertInstructionBefore(replacement, initial);
+    initial->ReplaceWith(replacement);
+  }
   RemoveInstruction(initial);
 }
 
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 82909c4..0d668e8 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1074,7 +1074,8 @@
 
 #define FOR_EACH_CONCRETE_INSTRUCTION_X86(M)                            \
   M(X86ComputeBaseMethodAddress, Instruction)                           \
-  M(X86LoadFromConstantTable, Instruction)
+  M(X86LoadFromConstantTable, Instruction)                              \
+  M(X86PackedSwitch, Instruction)
 
 #define FOR_EACH_CONCRETE_INSTRUCTION_X86_64(M)
 
diff --git a/compiler/optimizing/nodes_x86.h b/compiler/optimizing/nodes_x86.h
index f7cc872..556217b 100644
--- a/compiler/optimizing/nodes_x86.h
+++ b/compiler/optimizing/nodes_x86.h
@@ -62,6 +62,45 @@
   DISALLOW_COPY_AND_ASSIGN(HX86LoadFromConstantTable);
 };
 
+// X86 version of HPackedSwitch that holds a pointer to the base method address.
+class HX86PackedSwitch : public HTemplateInstruction<2> {
+ public:
+  HX86PackedSwitch(int32_t start_value,
+                   int32_t num_entries,
+                   HInstruction* input,
+                   HX86ComputeBaseMethodAddress* method_base,
+                   uint32_t dex_pc)
+    : HTemplateInstruction(SideEffects::None(), dex_pc),
+      start_value_(start_value),
+      num_entries_(num_entries) {
+    SetRawInputAt(0, input);
+    SetRawInputAt(1, method_base);
+  }
+
+  bool IsControlFlow() const OVERRIDE { return true; }
+
+  int32_t GetStartValue() const { return start_value_; }
+
+  int32_t GetNumEntries() const { return num_entries_; }
+
+  HX86ComputeBaseMethodAddress* GetBaseMethodAddress() const {
+    return InputAt(1)->AsX86ComputeBaseMethodAddress();
+  }
+
+  HBasicBlock* GetDefaultBlock() const {
+    // Last entry is the default block.
+    return GetBlock()->GetSuccessors()[num_entries_];
+  }
+
+  DECLARE_INSTRUCTION(X86PackedSwitch);
+
+ private:
+  const int32_t start_value_;
+  const int32_t num_entries_;
+
+  DISALLOW_COPY_AND_ASSIGN(HX86PackedSwitch);
+};
+
 }  // namespace art
 
 #endif  // ART_COMPILER_OPTIMIZING_NODES_X86_H_
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 04e815a..5347bf0 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -2369,44 +2369,48 @@
   }
 }
 
-int ConstantArea::AddInt32(int32_t v) {
-  for (size_t i = 0, e = buffer_.size(); i < e; i++) {
-    if (v == buffer_[i]) {
-      return i * kEntrySize;
-    }
-  }
-
-  // Didn't match anything.
-  int result = buffer_.size() * kEntrySize;
+size_t ConstantArea::AppendInt32(int32_t v) {
+  size_t result = buffer_.size() * elem_size_;
   buffer_.push_back(v);
   return result;
 }
 
-int ConstantArea::AddInt64(int64_t v) {
+size_t ConstantArea::AddInt32(int32_t v) {
+  for (size_t i = 0, e = buffer_.size(); i < e; i++) {
+    if (v == buffer_[i]) {
+      return i * elem_size_;
+    }
+  }
+
+  // Didn't match anything.
+  return AppendInt32(v);
+}
+
+size_t ConstantArea::AddInt64(int64_t v) {
   int32_t v_low = Low32Bits(v);
   int32_t v_high = High32Bits(v);
   if (buffer_.size() > 1) {
     // Ensure we don't pass the end of the buffer.
     for (size_t i = 0, e = buffer_.size() - 1; i < e; i++) {
       if (v_low == buffer_[i] && v_high == buffer_[i + 1]) {
-        return i * kEntrySize;
+        return i * elem_size_;
       }
     }
   }
 
   // Didn't match anything.
-  int result = buffer_.size() * kEntrySize;
+  size_t result = buffer_.size() * elem_size_;
   buffer_.push_back(v_low);
   buffer_.push_back(v_high);
   return result;
 }
 
-int ConstantArea::AddDouble(double v) {
+size_t ConstantArea::AddDouble(double v) {
   // Treat the value as a 64-bit integer value.
   return AddInt64(bit_cast<int64_t, double>(v));
 }
 
-int ConstantArea::AddFloat(float v) {
+size_t ConstantArea::AddFloat(float v) {
   // Treat the value as a 32-bit integer value.
   return AddInt32(bit_cast<int32_t, float>(v));
 }
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index 93ecdf5..b50fda9 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -166,21 +166,6 @@
     Init(base_in, disp.Int32Value());
   }
 
-  void Init(Register base_in, int32_t disp) {
-    if (disp == 0 && base_in != EBP) {
-      SetModRM(0, base_in);
-      if (base_in == ESP) SetSIB(TIMES_1, ESP, base_in);
-    } else if (disp >= -128 && disp <= 127) {
-      SetModRM(1, base_in);
-      if (base_in == ESP) SetSIB(TIMES_1, ESP, base_in);
-      SetDisp8(disp);
-    } else {
-      SetModRM(2, base_in);
-      if (base_in == ESP) SetSIB(TIMES_1, ESP, base_in);
-      SetDisp32(disp);
-    }
-  }
-
   Address(Register index_in, ScaleFactor scale_in, int32_t disp) {
     CHECK_NE(index_in, ESP);  // Illegal addressing mode.
     SetModRM(0, ESP);
@@ -189,19 +174,15 @@
   }
 
   Address(Register base_in, Register index_in, ScaleFactor scale_in, int32_t disp) {
-    CHECK_NE(index_in, ESP);  // Illegal addressing mode.
-    if (disp == 0 && base_in != EBP) {
-      SetModRM(0, ESP);
-      SetSIB(scale_in, index_in, base_in);
-    } else if (disp >= -128 && disp <= 127) {
-      SetModRM(1, ESP);
-      SetSIB(scale_in, index_in, base_in);
-      SetDisp8(disp);
-    } else {
-      SetModRM(2, ESP);
-      SetSIB(scale_in, index_in, base_in);
-      SetDisp32(disp);
-    }
+    Init(base_in, index_in, scale_in, disp);
+  }
+
+  Address(Register base_in,
+          Register index_in,
+          ScaleFactor scale_in,
+          int32_t disp, AssemblerFixup *fixup) {
+    Init(base_in, index_in, scale_in, disp);
+    SetFixup(fixup);
   }
 
   static Address Absolute(uintptr_t addr) {
@@ -217,6 +198,37 @@
 
  private:
   Address() {}
+
+  void Init(Register base_in, int32_t disp) {
+    if (disp == 0 && base_in != EBP) {
+      SetModRM(0, base_in);
+      if (base_in == ESP) SetSIB(TIMES_1, ESP, base_in);
+    } else if (disp >= -128 && disp <= 127) {
+      SetModRM(1, base_in);
+      if (base_in == ESP) SetSIB(TIMES_1, ESP, base_in);
+      SetDisp8(disp);
+    } else {
+      SetModRM(2, base_in);
+      if (base_in == ESP) SetSIB(TIMES_1, ESP, base_in);
+      SetDisp32(disp);
+    }
+  }
+
+  void Init(Register base_in, Register index_in, ScaleFactor scale_in, int32_t disp) {
+    CHECK_NE(index_in, ESP);  // Illegal addressing mode.
+    if (disp == 0 && base_in != EBP) {
+      SetModRM(0, ESP);
+      SetSIB(scale_in, index_in, base_in);
+    } else if (disp >= -128 && disp <= 127) {
+      SetModRM(1, ESP);
+      SetSIB(scale_in, index_in, base_in);
+      SetDisp8(disp);
+    } else {
+      SetModRM(2, ESP);
+      SetSIB(scale_in, index_in, base_in);
+      SetDisp32(disp);
+    }
+  }
 };
 
 
@@ -252,40 +264,39 @@
 
   // Add a double to the constant area, returning the offset into
   // the constant area where the literal resides.
-  int AddDouble(double v);
+  size_t AddDouble(double v);
 
   // Add a float to the constant area, returning the offset into
   // the constant area where the literal resides.
-  int AddFloat(float v);
+  size_t AddFloat(float v);
 
   // Add an int32_t to the constant area, returning the offset into
   // the constant area where the literal resides.
-  int AddInt32(int32_t v);
+  size_t AddInt32(int32_t v);
+
+  // Add an int32_t to the end of the constant area, returning the offset into
+  // the constant area where the literal resides.
+  size_t AppendInt32(int32_t v);
 
   // Add an int64_t to the constant area, returning the offset into
   // the constant area where the literal resides.
-  int AddInt64(int64_t v);
+  size_t AddInt64(int64_t v);
 
   bool IsEmpty() const {
     return buffer_.size() == 0;
   }
 
+  size_t GetSize() const {
+    return buffer_.size() * elem_size_;
+  }
+
   const std::vector<int32_t>& GetBuffer() const {
     return buffer_;
   }
 
-  void AddFixup(AssemblerFixup* fixup) {
-    fixups_.push_back(fixup);
-  }
-
-  const std::vector<AssemblerFixup*>& GetFixups() const {
-    return fixups_;
-  }
-
  private:
-  static constexpr size_t kEntrySize = sizeof(int32_t);
+  static constexpr size_t elem_size_ = sizeof(int32_t);
   std::vector<int32_t> buffer_;
-  std::vector<AssemblerFixup*> fixups_;
 };
 
 class X86Assembler FINAL : public Assembler {
@@ -740,26 +751,36 @@
 
   // Add a double to the constant area, returning the offset into
   // the constant area where the literal resides.
-  int AddDouble(double v) { return constant_area_.AddDouble(v); }
+  size_t AddDouble(double v) { return constant_area_.AddDouble(v); }
 
   // Add a float to the constant area, returning the offset into
   // the constant area where the literal resides.
-  int AddFloat(float v)   { return constant_area_.AddFloat(v); }
+  size_t AddFloat(float v)   { return constant_area_.AddFloat(v); }
 
   // Add an int32_t to the constant area, returning the offset into
   // the constant area where the literal resides.
-  int AddInt32(int32_t v) { return constant_area_.AddInt32(v); }
+  size_t AddInt32(int32_t v) {
+    return constant_area_.AddInt32(v);
+  }
+
+  // Add an int32_t to the end of the constant area, returning the offset into
+  // the constant area where the literal resides.
+  size_t AppendInt32(int32_t v) {
+    return constant_area_.AppendInt32(v);
+  }
 
   // Add an int64_t to the constant area, returning the offset into
   // the constant area where the literal resides.
-  int AddInt64(int64_t v) { return constant_area_.AddInt64(v); }
+  size_t AddInt64(int64_t v) { return constant_area_.AddInt64(v); }
 
   // Add the contents of the constant area to the assembler buffer.
   void AddConstantArea();
 
   // Is the constant area empty? Return true if there are no literals in the constant area.
   bool IsConstantAreaEmpty() const { return constant_area_.IsEmpty(); }
-  void AddConstantAreaFixup(AssemblerFixup* fixup) { constant_area_.AddFixup(fixup); }
+
+  // Return the current size of the constant area.
+  size_t ConstantAreaSize() const { return constant_area_.GetSize(); }
 
  private:
   inline void EmitUint8(uint8_t value);
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 6e7d74d..9eb5e67 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -3122,7 +3122,14 @@
   }
 }
 
-int ConstantArea::AddInt32(int32_t v) {
+size_t ConstantArea::AppendInt32(int32_t v) {
+  size_t result = buffer_.size() * elem_size_;
+  buffer_.push_back(v);
+  return result;
+}
+
+size_t ConstantArea::AddInt32(int32_t v) {
+  // Look for an existing match.
   for (size_t i = 0, e = buffer_.size(); i < e; i++) {
     if (v == buffer_[i]) {
       return i * elem_size_;
@@ -3130,12 +3137,10 @@
   }
 
   // Didn't match anything.
-  int result = buffer_.size() * elem_size_;
-  buffer_.push_back(v);
-  return result;
+  return AppendInt32(v);
 }
 
-int ConstantArea::AddInt64(int64_t v) {
+size_t ConstantArea::AddInt64(int64_t v) {
   int32_t v_low = v;
   int32_t v_high = v >> 32;
   if (buffer_.size() > 1) {
@@ -3148,18 +3153,18 @@
   }
 
   // Didn't match anything.
-  int result = buffer_.size() * elem_size_;
+  size_t result = buffer_.size() * elem_size_;
   buffer_.push_back(v_low);
   buffer_.push_back(v_high);
   return result;
 }
 
-int ConstantArea::AddDouble(double v) {
+size_t ConstantArea::AddDouble(double v) {
   // Treat the value as a 64-bit integer value.
   return AddInt64(bit_cast<int64_t, double>(v));
 }
 
-int ConstantArea::AddFloat(float v) {
+size_t ConstantArea::AddFloat(float v) {
   // Treat the value as a 32-bit integer value.
   return AddInt32(bit_cast<int32_t, float>(v));
 }
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 255f551..01d28e3 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -269,36 +269,40 @@
  * Class to handle constant area values.
  */
 class ConstantArea {
-  public:
-    ConstantArea() {}
+ public:
+  ConstantArea() {}
 
-    // Add a double to the constant area, returning the offset into
-    // the constant area where the literal resides.
-    int AddDouble(double v);
+  // Add a double to the constant area, returning the offset into
+  // the constant area where the literal resides.
+  size_t AddDouble(double v);
 
-    // Add a float to the constant area, returning the offset into
-    // the constant area where the literal resides.
-    int AddFloat(float v);
+  // Add a float to the constant area, returning the offset into
+  // the constant area where the literal resides.
+  size_t AddFloat(float v);
 
-    // Add an int32_t to the constant area, returning the offset into
-    // the constant area where the literal resides.
-    int AddInt32(int32_t v);
+  // Add an int32_t to the constant area, returning the offset into
+  // the constant area where the literal resides.
+  size_t AddInt32(int32_t v);
 
-    // Add an int64_t to the constant area, returning the offset into
-    // the constant area where the literal resides.
-    int AddInt64(int64_t v);
+  // Add an int32_t to the end of the constant area, returning the offset into
+  // the constant area where the literal resides.
+  size_t AppendInt32(int32_t v);
 
-    int GetSize() const {
-      return buffer_.size() * elem_size_;
-    }
+  // Add an int64_t to the constant area, returning the offset into
+  // the constant area where the literal resides.
+  size_t AddInt64(int64_t v);
 
-    const std::vector<int32_t>& GetBuffer() const {
-      return buffer_;
-    }
+  size_t GetSize() const {
+    return buffer_.size() * elem_size_;
+  }
 
-  private:
-    static constexpr size_t elem_size_ = sizeof(int32_t);
-    std::vector<int32_t> buffer_;
+  const std::vector<int32_t>& GetBuffer() const {
+    return buffer_;
+  }
+
+ private:
+  static constexpr size_t elem_size_ = sizeof(int32_t);
+  std::vector<int32_t> buffer_;
 };
 
 
@@ -806,19 +810,27 @@
 
   // Add a double to the constant area, returning the offset into
   // the constant area where the literal resides.
-  int AddDouble(double v) { return constant_area_.AddDouble(v); }
+  size_t AddDouble(double v) { return constant_area_.AddDouble(v); }
 
   // Add a float to the constant area, returning the offset into
   // the constant area where the literal resides.
-  int AddFloat(float v)   { return constant_area_.AddFloat(v); }
+  size_t AddFloat(float v)   { return constant_area_.AddFloat(v); }
 
   // Add an int32_t to the constant area, returning the offset into
   // the constant area where the literal resides.
-  int AddInt32(int32_t v) { return constant_area_.AddInt32(v); }
+  size_t AddInt32(int32_t v) {
+    return constant_area_.AddInt32(v);
+  }
+
+  // Add an int32_t to the end of the constant area, returning the offset into
+  // the constant area where the literal resides.
+  size_t AppendInt32(int32_t v) {
+    return constant_area_.AppendInt32(v);
+  }
 
   // Add an int64_t to the constant area, returning the offset into
   // the constant area where the literal resides.
-  int AddInt64(int64_t v) { return constant_area_.AddInt64(v); }
+  size_t AddInt64(int64_t v) { return constant_area_.AddInt64(v); }
 
   // Add the contents of the constant area to the assembler buffer.
   void AddConstantArea();
@@ -826,6 +838,9 @@
   // Is the constant area empty? Return true if there are no literals in the constant area.
   bool IsConstantAreaEmpty() const { return constant_area_.GetSize() == 0; }
 
+  // Return the current size of the constant area.
+  size_t ConstantAreaSize() const { return constant_area_.GetSize(); }
+
   //
   // Heap poisoning.
   //
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index d09631b..930bb2c 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -891,7 +891,109 @@
 ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
 
 // Generate the allocation entrypoints for each allocator.
-GENERATE_ALL_ALLOC_ENTRYPOINTS
+GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB)
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc).
+ENTRY art_quick_alloc_object_rosalloc
+    // Fast path rosalloc allocation.
+    // r0: type_idx/return value, r1: ArtMethod*, r9: Thread::Current
+    // r2, r3, r12: free.
+    ldr    r2, [r1, #ART_METHOD_DEX_CACHE_TYPES_OFFSET_32]    // Load dex cache resolved types array
+                                                              // Load the class (r2)
+    ldr    r2, [r2, r0, lsl #COMPRESSED_REFERENCE_SIZE_SHIFT]
+    cbz    r2, .Lart_quick_alloc_object_rosalloc_slow_path    // Check null class
+                                                              // Check class status.
+    ldr    r3, [r2, #MIRROR_CLASS_STATUS_OFFSET]
+    cmp    r3, #MIRROR_CLASS_STATUS_INITIALIZED
+    bne    .Lart_quick_alloc_object_rosalloc_slow_path
+                                                              // Add a fake dependence from the
+                                                              // following access flag and size
+                                                              // loads to the status load.
+                                                              // This is to prevent those loads
+                                                              // from being reordered above the
+                                                              // status load and reading wrong
+                                                              // values (an alternative is to use
+                                                              // a load-acquire for the status).
+    eor    r3, r3, r3
+    add    r2, r2, r3
+                                                              // Check access flags has
+                                                              // kAccClassIsFinalizable
+    ldr    r3, [r2, #MIRROR_CLASS_ACCESS_FLAGS_OFFSET]
+    tst    r3, #ACCESS_FLAGS_CLASS_IS_FINALIZABLE
+    bne    .Lart_quick_alloc_object_rosalloc_slow_path
+
+    ldr    r3, [r9, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]     // Check if the thread local
+                                                              // allocation stack has room.
+                                                              // TODO: consider using ldrd.
+    ldr    r12, [r9, #THREAD_LOCAL_ALLOC_STACK_END_OFFSET]
+    cmp    r3, r12
+    bhs    .Lart_quick_alloc_object_rosalloc_slow_path
+
+    ldr    r3, [r2, #MIRROR_CLASS_OBJECT_SIZE_OFFSET]         // Load the object size (r3)
+    cmp    r3, #ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE        // Check if the size is for a thread
+                                                              // local allocation
+    bhs    .Lart_quick_alloc_object_rosalloc_slow_path
+                                                              // Compute the rosalloc bracket index
+                                                              // from the size.
+                                                              // Align up the size by the rosalloc
+                                                              // bracket quantum size and divide
+                                                              // by the quantum size and subtract
+                                                              // by 1. This code is a shorter but
+                                                              // equivalent version.
+    sub    r3, r3, #1
+    lsr    r3, r3, #ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT
+                                                              // Load the rosalloc run (r12)
+    add    r12, r9, r3, lsl #POINTER_SIZE_SHIFT
+    ldr    r12, [r12, #THREAD_ROSALLOC_RUNS_OFFSET]
+                                                              // Load the free list head (r3). This
+                                                              // will be the return val.
+    ldr    r3, [r12, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)]
+    cbz    r3, .Lart_quick_alloc_object_rosalloc_slow_path
+    // "Point of no slow path". Won't go to the slow path from here on. OK to clobber r0 and r1.
+    ldr    r1, [r3, #ROSALLOC_SLOT_NEXT_OFFSET]               // Load the next pointer of the head
+                                                              // and update the list head with the
+                                                              // next pointer.
+    str    r1, [r12, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)]
+                                                              // Store the class pointer in the
+                                                              // header. This also overwrites the
+                                                              // next pointer. The offsets are
+                                                              // asserted to match.
+#if ROSALLOC_SLOT_NEXT_OFFSET != MIRROR_OBJECT_CLASS_OFFSET
+#error "Class pointer needs to overwrite next pointer."
+#endif
+    str    r2, [r3, #MIRROR_OBJECT_CLASS_OFFSET]
+                                                              // Push the new object onto the thread
+                                                              // local allocation stack and
+                                                              // increment the thread local
+                                                              // allocation stack top.
+    ldr    r1, [r9, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]
+    str    r3, [r1], #COMPRESSED_REFERENCE_SIZE               // (Increment r1 as a side effect.)
+    str    r1, [r9, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]
+                                                              // Decrement the size of the free list
+    ldr    r1, [r12, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_SIZE_OFFSET)]
+    sub    r1, #1
+                                                              // TODO: consider combining this store
+                                                              // and the list head store above using
+                                                              // strd.
+    str    r1, [r12, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_SIZE_OFFSET)]
+                                                              // Fence. This is "ish" not "ishst" so
+                                                              // that the code after this allocation
+                                                              // site will see the right values in
+                                                              // the fields of the class.
+                                                              // Alternatively we could use "ishst"
+                                                              // if we use load-acquire for the
+                                                              // class status load.)
+    dmb    ish
+    mov    r0, r3                                             // Set the return value and return.
+    bx     lr
+
+.Lart_quick_alloc_object_rosalloc_slow_path:
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME  r2, r3  @ save callee saves in case of GC
+    mov    r2, r9                     @ pass Thread::Current
+    bl     artAllocObjectFromCodeRosAlloc     @ (uint32_t type_idx, Method* method, Thread*)
+    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
+    RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+END art_quick_alloc_object_rosalloc
 
     /*
      * Called by managed code when the value in rSUSPEND has been decremented to 0.
diff --git a/runtime/arch/quick_alloc_entrypoints.S b/runtime/arch/quick_alloc_entrypoints.S
index ef5edbb..fbacdbc 100644
--- a/runtime/arch/quick_alloc_entrypoints.S
+++ b/runtime/arch/quick_alloc_entrypoints.S
@@ -113,7 +113,8 @@
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_dlmalloc_instrumented, DlMallocInstrumented)
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_dlmalloc_instrumented, DlMallocInstrumented)
 
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc)
+// This is to be separately defined for each architecture to allow a hand-written assembly fast path.
+// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc)
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_rosalloc, RosAlloc)
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_rosalloc, RosAlloc)
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_rosalloc, RosAlloc)
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 4a106e4..2f485ae 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -788,6 +788,7 @@
 
 // Generate the allocation entrypoints for each allocator.
 GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc)
 GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB)
 
 ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 5c413d2..95f0ccb 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -809,6 +809,7 @@
 
 // Generate the allocation entrypoints for each allocator.
 GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc)
 // A handle-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB).
 DEFINE_FUNCTION art_quick_alloc_object_tlab
     // Fast path tlab allocation.
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index d98fc51..69f6fe9 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -19,6 +19,7 @@
 
 #if defined(__cplusplus)
 #include "art_method.h"
+#include "gc/allocator/rosalloc.h"
 #include "lock_word.h"
 #include "mirror/class.h"
 #include "mirror/string.h"
@@ -53,6 +54,14 @@
 #define ADD_TEST_EQ(x, y)
 #endif
 
+#if defined(__LP64__)
+#define POINTER_SIZE_SHIFT 3
+#else
+#define POINTER_SIZE_SHIFT 2
+#endif
+ADD_TEST_EQ(static_cast<size_t>(1U << POINTER_SIZE_SHIFT),
+            static_cast<size_t>(__SIZEOF_POINTER__))
+
 // Size of references to the heap on the stack.
 #define STACK_REFERENCE_SIZE 4
 ADD_TEST_EQ(static_cast<size_t>(STACK_REFERENCE_SIZE), sizeof(art::StackReference<art::mirror::Object>))
@@ -62,6 +71,10 @@
 ADD_TEST_EQ(static_cast<size_t>(COMPRESSED_REFERENCE_SIZE),
             sizeof(art::mirror::CompressedReference<art::mirror::Object>))
 
+#define COMPRESSED_REFERENCE_SIZE_SHIFT 2
+ADD_TEST_EQ(static_cast<size_t>(1U << COMPRESSED_REFERENCE_SIZE_SHIFT),
+            static_cast<size_t>(COMPRESSED_REFERENCE_SIZE))
+
 // Note: these callee save methods loads require read barriers.
 // Offset of field Runtime::callee_save_methods_[kSaveAll]
 #define RUNTIME_SAVE_ALL_CALLEE_SAVE_FRAME_OFFSET 0
@@ -120,6 +133,18 @@
 #define THREAD_LOCAL_OBJECTS_OFFSET (THREAD_LOCAL_POS_OFFSET + 2 * __SIZEOF_POINTER__)
 ADD_TEST_EQ(THREAD_LOCAL_OBJECTS_OFFSET,
             art::Thread::ThreadLocalObjectsOffset<__SIZEOF_POINTER__>().Int32Value())
+// Offset of field Thread::tlsPtr_.rosalloc_runs.
+#define THREAD_ROSALLOC_RUNS_OFFSET (THREAD_LOCAL_POS_OFFSET + 3 * __SIZEOF_POINTER__)
+ADD_TEST_EQ(THREAD_ROSALLOC_RUNS_OFFSET,
+            art::Thread::RosAllocRunsOffset<__SIZEOF_POINTER__>().Int32Value())
+// Offset of field Thread::tlsPtr_.thread_local_alloc_stack_top.
+#define THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET (THREAD_ROSALLOC_RUNS_OFFSET + 34 * __SIZEOF_POINTER__)
+ADD_TEST_EQ(THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET,
+            art::Thread::ThreadLocalAllocStackTopOffset<__SIZEOF_POINTER__>().Int32Value())
+// Offset of field Thread::tlsPtr_.thread_local_alloc_stack_end.
+#define THREAD_LOCAL_ALLOC_STACK_END_OFFSET (THREAD_ROSALLOC_RUNS_OFFSET + 35 * __SIZEOF_POINTER__)
+ADD_TEST_EQ(THREAD_LOCAL_ALLOC_STACK_END_OFFSET,
+            art::Thread::ThreadLocalAllocStackEndOffset<__SIZEOF_POINTER__>().Int32Value())
 
 // Offsets within java.lang.Object.
 #define MIRROR_OBJECT_CLASS_OFFSET 0
@@ -236,6 +261,44 @@
 ADD_TEST_EQ(static_cast<uint32_t>(OBJECT_ALIGNMENT_MASK_TOGGLED),
             ~static_cast<uint32_t>(art::kObjectAlignment - 1))
 
+#define ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE 128
+ADD_TEST_EQ(ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE,
+            static_cast<int32_t>(art::gc::allocator::RosAlloc::kMaxThreadLocalBracketSize))
+
+#define ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT 4
+ADD_TEST_EQ(ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT,
+            static_cast<int32_t>(art::gc::allocator::RosAlloc::kBracketQuantumSizeShift))
+
+#define ROSALLOC_BRACKET_QUANTUM_SIZE_MASK 15
+ADD_TEST_EQ(ROSALLOC_BRACKET_QUANTUM_SIZE_MASK,
+            static_cast<int32_t>(art::gc::allocator::RosAlloc::kBracketQuantumSize - 1))
+
+#define ROSALLOC_BRACKET_QUANTUM_SIZE_MASK_TOGGLED32 0xfffffff0
+ADD_TEST_EQ(static_cast<uint32_t>(ROSALLOC_BRACKET_QUANTUM_SIZE_MASK_TOGGLED32),
+            ~static_cast<uint32_t>(art::gc::allocator::RosAlloc::kBracketQuantumSize - 1))
+
+#define ROSALLOC_BRACKET_QUANTUM_SIZE_MASK_TOGGLED64 0xfffffffffffffff0
+ADD_TEST_EQ(static_cast<uint64_t>(ROSALLOC_BRACKET_QUANTUM_SIZE_MASK_TOGGLED64),
+            ~static_cast<uint64_t>(art::gc::allocator::RosAlloc::kBracketQuantumSize - 1))
+
+#define ROSALLOC_RUN_FREE_LIST_OFFSET 8
+ADD_TEST_EQ(ROSALLOC_RUN_FREE_LIST_OFFSET,
+            static_cast<int32_t>(art::gc::allocator::RosAlloc::RunFreeListOffset()))
+
+#define ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET 0
+ADD_TEST_EQ(ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET,
+            static_cast<int32_t>(art::gc::allocator::RosAlloc::RunFreeListHeadOffset()))
+
+#define ROSALLOC_RUN_FREE_LIST_SIZE_OFFSET 16
+ADD_TEST_EQ(ROSALLOC_RUN_FREE_LIST_SIZE_OFFSET,
+            static_cast<int32_t>(art::gc::allocator::RosAlloc::RunFreeListSizeOffset()))
+
+#define ROSALLOC_SLOT_NEXT_OFFSET 0
+ADD_TEST_EQ(ROSALLOC_SLOT_NEXT_OFFSET,
+            static_cast<int32_t>(art::gc::allocator::RosAlloc::RunSlotNextOffset()))
+// Assert this so that we can avoid zeroing the next field by installing the class pointer.
+ADD_TEST_EQ(ROSALLOC_SLOT_NEXT_OFFSET, MIRROR_OBJECT_CLASS_OFFSET)
+
 #if defined(__cplusplus)
 }  // End of CheckAsmSupportOffsets.
 #endif
diff --git a/runtime/gc/allocator/rosalloc.h b/runtime/gc/allocator/rosalloc.h
index 87f1392..3ce3d63 100644
--- a/runtime/gc/allocator/rosalloc.h
+++ b/runtime/gc/allocator/rosalloc.h
@@ -131,6 +131,7 @@
 
    private:
     Slot* next_;  // Next slot in the list.
+    friend class RosAlloc;
   };
 
   // We use the tail (kUseTail == true) for the bulk or thread-local free lists to avoid the need to
@@ -302,6 +303,7 @@
     // free without traversing the whole free list.
     uint32_t size_;
     uint32_t padding_ ATTRIBUTE_UNUSED;
+    friend class RosAlloc;
   };
 
   // Represents a run of memory slots of the same size.
@@ -482,7 +484,7 @@
   static constexpr uint8_t kMagicNumFree = 43;
   // The number of size brackets. Sync this with the length of Thread::rosalloc_runs_.
   static constexpr size_t kNumOfSizeBrackets = kNumRosAllocThreadLocalSizeBrackets;
-  // The number of smaller size brackets that are 16 bytes apart.
+  // The number of smaller size brackets that are the quantum size apart.
   static constexpr size_t kNumOfQuantumSizeBrackets = 32;
   // The sizes (the slot sizes, in bytes) of the size brackets.
   static size_t bracketSizes[kNumOfSizeBrackets];
@@ -520,9 +522,7 @@
   }
   // Returns true if the given allocation size is for a thread local allocation.
   static bool IsSizeForThreadLocal(size_t size) {
-    DCHECK_GT(kNumThreadLocalSizeBrackets, 0U);
-    size_t max_thread_local_bracket_idx = kNumThreadLocalSizeBrackets - 1;
-    bool is_size_for_thread_local = size <= bracketSizes[max_thread_local_bracket_idx];
+    bool is_size_for_thread_local = size <= kMaxThreadLocalBracketSize;
     DCHECK(size > kLargeSizeThreshold ||
            (is_size_for_thread_local == (SizeToIndex(size) < kNumThreadLocalSizeBrackets)));
     return is_size_for_thread_local;
@@ -634,6 +634,16 @@
   // are less than this index. We use shared (current) runs for the rest.
   static const size_t kNumThreadLocalSizeBrackets = 8;
 
+  // The size of the largest bracket we use thread-local runs for.
+  // This should be equal to bracketSizes[kNumThreadLocalSizeBrackets - 1].
+  static const size_t kMaxThreadLocalBracketSize = 128;
+
+  // The bracket size increment for the brackets of size <= 512 bytes.
+  static constexpr size_t kBracketQuantumSize = 16;
+
+  // Equal to Log2(kQuantumBracketSizeIncrement).
+  static constexpr size_t kBracketQuantumSizeShift = 4;
+
  private:
   // The base address of the memory region that's managed by this allocator.
   uint8_t* base_;
@@ -770,6 +780,19 @@
            size_t page_release_size_threshold = kDefaultPageReleaseSizeThreshold);
   ~RosAlloc();
 
+  static size_t RunFreeListOffset() {
+    return OFFSETOF_MEMBER(Run, free_list_);
+  }
+  static size_t RunFreeListHeadOffset() {
+    return OFFSETOF_MEMBER(SlotFreeList<false>, head_);
+  }
+  static size_t RunFreeListSizeOffset() {
+    return OFFSETOF_MEMBER(SlotFreeList<false>, size_);
+  }
+  static size_t RunSlotNextOffset() {
+    return OFFSETOF_MEMBER(Slot, next_);
+  }
+
   // If kThreadUnsafe is true then the allocator may avoid acquiring some locks as an optimization.
   // If used, this may cause race conditions if multiple threads are allocating at the same time.
   template<bool kThreadSafe = true>
diff --git a/runtime/native/dalvik_system_DexFile.cc b/runtime/native/dalvik_system_DexFile.cc
index 1a6bead..e9ce02b 100644
--- a/runtime/native/dalvik_system_DexFile.cc
+++ b/runtime/native/dalvik_system_DexFile.cc
@@ -230,7 +230,8 @@
     }
   }
 
-  if (all_deleted) {
+  // oat_file can be null if we are running without dex2oat.
+  if (all_deleted && oat_file != nullptr) {
     // If all of the dex files are no longer in use we can unmap the corresponding oat file.
     VLOG(class_linker) << "Unregistering " << oat_file;
     runtime->GetOatFileManager().UnRegisterAndDeleteOatFile(oat_file);
diff --git a/runtime/thread.h b/runtime/thread.h
index 8cea10c..8f3461a 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -626,6 +626,24 @@
     return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values, thread_local_objects));
   }
 
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> RosAllocRunsOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values,
+                                                                rosalloc_runs));
+  }
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThreadLocalAllocStackTopOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values,
+                                                                thread_local_alloc_stack_top));
+  }
+
+  template<size_t pointer_size>
+  static ThreadOffset<pointer_size> ThreadLocalAllocStackEndOffset() {
+    return ThreadOffsetFromTlsPtr<pointer_size>(OFFSETOF_MEMBER(tls_ptr_sized_values,
+                                                                thread_local_alloc_stack_end));
+  }
+
   // Size of stack less any space reserved for stack overflow
   size_t GetStackSize() const {
     return tlsPtr_.stack_size - (tlsPtr_.stack_end - tlsPtr_.stack_begin);