Merge "x86_64: Enable compilation"
diff --git a/compiler/Android.mk b/compiler/Android.mk
index 4f9f312..6d2f5d1 100644
--- a/compiler/Android.mk
+++ b/compiler/Android.mk
@@ -84,6 +84,7 @@
 	optimizing/code_generator.cc \
 	optimizing/code_generator_arm.cc \
 	optimizing/code_generator_x86.cc \
+	optimizing/code_generator_x86_64.cc \
 	optimizing/graph_visualizer.cc \
 	optimizing/locations.cc \
 	optimizing/nodes.cc \
diff --git a/compiler/dex/frontend.cc b/compiler/dex/frontend.cc
index aa50b45..84e1a94 100644
--- a/compiler/dex/frontend.cc
+++ b/compiler/dex/frontend.cc
@@ -755,7 +755,7 @@
       support_list_size = arraysize(x86_64_support_list);
     }
 
-    for (int idx = 0; idx < cu.mir_graph->GetNumBlocks(); idx++) {
+    for (unsigned int idx = 0; idx < cu.mir_graph->GetNumBlocks(); idx++) {
       BasicBlock* bb = cu.mir_graph->GetBasicBlock(idx);
       if (bb == NULL) continue;
       if (bb->block_type == kDead) continue;
@@ -884,15 +884,13 @@
         (1 << kBBOpt) |
         (1 << kMatch) |
         (1 << kPromoteCompilerTemps));
-  }
-
-  if (cu.instruction_set == kArm64 || cu.instruction_set == kX86_64) {
-    // TODO(Arm64): enable optimizations once backend is mature enough.
+  } else if (cu.instruction_set == kX86_64) {
     // TODO(X86_64): enable optimizations once backend is mature enough.
     cu.disable_opt = ~(uint32_t)0;
-    if (cu.instruction_set == kArm64) {
-      cu.enable_debug |= (1 << kDebugCodegenDump);
-    }
+  } else if (cu.instruction_set == kArm64) {
+    // TODO(Arm64): enable optimizations once backend is mature enough.
+    cu.disable_opt = ~(uint32_t)0;
+    cu.enable_debug |= (1 << kDebugCodegenDump);
   }
 
   cu.StartTimingSplit("BuildMIRGraph");
diff --git a/compiler/dex/mir_graph.cc b/compiler/dex/mir_graph.cc
index a2676c8..63a5570 100644
--- a/compiler/dex/mir_graph.cc
+++ b/compiler/dex/mir_graph.cc
@@ -586,7 +586,7 @@
   if (current_method_ == 0) {
     DCHECK(entry_block_ == NULL);
     DCHECK(exit_block_ == NULL);
-    DCHECK_EQ(num_blocks_, 0);
+    DCHECK_EQ(num_blocks_, 0U);
     // Use id 0 to represent a null block.
     BasicBlock* null_block = NewMemBB(kNullBlock, num_blocks_++);
     DCHECK_EQ(null_block->id, NullBasicBlockId);
diff --git a/compiler/dex/mir_graph.h b/compiler/dex/mir_graph.h
index b6cec66..27b8ca4 100644
--- a/compiler/dex/mir_graph.h
+++ b/compiler/dex/mir_graph.h
@@ -587,7 +587,7 @@
     return m_units_[m_unit_index]->GetCodeItem()->insns_;
   }
 
-  int GetNumBlocks() const {
+  unsigned int GetNumBlocks() const {
     return num_blocks_;
   }
 
@@ -607,7 +607,7 @@
     return exit_block_;
   }
 
-  BasicBlock* GetBasicBlock(int block_id) const {
+  BasicBlock* GetBasicBlock(unsigned int block_id) const {
     return (block_id == NullBasicBlockId) ? NULL : block_list_.Get(block_id);
   }
 
@@ -1149,7 +1149,7 @@
   ArenaBitVector* try_block_addr_;
   BasicBlock* entry_block_;
   BasicBlock* exit_block_;
-  int num_blocks_;
+  unsigned int num_blocks_;
   const DexFile::CodeItem* current_code_item_;
   GrowableArray<uint16_t> dex_pc_to_block_map_;  // FindBlock lookup cache.
   std::vector<DexCompilationUnit*> m_units_;     // List of methods included in this graph
diff --git a/compiler/dex/quick/arm/assemble_arm.cc b/compiler/dex/quick/arm/assemble_arm.cc
index a895e6e..5083bbc 100644
--- a/compiler/dex/quick/arm/assemble_arm.cc
+++ b/compiler/dex/quick/arm/assemble_arm.cc
@@ -1628,7 +1628,7 @@
   CreateNativeGcMap();
 }
 
-int ArmMir2Lir::GetInsnSize(LIR* lir) {
+size_t ArmMir2Lir::GetInsnSize(LIR* lir) {
   DCHECK(!IsPseudoLirOp(lir->opcode));
   return EncodingMap[lir->opcode].size;
 }
diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index 8db7d4e..95bcfbd 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h
@@ -87,7 +87,7 @@
     std::string BuildInsnString(const char* fmt, LIR* lir, unsigned char* base_addr);
     ResourceMask GetPCUseDefEncoding() const OVERRIDE;
     uint64_t GetTargetInstFlags(int opcode);
-    int GetInsnSize(LIR* lir);
+    size_t GetInsnSize(LIR* lir) OVERRIDE;
     bool IsUnconditionalBranch(LIR* lir);
 
     // Check support for volatile load/store of a given size.
diff --git a/compiler/dex/quick/arm64/assemble_arm64.cc b/compiler/dex/quick/arm64/assemble_arm64.cc
index 2c4f262..9362147 100644
--- a/compiler/dex/quick/arm64/assemble_arm64.cc
+++ b/compiler/dex/quick/arm64/assemble_arm64.cc
@@ -887,7 +887,7 @@
   CreateNativeGcMap();
 }
 
-int Arm64Mir2Lir::GetInsnSize(LIR* lir) {
+size_t Arm64Mir2Lir::GetInsnSize(LIR* lir) {
   ArmOpcode opcode = UNWIDE(lir->opcode);
   DCHECK(!IsPseudoLirOp(opcode));
   return EncodingMap[opcode].size;
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index bf09b86..9a80c69 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -86,7 +86,7 @@
     std::string BuildInsnString(const char* fmt, LIR* lir, unsigned char* base_addr);
     ResourceMask GetPCUseDefEncoding() const OVERRIDE;
     uint64_t GetTargetInstFlags(int opcode);
-    int GetInsnSize(LIR* lir);
+    size_t GetInsnSize(LIR* lir) OVERRIDE;
     bool IsUnconditionalBranch(LIR* lir);
 
     // Check support for volatile load/store of a given size.
diff --git a/compiler/dex/quick/gen_common.cc b/compiler/dex/quick/gen_common.cc
index 8f6d716..f9081ce 100644
--- a/compiler/dex/quick/gen_common.cc
+++ b/compiler/dex/quick/gen_common.cc
@@ -456,7 +456,8 @@
      * this is an uncommon operation and isn't especially performance
      * critical.
      */
-    RegStorage r_src = AllocTemp();
+    // This is addressing the stack, which may be out of the 4G area.
+    RegStorage r_src = cu_->target64 ? AllocTempWide() : AllocTemp();
     RegStorage r_dst = AllocTemp();
     RegStorage r_idx = AllocTemp();
     RegStorage r_val;
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index 2af847c..a90a06e 100644
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -155,7 +155,12 @@
   if (arg0.wide == 0) {
     LoadValueDirectFixed(arg0, TargetReg(kArg0));
   } else {
-    RegStorage r_tmp = RegStorage::MakeRegPair(TargetReg(kArg0), TargetReg(kArg1));
+    RegStorage r_tmp;
+    if (cu_->instruction_set == kX86_64) {
+      r_tmp = RegStorage::Solo64(TargetReg(kArg0).GetReg());
+    } else {
+      r_tmp = RegStorage::MakeRegPair(TargetReg(kArg0), TargetReg(kArg1));
+    }
     LoadValueDirectWideFixed(arg0, r_tmp);
   }
   ClobberCallerSave();
@@ -181,7 +186,12 @@
   if (arg1.wide == 0) {
     LoadValueDirectFixed(arg1, TargetReg(kArg1));
   } else {
-    RegStorage r_tmp = RegStorage::MakeRegPair(TargetReg(kArg1), TargetReg(kArg2));
+    RegStorage r_tmp;
+    if (cu_->instruction_set == kX86_64) {
+      r_tmp = RegStorage::Solo64(TargetReg(kArg1).GetReg());
+    } else {
+      r_tmp = RegStorage::MakeRegPair(TargetReg(kArg1), TargetReg(kArg2));
+    }
     LoadValueDirectWideFixed(arg1, r_tmp);
   }
   LoadConstant(TargetReg(kArg0), arg0);
@@ -279,6 +289,12 @@
         LoadValueDirectFixed(arg1, arg1.fp ? TargetReg(kFArg2) : TargetReg(kArg1));
       } else if (cu_->instruction_set == kArm64) {
         LoadValueDirectFixed(arg1, arg1.fp ? TargetReg(kFArg1) : TargetReg(kArg1));
+      } else if (cu_->instruction_set == kX86_64) {
+        if (arg0.fp) {
+          LoadValueDirectFixed(arg1, arg1.fp ? TargetReg(kFArg1) : TargetReg(kArg0));
+        } else {
+          LoadValueDirectFixed(arg1, arg1.fp ? TargetReg(kFArg0) : TargetReg(kArg1));
+        }
       } else {
         LoadValueDirectFixed(arg1, TargetReg(kArg1));
       }
@@ -423,7 +439,12 @@
   if (arg2.wide == 0) {
     LoadValueDirectFixed(arg2, TargetReg(kArg2));
   } else {
-    RegStorage r_tmp = RegStorage::MakeRegPair(TargetReg(kArg2), TargetReg(kArg3));
+    RegStorage r_tmp;
+    if (cu_->instruction_set == kX86_64) {
+      r_tmp = RegStorage::Solo64(TargetReg(kArg2).GetReg());
+    } else {
+      r_tmp = RegStorage::MakeRegPair(TargetReg(kArg2), TargetReg(kArg3));
+    }
     LoadValueDirectWideFixed(arg2, r_tmp);
   }
   LoadConstant(TargetReg(kArg0), arg0);
diff --git a/compiler/dex/quick/mips/assemble_mips.cc b/compiler/dex/quick/mips/assemble_mips.cc
index b26ab57..c7e9190 100644
--- a/compiler/dex/quick/mips/assemble_mips.cc
+++ b/compiler/dex/quick/mips/assemble_mips.cc
@@ -709,7 +709,7 @@
   return res;
 }
 
-int MipsMir2Lir::GetInsnSize(LIR* lir) {
+size_t MipsMir2Lir::GetInsnSize(LIR* lir) {
   DCHECK(!IsPseudoLirOp(lir->opcode));
   return EncodingMap[lir->opcode].size;
 }
diff --git a/compiler/dex/quick/mips/codegen_mips.h b/compiler/dex/quick/mips/codegen_mips.h
index 62a7f24..571adac 100644
--- a/compiler/dex/quick/mips/codegen_mips.h
+++ b/compiler/dex/quick/mips/codegen_mips.h
@@ -85,7 +85,7 @@
     std::string BuildInsnString(const char* fmt, LIR* lir, unsigned char* base_addr);
     ResourceMask GetPCUseDefEncoding() const OVERRIDE;
     uint64_t GetTargetInstFlags(int opcode);
-    int GetInsnSize(LIR* lir);
+    size_t GetInsnSize(LIR* lir) OVERRIDE;
     bool IsUnconditionalBranch(LIR* lir);
 
     // Check support for volatile load/store of a given size.
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index ca4d0e4..9155677 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -1162,7 +1162,7 @@
     virtual std::string BuildInsnString(const char* fmt, LIR* lir, unsigned char* base_addr) = 0;
     virtual ResourceMask GetPCUseDefEncoding() const = 0;
     virtual uint64_t GetTargetInstFlags(int opcode) = 0;
-    virtual int GetInsnSize(LIR* lir) = 0;
+    virtual size_t GetInsnSize(LIR* lir) = 0;
     virtual bool IsUnconditionalBranch(LIR* lir) = 0;
 
     // Check support for volatile load/store of a given size.
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index d37ee67..c7e289d 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -506,9 +506,80 @@
   return low_reg;
 }
 
+static bool HasModrm(const X86EncodingMap* entry) {
+  switch (entry->kind) {
+    case kNullary: return false;
+    case kRegOpcode: return false;
+    default: return true;
+  }
+}
+
+static bool HasSib(const X86EncodingMap* entry) {
+  switch (entry->kind) {
+    case kArray: return true;
+    case kArrayReg: return true;
+    case kRegArray: return true;
+    case kArrayImm: return true;
+    case kRegArrayImm: return true;
+    case kShiftArrayImm: return true;
+    case kShiftArrayCl: return true;
+    case kArrayCond: return true;
+    case kCall:
+      switch (entry->opcode) {
+        case kX86CallA: return true;
+        default: return false;
+      }
+    case kPcRel: return true;
+       switch (entry->opcode) {
+         case kX86PcRelLoadRA: return true;
+         default: return false;
+        }
+    default: return false;
+  }
+}
+
+static bool ModrmIsRegReg(const X86EncodingMap* entry) {
+  switch (entry->kind) {
+    // There is no modrm for this kind of instruction, therefore the reg doesn't form part of the
+    // modrm:
+    case kNullary: return true;
+    case kRegOpcode: return true;
+    case kMovRegImm: return true;
+    // Regular modrm value of 3 cases, when there is one register the other register holds an
+    // opcode so the base register is special.
+    case kReg: return true;
+    case kRegReg: return true;
+    case kRegRegStore: return true;
+    case kRegImm: return true;
+    case kRegRegImm: return true;
+    case kRegRegImmStore: return true;
+    case kShiftRegImm: return true;
+    case kShiftRegCl: return true;
+    case kRegCond: return true;
+    case kRegRegCond: return true;
+    case kJmp:
+      switch (entry->opcode) {
+        case kX86JmpR: return true;
+        default: return false;
+      }
+    case kCall:
+      switch (entry->opcode) {
+        case kX86CallR: return true;
+        default: return false;
+      }
+    default: return false;
+  }
+}
+
 size_t X86Mir2Lir::ComputeSize(const X86EncodingMap* entry, int32_t raw_reg, int32_t raw_index,
-                               int32_t raw_base, bool has_sib, bool r8_form, bool r8_reg_reg_form,
-                               int32_t displacement) {
+                               int32_t raw_base, int32_t displacement) {
+  bool has_modrm = HasModrm(entry);
+  bool has_sib = HasSib(entry);
+  bool r8_form = entry->skeleton.r8_form;
+  bool modrm_is_reg_reg = ModrmIsRegReg(entry);
+  if (has_sib) {
+    DCHECK(!modrm_is_reg_reg);
+  }
   size_t size = 0;
   if (entry->skeleton.prefix1 > 0) {
     ++size;
@@ -517,15 +588,19 @@
     }
   }
   if (Gen64Bit() || kIsDebugBuild) {
-    bool registers_need_rex_prefix =
-        NeedsRex(raw_reg) || NeedsRex(raw_index) || NeedsRex(raw_base) ||
-        (r8_form && RegStorage::RegNum(raw_reg) > 4) ||
-        (r8_reg_reg_form && RegStorage::RegNum(raw_base) > 4);
-    if (registers_need_rex_prefix &&
-        entry->skeleton.prefix1 != REX_W && entry->skeleton.prefix2 != REX_W) {
-      DCHECK(Gen64Bit()) << "Attempt to use " << entry->name << " on a non-byte register "
-          << RegStorage::RegNum(raw_reg);
-      ++size;  // rex
+    bool registers_need_rex_prefix = NeedsRex(raw_reg) || NeedsRex(raw_index) || NeedsRex(raw_base);
+    if (r8_form) {
+      // Do we need an empty REX prefix to normalize byte registers?
+      registers_need_rex_prefix = registers_need_rex_prefix || (RegStorage::RegNum(raw_reg) >= 4);
+      registers_need_rex_prefix = registers_need_rex_prefix ||
+          (modrm_is_reg_reg && (RegStorage::RegNum(raw_base) >= 4));
+    }
+    if (registers_need_rex_prefix) {
+      DCHECK(Gen64Bit()) << "Attempt to use a 64-bit only addressable register "
+          << RegStorage::RegNum(raw_reg) << " with instruction " << entry->name;
+      if (entry->skeleton.prefix1 != REX_W && entry->skeleton.prefix2 != REX_W) {
+        ++size;  // rex
+      }
     }
   }
   ++size;  // opcode
@@ -535,89 +610,72 @@
       ++size;
     }
   }
-  ++size;  // modrm
-  if (has_sib || LowRegisterBits(raw_base) == rs_rX86_SP.GetRegNum()
-      || (Gen64Bit() && entry->skeleton.prefix1 == THREAD_PREFIX)) {
-    // SP requires a SIB byte.
-    // GS access also needs a SIB byte for absolute adressing in 64-bit mode.
-    ++size;
+  if (has_modrm) {
+    ++size;  // modrm
   }
-  if (displacement != 0 || LowRegisterBits(raw_base) == rs_rBP.GetRegNum()) {
-    // BP requires an explicit displacement, even when it's 0.
-    if (entry->opcode != kX86Lea32RA && entry->opcode != kX86Lea64RA) {
-      DCHECK_NE(entry->flags & (IS_LOAD | IS_STORE), UINT64_C(0)) << entry->name;
+  if (!modrm_is_reg_reg) {
+    if (has_sib || LowRegisterBits(raw_base) == rs_rX86_SP.GetRegNum()
+        || (Gen64Bit() && entry->skeleton.prefix1 == THREAD_PREFIX)) {
+      // SP requires a SIB byte.
+      // GS access also needs a SIB byte for absolute adressing in 64-bit mode.
+      ++size;
     }
-    size += IS_SIMM8(displacement) ? 1 : 4;
+    if (displacement != 0 || LowRegisterBits(raw_base) == rs_rBP.GetRegNum()) {
+      // BP requires an explicit displacement, even when it's 0.
+      if (entry->opcode != kX86Lea32RA && entry->opcode != kX86Lea64RA) {
+        DCHECK_NE(entry->flags & (IS_LOAD | IS_STORE), UINT64_C(0)) << entry->name;
+      }
+      size += IS_SIMM8(displacement) ? 1 : 4;
+    }
   }
   size += entry->skeleton.immediate_bytes;
   return size;
 }
 
-int X86Mir2Lir::GetInsnSize(LIR* lir) {
+size_t X86Mir2Lir::GetInsnSize(LIR* lir) {
   DCHECK(!IsPseudoLirOp(lir->opcode));
   const X86EncodingMap* entry = &X86Mir2Lir::EncodingMap[lir->opcode];
   DCHECK_EQ(entry->opcode, lir->opcode) << entry->name;
+
   switch (entry->kind) {
     case kData:
       return 4;  // 4 bytes of data.
     case kNop:
       return lir->operands[0];  // Length of nop is sole operand.
     case kNullary:
-      // Substract 1 for modrm which isn't used.
-      DCHECK_EQ(false, entry->skeleton.r8_form);
-      return ComputeSize(entry, NO_REG, NO_REG, NO_REG, false, false, false, 0) - 1;
+      return ComputeSize(entry, NO_REG, NO_REG, NO_REG, 0);
     case kRegOpcode:  // lir operands - 0: reg
-      // Substract 1 for modrm  which isn't used.
-      DCHECK_EQ(false, entry->skeleton.r8_form);
-      // Note: RegOpcode form passes reg as REX_R but encodes it as REX_B.
-      return ComputeSize(entry, lir->operands[0], NO_REG, NO_REG, false, false, false, 0) - 1;
+      return ComputeSize(entry, NO_REG, NO_REG, lir->operands[0], 0);
     case kReg:  // lir operands - 0: reg
-      // Note: Reg form passes reg as REX_R but encodes it as REX_B.
-      return ComputeSize(entry, lir->operands[0], NO_REG, NO_REG,
-                         false, entry->skeleton.r8_form, false, 0);
+      return ComputeSize(entry, NO_REG, NO_REG, lir->operands[0], 0);
     case kMem:  // lir operands - 0: base, 1: disp
-      DCHECK_EQ(false, entry->skeleton.r8_form);
-      return ComputeSize(entry, NO_REG, NO_REG, lir->operands[0], false, false, false,
-                         lir->operands[1]);
+      return ComputeSize(entry, NO_REG, NO_REG, lir->operands[0], lir->operands[1]);
     case kArray:  // lir operands - 0: base, 1: index, 2: scale, 3: disp
-      return ComputeSize(entry, NO_REG, lir->operands[1], lir->operands[0], true, false, false,
-                         lir->operands[3]);
+      return ComputeSize(entry, NO_REG, lir->operands[1], lir->operands[0], lir->operands[3]);
     case kMemReg:  // lir operands - 0: base, 1: disp, 2: reg
-      return ComputeSize(entry, lir->operands[2], NO_REG, lir->operands[0],
-                         false, entry->skeleton.r8_form, false, lir->operands[1]);
+      return ComputeSize(entry, lir->operands[2], NO_REG, lir->operands[0], lir->operands[1]);
     case kMemRegImm:  // lir operands - 0: base, 1: disp, 2: reg 3: immediate
-      return ComputeSize(entry, lir->operands[2], NO_REG, lir->operands[0],
-                         false, entry->skeleton.r8_form, false, lir->operands[1]);
+      return ComputeSize(entry, lir->operands[2], NO_REG, lir->operands[0], lir->operands[1]);
     case kArrayReg:  // lir operands - 0: base, 1: index, 2: scale, 3: disp, 4: reg
       return ComputeSize(entry, lir->operands[4], lir->operands[1], lir->operands[0],
-                         true, entry->skeleton.r8_form, false, lir->operands[3]);
+                         lir->operands[3]);
     case kThreadReg:  // lir operands - 0: disp, 1: reg
-      DCHECK_EQ(false, entry->skeleton.r8_form);
       // Thread displacement size is always 32bit.
-      return ComputeSize(entry, lir->operands[1], NO_REG, NO_REG, false, false, false,
-                         0x12345678);
+      return ComputeSize(entry, lir->operands[1], NO_REG, NO_REG, 0x12345678);
     case kRegReg:  // lir operands - 0: reg1, 1: reg2
-      // Note: RegReg form passes reg2 as index but encodes it using base.
-      return ComputeSize(entry, lir->operands[0], lir->operands[1], NO_REG,
-                         false, entry->skeleton.r8_form, entry->skeleton.r8_form, 0);
+      return ComputeSize(entry, lir->operands[0], NO_REG, lir->operands[1], 0);
     case kRegRegStore:  // lir operands - 0: reg2, 1: reg1
-      // Note: RegRegStore form passes reg1 as index but encodes it using base.
-      return ComputeSize(entry, lir->operands[1], lir->operands[0], NO_REG,
-                         false, entry->skeleton.r8_form, entry->skeleton.r8_form, 0);
+      return ComputeSize(entry, lir->operands[1], NO_REG, lir->operands[0], 0);
     case kRegMem:  // lir operands - 0: reg, 1: base, 2: disp
-      return ComputeSize(entry, lir->operands[0], NO_REG, lir->operands[1],
-                         false, entry->skeleton.r8_form, false, lir->operands[2]);
+      return ComputeSize(entry, lir->operands[0], NO_REG, lir->operands[1], lir->operands[2]);
     case kRegArray:   // lir operands - 0: reg, 1: base, 2: index, 3: scale, 4: disp
       return ComputeSize(entry, lir->operands[0], lir->operands[2], lir->operands[1],
-                         true, entry->skeleton.r8_form, false, lir->operands[4]);
+                         lir->operands[4]);
     case kRegThread:  // lir operands - 0: reg, 1: disp
       // Thread displacement size is always 32bit.
-      DCHECK_EQ(false, entry->skeleton.r8_form);
-      return ComputeSize(entry, lir->operands[0], NO_REG, NO_REG, false, false, false,
-                         0x12345678);
+      return ComputeSize(entry, lir->operands[0], NO_REG, NO_REG, 0x12345678);
     case kRegImm: {  // lir operands - 0: reg, 1: immediate
-      size_t size = ComputeSize(entry, lir->operands[0], NO_REG, NO_REG,
-                         false, entry->skeleton.r8_form, false, 0);
+      size_t size = ComputeSize(entry, lir->operands[0], NO_REG, NO_REG, 0);
       // AX opcodes don't require the modrm byte.
       if (entry->skeleton.ax_opcode == 0) {
         return size;
@@ -626,83 +684,62 @@
       }
     }
     case kMemImm:  // lir operands - 0: base, 1: disp, 2: immediate
-      DCHECK_EQ(false, entry->skeleton.r8_form);
-      return ComputeSize(entry, NO_REG, NO_REG, lir->operands[0],
-                         false, false, false, lir->operands[1]);
+      return ComputeSize(entry, NO_REG, NO_REG, lir->operands[0], lir->operands[1]);
     case kArrayImm:  // lir operands - 0: base, 1: index, 2: scale, 3: disp 4: immediate
-      DCHECK_EQ(false, entry->skeleton.r8_form);
-      return ComputeSize(entry, NO_REG, lir->operands[1], lir->operands[0],
-                         true, false, false, lir->operands[3]);
+      return ComputeSize(entry, NO_REG, lir->operands[1], lir->operands[0], lir->operands[3]);
     case kThreadImm:  // lir operands - 0: disp, 1: imm
       // Thread displacement size is always 32bit.
-      DCHECK_EQ(false, entry->skeleton.r8_form);
-      return ComputeSize(entry, NO_REG, NO_REG, NO_REG, false, false, false, 0x12345678);
+      return ComputeSize(entry, NO_REG, NO_REG, NO_REG, 0x12345678);
     case kRegRegImm:  // lir operands - 0: reg1, 1: reg2, 2: imm
       // Note: RegRegImm form passes reg2 as index but encodes it using base.
-      return ComputeSize(entry, lir->operands[0], lir->operands[1], NO_REG,
-                         false, entry->skeleton.r8_form, entry->skeleton.r8_form, 0);
+      return ComputeSize(entry, lir->operands[0], lir->operands[1], NO_REG, 0);
     case kRegRegImmStore:  // lir operands - 0: reg2, 1: reg1, 2: imm
       // Note: RegRegImmStore form passes reg1 as index but encodes it using base.
-      return ComputeSize(entry, lir->operands[1], lir->operands[0], NO_REG,
-                         false, entry->skeleton.r8_form, entry->skeleton.r8_form, 0);
+      return ComputeSize(entry, lir->operands[1], lir->operands[0], NO_REG, 0);
     case kRegMemImm:  // lir operands - 0: reg, 1: base, 2: disp, 3: imm
-      return ComputeSize(entry, lir->operands[0], NO_REG, lir->operands[1],
-                         false, entry->skeleton.r8_form, false, lir->operands[2]);
+      return ComputeSize(entry, lir->operands[0], NO_REG, lir->operands[1], lir->operands[2]);
     case kRegArrayImm:  // lir operands - 0: reg, 1: base, 2: index, 3: scale, 4: disp, 5: imm
       return ComputeSize(entry, lir->operands[0], lir->operands[2], lir->operands[1],
-                         true, entry->skeleton.r8_form, false, lir->operands[4]);
+                         lir->operands[4]);
     case kMovRegImm:  // lir operands - 0: reg, 1: immediate
       return ((entry->skeleton.prefix1 != 0 || NeedsRex(lir->operands[0])) ? 1 : 0) + 1 +
           entry->skeleton.immediate_bytes;
     case kShiftRegImm:  // lir operands - 0: reg, 1: immediate
       // Shift by immediate one has a shorter opcode.
-      return ComputeSize(entry, lir->operands[0], NO_REG, NO_REG,
-                         false, entry->skeleton.r8_form, false, 0) -
+      return ComputeSize(entry, lir->operands[0], NO_REG, NO_REG, 0) -
           (lir->operands[1] == 1 ? 1 : 0);
     case kShiftMemImm:  // lir operands - 0: base, 1: disp, 2: immediate
       // Shift by immediate one has a shorter opcode.
-      return ComputeSize(entry, NO_REG, NO_REG, lir->operands[0],
-                         false, entry->skeleton.r8_form, false, lir->operands[1]) -
+      return ComputeSize(entry, NO_REG, NO_REG, lir->operands[0], lir->operands[1]) -
           (lir->operands[2] == 1 ? 1 : 0);
     case kShiftArrayImm:  // lir operands - 0: base, 1: index, 2: scale, 3: disp 4: immediate
       // Shift by immediate one has a shorter opcode.
-      return ComputeSize(entry, NO_REG, lir->operands[1], lir->operands[0],
-                         true, entry->skeleton.r8_form, false, lir->operands[3]) -
+      return ComputeSize(entry, NO_REG, lir->operands[1], lir->operands[0], lir->operands[3]) -
           (lir->operands[4] == 1 ? 1 : 0);
     case kShiftRegCl:  // lir operands - 0: reg, 1: cl
       DCHECK_EQ(rs_rCX.GetRegNum(), RegStorage::RegNum(lir->operands[1]));
       // Note: ShiftRegCl form passes reg as reg but encodes it using base.
-      return ComputeSize(entry, lir->operands[0], NO_REG, NO_REG,
-                         false, entry->skeleton.r8_form, false, 0);
+      return ComputeSize(entry, lir->operands[0], NO_REG, NO_REG, 0);
     case kShiftMemCl:  // lir operands - 0: base, 1: disp, 2: cl
-      DCHECK_EQ(false, entry->skeleton.r8_form);
       DCHECK_EQ(rs_rCX.GetRegNum(), RegStorage::RegNum(lir->operands[2]));
-      return ComputeSize(entry, NO_REG, NO_REG, lir->operands[0],
-                         false, false, false, lir->operands[1]);
+      return ComputeSize(entry, NO_REG, NO_REG, lir->operands[0], lir->operands[1]);
     case kShiftArrayCl:  // lir operands - 0: base, 1: index, 2: scale, 3: disp, 4: cl
-      DCHECK_EQ(false, entry->skeleton.r8_form);
       DCHECK_EQ(rs_rCX.GetRegNum(), RegStorage::RegNum(lir->operands[4]));
       return ComputeSize(entry, lir->operands[4], lir->operands[1], lir->operands[0],
-                         true, false, false, lir->operands[3]);
+                         lir->operands[3]);
     case kRegCond:  // lir operands - 0: reg, 1: cond
-      return ComputeSize(entry, lir->operands[0], NO_REG, NO_REG,
-                         false, entry->skeleton.r8_form, false, 0);
+      return ComputeSize(entry, NO_REG, NO_REG, lir->operands[0], 0);
     case kMemCond:  // lir operands - 0: base, 1: disp, 2: cond
-      DCHECK_EQ(false, entry->skeleton.r8_form);
-      return ComputeSize(entry, NO_REG, NO_REG, lir->operands[0], false, false, false,
-                         lir->operands[1]);
+      return ComputeSize(entry, NO_REG, NO_REG, lir->operands[0], lir->operands[1]);
     case kArrayCond:  // lir operands - 0: base, 1: index, 2: scale, 3: disp, 4: cond
       DCHECK_EQ(false, entry->skeleton.r8_form);
-      return ComputeSize(entry, NO_REG, lir->operands[1], lir->operands[0], true, false, false,
-                         lir->operands[3]);
+      return ComputeSize(entry, NO_REG, lir->operands[1], lir->operands[0], lir->operands[3]);
     case kRegRegCond:  // lir operands - 0: reg1, 1: reg2, 2: cond
-      // Note: RegRegCond form passes reg2 as index but encodes it using base.
       DCHECK_EQ(false, entry->skeleton.r8_form);
-      return ComputeSize(entry, lir->operands[0], lir->operands[1], NO_REG, false, false, false, 0);
+      return ComputeSize(entry, lir->operands[0], NO_REG, lir->operands[1], 0);
     case kRegMemCond:  // lir operands - 0: reg, 1: base, 2: disp, 3:cond
       DCHECK_EQ(false, entry->skeleton.r8_form);
-      return ComputeSize(entry, lir->operands[0], NO_REG, lir->operands[1], false, false, false,
-                         lir->operands[2]);
+      return ComputeSize(entry, lir->operands[0], NO_REG, lir->operands[1], lir->operands[2]);
     case kJcc:
       if (lir->opcode == kX86Jcc8) {
         return 2;  // opcode + rel8
@@ -717,7 +754,7 @@
         return 5;  // opcode + rel32
       } else if (lir->opcode == kX86JmpT) {
         // Thread displacement size is always 32bit.
-        return ComputeSize(entry, NO_REG, NO_REG, NO_REG, false, false, false, 0x12345678);
+        return ComputeSize(entry, NO_REG, NO_REG, NO_REG, 0x12345678);
       } else {
         DCHECK(lir->opcode == kX86JmpR);
         if (NeedsRex(lir->operands[0])) {
@@ -731,14 +768,12 @@
         case kX86CallI: return 5;  // opcode 0:disp
         case kX86CallR: return 2;  // opcode modrm
         case kX86CallM:  // lir operands - 0: base, 1: disp
-          return ComputeSize(entry, NO_REG, NO_REG, lir->operands[0], false, false, false,
-                             lir->operands[1]);
+          return ComputeSize(entry, NO_REG, NO_REG, lir->operands[0], lir->operands[1]);
         case kX86CallA:  // lir operands - 0: base, 1: index, 2: scale, 3: disp
-          return ComputeSize(entry, NO_REG, lir->operands[1], lir->operands[0], true, false, false,
-                             lir->operands[3]);
+          return ComputeSize(entry, NO_REG, lir->operands[1], lir->operands[0], lir->operands[3]);
         case kX86CallT:  // lir operands - 0: disp
           // Thread displacement size is always 32bit.
-          return ComputeSize(entry, NO_REG, NO_REG, NO_REG, false, false, false, 0x12345678);
+          return ComputeSize(entry, NO_REG, NO_REG, NO_REG, 0x12345678);
         default:
           break;
       }
@@ -748,7 +783,7 @@
         // lir operands - 0: reg, 1: base, 2: index, 3: scale, 4: table
         // Force the displacement size to 32bit, it will hold a computed offset later.
         return ComputeSize(entry, lir->operands[0], lir->operands[2], lir->operands[1],
-                           true, false, false, 0x12345678);
+                           0x12345678);
       } else {
         DCHECK_EQ(entry->opcode, kX86PcRelAdr);
         return 5;  // opcode with reg + 4 byte immediate
@@ -757,7 +792,7 @@
       DCHECK_EQ(lir->opcode, static_cast<int>(kX86StartOfMethod));
       return 5 /* call opcode + 4 byte displacement */ + 1 /* pop reg */ +
           ComputeSize(&X86Mir2Lir::EncodingMap[Gen64Bit() ? kX86Sub64RI : kX86Sub32RI],
-                      lir->operands[0], NO_REG, NO_REG, false, false, false, 0) -
+                      lir->operands[0], NO_REG, NO_REG, 0) -
               // Shorter ax encoding.
               (RegStorage::RegNum(lir->operands[0]) == rs_rAX.GetRegNum()  ? 1 : 0);
     case kUnimplemented:
@@ -801,8 +836,7 @@
 }
 
 void X86Mir2Lir::EmitPrefix(const X86EncodingMap* entry,
-                            int32_t raw_reg_r, int32_t raw_reg_x, int32_t raw_reg_b,
-                            bool r8_form) {
+                            int32_t raw_reg_r, int32_t raw_reg_x, int32_t raw_reg_b) {
   // REX.WRXB
   // W - 64-bit operand
   // R - MODRM.reg
@@ -812,9 +846,17 @@
   bool r = NeedsRex(raw_reg_r);
   bool x = NeedsRex(raw_reg_x);
   bool b = NeedsRex(raw_reg_b);
+  bool r8_form = entry->skeleton.r8_form;
+  bool modrm_is_reg_reg = ModrmIsRegReg(entry);
+
   uint8_t rex = 0;
-  if (r8_form && RegStorage::RegNum(raw_reg_r) > 4) {
-    rex |= 0x40;  // REX.0000
+  if (r8_form) {
+    // Do we need an empty REX prefix to normalize byte register addressing?
+    if (RegStorage::RegNum(raw_reg_r) >= 4) {
+      rex |= 0x40;  // REX.0000
+    } else if (modrm_is_reg_reg && RegStorage::RegNum(raw_reg_b) >= 4) {
+      rex |= 0x40;  // REX.0000
+    }
   }
   if (w) {
     rex |= 0x48;  // REX.W000
@@ -875,9 +917,8 @@
 }
 
 void X86Mir2Lir::EmitPrefixAndOpcode(const X86EncodingMap* entry,
-                                     int32_t raw_reg_r, int32_t raw_reg_x, int32_t raw_reg_b,
-                                     bool r8_form) {
-  EmitPrefix(entry, raw_reg_r, raw_reg_x, raw_reg_b, r8_form);
+                                     int32_t raw_reg_r, int32_t raw_reg_x, int32_t raw_reg_b) {
+  EmitPrefix(entry, raw_reg_r, raw_reg_x, raw_reg_b);
   EmitOpcode(entry);
 }
 
@@ -971,7 +1012,7 @@
 
 void X86Mir2Lir::EmitNullary(const X86EncodingMap* entry) {
   DCHECK_EQ(false, entry->skeleton.r8_form);
-  EmitPrefixAndOpcode(entry, NO_REG, NO_REG, NO_REG, false);
+  EmitPrefixAndOpcode(entry, NO_REG, NO_REG, NO_REG);
   DCHECK_EQ(0, entry->skeleton.modrm_opcode);
   DCHECK_EQ(0, entry->skeleton.ax_opcode);
   DCHECK_EQ(0, entry->skeleton.immediate_bytes);
@@ -979,7 +1020,7 @@
 
 void X86Mir2Lir::EmitOpRegOpcode(const X86EncodingMap* entry, int32_t raw_reg) {
   DCHECK_EQ(false, entry->skeleton.r8_form);
-  EmitPrefixAndOpcode(entry, NO_REG, NO_REG, raw_reg, false);
+  EmitPrefixAndOpcode(entry, NO_REG, NO_REG, raw_reg);
   // There's no 3-byte instruction with +rd
   DCHECK(entry->skeleton.opcode != 0x0F ||
          (entry->skeleton.extra_opcode1 != 0x38 && entry->skeleton.extra_opcode1 != 0x3A));
@@ -992,7 +1033,7 @@
 
 void X86Mir2Lir::EmitOpReg(const X86EncodingMap* entry, int32_t raw_reg) {
   CheckValidByteRegister(entry, raw_reg);
-  EmitPrefixAndOpcode(entry, NO_REG, NO_REG, raw_reg, entry->skeleton.r8_form);
+  EmitPrefixAndOpcode(entry, NO_REG, NO_REG, raw_reg);
   uint8_t low_reg = LowRegisterBits(raw_reg);
   uint8_t modrm = (3 << 6) | (entry->skeleton.modrm_opcode << 3) | low_reg;
   code_buffer_.push_back(modrm);
@@ -1002,7 +1043,7 @@
 
 void X86Mir2Lir::EmitOpMem(const X86EncodingMap* entry, int32_t raw_base, int32_t disp) {
   DCHECK_EQ(false, entry->skeleton.r8_form);
-  EmitPrefix(entry, NO_REG, NO_REG, raw_base, false);
+  EmitPrefix(entry, NO_REG, NO_REG, raw_base);
   code_buffer_.push_back(entry->skeleton.opcode);
   DCHECK_NE(0x0F, entry->skeleton.opcode);
   DCHECK_EQ(0, entry->skeleton.extra_opcode1);
@@ -1016,7 +1057,7 @@
 void X86Mir2Lir::EmitOpArray(const X86EncodingMap* entry, int32_t raw_base, int32_t raw_index,
                              int scale, int32_t disp) {
   DCHECK_EQ(false, entry->skeleton.r8_form);
-  EmitPrefixAndOpcode(entry, NO_REG, raw_index, raw_base, false);
+  EmitPrefixAndOpcode(entry, NO_REG, raw_index, raw_base);
   uint8_t low_index = LowRegisterBits(raw_index);
   uint8_t low_base = LowRegisterBits(raw_base);
   EmitModrmSibDisp(entry->skeleton.modrm_opcode, low_base, low_index, scale, disp);
@@ -1027,7 +1068,7 @@
 void X86Mir2Lir::EmitMemReg(const X86EncodingMap* entry, int32_t raw_base, int32_t disp,
                             int32_t raw_reg) {
   CheckValidByteRegister(entry, raw_reg);
-  EmitPrefixAndOpcode(entry, raw_reg, NO_REG, raw_base, entry->skeleton.r8_form);
+  EmitPrefixAndOpcode(entry, raw_reg, NO_REG, raw_base);
   uint8_t low_reg = LowRegisterBits(raw_reg);
   uint8_t low_base = LowRegisterBits(raw_base);
   EmitModrmDisp(low_reg, low_base, disp);
@@ -1045,7 +1086,7 @@
 void X86Mir2Lir::EmitRegArray(const X86EncodingMap* entry, int32_t raw_reg, int32_t raw_base,
                               int32_t raw_index, int scale, int32_t disp) {
   CheckValidByteRegister(entry, raw_reg);
-  EmitPrefixAndOpcode(entry, raw_reg, raw_index, raw_base, entry->skeleton.r8_form);
+  EmitPrefixAndOpcode(entry, raw_reg, raw_index, raw_base);
   uint8_t low_reg = LowRegisterBits(raw_reg);
   uint8_t low_index = LowRegisterBits(raw_index);
   uint8_t low_base = LowRegisterBits(raw_base);
@@ -1064,7 +1105,7 @@
 void X86Mir2Lir::EmitMemImm(const X86EncodingMap* entry, int32_t raw_base, int32_t disp,
                             int32_t imm) {
   DCHECK_EQ(false, entry->skeleton.r8_form);
-  EmitPrefixAndOpcode(entry, NO_REG, NO_REG, raw_base, false);
+  EmitPrefixAndOpcode(entry, NO_REG, NO_REG, raw_base);
   uint8_t low_base = LowRegisterBits(raw_base);
   EmitModrmDisp(entry->skeleton.modrm_opcode, low_base, disp);
   DCHECK_EQ(0, entry->skeleton.ax_opcode);
@@ -1075,7 +1116,7 @@
                               int32_t raw_base, int32_t raw_index, int scale, int32_t disp,
                               int32_t imm) {
   DCHECK_EQ(false, entry->skeleton.r8_form);
-  EmitPrefixAndOpcode(entry, NO_REG, raw_index, raw_base, false);
+  EmitPrefixAndOpcode(entry, NO_REG, raw_index, raw_base);
   uint8_t low_index = LowRegisterBits(raw_index);
   uint8_t low_base = LowRegisterBits(raw_base);
   EmitModrmSibDisp(entry->skeleton.modrm_opcode, low_base, low_index, scale, disp);
@@ -1086,7 +1127,7 @@
 void X86Mir2Lir::EmitRegThread(const X86EncodingMap* entry, int32_t raw_reg, int32_t disp) {
   DCHECK_EQ(false, entry->skeleton.r8_form);
   DCHECK_NE(entry->skeleton.prefix1, 0);
-  EmitPrefixAndOpcode(entry, raw_reg, NO_REG, NO_REG, false);
+  EmitPrefixAndOpcode(entry, raw_reg, NO_REG, NO_REG);
   uint8_t low_reg = LowRegisterBits(raw_reg);
   EmitModrmThread(low_reg);
   code_buffer_.push_back(disp & 0xFF);
@@ -1101,7 +1142,7 @@
 void X86Mir2Lir::EmitRegReg(const X86EncodingMap* entry, int32_t raw_reg1, int32_t raw_reg2) {
   CheckValidByteRegister(entry, raw_reg1);
   CheckValidByteRegister(entry, raw_reg2);
-  EmitPrefixAndOpcode(entry, raw_reg1, NO_REG, raw_reg2, entry->skeleton.r8_form);
+  EmitPrefixAndOpcode(entry, raw_reg1, NO_REG, raw_reg2);
   uint8_t low_reg1 = LowRegisterBits(raw_reg1);
   uint8_t low_reg2 = LowRegisterBits(raw_reg2);
   uint8_t modrm = (3 << 6) | (low_reg1 << 3) | low_reg2;
@@ -1114,7 +1155,7 @@
 void X86Mir2Lir::EmitRegRegImm(const X86EncodingMap* entry, int32_t raw_reg1, int32_t raw_reg2,
                                int32_t imm) {
   DCHECK_EQ(false, entry->skeleton.r8_form);
-  EmitPrefixAndOpcode(entry, raw_reg1, NO_REG, raw_reg2, false);
+  EmitPrefixAndOpcode(entry, raw_reg1, NO_REG, raw_reg2);
   uint8_t low_reg1 = LowRegisterBits(raw_reg1);
   uint8_t low_reg2 = LowRegisterBits(raw_reg2);
   uint8_t modrm = (3 << 6) | (low_reg1 << 3) | low_reg2;
@@ -1128,7 +1169,7 @@
                                int32_t raw_reg, int32_t raw_base, int disp, int32_t imm) {
   DCHECK(!RegStorage::IsFloat(raw_reg));
   CheckValidByteRegister(entry, raw_reg);
-  EmitPrefixAndOpcode(entry, raw_reg, NO_REG, raw_base, entry->skeleton.r8_form);
+  EmitPrefixAndOpcode(entry, raw_reg, NO_REG, raw_base);
   uint8_t low_reg = LowRegisterBits(raw_reg);
   uint8_t low_base = LowRegisterBits(raw_base);
   EmitModrmDisp(low_reg, low_base, disp);
@@ -1145,7 +1186,7 @@
 
 void X86Mir2Lir::EmitRegImm(const X86EncodingMap* entry, int32_t raw_reg, int32_t imm) {
   CheckValidByteRegister(entry, raw_reg);
-  EmitPrefix(entry, NO_REG, NO_REG, raw_reg, entry->skeleton.r8_form);
+  EmitPrefix(entry, NO_REG, NO_REG, raw_reg);
   if (RegStorage::RegNum(raw_reg) == rs_rAX.GetRegNum() && entry->skeleton.ax_opcode != 0) {
     code_buffer_.push_back(entry->skeleton.ax_opcode);
   } else {
@@ -1158,7 +1199,8 @@
 }
 
 void X86Mir2Lir::EmitThreadImm(const X86EncodingMap* entry, int32_t disp, int32_t imm) {
-  EmitPrefixAndOpcode(entry, NO_REG, NO_REG, NO_REG, false);
+  DCHECK_EQ(false, entry->skeleton.r8_form);
+  EmitPrefixAndOpcode(entry, NO_REG, NO_REG, NO_REG);
   EmitModrmThread(entry->skeleton.modrm_opcode);
   code_buffer_.push_back(disp & 0xFF);
   code_buffer_.push_back((disp >> 8) & 0xFF);
@@ -1170,7 +1212,7 @@
 
 void X86Mir2Lir::EmitMovRegImm(const X86EncodingMap* entry, int32_t raw_reg, int64_t imm) {
   DCHECK_EQ(false, entry->skeleton.r8_form);
-  EmitPrefix(entry, NO_REG, NO_REG, raw_reg, false);
+  EmitPrefix(entry, NO_REG, NO_REG, raw_reg);
   uint8_t low_reg = LowRegisterBits(raw_reg);
   code_buffer_.push_back(0xB8 + low_reg);
   switch (entry->skeleton.immediate_bytes) {
@@ -1198,7 +1240,7 @@
 
 void X86Mir2Lir::EmitShiftRegImm(const X86EncodingMap* entry, int32_t raw_reg, int32_t imm) {
   CheckValidByteRegister(entry, raw_reg);
-  EmitPrefix(entry, NO_REG, NO_REG, raw_reg, entry->skeleton.r8_form);
+  EmitPrefix(entry, NO_REG, NO_REG, raw_reg);
   if (imm != 1) {
     code_buffer_.push_back(entry->skeleton.opcode);
   } else {
@@ -1221,7 +1263,7 @@
 void X86Mir2Lir::EmitShiftRegCl(const X86EncodingMap* entry, int32_t raw_reg, int32_t raw_cl) {
   CheckValidByteRegister(entry, raw_reg);
   DCHECK_EQ(rs_rCX.GetRegNum(), RegStorage::RegNum(raw_cl));
-  EmitPrefix(entry, NO_REG, NO_REG, raw_reg, entry->skeleton.r8_form);
+  EmitPrefix(entry, NO_REG, NO_REG, raw_reg);
   code_buffer_.push_back(entry->skeleton.opcode);
   DCHECK_NE(0x0F, entry->skeleton.opcode);
   DCHECK_EQ(0, entry->skeleton.extra_opcode1);
@@ -1237,7 +1279,7 @@
                                 int32_t displacement, int32_t raw_cl) {
   DCHECK_EQ(false, entry->skeleton.r8_form);
   DCHECK_EQ(rs_rCX.GetRegNum(), RegStorage::RegNum(raw_cl));
-  EmitPrefix(entry, NO_REG, NO_REG, raw_base, false);
+  EmitPrefix(entry, NO_REG, NO_REG, raw_base);
   code_buffer_.push_back(entry->skeleton.opcode);
   DCHECK_NE(0x0F, entry->skeleton.opcode);
   DCHECK_EQ(0, entry->skeleton.extra_opcode1);
@@ -1251,7 +1293,7 @@
 void X86Mir2Lir::EmitShiftMemImm(const X86EncodingMap* entry, int32_t raw_base, int32_t disp,
                                  int32_t imm) {
   DCHECK_EQ(false, entry->skeleton.r8_form);
-  EmitPrefix(entry, NO_REG, NO_REG, raw_base, false);
+  EmitPrefix(entry, NO_REG, NO_REG, raw_base);
   if (imm != 1) {
     code_buffer_.push_back(entry->skeleton.opcode);
   } else {
@@ -1272,7 +1314,7 @@
 
 void X86Mir2Lir::EmitRegCond(const X86EncodingMap* entry, int32_t raw_reg, int32_t cc) {
   CheckValidByteRegister(entry, raw_reg);
-  EmitPrefix(entry, raw_reg, NO_REG, NO_REG, entry->skeleton.r8_form);
+  EmitPrefix(entry, NO_REG, NO_REG, raw_reg);
   DCHECK_EQ(0, entry->skeleton.ax_opcode);
   DCHECK_EQ(0x0F, entry->skeleton.opcode);
   code_buffer_.push_back(0x0F);
@@ -1315,7 +1357,7 @@
                                 int32_t cc) {
   // Generate prefix and opcode without the condition.
   DCHECK_EQ(false, entry->skeleton.r8_form);
-  EmitPrefixAndOpcode(entry, raw_reg1, NO_REG, raw_reg2, false);
+  EmitPrefixAndOpcode(entry, raw_reg1, NO_REG, raw_reg2);
 
   // Now add the condition. The last byte of opcode is the one that receives it.
   DCHECK_GE(cc, 0);
@@ -1341,7 +1383,7 @@
                                 int32_t disp, int32_t cc) {
   // Generate prefix and opcode without the condition.
   DCHECK_EQ(false, entry->skeleton.r8_form);
-  EmitPrefixAndOpcode(entry, raw_reg1, NO_REG, raw_base, false);
+  EmitPrefixAndOpcode(entry, raw_reg1, NO_REG, raw_base);
 
   // Now add the condition. The last byte of opcode is the one that receives it.
   DCHECK_GE(cc, 0);
@@ -1376,7 +1418,7 @@
   } else {
     DCHECK(entry->opcode == kX86JmpR);
     DCHECK_EQ(false, entry->skeleton.r8_form);
-    EmitPrefix(entry, NO_REG, NO_REG, rel, false);
+    EmitPrefix(entry, NO_REG, NO_REG, rel);
     code_buffer_.push_back(entry->skeleton.opcode);
     uint8_t low_reg = LowRegisterBits(rel);
     uint8_t modrm = (3 << 6) | (entry->skeleton.modrm_opcode << 3) | low_reg;
@@ -1404,7 +1446,7 @@
 
 void X86Mir2Lir::EmitCallMem(const X86EncodingMap* entry, int32_t raw_base, int32_t disp) {
   DCHECK_EQ(false, entry->skeleton.r8_form);
-  EmitPrefixAndOpcode(entry, NO_REG, NO_REG, raw_base, false);
+  EmitPrefixAndOpcode(entry, NO_REG, NO_REG, raw_base);
   uint8_t low_base = LowRegisterBits(raw_base);
   EmitModrmDisp(entry->skeleton.modrm_opcode, low_base, disp);
   DCHECK_EQ(0, entry->skeleton.ax_opcode);
@@ -1413,7 +1455,7 @@
 
 void X86Mir2Lir::EmitCallImmediate(const X86EncodingMap* entry, int32_t disp) {
   DCHECK_EQ(false, entry->skeleton.r8_form);
-  EmitPrefixAndOpcode(entry, NO_REG, NO_REG, NO_REG, false);
+  EmitPrefixAndOpcode(entry, NO_REG, NO_REG, NO_REG);
   DCHECK_EQ(4, entry->skeleton.immediate_bytes);
   code_buffer_.push_back(disp & 0xFF);
   code_buffer_.push_back((disp >> 8) & 0xFF);
@@ -1425,7 +1467,7 @@
 void X86Mir2Lir::EmitCallThread(const X86EncodingMap* entry, int32_t disp) {
   DCHECK_EQ(false, entry->skeleton.r8_form);
   DCHECK_NE(entry->skeleton.prefix1, 0);
-  EmitPrefixAndOpcode(entry, NO_REG, NO_REG, NO_REG, false);
+  EmitPrefixAndOpcode(entry, NO_REG, NO_REG, NO_REG);
   EmitModrmThread(entry->skeleton.modrm_opcode);
   code_buffer_.push_back(disp & 0xFF);
   code_buffer_.push_back((disp >> 8) & 0xFF);
@@ -1450,7 +1492,7 @@
   }
   if (entry->opcode == kX86PcRelLoadRA) {
     DCHECK_EQ(false, entry->skeleton.r8_form);
-    EmitPrefix(entry, raw_reg, raw_index, raw_base_or_table, false);
+    EmitPrefix(entry, raw_reg, raw_index, raw_base_or_table);
     code_buffer_.push_back(entry->skeleton.opcode);
     DCHECK_NE(0x0F, entry->skeleton.opcode);
     DCHECK_EQ(0, entry->skeleton.extra_opcode1);
@@ -1479,7 +1521,7 @@
 void X86Mir2Lir::EmitMacro(const X86EncodingMap* entry, int32_t raw_reg, int32_t offset) {
   DCHECK_EQ(entry->opcode, kX86StartOfMethod) << entry->name;
   DCHECK_EQ(false, entry->skeleton.r8_form);
-  EmitPrefix(entry, raw_reg, NO_REG, NO_REG, false);
+  EmitPrefix(entry, raw_reg, NO_REG, NO_REG);
   code_buffer_.push_back(0xE8);  // call +0
   code_buffer_.push_back(0);
   code_buffer_.push_back(0);
@@ -1496,7 +1538,7 @@
 void X86Mir2Lir::EmitUnimplemented(const X86EncodingMap* entry, LIR* lir) {
   UNIMPLEMENTED(WARNING) << "encoding kind for " << entry->name << " "
                          << BuildInsnString(entry->fmt, lir, 0);
-  for (int i = 0; i < GetInsnSize(lir); ++i) {
+  for (size_t i = 0; i < GetInsnSize(lir); ++i) {
     code_buffer_.push_back(0xCC);  // push breakpoint instruction - int 3
   }
 }
@@ -1793,8 +1835,8 @@
         EmitUnimplemented(entry, lir);
         break;
     }
-    CHECK_EQ(static_cast<size_t>(GetInsnSize(lir)),
-             code_buffer_.size() - starting_cbuf_size)
+    DCHECK_EQ(lir->flags.size, GetInsnSize(lir));
+    CHECK_EQ(lir->flags.size, code_buffer_.size() - starting_cbuf_size)
         << "Instruction size mismatch for entry: " << X86Mir2Lir::EncodingMap[lir->opcode].name;
   }
   return res;
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 6ae553d..3540843 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -121,7 +121,7 @@
   std::string BuildInsnString(const char* fmt, LIR* lir, unsigned char* base_addr);
   ResourceMask GetPCUseDefEncoding() const OVERRIDE;
   uint64_t GetTargetInstFlags(int opcode);
-  int GetInsnSize(LIR* lir);
+  size_t GetInsnSize(LIR* lir) OVERRIDE;
   bool IsUnconditionalBranch(LIR* lir);
 
   // Check support for volatile load/store of a given size.
@@ -392,15 +392,13 @@
 
  protected:
   size_t ComputeSize(const X86EncodingMap* entry, int32_t raw_reg, int32_t raw_index,
-                     int32_t raw_base, bool has_sib, bool r8_form, bool r8_reg_reg_form,
-                     int32_t displacement);
+                     int32_t raw_base, int32_t displacement);
   void CheckValidByteRegister(const X86EncodingMap* entry, int32_t raw_reg);
   void EmitPrefix(const X86EncodingMap* entry,
-                  int32_t raw_reg_r, int32_t raw_reg_x, int32_t raw_reg_b,
-                  bool r8_form);
+                  int32_t raw_reg_r, int32_t raw_reg_x, int32_t raw_reg_b);
   void EmitOpcode(const X86EncodingMap* entry);
   void EmitPrefixAndOpcode(const X86EncodingMap* entry,
-                           int32_t reg_r, int32_t reg_x, int32_t reg_b, bool r8_form);
+                           int32_t reg_r, int32_t reg_x, int32_t reg_b);
   void EmitDisp(uint8_t base, int32_t disp);
   void EmitModrmThread(uint8_t reg_or_opcode);
   void EmitModrmDisp(uint8_t reg_or_opcode, uint8_t base, int32_t disp);
@@ -464,6 +462,12 @@
   virtual RegStorage AllocateByteRegister();
 
   /*
+   * @brief Check if a register is byte addressable.
+   * @returns true if a register is byte addressable.
+   */
+  bool IsByteRegister(RegStorage reg);
+
+  /*
    * @brief generate inline code for fast case of Strng.indexOf.
    * @param info Call parameters
    * @param zero_based 'true' if the index into the string is 0.
diff --git a/compiler/dex/quick/x86/fp_x86.cc b/compiler/dex/quick/x86/fp_x86.cc
index ced6400..f6f0617 100644
--- a/compiler/dex/quick/x86/fp_x86.cc
+++ b/compiler/dex/quick/x86/fp_x86.cc
@@ -381,7 +381,7 @@
     branch = NewLIR2(kX86Jcc8, 0, kX86CondPE);
   }
   // If the result reg can't be byte accessed, use a jump and move instead of a set.
-  if (rl_result.reg.GetReg() >= rs_rX86_SP.GetReg()) {
+  if (!IsByteRegister(rl_result.reg)) {
     LIR* branch2 = NULL;
     if (unordered_gt) {
       branch2 = NewLIR2(kX86Jcc8, 0, kX86CondA);
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index 4a77df2..05b5e43 100644
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -882,10 +882,9 @@
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   RegStorage result_reg = rl_result.reg;
 
-  // SETcc only works with EAX..EDX.
-  if (result_reg.GetRegNum() >= rs_rX86_SP.GetRegNum()) {
+  // For 32-bit, SETcc only works with EAX..EDX.
+  if (!IsByteRegister(result_reg)) {
     result_reg = AllocateByteRegister();
-    DCHECK_LT(result_reg.GetRegNum(), rs_rX86_SP.GetRegNum());
   }
   NewLIR2(kX86Set8R, result_reg.GetReg(), kX86CondZ);
   NewLIR2(kX86Movzx8RR, rl_result.reg.GetReg(), result_reg.GetReg());
@@ -1386,9 +1385,9 @@
   if (!Gen64Bit()) {
     x86op = GetOpcode(op, rl_dest, rl_src, true);
     lir = NewLIR3(x86op, rl_dest.reg.GetHighReg(), r_base, displacement + HIWORD_OFFSET);
+    AnnotateDalvikRegAccess(lir, (displacement + HIWORD_OFFSET) >> 2,
+                            true /* is_load */, true /* is64bit */);
   }
-  AnnotateDalvikRegAccess(lir, (displacement + HIWORD_OFFSET) >> 2,
-                          true /* is_load */, true /* is64bit */);
 }
 
 void X86Mir2Lir::GenLongArith(RegLocation rl_dest, RegLocation rl_src, Instruction::Code op) {
@@ -1423,11 +1422,11 @@
   if (!Gen64Bit()) {
     x86op = GetOpcode(op, rl_dest, rl_src, true);
     lir = NewLIR3(x86op, r_base, displacement + HIWORD_OFFSET, rl_src.reg.GetHighReg());
+    AnnotateDalvikRegAccess(lir, (displacement + HIWORD_OFFSET) >> 2,
+                            true /* is_load */, true /* is64bit */);
+    AnnotateDalvikRegAccess(lir, (displacement + HIWORD_OFFSET) >> 2,
+                            false /* is_load */, true /* is64bit */);
   }
-  AnnotateDalvikRegAccess(lir, (displacement + HIWORD_OFFSET) >> 2,
-                          true /* is_load */, true /* is64bit */);
-  AnnotateDalvikRegAccess(lir, (displacement + HIWORD_OFFSET) >> 2,
-                          false /* is_load */, true /* is64bit */);
   FreeTemp(rl_src.reg);
 }
 
@@ -1760,8 +1759,7 @@
     rl_src = LoadValue(rl_src, reg_class);
   }
   // If the src reg can't be byte accessed, move it to a temp first.
-  if ((size == kSignedByte || size == kUnsignedByte) &&
-      rl_src.reg.GetRegNum() >= rs_rX86_SP.GetRegNum()) {
+  if ((size == kSignedByte || size == kUnsignedByte) && !IsByteRegister(rl_src.reg)) {
     RegStorage temp = AllocTemp();
     OpRegCopy(temp, rl_src.reg);
     StoreBaseIndexedDisp(rl_array.reg, rl_index.reg, scale, data_offset, temp, size);
@@ -2240,10 +2238,9 @@
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   RegStorage result_reg = rl_result.reg;
 
-  // SETcc only works with EAX..EDX.
-  if (result_reg == object.reg || result_reg.GetRegNum() >= rs_rX86_SP.GetRegNum()) {
+  // For 32-bit, SETcc only works with EAX..EDX.
+  if (result_reg == object.reg || !IsByteRegister(result_reg)) {
     result_reg = AllocateByteRegister();
-    DCHECK_LT(result_reg.GetRegNum(), rs_rX86_SP.GetRegNum());
   }
 
   // Assume that there is no match.
@@ -2355,7 +2352,7 @@
   /* kArg0 is ref, kArg2 is class. If ref==null, use directly as bool result. */
   RegLocation rl_result = GetReturn(kRefReg);
 
-  // SETcc only works with EAX..EDX.
+  // For 32-bit, SETcc only works with EAX..EDX.
   DCHECK_LT(rl_result.reg.GetRegNum(), 4);
 
   // Is the class NULL?
@@ -2655,6 +2652,7 @@
     Mir2Lir::GenIntToLong(rl_dest, rl_src);
     return;
   }
+  rl_src = UpdateLoc(rl_src);
   RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
   if (rl_src.location == kLocPhysReg) {
     NewLIR2(kX86MovsxdRR, rl_result.reg.GetReg(), rl_src.reg.GetReg());
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index d1ba239..483d8cf 100644
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -424,7 +424,15 @@
 }
 
 RegStorage X86Mir2Lir::AllocateByteRegister() {
-  return AllocTypedTemp(false, kCoreReg);
+  RegStorage reg = AllocTypedTemp(false, kCoreReg);
+  if (!Gen64Bit()) {
+    DCHECK_LT(reg.GetRegNum(), rs_rX86_SP.GetRegNum());
+  }
+  return reg;
+}
+
+bool X86Mir2Lir::IsByteRegister(RegStorage reg) {
+  return Gen64Bit() || reg.GetRegNum() < rs_rX86_SP.GetRegNum();
 }
 
 /* Clobber all regs that might be used by an external C call */
diff --git a/compiler/dex/quick/x86/utility_x86.cc b/compiler/dex/quick/x86/utility_x86.cc
index c72e8cd..b93e3e8 100644
--- a/compiler/dex/quick/x86/utility_x86.cc
+++ b/compiler/dex/quick/x86/utility_x86.cc
@@ -138,6 +138,7 @@
       case kOpLsl: opcode = kX86Sal64RI; break;
       case kOpLsr: opcode = kX86Shr64RI; break;
       case kOpAsr: opcode = kX86Sar64RI; break;
+      case kOpCmp: opcode = byte_imm ? kX86Cmp64RI8 : kX86Cmp64RI; break;
       default:
         LOG(FATAL) << "Bad case in OpRegImm (64-bit) " << op;
     }
@@ -505,7 +506,7 @@
       return NewLIR5(kX86Lea32RA, r_dest.GetReg(),  r5sib_no_base /* base */,
                      r_src.GetReg() /* index */, value /* scale */, 0 /* disp */);
     } else if (op == kOpAdd) {  // lea add special case
-      return NewLIR5(Gen64Bit() ? kX86Lea64RA : kX86Lea32RA, r_dest.GetReg(),
+      return NewLIR5(r_dest.Is64Bit() ? kX86Lea64RA : kX86Lea32RA, r_dest.GetReg(),
                      r_src.GetReg() /* base */, rs_rX86_SP.GetReg()/*r4sib_no_index*/ /* index */,
                      0 /* scale */, value /* disp */);
     }
diff --git a/compiler/dex/ssa_transformation.cc b/compiler/dex/ssa_transformation.cc
index 4324325..e26745a 100644
--- a/compiler/dex/ssa_transformation.cc
+++ b/compiler/dex/ssa_transformation.cc
@@ -117,6 +117,16 @@
   RecordDFSOrders(GetEntryBlock());
 
   num_reachable_blocks_ = dfs_order_->Size();
+
+  if (num_reachable_blocks_ != num_blocks_) {
+    // Hide all unreachable blocks.
+    AllNodesIterator iter(this);
+    for (BasicBlock* bb = iter.Next(); bb != NULL; bb = iter.Next()) {
+      if (!bb->visited) {
+        bb->Hide(cu_);
+      }
+    }
+  }
 }
 
 /*
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 16c1e00..3e326f0 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -1918,7 +1918,7 @@
     }
   }
   uint64_t duration_ns = NanoTime() - start_ns;
-  if (duration_ns > MsToNs(compiler_->GetMaximumCompilationTimeBeforeWarning())) {
+  if (duration_ns > MsToNs(compiler_->GetMaximumCompilationTimeBeforeWarning()) && !kIsDebugBuild) {
     LOG(WARNING) << "Compilation of " << PrettyMethod(method_idx, dex_file)
                  << " took " << PrettyDuration(duration_ns);
   }
diff --git a/compiler/driver/compiler_driver.h b/compiler/driver/compiler_driver.h
index fad6798..9903421 100644
--- a/compiler/driver/compiler_driver.h
+++ b/compiler/driver/compiler_driver.h
@@ -28,6 +28,7 @@
 #include "compiled_method.h"
 #include "compiler.h"
 #include "dex_file.h"
+#include "driver/compiler_options.h"
 #include "instruction_set.h"
 #include "invoke_type.h"
 #include "method_reference.h"
@@ -105,8 +106,7 @@
                           InstructionSetFeatures instruction_set_features,
                           bool image, DescriptorSet* image_classes,
                           size_t thread_count, bool dump_stats, bool dump_passes,
-                          CumulativeLogger* timer,
-                          std::string profile_file = "");
+                          CumulativeLogger* timer, std::string profile_file = "");
 
   ~CompilerDriver();
 
@@ -394,6 +394,10 @@
     return dump_passes_;
   }
 
+  bool DidIncludeDebugSymbols() const {
+    return compiler_options_->GetIncludeDebugSymbols();
+  }
+
   CumulativeLogger* GetTimingsLogger() const {
     return timings_logger_;
   }
diff --git a/compiler/driver/compiler_options.h b/compiler/driver/compiler_options.h
index 05a9ac7..5d1c5da 100644
--- a/compiler/driver/compiler_options.h
+++ b/compiler/driver/compiler_options.h
@@ -42,6 +42,7 @@
   static const size_t kDefaultTinyMethodThreshold = 20;
   static const size_t kDefaultNumDexMethodsThreshold = 900;
   static constexpr double kDefaultTopKProfileThreshold = 90.0;
+  static const bool kDefaultIncludeDebugSymbols = kIsDebugBuild;
 
   CompilerOptions() :
     compiler_filter_(kDefaultCompilerFilter),
@@ -51,7 +52,8 @@
     tiny_method_threshold_(kDefaultTinyMethodThreshold),
     num_dex_methods_threshold_(kDefaultNumDexMethodsThreshold),
     generate_gdb_information_(false),
-    top_k_profile_threshold_(kDefaultTopKProfileThreshold)
+    top_k_profile_threshold_(kDefaultTopKProfileThreshold),
+    include_debug_symbols_(kDefaultIncludeDebugSymbols)
 #ifdef ART_SEA_IR_MODE
     , sea_ir_mode_(false)
 #endif
@@ -64,7 +66,8 @@
                   size_t tiny_method_threshold,
                   size_t num_dex_methods_threshold,
                   bool generate_gdb_information,
-                  double top_k_profile_threshold
+                  double top_k_profile_threshold,
+                  bool include_debug_symbols
 #ifdef ART_SEA_IR_MODE
                   , bool sea_ir_mode
 #endif
@@ -76,7 +79,8 @@
     tiny_method_threshold_(tiny_method_threshold),
     num_dex_methods_threshold_(num_dex_methods_threshold),
     generate_gdb_information_(generate_gdb_information),
-    top_k_profile_threshold_(top_k_profile_threshold)
+    top_k_profile_threshold_(top_k_profile_threshold),
+    include_debug_symbols_(include_debug_symbols)
 #ifdef ART_SEA_IR_MODE
     , sea_ir_mode_(sea_ir_mode)
 #endif
@@ -139,6 +143,10 @@
     return top_k_profile_threshold_;
   }
 
+  bool GetIncludeDebugSymbols() const {
+    return include_debug_symbols_;
+  }
+
 #ifdef ART_SEA_IR_MODE
   bool GetSeaIrMode();
 #endif
@@ -157,6 +165,7 @@
   bool generate_gdb_information_;
   // When using a profile file only the top K% of the profiled samples will be compiled.
   double top_k_profile_threshold_;
+  bool include_debug_symbols_;
 #ifdef ART_SEA_IR_MODE
   bool sea_ir_mode_;
 #endif
diff --git a/compiler/elf_writer_quick.cc b/compiler/elf_writer_quick.cc
index cb66e48..78757ec 100644
--- a/compiler/elf_writer_quick.cc
+++ b/compiler/elf_writer_quick.cc
@@ -807,12 +807,17 @@
                            const std::string& android_root_unused,
                            bool is_host_unused) {
   const bool debug = false;
+  const bool add_symbols = oat_writer->DidAddSymbols();
   const OatHeader& oat_header = oat_writer->GetOatHeader();
   Elf32_Word oat_data_size = oat_header.GetExecutableOffset();
   uint32_t oat_exec_size = oat_writer->GetSize() - oat_data_size;
 
   ElfBuilder builder(oat_writer, elf_file_, compiler_driver_->GetInstructionSet(), 0,
-                     oat_data_size, oat_data_size, oat_exec_size, false, debug);
+                     oat_data_size, oat_data_size, oat_exec_size, add_symbols, debug);
+
+  if (add_symbols) {
+    AddDebugSymbols(builder, oat_writer, debug);
+  }
 
   bool generateDebugInformation = compiler_driver_->GetCallFrameInformation() != nullptr;
   if (generateDebugInformation) {
@@ -833,6 +838,15 @@
   return builder.Write();
 }
 
+void ElfWriterQuick::AddDebugSymbols(ElfBuilder& builder, OatWriter* oat_writer, bool debug) {
+  const std::vector<OatWriter::DebugInfo>& method_info = oat_writer->GetCFIMethodInfo();
+  ElfSymtabBuilder* symtab = &builder.symtab_builder_;
+  for (auto it = method_info.begin(); it != method_info.end(); ++it) {
+    symtab->AddSymbol(it->method_name_, &builder.text_builder_, it->low_pc_, true,
+                      it->high_pc_ - it->low_pc_, STB_GLOBAL, STT_FUNC);
+  }
+}
+
 static void UpdateWord(std::vector<uint8_t>*buf, int offset, int data) {
   (*buf)[offset+0] = data;
   (*buf)[offset+1] = data >> 8;
diff --git a/compiler/elf_writer_quick.h b/compiler/elf_writer_quick.h
index f687d2e..dbdccfc 100644
--- a/compiler/elf_writer_quick.h
+++ b/compiler/elf_writer_quick.h
@@ -48,6 +48,10 @@
   ~ElfWriterQuick() {}
 
   class ElfBuilder;
+  void AddDebugSymbols(ElfBuilder& builder,
+                       OatWriter* oat_writer,
+                       bool debug);
+
   class ElfSectionBuilder {
    public:
     ElfSectionBuilder(const std::string& sec_name, Elf32_Word type, Elf32_Word flags,
@@ -235,7 +239,6 @@
     ~ElfBuilder() {}
 
     bool Write();
-    ElfSymtabBuilder* GetDefaultDynsymBuilder() { return &dynsym_builder_; }
 
     // Adds the given raw section to the builder. This will copy it. The caller
     // is responsible for deallocating their copy.
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index 5d532ab..c6b9161 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -350,31 +350,14 @@
         uint32_t thumb_offset = compiled_method->CodeDelta();
         quick_code_offset = offset_ + sizeof(OatQuickMethodHeader) + thumb_offset;
 
-        std::vector<uint8_t>* cfi_info = writer_->compiler_driver_->GetCallFrameInformation();
-        if (cfi_info != nullptr) {
-          // Copy in the FDE, if present
-          const std::vector<uint8_t>* fde = compiled_method->GetCFIInfo();
-          if (fde != nullptr) {
-            // Copy the information into cfi_info and then fix the address in the new copy.
-            int cur_offset = cfi_info->size();
-            cfi_info->insert(cfi_info->end(), fde->begin(), fde->end());
-
-            // Set the 'initial_location' field to address the start of the method.
-            uint32_t new_value = quick_code_offset - writer_->oat_header_->GetExecutableOffset();
-            uint32_t offset_to_update = cur_offset + 2*sizeof(uint32_t);
-            (*cfi_info)[offset_to_update+0] = new_value;
-            (*cfi_info)[offset_to_update+1] = new_value >> 8;
-            (*cfi_info)[offset_to_update+2] = new_value >> 16;
-            (*cfi_info)[offset_to_update+3] = new_value >> 24;
-            std::string name = PrettyMethod(it.GetMemberIndex(), *dex_file_, false);
-            writer_->method_info_.push_back(DebugInfo(name, new_value, new_value + code_size));
-          }
-        }
+        bool force_debug_capture = false;
+        bool deduped = false;
 
         // Deduplicate code arrays.
         auto code_iter = dedupe_map_.find(compiled_method);
         if (code_iter != dedupe_map_.end()) {
           quick_code_offset = code_iter->second;
+          deduped = true;
         } else {
           dedupe_map_.Put(compiled_method, quick_code_offset);
         }
@@ -409,6 +392,41 @@
           writer_->oat_header_->UpdateChecksum(&(*quick_code)[0], code_size);
           offset_ += code_size;
         }
+
+        uint32_t quick_code_start = quick_code_offset - writer_->oat_header_->GetExecutableOffset();
+        std::vector<uint8_t>* cfi_info = writer_->compiler_driver_->GetCallFrameInformation();
+        if (cfi_info != nullptr) {
+          // Copy in the FDE, if present
+          const std::vector<uint8_t>* fde = compiled_method->GetCFIInfo();
+          if (fde != nullptr) {
+            // Copy the information into cfi_info and then fix the address in the new copy.
+            int cur_offset = cfi_info->size();
+            cfi_info->insert(cfi_info->end(), fde->begin(), fde->end());
+
+            // Set the 'initial_location' field to address the start of the method.
+            uint32_t offset_to_update = cur_offset + 2*sizeof(uint32_t);
+            (*cfi_info)[offset_to_update+0] = quick_code_start;
+            (*cfi_info)[offset_to_update+1] = quick_code_start >> 8;
+            (*cfi_info)[offset_to_update+2] = quick_code_start >> 16;
+            (*cfi_info)[offset_to_update+3] = quick_code_start >> 24;
+            force_debug_capture = true;
+          }
+        }
+
+
+        if (writer_->compiler_driver_->DidIncludeDebugSymbols() || force_debug_capture) {
+          // Record debug information for this function if we are doing that or
+          // we have CFI and so need it.
+          std::string name = PrettyMethod(it.GetMemberIndex(), *dex_file_, true);
+          if (deduped) {
+            // TODO We should place the DEDUPED tag on the first instance of a
+            // deduplicated symbol so that it will show up in a debuggerd crash
+            // report.
+            name += " [ DEDUPED ]";
+          }
+          writer_->method_info_.push_back(DebugInfo(name, quick_code_start,
+                                                    quick_code_start + code_size));
+        }
       }
 
       if (kIsDebugBuild) {
@@ -517,7 +535,7 @@
                                                       NullHandle<mirror::ClassLoader>(),
                                                       NullHandle<mirror::ArtMethod>(),
                                                       invoke_type);
-    CHECK(method != NULL);
+    CHECK(method != NULL) << PrettyMethod(it.GetMemberIndex(), *dex_file_, true);
     // Portable code offsets are set by ElfWriterMclinker::FixupCompiledCodeOffset after linking.
     method->SetQuickOatCodeOffset(offsets.code_offset_);
     method->SetOatNativeGcMapOffset(offsets.gc_map_offset_);
diff --git a/compiler/oat_writer.h b/compiler/oat_writer.h
index 8c20aa8..dbecb95 100644
--- a/compiler/oat_writer.h
+++ b/compiler/oat_writer.h
@@ -108,6 +108,10 @@
     return method_info_;
   }
 
+  bool DidAddSymbols() const {
+    return compiler_driver_->DidIncludeDebugSymbols();
+  }
+
  private:
   // The DataAccess classes are helper classes that provide access to members related to
   // a given map, i.e. GC map, mapping table or vmap table. By abstracting these away
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index f05cb66..b8332ad 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -18,6 +18,7 @@
 
 #include "code_generator_arm.h"
 #include "code_generator_x86.h"
+#include "code_generator_x86_64.h"
 #include "dex/verified_method.h"
 #include "driver/dex_compilation_unit.h"
 #include "gc_map_builder.h"
@@ -221,7 +222,7 @@
       return new (allocator) x86::CodeGeneratorX86(graph);
     }
     case kX86_64: {
-      return new (allocator) x86::CodeGeneratorX86(graph);
+      return new (allocator) x86_64::CodeGeneratorX86_64(graph);
     }
     default:
       return nullptr;
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 82fa639..83621e0 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -67,8 +67,7 @@
     // Note that this follows the current calling convention.
     return GetFrameSize()
         + kVRegSize  // Art method
-        + (parameter->GetIndex() - graph_->GetNumberOfVRegs() + graph_->GetNumberOfInVRegs())
-          * kVRegSize;
+        + parameter->GetIndex() * kVRegSize;
   }
 
   virtual void GenerateFrameEntry() = 0;
@@ -158,10 +157,10 @@
     return registers_[index];
   }
 
-  uint8_t GetStackOffsetOf(size_t index, size_t word_size) const {
+  uint8_t GetStackOffsetOf(size_t index) const {
     // We still reserve the space for parameters passed by registers.
-    // Add word_size for the method pointer.
-    return index * kVRegSize + word_size;
+    // Add one for the method pointer.
+    return (index + 1) * kVRegSize;
   }
 
  private:
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index d61df36..212a6dc 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -15,14 +15,14 @@
  */
 
 #include "code_generator_arm.h"
-#include "utils/assembler.h"
-#include "utils/arm/assembler_arm.h"
-#include "utils/arm/managed_register_arm.h"
 
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "mirror/array.h"
 #include "mirror/art_method.h"
 #include "thread.h"
+#include "utils/assembler.h"
+#include "utils/arm/assembler_arm.h"
+#include "utils/arm/managed_register_arm.h"
 
 #define __ reinterpret_cast<ArmAssembler*>(GetAssembler())->
 
@@ -48,7 +48,8 @@
 CodeGeneratorARM::CodeGeneratorARM(HGraph* graph)
     : CodeGenerator(graph, kNumberOfRegIds),
       location_builder_(graph, this),
-      instruction_visitor_(graph, this) {}
+      instruction_visitor_(graph, this),
+      move_resolver_(graph->GetArena(), this) {}
 
 static bool* GetBlockedRegisterPairs(bool* blocked_registers) {
   return blocked_registers + kNumberOfAllocIds;
@@ -106,6 +107,9 @@
   // Reserve thread register.
   blocked_registers[TR] = true;
 
+  // Reserve temp register.
+  blocked_registers[IP] = true;
+
   // TODO: We currently don't use Quick's callee saved registers.
   blocked_registers[R5] = true;
   blocked_registers[R6] = true;
@@ -161,7 +165,7 @@
   uint16_t number_of_in_vregs = GetGraph()->GetNumberOfInVRegs();
   if (reg_number >= number_of_vregs - number_of_in_vregs) {
     // Local is a parameter of the method. It is stored in the caller's frame.
-    return GetFrameSize() + kArmWordSize  // ART method
+    return GetFrameSize() + kVRegSize  // ART method
                           + (reg_number - number_of_vregs + number_of_in_vregs) * kVRegSize;
   } else {
     // Local is a temporary in this method. It is stored in this method's frame.
@@ -210,7 +214,7 @@
       if (index < calling_convention.GetNumberOfRegisters()) {
         return ArmCoreLocation(calling_convention.GetRegisterAt(index));
       } else {
-        return Location::StackSlot(calling_convention.GetStackOffsetOf(index, kArmWordSize));
+        return Location::StackSlot(calling_convention.GetStackOffsetOf(index));
       }
     }
 
@@ -223,7 +227,7 @@
       } else if (index + 1 == calling_convention.GetNumberOfRegisters()) {
         return Location::QuickParameter(index);
       } else {
-        return Location::DoubleStackSlot(calling_convention.GetStackOffsetOf(index, kArmWordSize));
+        return Location::DoubleStackSlot(calling_convention.GetStackOffsetOf(index));
       }
     }
 
@@ -254,8 +258,8 @@
     if (source.IsRegister()) {
       __ str(source.AsArm().AsCoreRegister(), Address(SP, destination.GetStackIndex()));
     } else {
-      __ ldr(R0, Address(SP, source.GetStackIndex()));
-      __ str(R0, Address(SP, destination.GetStackIndex()));
+      __ ldr(IP, Address(SP, source.GetStackIndex()));
+      __ str(IP, Address(SP, destination.GetStackIndex()));
     }
   }
 }
@@ -274,7 +278,7 @@
       __ Mov(destination.AsArm().AsRegisterPairLow(),
              calling_convention.GetRegisterAt(argument_index));
       __ ldr(destination.AsArm().AsRegisterPairHigh(),
-             Address(SP, calling_convention.GetStackOffsetOf(argument_index + 1, kArmWordSize) + GetFrameSize()));
+             Address(SP, calling_convention.GetStackOffsetOf(argument_index + 1) + GetFrameSize()));
     } else {
       DCHECK(source.IsDoubleStackSlot());
       if (destination.AsArm().AsRegisterPair() == R1_R2) {
@@ -291,12 +295,12 @@
     if (source.IsRegister()) {
       __ Mov(calling_convention.GetRegisterAt(argument_index), source.AsArm().AsRegisterPairLow());
       __ str(source.AsArm().AsRegisterPairHigh(),
-             Address(SP, calling_convention.GetStackOffsetOf(argument_index + 1, kArmWordSize)));
+             Address(SP, calling_convention.GetStackOffsetOf(argument_index + 1)));
     } else {
       DCHECK(source.IsDoubleStackSlot());
       __ ldr(calling_convention.GetRegisterAt(argument_index), Address(SP, source.GetStackIndex()));
       __ ldr(R0, Address(SP, source.GetHighStackIndex(kArmWordSize)));
-      __ str(R0, Address(SP, calling_convention.GetStackOffsetOf(argument_index + 1, kArmWordSize)));
+      __ str(R0, Address(SP, calling_convention.GetStackOffsetOf(argument_index + 1)));
     }
   } else {
     DCHECK(destination.IsDoubleStackSlot());
@@ -314,14 +318,14 @@
       __ str(calling_convention.GetRegisterAt(argument_index),
              Address(SP, destination.GetStackIndex()));
       __ ldr(R0,
-             Address(SP, calling_convention.GetStackOffsetOf(argument_index + 1, kArmWordSize) + GetFrameSize()));
+             Address(SP, calling_convention.GetStackOffsetOf(argument_index + 1) + GetFrameSize()));
       __ str(R0, Address(SP, destination.GetHighStackIndex(kArmWordSize)));
     } else {
       DCHECK(source.IsDoubleStackSlot());
-      __ ldr(R0, Address(SP, source.GetStackIndex()));
-      __ str(R0, Address(SP, destination.GetStackIndex()));
-      __ ldr(R0, Address(SP, source.GetHighStackIndex(kArmWordSize)));
-      __ str(R0, Address(SP, destination.GetHighStackIndex(kArmWordSize)));
+      __ ldr(IP, Address(SP, source.GetStackIndex()));
+      __ str(IP, Address(SP, destination.GetStackIndex()));
+      __ ldr(IP, Address(SP, source.GetHighStackIndex(kArmWordSize)));
+      __ str(IP, Address(SP, destination.GetHighStackIndex(kArmWordSize)));
     }
   }
 }
@@ -332,8 +336,8 @@
     if (location.IsRegister()) {
       __ LoadImmediate(location.AsArm().AsCoreRegister(), value);
     } else {
-      __ LoadImmediate(R0, value);
-      __ str(R0, Address(SP, location.GetStackIndex()));
+      __ LoadImmediate(IP, value);
+      __ str(IP, Address(SP, location.GetStackIndex()));
     }
   } else if (instruction->AsLongConstant() != nullptr) {
     int64_t value = instruction->AsLongConstant()->GetValue();
@@ -341,10 +345,10 @@
       __ LoadImmediate(location.AsArm().AsRegisterPairLow(), Low32Bits(value));
       __ LoadImmediate(location.AsArm().AsRegisterPairHigh(), High32Bits(value));
     } else {
-      __ LoadImmediate(R0, Low32Bits(value));
-      __ str(R0, Address(SP, location.GetStackIndex()));
-      __ LoadImmediate(R0, High32Bits(value));
-      __ str(R0, Address(SP, location.GetHighStackIndex(kArmWordSize)));
+      __ LoadImmediate(IP, Low32Bits(value));
+      __ str(IP, Address(SP, location.GetStackIndex()));
+      __ LoadImmediate(IP, High32Bits(value));
+      __ str(IP, Address(SP, location.GetHighStackIndex(kArmWordSize)));
     }
   } else if (instruction->AsLoadLocal() != nullptr) {
     uint32_t stack_slot = GetStackSlot(instruction->AsLoadLocal()->GetLocal());
@@ -493,7 +497,7 @@
 }
 
 void InstructionCodeGeneratorARM::VisitIntConstant(HIntConstant* constant) {
-  // Will be generated at use site.
+  codegen_->Move(constant, constant->GetLocations()->Out(), nullptr);
 }
 
 void LocationsBuilderARM::VisitLongConstant(HLongConstant* constant) {
@@ -564,7 +568,7 @@
 
 void LocationsBuilderARM::VisitInvokeStatic(HInvokeStatic* invoke) {
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(invoke);
-  locations->AddTemp(Location::RequiresRegister());
+  locations->AddTemp(ArmCoreLocation(R0));
 
   InvokeDexCallingConventionVisitor calling_convention_visitor;
   for (size_t i = 0; i < invoke->InputCount(); i++) {
@@ -811,15 +815,93 @@
 }
 
 void InstructionCodeGeneratorARM::VisitPhi(HPhi* instruction) {
-  LOG(FATAL) << "Unimplemented";
+  LOG(FATAL) << "Unreachable";
 }
 
 void LocationsBuilderARM::VisitParallelMove(HParallelMove* instruction) {
-  LOG(FATAL) << "Unimplemented";
+  LOG(FATAL) << "Unreachable";
 }
 
 void InstructionCodeGeneratorARM::VisitParallelMove(HParallelMove* instruction) {
-  LOG(FATAL) << "Unimplemented";
+  codegen_->GetMoveResolver()->EmitNativeCode(instruction);
+}
+
+ArmAssembler* ParallelMoveResolverARM::GetAssembler() const {
+  return codegen_->GetAssembler();
+}
+
+void ParallelMoveResolverARM::EmitMove(size_t index) {
+  MoveOperands* move = moves_.Get(index);
+  Location source = move->GetSource();
+  Location destination = move->GetDestination();
+
+  if (source.IsRegister()) {
+    if (destination.IsRegister()) {
+      __ Mov(destination.AsArm().AsCoreRegister(), source.AsArm().AsCoreRegister());
+    } else {
+      DCHECK(destination.IsStackSlot());
+      __ StoreToOffset(kStoreWord, source.AsArm().AsCoreRegister(),
+                       SP, destination.GetStackIndex());
+    }
+  } else if (source.IsStackSlot()) {
+    if (destination.IsRegister()) {
+      __ LoadFromOffset(kLoadWord, destination.AsArm().AsCoreRegister(),
+                        SP, source.GetStackIndex());
+    } else {
+      DCHECK(destination.IsStackSlot());
+      __ LoadFromOffset(kLoadWord, IP, SP, source.GetStackIndex());
+      __ StoreToOffset(kStoreWord, IP, SP, destination.GetStackIndex());
+    }
+  } else {
+    LOG(FATAL) << "Unimplemented";
+  }
+}
+
+void ParallelMoveResolverARM::Exchange(Register reg, int mem) {
+  __ Mov(IP, reg);
+  __ LoadFromOffset(kLoadWord, reg, SP, mem);
+  __ StoreToOffset(kStoreWord, IP, SP, mem);
+}
+
+void ParallelMoveResolverARM::Exchange(int mem1, int mem2) {
+  ScratchRegisterScope ensure_scratch(this, IP, R0, codegen_->GetNumberOfCoreRegisters());
+  int stack_offset = ensure_scratch.IsSpilled() ? kArmWordSize : 0;
+  __ LoadFromOffset(kLoadWord, static_cast<Register>(ensure_scratch.GetRegister()),
+                    SP, mem1 + stack_offset);
+  __ LoadFromOffset(kLoadWord, IP, SP, mem2 + stack_offset);
+  __ StoreToOffset(kStoreWord, static_cast<Register>(ensure_scratch.GetRegister()),
+                   SP, mem2 + stack_offset);
+  __ StoreToOffset(kStoreWord, IP, SP, mem1 + stack_offset);
+}
+
+void ParallelMoveResolverARM::EmitSwap(size_t index) {
+  MoveOperands* move = moves_.Get(index);
+  Location source = move->GetSource();
+  Location destination = move->GetDestination();
+
+  if (source.IsRegister() && destination.IsRegister()) {
+    DCHECK_NE(source.AsArm().AsCoreRegister(), IP);
+    DCHECK_NE(destination.AsArm().AsCoreRegister(), IP);
+    __ Mov(IP, source.AsArm().AsCoreRegister());
+    __ Mov(source.AsArm().AsCoreRegister(), destination.AsArm().AsCoreRegister());
+    __ Mov(destination.AsArm().AsCoreRegister(), IP);
+  } else if (source.IsRegister() && destination.IsStackSlot()) {
+    Exchange(source.AsArm().AsCoreRegister(), destination.GetStackIndex());
+  } else if (source.IsStackSlot() && destination.IsRegister()) {
+    Exchange(destination.AsArm().AsCoreRegister(), source.GetStackIndex());
+  } else if (source.IsStackSlot() && destination.IsStackSlot()) {
+    Exchange(source.GetStackIndex(), destination.GetStackIndex());
+  } else {
+    LOG(FATAL) << "Unimplemented";
+  }
+}
+
+void ParallelMoveResolverARM::SpillScratch(int reg) {
+  __ Push(static_cast<Register>(reg));
+}
+
+void ParallelMoveResolverARM::RestoreScratch(int reg) {
+  __ Pop(static_cast<Register>(reg));
 }
 
 }  // namespace arm
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index ac5ef21..712a24c 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -19,6 +19,7 @@
 
 #include "code_generator.h"
 #include "nodes.h"
+#include "parallel_move_resolver.h"
 #include "utils/arm/assembler_arm32.h"
 
 namespace art {
@@ -59,6 +60,27 @@
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitor);
 };
 
+class ParallelMoveResolverARM : public ParallelMoveResolver {
+ public:
+  ParallelMoveResolverARM(ArenaAllocator* allocator, CodeGeneratorARM* codegen)
+      : ParallelMoveResolver(allocator), codegen_(codegen) {}
+
+  virtual void EmitMove(size_t index) OVERRIDE;
+  virtual void EmitSwap(size_t index) OVERRIDE;
+  virtual void SpillScratch(int reg) OVERRIDE;
+  virtual void RestoreScratch(int reg) OVERRIDE;
+
+  ArmAssembler* GetAssembler() const;
+
+ private:
+  void Exchange(Register reg, int mem);
+  void Exchange(int mem1, int mem2);
+
+  CodeGeneratorARM* const codegen_;
+
+  DISALLOW_COPY_AND_ASSIGN(ParallelMoveResolverARM);
+};
+
 class LocationsBuilderARM : public HGraphVisitor {
  public:
   explicit LocationsBuilderARM(HGraph* graph, CodeGeneratorARM* codegen)
@@ -145,6 +167,10 @@
   virtual void DumpCoreRegister(std::ostream& stream, int reg) const OVERRIDE;
   virtual void DumpFloatingPointRegister(std::ostream& stream, int reg) const OVERRIDE;
 
+  ParallelMoveResolverARM* GetMoveResolver() {
+    return &move_resolver_;
+  }
+
  private:
   // Helper method to move a 32bits value between two locations.
   void Move32(Location destination, Location source);
@@ -153,6 +179,7 @@
 
   LocationsBuilderARM location_builder_;
   InstructionCodeGeneratorARM instruction_visitor_;
+  ParallelMoveResolverARM move_resolver_;
   Arm32Assembler assembler_;
 
   DISALLOW_COPY_AND_ASSIGN(CodeGeneratorARM);
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index c7dca86..342a191 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -157,7 +157,7 @@
   uint16_t number_of_in_vregs = GetGraph()->GetNumberOfInVRegs();
   if (reg_number >= number_of_vregs - number_of_in_vregs) {
     // Local is a parameter of the method. It is stored in the caller's frame.
-    return GetFrameSize() + kX86WordSize  // ART method
+    return GetFrameSize() + kVRegSize  // ART method
                           + (reg_number - number_of_vregs + number_of_in_vregs) * kVRegSize;
   } else {
     // Local is a temporary in this method. It is stored in this method's frame.
@@ -221,7 +221,7 @@
       if (index < calling_convention.GetNumberOfRegisters()) {
         return X86CpuLocation(calling_convention.GetRegisterAt(index));
       } else {
-        return Location::StackSlot(calling_convention.GetStackOffsetOf(index, kX86WordSize));
+        return Location::StackSlot(calling_convention.GetStackOffsetOf(index));
       }
     }
 
@@ -234,7 +234,7 @@
       } else if (index + 1 == calling_convention.GetNumberOfRegisters()) {
         return Location::QuickParameter(index);
       } else {
-        return Location::DoubleStackSlot(calling_convention.GetStackOffsetOf(index, kX86WordSize));
+        return Location::DoubleStackSlot(calling_convention.GetStackOffsetOf(index));
       }
     }
 
@@ -286,7 +286,7 @@
       __ movl(destination.AsX86().AsRegisterPairLow(),
               calling_convention.GetRegisterAt(argument_index));
       __ movl(destination.AsX86().AsRegisterPairHigh(), Address(ESP,
-          calling_convention.GetStackOffsetOf(argument_index + 1, kX86WordSize) + GetFrameSize()));
+          calling_convention.GetStackOffsetOf(argument_index + 1) + GetFrameSize()));
     } else {
       DCHECK(source.IsDoubleStackSlot());
       __ movl(destination.AsX86().AsRegisterPairLow(), Address(ESP, source.GetStackIndex()));
@@ -298,14 +298,14 @@
     uint32_t argument_index = destination.GetQuickParameterIndex();
     if (source.IsRegister()) {
       __ movl(calling_convention.GetRegisterAt(argument_index), source.AsX86().AsRegisterPairLow());
-      __ movl(Address(ESP, calling_convention.GetStackOffsetOf(argument_index + 1, kX86WordSize)),
+      __ movl(Address(ESP, calling_convention.GetStackOffsetOf(argument_index + 1)),
               source.AsX86().AsRegisterPairHigh());
     } else {
       DCHECK(source.IsDoubleStackSlot());
       __ movl(calling_convention.GetRegisterAt(argument_index),
               Address(ESP, source.GetStackIndex()));
       __ pushl(Address(ESP, source.GetHighStackIndex(kX86WordSize)));
-      __ popl(Address(ESP, calling_convention.GetStackOffsetOf(argument_index + 1, kX86WordSize)));
+      __ popl(Address(ESP, calling_convention.GetStackOffsetOf(argument_index + 1)));
     }
   } else {
     if (source.IsRegister()) {
@@ -318,7 +318,7 @@
       __ movl(Address(ESP, destination.GetStackIndex()),
               calling_convention.GetRegisterAt(argument_index));
       __ pushl(Address(ESP,
-          calling_convention.GetStackOffsetOf(argument_index + 1, kX86WordSize) + GetFrameSize()));
+          calling_convention.GetStackOffsetOf(argument_index + 1) + GetFrameSize()));
       __ popl(Address(ESP, destination.GetHighStackIndex(kX86WordSize)));
     } else {
       DCHECK(source.IsDoubleStackSlot());
@@ -847,7 +847,7 @@
 
 void ParallelMoveResolverX86::MoveMemoryToMemory(int dst, int src) {
   ScratchRegisterScope ensure_scratch(
-      this, kNoRegister, codegen_->GetNumberOfCoreRegisters());
+      this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters());
   int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0;
   __ movl(static_cast<Register>(ensure_scratch.GetRegister()), Address(ESP, src + stack_offset));
   __ movl(Address(ESP, dst + stack_offset), static_cast<Register>(ensure_scratch.GetRegister()));
@@ -879,7 +879,10 @@
 }
 
 void ParallelMoveResolverX86::Exchange(Register reg, int mem) {
-  ScratchRegisterScope ensure_scratch(this, reg, codegen_->GetNumberOfCoreRegisters());
+  Register suggested_scratch = reg == EAX ? EBX : EAX;
+  ScratchRegisterScope ensure_scratch(
+      this, reg, suggested_scratch, codegen_->GetNumberOfCoreRegisters());
+
   int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0;
   __ movl(static_cast<Register>(ensure_scratch.GetRegister()), Address(ESP, mem + stack_offset));
   __ movl(Address(ESP, mem + stack_offset), reg);
@@ -889,9 +892,12 @@
 
 void ParallelMoveResolverX86::Exchange(int mem1, int mem2) {
   ScratchRegisterScope ensure_scratch1(
-      this, kNoRegister, codegen_->GetNumberOfCoreRegisters());
+      this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters());
+
+  Register suggested_scratch = ensure_scratch1.GetRegister() == EAX ? EBX : EAX;
   ScratchRegisterScope ensure_scratch2(
-      this, ensure_scratch1.GetRegister(), codegen_->GetNumberOfCoreRegisters());
+      this, ensure_scratch1.GetRegister(), suggested_scratch, codegen_->GetNumberOfCoreRegisters());
+
   int stack_offset = ensure_scratch1.IsSpilled() ? kX86WordSize : 0;
   stack_offset += ensure_scratch2.IsSpilled() ? kX86WordSize : 0;
   __ movl(static_cast<Register>(ensure_scratch1.GetRegister()), Address(ESP, mem1 + stack_offset));
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
new file mode 100644
index 0000000..ef17ca7
--- /dev/null
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -0,0 +1,708 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "code_generator_x86_64.h"
+
+#include "entrypoints/quick/quick_entrypoints.h"
+#include "mirror/array.h"
+#include "mirror/art_method.h"
+#include "mirror/object_reference.h"
+#include "thread.h"
+#include "utils/assembler.h"
+#include "utils/x86_64/assembler_x86_64.h"
+#include "utils/x86_64/managed_register_x86_64.h"
+
+#define __ reinterpret_cast<X86_64Assembler*>(GetAssembler())->
+
+namespace art {
+
+x86_64::X86_64ManagedRegister Location::AsX86_64() const {
+  return reg().AsX86_64();
+}
+
+namespace x86_64 {
+
+static constexpr int kNumberOfPushedRegistersAtEntry = 1;
+static constexpr int kCurrentMethodStackOffset = 0;
+
+void CodeGeneratorX86_64::DumpCoreRegister(std::ostream& stream, int reg) const {
+  stream << X86_64ManagedRegister::FromCpuRegister(Register(reg));
+}
+
+void CodeGeneratorX86_64::DumpFloatingPointRegister(std::ostream& stream, int reg) const {
+  stream << X86_64ManagedRegister::FromXmmRegister(FloatRegister(reg));
+}
+
+static Location X86_64CpuLocation(Register reg) {
+  return Location::RegisterLocation(X86_64ManagedRegister::FromCpuRegister(reg));
+}
+
+CodeGeneratorX86_64::CodeGeneratorX86_64(HGraph* graph)
+      : CodeGenerator(graph, kNumberOfRegIds),
+        location_builder_(graph, this),
+        instruction_visitor_(graph, this) {}
+
+InstructionCodeGeneratorX86_64::InstructionCodeGeneratorX86_64(HGraph* graph, CodeGeneratorX86_64* codegen)
+      : HGraphVisitor(graph),
+        assembler_(codegen->GetAssembler()),
+        codegen_(codegen) {}
+
+ManagedRegister CodeGeneratorX86_64::AllocateFreeRegister(Primitive::Type type,
+                                                          bool* blocked_registers) const {
+  switch (type) {
+    case Primitive::kPrimLong:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimNot: {
+      size_t reg = AllocateFreeRegisterInternal(blocked_registers, kNumberOfCpuRegisters);
+      return X86_64ManagedRegister::FromCpuRegister(static_cast<Register>(reg));
+    }
+
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      LOG(FATAL) << "Unimplemented register type " << type;
+
+    case Primitive::kPrimVoid:
+      LOG(FATAL) << "Unreachable type " << type;
+  }
+
+  return ManagedRegister::NoRegister();
+}
+
+void CodeGeneratorX86_64::SetupBlockedRegisters(bool* blocked_registers) const {
+  // Stack register is always reserved.
+  blocked_registers[RSP] = true;
+
+  // TODO: We currently don't use Quick's callee saved registers.
+  blocked_registers[RBX] = true;
+  blocked_registers[RBP] = true;
+  blocked_registers[R12] = true;
+  blocked_registers[R13] = true;
+  blocked_registers[R14] = true;
+  blocked_registers[R15] = true;
+}
+
+void CodeGeneratorX86_64::ComputeFrameSize(size_t number_of_spill_slots) {
+  // Add the current ART method to the frame size, the return PC, and the filler.
+  SetFrameSize(RoundUp(
+      number_of_spill_slots * kVRegSize
+      + kVRegSize  // filler
+      + kVRegSize  // Art method
+      + kNumberOfPushedRegistersAtEntry * kX86_64WordSize,
+      kStackAlignment));
+}
+
+void CodeGeneratorX86_64::GenerateFrameEntry() {
+  // Create a fake register to mimic Quick.
+  static const int kFakeReturnRegister = 16;
+  core_spill_mask_ |= (1 << kFakeReturnRegister);
+
+  // The return PC has already been pushed on the stack.
+  __ subq(CpuRegister(RSP), Immediate(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kX86_64WordSize));
+  __ movl(Address(CpuRegister(RSP), kCurrentMethodStackOffset), CpuRegister(RDI));
+}
+
+void CodeGeneratorX86_64::GenerateFrameExit() {
+  __ addq(CpuRegister(RSP),
+          Immediate(GetFrameSize() - kNumberOfPushedRegistersAtEntry * kX86_64WordSize));
+}
+
+void CodeGeneratorX86_64::Bind(Label* label) {
+  __ Bind(label);
+}
+
+void InstructionCodeGeneratorX86_64::LoadCurrentMethod(CpuRegister reg) {
+  __ movl(reg, Address(CpuRegister(RSP), kCurrentMethodStackOffset));
+}
+
+int32_t CodeGeneratorX86_64::GetStackSlot(HLocal* local) const {
+  uint16_t reg_number = local->GetRegNumber();
+  uint16_t number_of_vregs = GetGraph()->GetNumberOfVRegs();
+  uint16_t number_of_in_vregs = GetGraph()->GetNumberOfInVRegs();
+  if (reg_number >= number_of_vregs - number_of_in_vregs) {
+    // Local is a parameter of the method. It is stored in the caller's frame.
+    return GetFrameSize() + kVRegSize  // ART method
+                          + (reg_number - number_of_vregs + number_of_in_vregs) * kVRegSize;
+  } else {
+    // Local is a temporary in this method. It is stored in this method's frame.
+    return GetFrameSize() - (kNumberOfPushedRegistersAtEntry * kX86_64WordSize)
+                          - kVRegSize
+                          - (number_of_vregs * kVRegSize)
+                          + (reg_number * kVRegSize);
+  }
+}
+
+Location CodeGeneratorX86_64::GetStackLocation(HLoadLocal* load) const {
+  switch (load->GetType()) {
+    case Primitive::kPrimLong:
+      return Location::DoubleStackSlot(GetStackSlot(load->GetLocal()));
+      break;
+
+    case Primitive::kPrimInt:
+    case Primitive::kPrimNot:
+      return Location::StackSlot(GetStackSlot(load->GetLocal()));
+
+    case Primitive::kPrimFloat:
+    case Primitive::kPrimDouble:
+      LOG(FATAL) << "Unimplemented type " << load->GetType();
+
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimVoid:
+      LOG(FATAL) << "Unexpected type " << load->GetType();
+  }
+
+  LOG(FATAL) << "Unreachable";
+  return Location();
+}
+
+void CodeGeneratorX86_64::Move(Location destination, Location source) {
+  if (source.Equals(destination)) {
+    return;
+  }
+  if (destination.IsRegister()) {
+    if (source.IsRegister()) {
+      __ movq(destination.AsX86_64().AsCpuRegister(), source.AsX86_64().AsCpuRegister());
+    } else if (source.IsStackSlot()) {
+      __ movl(destination.AsX86_64().AsCpuRegister(), Address(CpuRegister(RSP), source.GetStackIndex()));
+    } else {
+      DCHECK(source.IsDoubleStackSlot());
+      __ movq(destination.AsX86_64().AsCpuRegister(), Address(CpuRegister(RSP), source.GetStackIndex()));
+    }
+  } else if (destination.IsStackSlot()) {
+    if (source.IsRegister()) {
+      __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), source.AsX86_64().AsCpuRegister());
+    } else {
+      DCHECK(source.IsStackSlot());
+      __ movl(CpuRegister(RAX), Address(CpuRegister(RSP), source.GetStackIndex()));
+      __ movl(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(RAX));
+    }
+  } else {
+    DCHECK(destination.IsDoubleStackSlot());
+    if (source.IsRegister()) {
+      __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), source.AsX86_64().AsCpuRegister());
+    } else {
+      DCHECK(source.IsDoubleStackSlot());
+      __ movq(CpuRegister(RAX), Address(CpuRegister(RSP), source.GetStackIndex()));
+      __ movq(Address(CpuRegister(RSP), destination.GetStackIndex()), CpuRegister(RAX));
+    }
+  }
+}
+
+void CodeGeneratorX86_64::Move(HInstruction* instruction, Location location, HInstruction* move_for) {
+  if (instruction->AsIntConstant() != nullptr) {
+    Immediate imm(instruction->AsIntConstant()->GetValue());
+    if (location.IsRegister()) {
+      __ movq(location.AsX86_64().AsCpuRegister(), imm);
+    } else {
+      __ movl(Address(CpuRegister(RSP), location.GetStackIndex()), imm);
+    }
+  } else if (instruction->AsLongConstant() != nullptr) {
+    int64_t value = instruction->AsLongConstant()->GetValue();
+    if (location.IsRegister()) {
+      __ movq(location.AsX86_64().AsCpuRegister(), Immediate(value));
+    } else {
+      __ movq(CpuRegister(RAX), Immediate(value));
+      __ movq(Address(CpuRegister(RSP), location.GetStackIndex()), CpuRegister(RAX));
+    }
+  } else if (instruction->AsLoadLocal() != nullptr) {
+    switch (instruction->GetType()) {
+      case Primitive::kPrimBoolean:
+      case Primitive::kPrimByte:
+      case Primitive::kPrimChar:
+      case Primitive::kPrimShort:
+      case Primitive::kPrimInt:
+      case Primitive::kPrimNot:
+        Move(location, Location::StackSlot(GetStackSlot(instruction->AsLoadLocal()->GetLocal())));
+        break;
+
+      case Primitive::kPrimLong:
+        Move(location, Location::DoubleStackSlot(GetStackSlot(instruction->AsLoadLocal()->GetLocal())));
+        break;
+
+      default:
+        LOG(FATAL) << "Unimplemented local type " << instruction->GetType();
+    }
+  } else {
+    // This can currently only happen when the instruction that requests the move
+    // is the next to be compiled.
+    DCHECK_EQ(instruction->GetNext(), move_for);
+    switch (instruction->GetType()) {
+      case Primitive::kPrimBoolean:
+      case Primitive::kPrimByte:
+      case Primitive::kPrimChar:
+      case Primitive::kPrimShort:
+      case Primitive::kPrimInt:
+      case Primitive::kPrimNot:
+      case Primitive::kPrimLong:
+        Move(location, instruction->GetLocations()->Out());
+        break;
+
+      default:
+        LOG(FATAL) << "Unimplemented type " << instruction->GetType();
+    }
+  }
+}
+
+void LocationsBuilderX86_64::VisitGoto(HGoto* got) {
+  got->SetLocations(nullptr);
+}
+
+void InstructionCodeGeneratorX86_64::VisitGoto(HGoto* got) {
+  HBasicBlock* successor = got->GetSuccessor();
+  if (GetGraph()->GetExitBlock() == successor) {
+    codegen_->GenerateFrameExit();
+  } else if (!codegen_->GoesToNextBlock(got->GetBlock(), successor)) {
+    __ jmp(codegen_->GetLabelOf(successor));
+  }
+}
+
+void LocationsBuilderX86_64::VisitExit(HExit* exit) {
+  exit->SetLocations(nullptr);
+}
+
+void InstructionCodeGeneratorX86_64::VisitExit(HExit* exit) {
+  if (kIsDebugBuild) {
+    __ Comment("Unreachable");
+    __ int3();
+  }
+}
+
+void LocationsBuilderX86_64::VisitIf(HIf* if_instr) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(if_instr);
+  locations->SetInAt(0, X86_64CpuLocation(RAX));
+  if_instr->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorX86_64::VisitIf(HIf* if_instr) {
+  // TODO: Generate the input as a condition, instead of materializing in a register.
+  __ cmpl(if_instr->GetLocations()->InAt(0).AsX86_64().AsCpuRegister(), Immediate(0));
+  __ j(kEqual, codegen_->GetLabelOf(if_instr->IfFalseSuccessor()));
+  if (!codegen_->GoesToNextBlock(if_instr->GetBlock(), if_instr->IfTrueSuccessor())) {
+    __ jmp(codegen_->GetLabelOf(if_instr->IfTrueSuccessor()));
+  }
+}
+
+void LocationsBuilderX86_64::VisitLocal(HLocal* local) {
+  local->SetLocations(nullptr);
+}
+
+void InstructionCodeGeneratorX86_64::VisitLocal(HLocal* local) {
+  DCHECK_EQ(local->GetBlock(), GetGraph()->GetEntryBlock());
+}
+
+void LocationsBuilderX86_64::VisitLoadLocal(HLoadLocal* local) {
+  local->SetLocations(nullptr);
+}
+
+void InstructionCodeGeneratorX86_64::VisitLoadLocal(HLoadLocal* load) {
+  // Nothing to do, this is driven by the code generator.
+}
+
+void LocationsBuilderX86_64::VisitStoreLocal(HStoreLocal* store) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(store);
+  switch (store->InputAt(1)->GetType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimNot:
+      locations->SetInAt(1, Location::StackSlot(codegen_->GetStackSlot(store->GetLocal())));
+      break;
+
+    case Primitive::kPrimLong:
+      locations->SetInAt(1, Location::DoubleStackSlot(codegen_->GetStackSlot(store->GetLocal())));
+      break;
+
+    default:
+      LOG(FATAL) << "Unimplemented local type " << store->InputAt(1)->GetType();
+  }
+  store->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorX86_64::VisitStoreLocal(HStoreLocal* store) {
+}
+
+void LocationsBuilderX86_64::VisitEqual(HEqual* equal) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(equal);
+  locations->SetInAt(0, X86_64CpuLocation(RAX));
+  locations->SetInAt(1, X86_64CpuLocation(RCX));
+  locations->SetOut(X86_64CpuLocation(RAX));
+  equal->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorX86_64::VisitEqual(HEqual* equal) {
+  __ cmpq(equal->GetLocations()->InAt(0).AsX86_64().AsCpuRegister(),
+          equal->GetLocations()->InAt(1).AsX86_64().AsCpuRegister());
+  __ setcc(kEqual, equal->GetLocations()->Out().AsX86_64().AsCpuRegister());
+}
+
+void LocationsBuilderX86_64::VisitIntConstant(HIntConstant* constant) {
+  // TODO: Support constant locations.
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(constant);
+  locations->SetOut(Location::RequiresRegister());
+  constant->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorX86_64::VisitIntConstant(HIntConstant* constant) {
+  // Will be generated at use site.
+}
+
+void LocationsBuilderX86_64::VisitLongConstant(HLongConstant* constant) {
+  // TODO: Support constant locations.
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(constant);
+  locations->SetOut(Location::RequiresRegister());
+  constant->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorX86_64::VisitLongConstant(HLongConstant* constant) {
+  // Will be generated at use site.
+}
+
+void LocationsBuilderX86_64::VisitReturnVoid(HReturnVoid* ret) {
+  ret->SetLocations(nullptr);
+}
+
+void InstructionCodeGeneratorX86_64::VisitReturnVoid(HReturnVoid* ret) {
+  codegen_->GenerateFrameExit();
+  __ ret();
+}
+
+void LocationsBuilderX86_64::VisitReturn(HReturn* ret) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(ret);
+  switch (ret->InputAt(0)->GetType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimNot:
+    case Primitive::kPrimLong:
+      locations->SetInAt(0, X86_64CpuLocation(RAX));
+      break;
+
+    default:
+      LOG(FATAL) << "Unimplemented return type " << ret->InputAt(0)->GetType();
+  }
+  ret->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorX86_64::VisitReturn(HReturn* ret) {
+  if (kIsDebugBuild) {
+    switch (ret->InputAt(0)->GetType()) {
+      case Primitive::kPrimBoolean:
+      case Primitive::kPrimByte:
+      case Primitive::kPrimChar:
+      case Primitive::kPrimShort:
+      case Primitive::kPrimInt:
+      case Primitive::kPrimNot:
+      case Primitive::kPrimLong:
+        DCHECK_EQ(ret->GetLocations()->InAt(0).AsX86_64().AsCpuRegister().AsRegister(), RAX);
+        break;
+
+      default:
+        LOG(FATAL) << "Unimplemented return type " << ret->InputAt(0)->GetType();
+    }
+  }
+  codegen_->GenerateFrameExit();
+  __ ret();
+}
+
+static constexpr Register kRuntimeParameterCoreRegisters[] = { RDI, RSI, RDX };
+static constexpr size_t kRuntimeParameterCoreRegistersLength =
+    arraysize(kRuntimeParameterCoreRegisters);
+
+class InvokeRuntimeCallingConvention : public CallingConvention<Register> {
+ public:
+  InvokeRuntimeCallingConvention()
+      : CallingConvention(kRuntimeParameterCoreRegisters,
+                          kRuntimeParameterCoreRegistersLength) {}
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(InvokeRuntimeCallingConvention);
+};
+
+Location InvokeDexCallingConventionVisitor::GetNextLocation(Primitive::Type type) {
+  switch (type) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimNot: {
+      uint32_t index = gp_index_++;
+      stack_index_++;
+      if (index < calling_convention.GetNumberOfRegisters()) {
+        return X86_64CpuLocation(calling_convention.GetRegisterAt(index));
+      } else {
+        return Location::StackSlot(calling_convention.GetStackOffsetOf(stack_index_ - 1));
+      }
+    }
+
+    case Primitive::kPrimLong: {
+      uint32_t index = gp_index_;
+      stack_index_ += 2;
+      if (index < calling_convention.GetNumberOfRegisters()) {
+        gp_index_ += 1;
+        return X86_64CpuLocation(calling_convention.GetRegisterAt(index));
+      } else {
+        gp_index_ += 2;
+        return Location::DoubleStackSlot(calling_convention.GetStackOffsetOf(stack_index_ - 2));
+      }
+    }
+
+    case Primitive::kPrimDouble:
+    case Primitive::kPrimFloat:
+      LOG(FATAL) << "Unimplemented parameter type " << type;
+      break;
+
+    case Primitive::kPrimVoid:
+      LOG(FATAL) << "Unexpected parameter type " << type;
+      break;
+  }
+  return Location();
+}
+
+void LocationsBuilderX86_64::VisitInvokeStatic(HInvokeStatic* invoke) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(invoke);
+  locations->AddTemp(X86_64CpuLocation(RDI));
+
+  InvokeDexCallingConventionVisitor calling_convention_visitor;
+  for (size_t i = 0; i < invoke->InputCount(); ++i) {
+    HInstruction* input = invoke->InputAt(i);
+    locations->SetInAt(i, calling_convention_visitor.GetNextLocation(input->GetType()));
+  }
+
+  switch (invoke->GetType()) {
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+    case Primitive::kPrimNot:
+    case Primitive::kPrimLong:
+      locations->SetOut(X86_64CpuLocation(RAX));
+      break;
+
+    case Primitive::kPrimVoid:
+      break;
+
+    case Primitive::kPrimDouble:
+    case Primitive::kPrimFloat:
+      LOG(FATAL) << "Unimplemented return type " << invoke->GetType();
+      break;
+  }
+
+  invoke->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorX86_64::VisitInvokeStatic(HInvokeStatic* invoke) {
+  CpuRegister temp = invoke->GetLocations()->GetTemp(0).AsX86_64().AsCpuRegister();
+  uint32_t heap_reference_size = sizeof(mirror::HeapReference<mirror::Object>);
+  size_t index_in_cache = mirror::Array::DataOffset(heap_reference_size).SizeValue() +
+      invoke->GetIndexInDexCache() * heap_reference_size;
+
+  // TODO: Implement all kinds of calls:
+  // 1) boot -> boot
+  // 2) app -> boot
+  // 3) app -> app
+  //
+  // Currently we implement the app -> app logic, which looks up in the resolve cache.
+
+  // temp = method;
+  LoadCurrentMethod(temp);
+  // temp = temp->dex_cache_resolved_methods_;
+  __ movl(temp, Address(temp, mirror::ArtMethod::DexCacheResolvedMethodsOffset().SizeValue()));
+  // temp = temp[index_in_cache]
+  __ movl(temp, Address(temp, index_in_cache));
+  // (temp + offset_of_quick_compiled_code)()
+  __ call(Address(temp, mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset().SizeValue()));
+
+  codegen_->RecordPcInfo(invoke->GetDexPc());
+}
+
+void LocationsBuilderX86_64::VisitAdd(HAdd* add) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(add);
+  switch (add->GetResultType()) {
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong: {
+      locations->SetInAt(0, X86_64CpuLocation(RAX));
+      locations->SetInAt(1, X86_64CpuLocation(RCX));
+      locations->SetOut(X86_64CpuLocation(RAX));
+      break;
+    }
+
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      LOG(FATAL) << "Unexpected add type " << add->GetResultType();
+      break;
+
+    default:
+      LOG(FATAL) << "Unimplemented add type " << add->GetResultType();
+  }
+  add->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorX86_64::VisitAdd(HAdd* add) {
+  LocationSummary* locations = add->GetLocations();
+  switch (add->GetResultType()) {
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong: {
+      DCHECK_EQ(locations->InAt(0).AsX86_64().AsCpuRegister().AsRegister(),
+                locations->Out().AsX86_64().AsCpuRegister().AsRegister());
+      __ addq(locations->InAt(0).AsX86_64().AsCpuRegister(),
+              locations->InAt(1).AsX86_64().AsCpuRegister());
+      break;
+    }
+
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      LOG(FATAL) << "Unexpected add type " << add->GetResultType();
+      break;
+
+    default:
+      LOG(FATAL) << "Unimplemented add type " << add->GetResultType();
+  }
+}
+
+void LocationsBuilderX86_64::VisitSub(HSub* sub) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(sub);
+  switch (sub->GetResultType()) {
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong: {
+      locations->SetInAt(0, X86_64CpuLocation(RAX));
+      locations->SetInAt(1, X86_64CpuLocation(RCX));
+      locations->SetOut(X86_64CpuLocation(RAX));
+      break;
+    }
+
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      LOG(FATAL) << "Unexpected sub type " << sub->GetResultType();
+      break;
+
+    default:
+      LOG(FATAL) << "Unimplemented sub type " << sub->GetResultType();
+  }
+  sub->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorX86_64::VisitSub(HSub* sub) {
+  LocationSummary* locations = sub->GetLocations();
+  switch (sub->GetResultType()) {
+    case Primitive::kPrimInt:
+    case Primitive::kPrimLong: {
+      DCHECK_EQ(locations->InAt(0).AsX86_64().AsCpuRegister().AsRegister(),
+                locations->Out().AsX86_64().AsCpuRegister().AsRegister());
+      __ subq(locations->InAt(0).AsX86_64().AsCpuRegister(),
+              locations->InAt(1).AsX86_64().AsCpuRegister());
+      break;
+    }
+
+    case Primitive::kPrimBoolean:
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      LOG(FATAL) << "Unexpected sub type " << sub->GetResultType();
+      break;
+
+    default:
+      LOG(FATAL) << "Unimplemented sub type " << sub->GetResultType();
+  }
+}
+
+void LocationsBuilderX86_64::VisitNewInstance(HNewInstance* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  locations->SetOut(X86_64CpuLocation(RAX));
+  instruction->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorX86_64::VisitNewInstance(HNewInstance* instruction) {
+  InvokeRuntimeCallingConvention calling_convention;
+  LoadCurrentMethod(CpuRegister(calling_convention.GetRegisterAt(1)));
+  __ movq(CpuRegister(calling_convention.GetRegisterAt(0)), Immediate(instruction->GetTypeIndex()));
+
+  __ gs()->call(Address::Absolute(
+      QUICK_ENTRYPOINT_OFFSET(kX86_64WordSize, pAllocObjectWithAccessCheck), true));
+
+  codegen_->RecordPcInfo(instruction->GetDexPc());
+}
+
+void LocationsBuilderX86_64::VisitParameterValue(HParameterValue* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  Location location = parameter_visitor_.GetNextLocation(instruction->GetType());
+  if (location.IsStackSlot()) {
+    location = Location::StackSlot(location.GetStackIndex() + codegen_->GetFrameSize());
+  } else if (location.IsDoubleStackSlot()) {
+    location = Location::DoubleStackSlot(location.GetStackIndex() + codegen_->GetFrameSize());
+  }
+  locations->SetOut(location);
+  instruction->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorX86_64::VisitParameterValue(HParameterValue* instruction) {
+  // Nothing to do, the parameter is already at its location.
+}
+
+void LocationsBuilderX86_64::VisitNot(HNot* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  locations->SetInAt(0, X86_64CpuLocation(RAX));
+  locations->SetOut(X86_64CpuLocation(RAX));
+  instruction->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorX86_64::VisitNot(HNot* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  DCHECK_EQ(locations->InAt(0).AsX86_64().AsCpuRegister().AsRegister(),
+            locations->Out().AsX86_64().AsCpuRegister().AsRegister());
+  __ xorq(locations->Out().AsX86_64().AsCpuRegister(), Immediate(1));
+}
+
+void LocationsBuilderX86_64::VisitPhi(HPhi* instruction) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instruction);
+  for (size_t i = 0, e = instruction->InputCount(); i < e; ++i) {
+    locations->SetInAt(i, Location::Any());
+  }
+  locations->SetOut(Location::Any());
+  instruction->SetLocations(locations);
+}
+
+void InstructionCodeGeneratorX86_64::VisitPhi(HPhi* instruction) {
+  LOG(FATAL) << "Unimplemented";
+}
+
+void LocationsBuilderX86_64::VisitParallelMove(HParallelMove* instruction) {
+  LOG(FATAL) << "Unimplemented";
+}
+
+void InstructionCodeGeneratorX86_64::VisitParallelMove(HParallelMove* instruction) {
+  LOG(FATAL) << "Unimplemented";
+}
+
+}  // namespace x86_64
+}  // namespace art
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
new file mode 100644
index 0000000..ac7ee9f
--- /dev/null
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_COMPILER_OPTIMIZING_CODE_GENERATOR_X86_64_H_
+#define ART_COMPILER_OPTIMIZING_CODE_GENERATOR_X86_64_H_
+
+#include "code_generator.h"
+#include "nodes.h"
+#include "utils/x86_64/assembler_x86_64.h"
+
+namespace art {
+namespace x86_64 {
+
+static constexpr size_t kX86_64WordSize = 8;
+
+static constexpr Register kParameterCoreRegisters[] = { RSI, RDX, RCX, R8, R9 };
+
+static constexpr size_t kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters);
+
+class InvokeDexCallingConvention : public CallingConvention<Register> {
+ public:
+  InvokeDexCallingConvention()
+      : CallingConvention(kParameterCoreRegisters, kParameterCoreRegistersLength) {}
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConvention);
+};
+
+class InvokeDexCallingConventionVisitor {
+ public:
+  InvokeDexCallingConventionVisitor() : gp_index_(0), stack_index_(0) {}
+
+  Location GetNextLocation(Primitive::Type type);
+
+ private:
+  InvokeDexCallingConvention calling_convention;
+  uint32_t gp_index_;
+  uint32_t stack_index_;
+
+  DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitor);
+};
+
+class CodeGeneratorX86_64;
+
+class LocationsBuilderX86_64 : public HGraphVisitor {
+ public:
+  LocationsBuilderX86_64(HGraph* graph, CodeGeneratorX86_64* codegen)
+      : HGraphVisitor(graph), codegen_(codegen) {}
+
+#define DECLARE_VISIT_INSTRUCTION(name)     \
+  virtual void Visit##name(H##name* instr);
+
+  FOR_EACH_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+
+#undef DECLARE_VISIT_INSTRUCTION
+
+ private:
+  CodeGeneratorX86_64* const codegen_;
+  InvokeDexCallingConventionVisitor parameter_visitor_;
+
+  DISALLOW_COPY_AND_ASSIGN(LocationsBuilderX86_64);
+};
+
+class InstructionCodeGeneratorX86_64 : public HGraphVisitor {
+ public:
+  InstructionCodeGeneratorX86_64(HGraph* graph, CodeGeneratorX86_64* codegen);
+
+#define DECLARE_VISIT_INSTRUCTION(name)     \
+  virtual void Visit##name(H##name* instr);
+
+  FOR_EACH_INSTRUCTION(DECLARE_VISIT_INSTRUCTION)
+
+#undef DECLARE_VISIT_INSTRUCTION
+
+  void LoadCurrentMethod(CpuRegister reg);
+
+  X86_64Assembler* GetAssembler() const { return assembler_; }
+
+ private:
+  X86_64Assembler* const assembler_;
+  CodeGeneratorX86_64* const codegen_;
+
+  DISALLOW_COPY_AND_ASSIGN(InstructionCodeGeneratorX86_64);
+};
+
+class CodeGeneratorX86_64 : public CodeGenerator {
+ public:
+  explicit CodeGeneratorX86_64(HGraph* graph);
+  virtual ~CodeGeneratorX86_64() {}
+
+  virtual void ComputeFrameSize(size_t number_of_spill_slots) OVERRIDE;
+  virtual void GenerateFrameEntry() OVERRIDE;
+  virtual void GenerateFrameExit() OVERRIDE;
+  virtual void Bind(Label* label) OVERRIDE;
+  virtual void Move(HInstruction* instruction, Location location, HInstruction* move_for) OVERRIDE;
+
+  virtual size_t GetWordSize() const OVERRIDE {
+    return kX86_64WordSize;
+  }
+
+  virtual HGraphVisitor* GetLocationBuilder() OVERRIDE {
+    return &location_builder_;
+  }
+
+  virtual HGraphVisitor* GetInstructionVisitor() OVERRIDE {
+    return &instruction_visitor_;
+  }
+
+  virtual X86_64Assembler* GetAssembler() OVERRIDE {
+    return &assembler_;
+  }
+
+  int32_t GetStackSlot(HLocal* local) const;
+  virtual Location GetStackLocation(HLoadLocal* load) const OVERRIDE;
+
+  virtual size_t GetNumberOfRegisters() const OVERRIDE {
+    return kNumberOfRegIds;
+  }
+
+  virtual size_t GetNumberOfCoreRegisters() const OVERRIDE {
+    return kNumberOfCpuRegisters;
+  }
+
+  virtual size_t GetNumberOfFloatingPointRegisters() const OVERRIDE {
+    return kNumberOfFloatRegisters;
+  }
+
+  virtual void SetupBlockedRegisters(bool* blocked_registers) const OVERRIDE;
+  virtual ManagedRegister AllocateFreeRegister(
+      Primitive::Type type, bool* blocked_registers) const OVERRIDE;
+  virtual void DumpCoreRegister(std::ostream& stream, int reg) const OVERRIDE;
+  virtual void DumpFloatingPointRegister(std::ostream& stream, int reg) const OVERRIDE;
+
+ private:
+  // Helper method to move a value between two locations.
+  void Move(Location destination, Location source);
+
+  LocationsBuilderX86_64 location_builder_;
+  InstructionCodeGeneratorX86_64 instruction_visitor_;
+  X86_64Assembler assembler_;
+
+  DISALLOW_COPY_AND_ASSIGN(CodeGeneratorX86_64);
+};
+
+}  // namespace x86_64
+}  // namespace art
+
+#endif  // ART_COMPILER_OPTIMIZING_CODE_GENERATOR_X86_64_H_
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index 8ee775c..c3baf1a 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -47,6 +47,17 @@
   DISALLOW_COPY_AND_ASSIGN(InternalCodeAllocator);
 };
 
+#if defined(__i386__) || defined(__arm__) || defined(__x86_64__)
+static void Run(const InternalCodeAllocator& allocator, bool has_result, int32_t expected) {
+  typedef int32_t (*fptr)();
+  CommonCompilerTest::MakeExecutable(allocator.GetMemory(), allocator.GetSize());
+  int32_t result = reinterpret_cast<fptr>(allocator.GetMemory())();
+  if (has_result) {
+    CHECK_EQ(result, expected);
+  }
+}
+#endif
+
 static void TestCode(const uint16_t* data, bool has_result = false, int32_t expected = 0) {
   ArenaPool pool;
   ArenaAllocator arena(&pool);
@@ -55,24 +66,23 @@
   HGraph* graph = builder.BuildGraph(*item);
   ASSERT_NE(graph, nullptr);
   InternalCodeAllocator allocator;
+
   CodeGenerator* codegen = CodeGenerator::Create(&arena, graph, kX86);
   codegen->CompileBaseline(&allocator);
-  typedef int32_t (*fptr)();
 #if defined(__i386__)
-  CommonCompilerTest::MakeExecutable(allocator.GetMemory(), allocator.GetSize());
-  int32_t result = reinterpret_cast<fptr>(allocator.GetMemory())();
-  if (has_result) {
-    CHECK_EQ(result, expected);
-  }
+  Run(allocator, has_result, expected);
 #endif
+
   codegen = CodeGenerator::Create(&arena, graph, kArm);
   codegen->CompileBaseline(&allocator);
 #if defined(__arm__)
-  CommonCompilerTest::MakeExecutable(allocator.GetMemory(), allocator.GetSize());
-  int32_t result = reinterpret_cast<fptr>(allocator.GetMemory())();
-  if (has_result) {
-    CHECK_EQ(result, expected);
-  }
+  Run(allocator, has_result, expected);
+#endif
+
+  codegen = CodeGenerator::Create(&arena, graph, kX86_64);
+  codegen->CompileBaseline(&allocator);
+#if defined(__x86_64__)
+  Run(allocator, has_result, expected);
 #endif
 }
 
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index 3c60d3c..40a39ad 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -150,6 +150,7 @@
 
   arm::ArmManagedRegister AsArm() const;
   x86::X86ManagedRegister AsX86() const;
+  x86_64::X86_64ManagedRegister AsX86_64() const;
 
   Kind GetKind() const {
     return KindField::Decode(value_);
diff --git a/compiler/optimizing/parallel_move_resolver.cc b/compiler/optimizing/parallel_move_resolver.cc
index 4a1b6ce..cadd3c5 100644
--- a/compiler/optimizing/parallel_move_resolver.cc
+++ b/compiler/optimizing/parallel_move_resolver.cc
@@ -163,7 +163,11 @@
   return false;
 }
 
-int ParallelMoveResolver::AllocateScratchRegister(int blocked, int register_count, bool* spilled) {
+int ParallelMoveResolver::AllocateScratchRegister(int blocked,
+                                                  int register_count,
+                                                  int if_scratch,
+                                                  bool* spilled) {
+  DCHECK_NE(blocked, if_scratch);
   int scratch = -1;
   for (int reg = 0; reg < register_count; ++reg) {
     if ((blocked != reg) &&
@@ -175,11 +179,7 @@
 
   if (scratch == -1) {
     *spilled = true;
-    for (int reg = 0; reg < register_count; ++reg) {
-      if (blocked != reg) {
-        scratch = reg;
-      }
-    }
+    scratch = if_scratch;
   } else {
     *spilled = false;
   }
@@ -189,11 +189,11 @@
 
 
 ParallelMoveResolver::ScratchRegisterScope::ScratchRegisterScope(
-    ParallelMoveResolver* resolver, int blocked, int number_of_registers)
+    ParallelMoveResolver* resolver, int blocked, int if_scratch, int number_of_registers)
     : resolver_(resolver),
       reg_(kNoRegister),
       spilled_(false) {
-  reg_ = resolver_->AllocateScratchRegister(blocked, number_of_registers, &spilled_);
+  reg_ = resolver_->AllocateScratchRegister(blocked, number_of_registers, if_scratch, &spilled_);
 
   if (spilled_) {
     resolver->SpillScratch(reg_);
diff --git a/compiler/optimizing/parallel_move_resolver.h b/compiler/optimizing/parallel_move_resolver.h
index e1189d8..fcc1de6 100644
--- a/compiler/optimizing/parallel_move_resolver.h
+++ b/compiler/optimizing/parallel_move_resolver.h
@@ -42,7 +42,10 @@
  protected:
   class ScratchRegisterScope : public ValueObject {
    public:
-    ScratchRegisterScope(ParallelMoveResolver* resolver, int blocked, int number_of_registers);
+    ScratchRegisterScope(ParallelMoveResolver* resolver,
+                         int blocked,
+                         int if_scratch,
+                         int number_of_registers);
     ~ScratchRegisterScope();
 
     int GetRegister() const { return reg_; }
@@ -55,7 +58,7 @@
   };
 
   bool IsScratchLocation(Location loc);
-  int AllocateScratchRegister(int blocked, int register_count, bool* spilled);
+  int AllocateScratchRegister(int blocked, int if_scratch, int register_count, bool* spilled);
 
   // Emit a move.
   virtual void EmitMove(size_t index) = 0;
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index c2a4769..348e9d4 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -651,7 +651,9 @@
     // Move must happen after the instruction.
     DCHECK(!at->IsControlFlow());
     move = at->GetNext()->AsParallelMove();
-    if (move == nullptr || IsInputMove(move)) {
+    // This is a parallel move for connecting siblings in a same block. We need to
+    // differentiate it with moves for connecting blocks, and input moves.
+    if (move == nullptr || move->GetLifetimePosition() != position) {
       move = new (allocator_) HParallelMove(allocator_);
       move->SetLifetimePosition(position);
       at->GetBlock()->InsertInstructionBefore(move, at->GetNext());
@@ -660,7 +662,9 @@
     // Move must happen before the instruction.
     HInstruction* previous = at->GetPrevious();
     if (previous != nullptr && previous->AsParallelMove() != nullptr) {
-      if (IsInputMove(previous)) {
+      // This is a parallel move for connecting siblings in a same block. We need to
+      // differentiate it with moves for connecting blocks, and input moves.
+      if (previous->GetLifetimePosition() != position) {
         previous = previous->GetPrevious();
       }
     }
@@ -684,8 +688,12 @@
   HInstruction* last = block->GetLastInstruction();
   HInstruction* previous = last->GetPrevious();
   HParallelMove* move;
-  if (previous == nullptr || previous->AsParallelMove() == nullptr) {
+  // This is a parallel move for connecting blocks. We need to differentiate
+  // it with moves for connecting siblings in a same block, and output moves.
+  if (previous == nullptr || previous->AsParallelMove() == nullptr
+      || previous->AsParallelMove()->GetLifetimePosition() != block->GetLifetimeEnd()) {
     move = new (allocator_) HParallelMove(allocator_);
+    move->SetLifetimePosition(block->GetLifetimeEnd());
     block->InsertInstructionBefore(move, last);
   } else {
     move = previous->AsParallelMove();
@@ -700,7 +708,9 @@
 
   HInstruction* first = block->GetFirstInstruction();
   HParallelMove* move = first->AsParallelMove();
-  if (move == nullptr || IsInputMove(move)) {
+  // This is a parallel move for connecting blocks. We need to differentiate
+  // it with moves for connecting siblings in a same block, and input moves.
+  if (move == nullptr || move->GetLifetimePosition() != block->GetLifetimeStart()) {
     move = new (allocator_) HParallelMove(allocator_);
     move->SetLifetimePosition(block->GetLifetimeStart());
     block->InsertInstructionBefore(move, first);
@@ -718,9 +728,14 @@
     return;
   }
 
+  size_t position = instruction->GetLifetimePosition() + 1;
   HParallelMove* move = instruction->GetNext()->AsParallelMove();
-  if (move == nullptr || IsInputMove(move)) {
+  // This is a parallel move for moving the output of an instruction. We need
+  // to differentiate with input moves, moves for connecting siblings in a
+  // and moves for connecting blocks.
+  if (move == nullptr || move->GetLifetimePosition() != position) {
     move = new (allocator_) HParallelMove(allocator_);
+    move->SetLifetimePosition(position);
     instruction->GetBlock()->InsertInstructionBefore(move, instruction->GetNext());
   }
   move->AddMove(new (allocator_) MoveOperands(source, destination));
diff --git a/compiler/optimizing/register_allocator.h b/compiler/optimizing/register_allocator.h
index 1b5585f..8b7c4f1 100644
--- a/compiler/optimizing/register_allocator.h
+++ b/compiler/optimizing/register_allocator.h
@@ -65,7 +65,7 @@
 
   static bool CanAllocateRegistersFor(const HGraph& graph, InstructionSet instruction_set);
   static bool Supports(InstructionSet instruction_set) {
-    return instruction_set == kX86;
+    return instruction_set == kX86 || instruction_set == kArm;
   }
 
   size_t GetNumberOfSpillSlots() const {
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index 7e0a106..67ac729 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -203,6 +203,10 @@
   UsageError("");
   UsageError("  --dump-timing: display a breakdown of where time was spent");
   UsageError("");
+  UsageError("  --include-debug-symbols: Include ELF symbols in this oat file");
+  UsageError("");
+  UsageError("  --no-include-debug-symbols: Do not include ELF symbols in this oat file");
+  UsageError("");
   UsageError("  --runtime-arg <argument>: used to specify various arguments for the runtime,");
   UsageError("      such as initial heap size, maximum heap size, and verbose output.");
   UsageError("      Use a separate --runtime-arg switch for each argument.");
@@ -816,6 +820,7 @@
   bool dump_stats = false;
   bool dump_timing = false;
   bool dump_passes = false;
+  bool include_debug_symbols = kIsDebugBuild;
   bool dump_slow_timing = kIsDebugBuild;
   bool watch_dog_enabled = !kIsTargetBuild;
   bool generate_gdb_information = kIsDebugBuild;
@@ -969,6 +974,10 @@
       dump_passes = true;
     } else if (option == "--dump-stats") {
       dump_stats = true;
+    } else if (option == "--include-debug-symbols" || option == "--no-strip-symbols") {
+      include_debug_symbols = true;
+    } else if (option == "--no-include-debug-symbols" || option == "--strip-symbols") {
+      include_debug_symbols = false;
     } else if (option.starts_with("--profile-file=")) {
       profile_file = option.substr(strlen("--profile-file=")).data();
       VLOG(compiler) << "dex2oat: profile file is " << profile_file;
@@ -1122,7 +1131,8 @@
                                    tiny_method_threshold,
                                    num_dex_methods_threshold,
                                    generate_gdb_information,
-                                   top_k_profile_threshold
+                                   top_k_profile_threshold,
+                                   include_debug_symbols
 #ifdef ART_SEA_IR_MODE
                                    , compiler_options.sea_ir_ = true;
 #endif
@@ -1409,16 +1419,20 @@
   }
 
 #if ART_USE_PORTABLE_COMPILER  // We currently only generate symbols on Portable
-  timings.NewSplit("dex2oat ElfStripper");
-  // Strip unneeded sections for target
-  off_t seek_actual = lseek(oat_file->Fd(), 0, SEEK_SET);
-  CHECK_EQ(0, seek_actual);
-  std::string error_msg;
-  CHECK(ElfStripper::Strip(oat_file.get(), &error_msg)) << error_msg;
+  if (!compiler_options.GetIncludeDebugSymbols()) {
+    timings.NewSplit("dex2oat ElfStripper");
+    // Strip unneeded sections for target
+    off_t seek_actual = lseek(oat_file->Fd(), 0, SEEK_SET);
+    CHECK_EQ(0, seek_actual);
+    std::string error_msg;
+    CHECK(ElfStripper::Strip(oat_file.get(), &error_msg)) << error_msg;
 
 
-  // We wrote the oat file successfully, and want to keep it.
-  VLOG(compiler) << "Oat file written successfully (stripped): " << oat_location;
+    // We wrote the oat file successfully, and want to keep it.
+    VLOG(compiler) << "Oat file written successfully (stripped): " << oat_location;
+  } else {
+    VLOG(compiler) << "Oat file written successfully without stripping: " << oat_location;
+  }
 #endif  // ART_USE_PORTABLE_COMPILER
 
   timings.EndSplit();
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index dd8e221..d704788 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1288,8 +1288,8 @@
      */
 TWO_ARG_DOWNCALL art_quick_initialize_static_storage, artInitializeStaticStorageFromCode, RETURN_IF_RESULT_IS_NON_ZERO
 
-UNIMPLEMENTED art_quick_initialize_type
-UNIMPLEMENTED art_quick_initialize_type_and_verify_access
+TWO_ARG_DOWNCALL art_quick_initialize_type, artInitializeTypeFromCode, RETURN_IF_RESULT_IS_NON_ZERO
+TWO_ARG_DOWNCALL art_quick_initialize_type_and_verify_access, artInitializeTypeAndVerifyAccessFromCode, RETURN_IF_RESULT_IS_NON_ZERO
 
 ONE_ARG_REF_DOWNCALL art_quick_get32_static, artGet32StaticFromCode, RETURN_OR_DELIVER_PENDING_EXCEPTION_X1
 ONE_ARG_REF_DOWNCALL art_quick_get64_static, artGet64StaticFromCode, RETURN_OR_DELIVER_PENDING_EXCEPTION_X1
diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index 59311bc..7785bc3 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc
@@ -1784,9 +1784,87 @@
   ASSERT_FALSE(self->IsExceptionPending());
   EXPECT_EQ(static_cast<size_t>(JNI_TRUE), result);
 #else
-  LOG(INFO) << "Skipping memcpy as I don't know how to do that on " << kRuntimeISA;
+  LOG(INFO) << "Skipping imt as I don't know how to do that on " << kRuntimeISA;
   // Force-print to std::cout so it's also outside the logcat.
-  std::cout << "Skipping memcpy as I don't know how to do that on " << kRuntimeISA << std::endl;
+  std::cout << "Skipping imt as I don't know how to do that on " << kRuntimeISA << std::endl;
+#endif
+}
+
+#if defined(__arm__) || defined(__aarch64__)
+extern "C" void art_quick_indexof(void);
+#endif
+
+TEST_F(StubTest, StringIndexOf) {
+#if defined(__arm__) || defined(__aarch64__)
+  Thread* self = Thread::Current();
+  ScopedObjectAccess soa(self);
+  // garbage is created during ClassLinker::Init
+
+  // Create some strings
+  // Use array so we can index into it and use a matrix for expected results
+  // Setup: The first half is standard. The second half uses a non-zero offset.
+  // TODO: Shared backing arrays.
+  static constexpr size_t kStringCount = 7;
+  const char* c_str[kStringCount] = { "", "a", "ba", "cba", "dcba", "edcba", "asdfghjkl" };
+  static constexpr size_t kCharCount = 5;
+  const char c_char[kCharCount] = { 'a', 'b', 'c', 'd', 'e' };
+
+  StackHandleScope<kStringCount> hs(self);
+  Handle<mirror::String> s[kStringCount];
+
+  for (size_t i = 0; i < kStringCount; ++i) {
+    s[i] = hs.NewHandle(mirror::String::AllocFromModifiedUtf8(soa.Self(), c_str[i]));
+  }
+
+  // Matrix of expectations. First component is first parameter. Note we only check against the
+  // sign, not the value. As we are testing random offsets, we need to compute this and need to
+  // rely on String::CompareTo being correct.
+  static constexpr size_t kMaxLen = 9;
+  DCHECK_LE(strlen(c_str[kStringCount-1]), kMaxLen) << "Please fix the indexof test.";
+
+  // Last dimension: start, offset by 1.
+  int32_t expected[kStringCount][kCharCount][kMaxLen + 3];
+  for (size_t x = 0; x < kStringCount; ++x) {
+    for (size_t y = 0; y < kCharCount; ++y) {
+      for (size_t z = 0; z <= kMaxLen + 2; ++z) {
+        expected[x][y][z] = s[x]->FastIndexOf(c_char[y], static_cast<int32_t>(z) - 1);
+      }
+    }
+  }
+
+  // Play with it...
+
+  for (size_t x = 0; x < kStringCount; ++x) {
+    for (size_t y = 0; y < kCharCount; ++y) {
+      for (size_t z = 0; z <= kMaxLen + 2; ++z) {
+        int32_t start = static_cast<int32_t>(z) - 1;
+
+        // Test string_compareto x y
+        size_t result = Invoke3(reinterpret_cast<size_t>(s[x].Get()), c_char[y], start,
+                                reinterpret_cast<uintptr_t>(&art_quick_indexof), self);
+
+        EXPECT_FALSE(self->IsExceptionPending());
+
+        // The result is a 32b signed integer
+        union {
+          size_t r;
+          int32_t i;
+        } conv;
+        conv.r = result;
+
+        EXPECT_EQ(expected[x][y][z], conv.i) << "Wrong result for " << c_str[x] << " / " <<
+            c_char[y] << " @ " << start;
+      }
+    }
+  }
+
+  // TODO: Deallocate things.
+
+  // Tests done.
+#else
+  LOG(INFO) << "Skipping indexof as I don't know how to do that on " << kRuntimeISA;
+  // Force-print to std::cout so it's also outside the logcat.
+  std::cout << "Skipping indexof as I don't know how to do that on " << kRuntimeISA << std::endl;
 #endif
 }
 
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index 349700a..50e9624 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -903,7 +903,7 @@
                                       std::vector<uint32_t>& stack_depths) {
   struct OwnedMonitorVisitor : public StackVisitor {
     OwnedMonitorVisitor(Thread* thread, Context* context,
-                        std::vector<mirror::Object*>* monitor_vector,
+                        std::vector<JDWP::ObjectId>* monitor_vector,
                         std::vector<uint32_t>* stack_depth_vector)
         SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
       : StackVisitor(thread, context), current_stack_depth(0),
@@ -919,23 +919,22 @@
       return true;
     }
 
-    static void AppendOwnedMonitors(mirror::Object* owned_monitor, void* arg) {
+    static void AppendOwnedMonitors(mirror::Object* owned_monitor, void* arg)
+        SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
       OwnedMonitorVisitor* visitor = reinterpret_cast<OwnedMonitorVisitor*>(arg);
-      visitor->monitors->push_back(owned_monitor);
+      visitor->monitors->push_back(gRegistry->Add(owned_monitor));
       visitor->stack_depths->push_back(visitor->current_stack_depth);
     }
 
     size_t current_stack_depth;
-    std::vector<mirror::Object*>* monitors;
+    std::vector<JDWP::ObjectId>* monitors;
     std::vector<uint32_t>* stack_depths;
   };
 
-  std::vector<mirror::Object*> monitor_vector;
-  std::vector<uint32_t> stack_depth_vector;
   ScopedObjectAccessUnchecked soa(Thread::Current());
+  Thread* thread;
   {
     MutexLock mu(soa.Self(), *Locks::thread_list_lock_);
-    Thread* thread;
     JDWP::JdwpError error = DecodeThread(soa, thread_id, thread);
     if (error != JDWP::ERR_NONE) {
       return error;
@@ -943,18 +942,10 @@
     if (!IsSuspendedForDebugger(soa, thread)) {
       return JDWP::ERR_THREAD_NOT_SUSPENDED;
     }
-    std::unique_ptr<Context> context(Context::Create());
-    OwnedMonitorVisitor visitor(thread, context.get(), &monitor_vector, &stack_depth_vector);
-    visitor.WalkStack();
   }
-
-  // Add() requires the thread_list_lock_ not held to avoid the lock
-  // level violation.
-  for (size_t i = 0; i < monitor_vector.size(); ++i) {
-    monitors.push_back(gRegistry->Add(monitor_vector[i]));
-    stack_depths.push_back(stack_depth_vector[i]);
-  }
-
+  std::unique_ptr<Context> context(Context::Create());
+  OwnedMonitorVisitor visitor(thread, context.get(), &monitors, &stack_depths);
+  visitor.WalkStack();
   return JDWP::ERR_NONE;
 }
 
diff --git a/runtime/dex_file_verifier.cc b/runtime/dex_file_verifier.cc
index 61ea870..291e2d0 100644
--- a/runtime/dex_file_verifier.cc
+++ b/runtime/dex_file_verifier.cc
@@ -111,17 +111,17 @@
   }
 
 // Helper macro to load method id. Return last parameter on error.
-#define LOAD_METHOD(var, idx, error_string, error_val)                  \
+#define LOAD_METHOD(var, idx, error_string, error_stmt)                 \
   const DexFile::MethodId* var  = CheckLoadMethodId(idx, error_string); \
   if (UNLIKELY(var == nullptr)) {                                       \
-    return error_val;                                                   \
+    error_stmt;                                                         \
   }
 
 // Helper macro to load method id. Return last parameter on error.
-#define LOAD_FIELD(var, idx, fmt, error_val)                \
+#define LOAD_FIELD(var, idx, fmt, error_stmt)               \
   const DexFile::FieldId* var = CheckLoadFieldId(idx, fmt); \
   if (UNLIKELY(var == nullptr)) {                           \
-    return error_val;                                       \
+    error_stmt;                                             \
   }
 
 bool DexFileVerifier::Verify(const DexFile* dex_file, const byte* begin, size_t size,
@@ -1378,42 +1378,48 @@
   return true;
 }
 
-uint32_t DexFileVerifier::FindFirstClassDataDefiner(const byte* ptr) {
+uint16_t DexFileVerifier::FindFirstClassDataDefiner(const byte* ptr, bool* success) {
   ClassDataItemIterator it(*dex_file_, ptr);
+  *success = true;
 
   if (it.HasNextStaticField() || it.HasNextInstanceField()) {
-    LOAD_FIELD(field, it.GetMemberIndex(), "first_class_data_definer field_id", 0x10000U)
+    LOAD_FIELD(field, it.GetMemberIndex(), "first_class_data_definer field_id",
+               *success = false; return DexFile::kDexNoIndex16)
     return field->class_idx_;
   }
 
   if (it.HasNextDirectMethod() || it.HasNextVirtualMethod()) {
-    LOAD_METHOD(method, it.GetMemberIndex(), "first_class_data_definer method_id", 0x10000U)
+    LOAD_METHOD(method, it.GetMemberIndex(), "first_class_data_definer method_id",
+                *success = false; return DexFile::kDexNoIndex16)
     return method->class_idx_;
   }
 
   return DexFile::kDexNoIndex16;
 }
 
-uint32_t DexFileVerifier::FindFirstAnnotationsDirectoryDefiner(const byte* ptr) {
+uint16_t DexFileVerifier::FindFirstAnnotationsDirectoryDefiner(const byte* ptr, bool* success) {
   const DexFile::AnnotationsDirectoryItem* item =
       reinterpret_cast<const DexFile::AnnotationsDirectoryItem*>(ptr);
+  *success = true;
+
   if (item->fields_size_ != 0) {
     DexFile::FieldAnnotationsItem* field_items = (DexFile::FieldAnnotationsItem*) (item + 1);
-    LOAD_FIELD(field, field_items[0].field_idx_, "first_annotations_dir_definer field_id", 0x10000U)
+    LOAD_FIELD(field, field_items[0].field_idx_, "first_annotations_dir_definer field_id",
+               *success = false; return DexFile::kDexNoIndex16)
     return field->class_idx_;
   }
 
   if (item->methods_size_ != 0) {
     DexFile::MethodAnnotationsItem* method_items = (DexFile::MethodAnnotationsItem*) (item + 1);
     LOAD_METHOD(method, method_items[0].method_idx_, "first_annotations_dir_definer method id",
-                0x10000U)
+                *success = false; return DexFile::kDexNoIndex16)
     return method->class_idx_;
   }
 
   if (item->parameters_size_ != 0) {
     DexFile::ParameterAnnotationsItem* parameter_items = (DexFile::ParameterAnnotationsItem*) (item + 1);
     LOAD_METHOD(method, parameter_items[0].method_idx_, "first_annotations_dir_definer method id",
-                0x10000U)
+                *success = false; return DexFile::kDexNoIndex16)
     return method->class_idx_;
   }
 
@@ -1487,6 +1493,10 @@
 
   DexFileParameterIterator it(*dex_file_, *item);
   while (it.HasNext() && *shorty != '\0') {
+    if (!CheckIndex(it.GetTypeIdx(), dex_file_->NumTypeIds(),
+                    "inter_proto_id_item shorty type_idx")) {
+      return false;
+    }
     const char* descriptor = it.GetDescriptor();
     if (!CheckShortyDescriptorMatch(*shorty, descriptor, false)) {
       return false;
@@ -1699,8 +1709,9 @@
   // Check that references in class_data_item are to the right class.
   if (item->class_data_off_ != 0) {
     const byte* data = begin_ + item->class_data_off_;
-    uint32_t data_definer = FindFirstClassDataDefiner(data);
-    if (data_definer >= 0x10000U) {
+    bool success;
+    uint16_t data_definer = FindFirstClassDataDefiner(data, &success);
+    if (!success) {
       return false;
     }
     if (UNLIKELY((data_definer != item->class_idx_) && (data_definer != DexFile::kDexNoIndex16))) {
@@ -1712,8 +1723,9 @@
   // Check that references in annotations_directory_item are to right class.
   if (item->annotations_off_ != 0) {
     const byte* data = begin_ + item->annotations_off_;
-    uint32_t annotations_definer = FindFirstAnnotationsDirectoryDefiner(data);
-    if (annotations_definer >= 0x10000U) {
+    bool success;
+    uint16_t annotations_definer = FindFirstAnnotationsDirectoryDefiner(data, &success);
+    if (!success) {
       return false;
     }
     if (UNLIKELY((annotations_definer != item->class_idx_) &&
@@ -1777,13 +1789,14 @@
 
 bool DexFileVerifier::CheckInterClassDataItem() {
   ClassDataItemIterator it(*dex_file_, ptr_);
-  uint32_t defining_class = FindFirstClassDataDefiner(ptr_);
-  if (defining_class >= 0x10000U) {
+  bool success;
+  uint16_t defining_class = FindFirstClassDataDefiner(ptr_, &success);
+  if (!success) {
     return false;
   }
 
   for (; it.HasNextStaticField() || it.HasNextInstanceField(); it.Next()) {
-    LOAD_FIELD(field, it.GetMemberIndex(), "inter_class_data_item field_id", false)
+    LOAD_FIELD(field, it.GetMemberIndex(), "inter_class_data_item field_id", return false)
     if (UNLIKELY(field->class_idx_ != defining_class)) {
       ErrorStringPrintf("Mismatched defining class for class_data_item field");
       return false;
@@ -1794,7 +1807,7 @@
     if (code_off != 0 && !CheckOffsetToTypeMap(code_off, DexFile::kDexTypeCodeItem)) {
       return false;
     }
-    LOAD_METHOD(method, it.GetMemberIndex(), "inter_class_data_item method_id", false)
+    LOAD_METHOD(method, it.GetMemberIndex(), "inter_class_data_item method_id", return false)
     if (UNLIKELY(method->class_idx_ != defining_class)) {
       ErrorStringPrintf("Mismatched defining class for class_data_item method");
       return false;
@@ -1808,8 +1821,9 @@
 bool DexFileVerifier::CheckInterAnnotationsDirectoryItem() {
   const DexFile::AnnotationsDirectoryItem* item =
       reinterpret_cast<const DexFile::AnnotationsDirectoryItem*>(ptr_);
-  uint32_t defining_class = FindFirstAnnotationsDirectoryDefiner(ptr_);
-  if (defining_class >= 0x10000U) {
+  bool success;
+  uint16_t defining_class = FindFirstAnnotationsDirectoryDefiner(ptr_, &success);
+  if (!success) {
     return false;
   }
 
@@ -1823,7 +1837,8 @@
       reinterpret_cast<const DexFile::FieldAnnotationsItem*>(item + 1);
   uint32_t field_count = item->fields_size_;
   for (uint32_t i = 0; i < field_count; i++) {
-    LOAD_FIELD(field, field_item->field_idx_, "inter_annotations_directory_item field_id", false)
+    LOAD_FIELD(field, field_item->field_idx_, "inter_annotations_directory_item field_id",
+               return false)
     if (UNLIKELY(field->class_idx_ != defining_class)) {
       ErrorStringPrintf("Mismatched defining class for field_annotation");
       return false;
@@ -1840,7 +1855,7 @@
   uint32_t method_count = item->methods_size_;
   for (uint32_t i = 0; i < method_count; i++) {
     LOAD_METHOD(method, method_item->method_idx_, "inter_annotations_directory_item method_id",
-                false)
+                return false)
     if (UNLIKELY(method->class_idx_ != defining_class)) {
       ErrorStringPrintf("Mismatched defining class for method_annotation");
       return false;
@@ -1857,7 +1872,7 @@
   uint32_t parameter_count = item->parameters_size_;
   for (uint32_t i = 0; i < parameter_count; i++) {
     LOAD_METHOD(parameter_method, parameter_item->method_idx_,
-                "inter_annotations_directory_item parameter method_id", false)
+                "inter_annotations_directory_item parameter method_id", return false)
     if (UNLIKELY(parameter_method->class_idx_ != defining_class)) {
       ErrorStringPrintf("Mismatched defining class for parameter_annotation");
       return false;
diff --git a/runtime/dex_file_verifier.h b/runtime/dex_file_verifier.h
index 7489dcd..f845993 100644
--- a/runtime/dex_file_verifier.h
+++ b/runtime/dex_file_verifier.h
@@ -72,10 +72,10 @@
 
   bool CheckOffsetToTypeMap(size_t offset, uint16_t type);
 
-  // Note: the result type of the following methods is wider than that of the underlying index
-  // (16b vs 32b). This is so that we can define an error value (anything >= 2^16).
-  uint32_t FindFirstClassDataDefiner(const byte* ptr);
-  uint32_t FindFirstAnnotationsDirectoryDefiner(const byte* ptr);
+  // Note: as sometimes kDexNoIndex16, being 0xFFFF, is a valid return value, we need an
+  // additional out parameter to signal any errors loading an index.
+  uint16_t FindFirstClassDataDefiner(const byte* ptr, bool* success);
+  uint16_t FindFirstAnnotationsDirectoryDefiner(const byte* ptr, bool* success);
 
   bool CheckInterStringIdItem();
   bool CheckInterTypeIdItem();
diff --git a/runtime/dex_instruction.h b/runtime/dex_instruction.h
index 1ff5c19..edba502 100644
--- a/runtime/dex_instruction.h
+++ b/runtime/dex_instruction.h
@@ -145,27 +145,28 @@
   };
 
   enum VerifyFlag {
-    kVerifyNone            = 0x00000,
-    kVerifyRegA            = 0x00001,
-    kVerifyRegAWide        = 0x00002,
-    kVerifyRegB            = 0x00004,
-    kVerifyRegBField       = 0x00008,
-    kVerifyRegBMethod      = 0x00010,
-    kVerifyRegBNewInstance = 0x00020,
-    kVerifyRegBString      = 0x00040,
-    kVerifyRegBType        = 0x00080,
-    kVerifyRegBWide        = 0x00100,
-    kVerifyRegC            = 0x00200,
-    kVerifyRegCField       = 0x00400,
-    kVerifyRegCNewArray    = 0x00800,
-    kVerifyRegCType        = 0x01000,
-    kVerifyRegCWide        = 0x02000,
-    kVerifyArrayData       = 0x04000,
-    kVerifyBranchTarget    = 0x08000,
-    kVerifySwitchTargets   = 0x10000,
-    kVerifyVarArg          = 0x20000,
-    kVerifyVarArgRange     = 0x40000,
-    kVerifyError           = 0x80000,
+    kVerifyNone            = 0x000000,
+    kVerifyRegA            = 0x000001,
+    kVerifyRegAWide        = 0x000002,
+    kVerifyRegB            = 0x000004,
+    kVerifyRegBField       = 0x000008,
+    kVerifyRegBMethod      = 0x000010,
+    kVerifyRegBNewInstance = 0x000020,
+    kVerifyRegBString      = 0x000040,
+    kVerifyRegBType        = 0x000080,
+    kVerifyRegBWide        = 0x000100,
+    kVerifyRegC            = 0x000200,
+    kVerifyRegCField       = 0x000400,
+    kVerifyRegCNewArray    = 0x000800,
+    kVerifyRegCType        = 0x001000,
+    kVerifyRegCWide        = 0x002000,
+    kVerifyArrayData       = 0x004000,
+    kVerifyBranchTarget    = 0x008000,
+    kVerifySwitchTargets   = 0x010000,
+    kVerifyVarArg          = 0x020000,
+    kVerifyVarArgRange     = 0x040000,
+    kVerifyRuntimeOnly     = 0x080000,
+    kVerifyError           = 0x100000,
   };
 
   static constexpr uint32_t kMaxVarArgRegs = 5;
@@ -493,18 +494,23 @@
   }
 
   int GetVerifyTypeArgumentB() const {
-    return (kInstructionVerifyFlags[Opcode()] & (kVerifyRegB | kVerifyRegBField | kVerifyRegBMethod |
-             kVerifyRegBNewInstance | kVerifyRegBString | kVerifyRegBType | kVerifyRegBWide));
+    return (kInstructionVerifyFlags[Opcode()] & (kVerifyRegB | kVerifyRegBField |
+        kVerifyRegBMethod | kVerifyRegBNewInstance | kVerifyRegBString | kVerifyRegBType |
+        kVerifyRegBWide));
   }
 
   int GetVerifyTypeArgumentC() const {
     return (kInstructionVerifyFlags[Opcode()] & (kVerifyRegC | kVerifyRegCField |
-             kVerifyRegCNewArray | kVerifyRegCType | kVerifyRegCWide));
+        kVerifyRegCNewArray | kVerifyRegCType | kVerifyRegCWide));
   }
 
   int GetVerifyExtraFlags() const {
     return (kInstructionVerifyFlags[Opcode()] & (kVerifyArrayData | kVerifyBranchTarget |
-             kVerifySwitchTargets | kVerifyVarArg | kVerifyVarArgRange | kVerifyError));
+        kVerifySwitchTargets | kVerifyVarArg | kVerifyVarArgRange | kVerifyError));
+  }
+
+  bool GetVerifyIsRuntimeOnly() const {
+    return (kInstructionVerifyFlags[Opcode()] & kVerifyRuntimeOnly) != 0;
   }
 
   // Get the dex PC of this instruction as a offset in code units from the beginning of insns.
diff --git a/runtime/dex_instruction_list.h b/runtime/dex_instruction_list.h
index f43e42f..4cda58b 100644
--- a/runtime/dex_instruction_list.h
+++ b/runtime/dex_instruction_list.h
@@ -245,14 +245,14 @@
   V(0xE0, SHL_INT_LIT8, "shl-int/lit8", k22b, true, kNone, kContinue | kShl | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
   V(0xE1, SHR_INT_LIT8, "shr-int/lit8", k22b, true, kNone, kContinue | kShr | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
   V(0xE2, USHR_INT_LIT8, "ushr-int/lit8", k22b, true, kNone, kContinue | kUshr | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xE3, IGET_QUICK, "iget-quick", k22c, true, kFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xE4, IGET_WIDE_QUICK, "iget-wide-quick", k22c, true, kFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegAWide | kVerifyRegB) \
-  V(0xE5, IGET_OBJECT_QUICK, "iget-object-quick", k22c, true, kFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xE6, IPUT_QUICK, "iput-quick", k22c, false, kFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xE7, IPUT_WIDE_QUICK, "iput-wide-quick", k22c, false, kFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegAWide | kVerifyRegB) \
-  V(0xE8, IPUT_OBJECT_QUICK, "iput-object-quick", k22c, false, kFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB) \
-  V(0xE9, INVOKE_VIRTUAL_QUICK, "invoke-virtual-quick", k35c, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyVarArg) \
-  V(0xEA, INVOKE_VIRTUAL_RANGE_QUICK, "invoke-virtual/range-quick", k3rc, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyVarArgRange) \
+  V(0xE3, IGET_QUICK, "iget-quick", k22c, true, kFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
+  V(0xE4, IGET_WIDE_QUICK, "iget-wide-quick", k22c, true, kFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegAWide | kVerifyRegB | kVerifyRuntimeOnly) \
+  V(0xE5, IGET_OBJECT_QUICK, "iget-object-quick", k22c, true, kFieldRef, kContinue | kThrow | kLoad | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
+  V(0xE6, IPUT_QUICK, "iput-quick", k22c, false, kFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
+  V(0xE7, IPUT_WIDE_QUICK, "iput-wide-quick", k22c, false, kFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegAWide | kVerifyRegB | kVerifyRuntimeOnly) \
+  V(0xE8, IPUT_OBJECT_QUICK, "iput-object-quick", k22c, false, kFieldRef, kContinue | kThrow | kStore | kRegCFieldOrConstant, kVerifyRegA | kVerifyRegB | kVerifyRuntimeOnly) \
+  V(0xE9, INVOKE_VIRTUAL_QUICK, "invoke-virtual-quick", k35c, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyVarArg | kVerifyRuntimeOnly) \
+  V(0xEA, INVOKE_VIRTUAL_RANGE_QUICK, "invoke-virtual/range-quick", k3rc, false, kMethodRef, kContinue | kThrow | kInvoke, kVerifyVarArgRange | kVerifyRuntimeOnly) \
   V(0xEB, UNUSED_EB, "unused-eb", k10x, false, kUnknown, 0, kVerifyError) \
   V(0xEC, UNUSED_EC, "unused-ec", k10x, false, kUnknown, 0, kVerifyError) \
   V(0xED, UNUSED_ED, "unused-ed", k10x, false, kUnknown, 0, kVerifyError) \
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index ccf478c..717381c 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -574,7 +574,7 @@
       break;
   }
 
-  if (implicit_checks_supported &&
+  if (!options->interpreter_only_ && implicit_checks_supported &&
       (options->explicit_checks_ != (ParsedOptions::kExplicitSuspendCheck |
           ParsedOptions::kExplicitNullCheck |
           ParsedOptions::kExplicitStackOverflowCheck) || kEnableJavaStackTraceHandler)) {
diff --git a/runtime/verifier/method_verifier.cc b/runtime/verifier/method_verifier.cc
index 63a1fe5..2f4e805 100644
--- a/runtime/verifier/method_verifier.cc
+++ b/runtime/verifier/method_verifier.cc
@@ -274,7 +274,7 @@
     result = kHardFailure;
   }
   uint64_t duration_ns = NanoTime() - start_ns;
-  if (duration_ns > MsToNs(100)) {
+  if (duration_ns > MsToNs(100) && !kIsDebugBuild) {
     LOG(WARNING) << "Verification of " << PrettyMethod(method_idx, *dex_file)
                  << " took " << PrettyDuration(duration_ns);
   }
@@ -731,6 +731,10 @@
       result = false;
       break;
   }
+  if (inst->GetVerifyIsRuntimeOnly() && Runtime::Current()->IsCompiler()) {
+    Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "opcode only expected at runtime " << inst->Name();
+    result = false;
+  }
   return result;
 }
 
@@ -1334,31 +1338,6 @@
     insn_flags_[insn_idx].ClearChanged();
   }
 
-  // When we're in compiler mode, do not accept quickened instructions.
-  // We explicitly iterate over *all* instructions to check code that may be unreachable and
-  // missed by the loop above.
-  if (Runtime::Current() != nullptr && Runtime::Current()->IsCompiler()) {
-    uint32_t insn_idx = 0;
-    for (; insn_idx < insns_size; insn_idx += insn_flags_[insn_idx].GetLengthInCodeUnits()) {
-      const Instruction* inst = Instruction::At(insns + insn_idx);
-      switch (inst->Opcode()) {
-        case Instruction::IGET_QUICK:
-        case Instruction::IGET_WIDE_QUICK:
-        case Instruction::IGET_OBJECT_QUICK:
-        case Instruction::IPUT_QUICK:
-        case Instruction::IPUT_WIDE_QUICK:
-        case Instruction::IPUT_OBJECT_QUICK:
-        case Instruction::INVOKE_VIRTUAL_QUICK:
-        case Instruction::INVOKE_VIRTUAL_RANGE_QUICK:
-          Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "Quickened instructions not allowed. ";
-          return false;
-
-        default:
-          break;
-      }
-    }
-  }
-
   if (gDebugVerify) {
     /*
      * Scan for dead code. There's nothing "evil" about dead code
@@ -2156,15 +2135,12 @@
     case Instruction::INVOKE_VIRTUAL_RANGE:
     case Instruction::INVOKE_SUPER:
     case Instruction::INVOKE_SUPER_RANGE: {
-      if (inst->VRegA() == 0) {
-        Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "invoke_virtual/super needs at least receiver";
-      }
       bool is_range = (inst->Opcode() == Instruction::INVOKE_VIRTUAL_RANGE ||
                        inst->Opcode() == Instruction::INVOKE_SUPER_RANGE);
       bool is_super = (inst->Opcode() == Instruction::INVOKE_SUPER ||
                        inst->Opcode() == Instruction::INVOKE_SUPER_RANGE);
-      mirror::ArtMethod* called_method = VerifyInvocationArgs(inst, METHOD_VIRTUAL,
-                                                                   is_range, is_super);
+      mirror::ArtMethod* called_method = VerifyInvocationArgs(inst, METHOD_VIRTUAL, is_range,
+                                                              is_super);
       const RegType* return_type = nullptr;
       if (called_method != nullptr) {
         Thread* self = Thread::Current();
@@ -3037,6 +3013,26 @@
   // Resolve the method. This could be an abstract or concrete method depending on what sort of call
   // we're making.
   const uint32_t method_idx = (is_range) ? inst->VRegB_3rc() : inst->VRegB_35c();
+
+  // As the method may not have been resolved, make this static check against what we expect.
+  const DexFile::MethodId& method_id = dex_file_->GetMethodId(method_idx);
+  uint32_t shorty_idx = dex_file_->GetProtoId(method_id.proto_idx_).shorty_idx_;
+  uint32_t shorty_len;
+  const char* descriptor = dex_file_->StringDataAndUtf16LengthByIdx(shorty_idx, &shorty_len);
+  int32_t sig_registers = method_type == METHOD_STATIC ? 0 : 1;
+  for (size_t i = 1; i < shorty_len; i++) {
+    if (descriptor[i] == 'J' || descriptor[i] == 'D') {
+      sig_registers += 2;
+    } else {
+      sig_registers++;
+    }
+  }
+  if (inst->VRegA() != sig_registers) {
+    Fail(VERIFY_ERROR_BAD_CLASS_HARD) << "Rejecting invocation, expected " << inst->VRegA() <<
+        " arguments, found " << sig_registers;
+    return nullptr;
+  }
+
   mirror::ArtMethod* res_method = ResolveMethodAndCheckAccess(method_idx, method_type);
   if (res_method == NULL) {  // error or class is unresolved
     return NULL;
diff --git a/test/404-optimizing-allocator/src/Main.java b/test/404-optimizing-allocator/src/Main.java
index 60477f9..7b31820 100644
--- a/test/404-optimizing-allocator/src/Main.java
+++ b/test/404-optimizing-allocator/src/Main.java
@@ -27,7 +27,8 @@
     expectEquals(10, $opt$reg$TestMultipleLive());
     expectEquals(1, $opt$reg$TestWithBreakAndContinue());
     expectEquals(-15, $opt$reg$testSpillInIf(5, 6, 7));
-    expectEquals(-567, $opt$reg$TestAgressiveLive(1, 2, 3, 4, 5, 6, 7));
+    expectEquals(-567, $opt$reg$TestAgressiveLive1(1, 2, 3, 4, 5, 6, 7));
+    expectEquals(-77, $opt$reg$TestAgressiveLive2(1, 2, 3, 4, 5, 6, 7));
   }
 
   public static int $opt$reg$TestLostCopy() {
@@ -125,7 +126,7 @@
     return a - b - c - d - e;
   }
 
-  public static int $opt$reg$TestAgressiveLive(int a, int b, int c, int d, int e, int f, int g) {
+  public static int $opt$reg$TestAgressiveLive1(int a, int b, int c, int d, int e, int f, int g) {
     int h = a - b;
     int i = c - d;
     int j = e - f;
@@ -146,6 +147,17 @@
     return a - b - c - d - e - f - g - h - i - j - k;
   }
 
+  public static int $opt$reg$TestAgressiveLive2(int a, int b, int c, int d, int e, int f, int g) {
+    int h = a - b;
+    int i = c - d;
+    int j = e - f;
+    int k = 42 + g - a;
+    do {
+      h++;
+    } while (h != 5);
+    return a - b - c - d - e - f - g - h - i - j - k;
+  }
+
   public static void expectEquals(int expected, int value) {
     if (expected != value) {
       throw new Error("Expected: " + expected + ", got: " + value);