x86_64: Enable fp-reg promotion

Patch introduces 4 register XMM12-15 available for promotion of
fp virtual registers.

Change-Id: I3f89ad07fc8ae98b70f550eada09be7b693ffb67
Signed-off-by: Serguei Katkov <serguei.i.katkov@intel.com>
Signed-off-by: Chao-ying Fu <chao-ying.fu@intel.com>
diff --git a/compiler/dex/quick/codegen_util.cc b/compiler/dex/quick/codegen_util.cc
index 5870d22..048aca3 100644
--- a/compiler/dex/quick/codegen_util.cc
+++ b/compiler/dex/quick/codegen_util.cc
@@ -1046,9 +1046,19 @@
     }
     // Push a marker to take place of lr.
     vmap_encoder.PushBackUnsigned(VmapTable::kAdjustedFpMarker);
-    // fp regs already sorted.
-    for (uint32_t i = 0; i < fp_vmap_table_.size(); i++) {
-      vmap_encoder.PushBackUnsigned(fp_vmap_table_[i] + VmapTable::kEntryAdjustment);
+    if (cu_->instruction_set == kThumb2) {
+      // fp regs already sorted.
+      for (uint32_t i = 0; i < fp_vmap_table_.size(); i++) {
+        vmap_encoder.PushBackUnsigned(fp_vmap_table_[i] + VmapTable::kEntryAdjustment);
+      }
+    } else {
+      // For other platforms regs may have been inserted out of order - sort first.
+      std::sort(fp_vmap_table_.begin(), fp_vmap_table_.end());
+      for (size_t i = 0 ; i < fp_vmap_table_.size(); ++i) {
+        // Copy, stripping out the phys register sort key.
+        vmap_encoder.PushBackUnsigned(
+            ~(-1 << VREG_NUM_WIDTH) & (fp_vmap_table_[i] + VmapTable::kEntryAdjustment));
+      }
     }
   } else {
     DCHECK_EQ(POPCOUNT(core_spill_mask_), 0);
diff --git a/compiler/dex/quick/x86/call_x86.cc b/compiler/dex/quick/x86/call_x86.cc
index 9000514..8e2a1e3 100644
--- a/compiler/dex/quick/x86/call_x86.cc
+++ b/compiler/dex/quick/x86/call_x86.cc
@@ -234,8 +234,7 @@
   NewLIR0(kPseudoMethodEntry);
   /* Spill core callee saves */
   SpillCoreRegs();
-  /* NOTE: promotion of FP regs currently unsupported, thus no FP spill */
-  DCHECK_EQ(num_fp_spills_, 0);
+  SpillFPRegs();
   if (!skip_overflow_check) {
     class StackOverflowSlowPath : public LIRSlowPath {
      public:
@@ -309,6 +308,7 @@
 
   NewLIR0(kPseudoMethodExit);
   UnSpillCoreRegs();
+  UnSpillFPRegs();
   /* Remove frame except for return address */
   stack_increment_ = OpRegImm(kOpAdd, rs_rX86_SP, frame_size_ - GetInstructionSetPointerSize(cu_->instruction_set));
   NewLIR0(kX86Ret);
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index ff7b30e..b0c54e8 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -319,6 +319,8 @@
   void OpRegThreadMem(OpKind op, RegStorage r_dest, ThreadOffset<8> thread_offset);
   void SpillCoreRegs();
   void UnSpillCoreRegs();
+  void UnSpillFPRegs();
+  void SpillFPRegs();
   static const X86EncodingMap EncodingMap[kX86Last];
   bool InexpensiveConstantInt(int32_t value);
   bool InexpensiveConstantFloat(int32_t value);
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index e81f505..1ebbbbd 100755
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -52,6 +52,13 @@
     rs_dr0, rs_dr1, rs_dr2, rs_dr3, rs_dr4, rs_dr5, rs_dr6, rs_dr7,
     rs_dr8, rs_dr9, rs_dr10, rs_dr11, rs_dr12, rs_dr13, rs_dr14, rs_dr15
 };
+static constexpr RegStorage xp_regs_arr_32[] = {
+    rs_xr0, rs_xr1, rs_xr2, rs_xr3, rs_xr4, rs_xr5, rs_xr6, rs_xr7,
+};
+static constexpr RegStorage xp_regs_arr_64[] = {
+    rs_xr0, rs_xr1, rs_xr2, rs_xr3, rs_xr4, rs_xr5, rs_xr6, rs_xr7,
+    rs_xr8, rs_xr9, rs_xr10, rs_xr11, rs_xr12, rs_xr13, rs_xr14, rs_xr15
+};
 static constexpr RegStorage reserved_regs_arr_32[] = {rs_rX86_SP_32};
 static constexpr RegStorage reserved_regs_arr_64[] = {rs_rX86_SP_32};
 static constexpr RegStorage reserved_regs_arr_64q[] = {rs_rX86_SP_64};
@@ -60,6 +67,24 @@
     rs_rAX, rs_rCX, rs_rDX, rs_rSI, rs_rDI,
     rs_r8, rs_r9, rs_r10, rs_r11
 };
+
+// How to add register to be available for promotion:
+// 1) Remove register from array defining temp
+// 2) Update ClobberCallerSave
+// 3) Update JNI compiler ABI:
+// 3.1) add reg in JniCallingConvention method
+// 3.2) update CoreSpillMask/FpSpillMask
+// 4) Update entrypoints
+// 4.1) Update constants in asm_support_x86_64.h for new frame size
+// 4.2) Remove entry in SmashCallerSaves
+// 4.3) Update jni_entrypoints to spill/unspill new callee save reg
+// 4.4) Update quick_entrypoints to spill/unspill new callee save reg
+// 5) Update runtime ABI
+// 5.1) Update quick_method_frame_info with new required spills
+// 5.2) Update QuickArgumentVisitor with new offsets to gprs and xmms
+// Note that you cannot use register corresponding to incoming args
+// according to ABI and QCG needs one additional XMM temp for
+// bulk copy in preparation to call.
 static constexpr RegStorage core_temps_arr_64q[] = {
     rs_r0q, rs_r1q, rs_r2q, rs_r6q, rs_r7q,
     rs_r8q, rs_r9q, rs_r10q, rs_r11q
@@ -69,14 +94,14 @@
 };
 static constexpr RegStorage sp_temps_arr_64[] = {
     rs_fr0, rs_fr1, rs_fr2, rs_fr3, rs_fr4, rs_fr5, rs_fr6, rs_fr7,
-    rs_fr8, rs_fr9, rs_fr10, rs_fr11, rs_fr12, rs_fr13, rs_fr14, rs_fr15
+    rs_fr8, rs_fr9, rs_fr10, rs_fr11
 };
 static constexpr RegStorage dp_temps_arr_32[] = {
     rs_dr0, rs_dr1, rs_dr2, rs_dr3, rs_dr4, rs_dr5, rs_dr6, rs_dr7,
 };
 static constexpr RegStorage dp_temps_arr_64[] = {
     rs_dr0, rs_dr1, rs_dr2, rs_dr3, rs_dr4, rs_dr5, rs_dr6, rs_dr7,
-    rs_dr8, rs_dr9, rs_dr10, rs_dr11, rs_dr12, rs_dr13, rs_dr14, rs_dr15
+    rs_dr8, rs_dr9, rs_dr10, rs_dr11
 };
 
 static constexpr RegStorage xp_temps_arr_32[] = {
@@ -84,7 +109,7 @@
 };
 static constexpr RegStorage xp_temps_arr_64[] = {
     rs_xr0, rs_xr1, rs_xr2, rs_xr3, rs_xr4, rs_xr5, rs_xr6, rs_xr7,
-    rs_xr8, rs_xr9, rs_xr10, rs_xr11, rs_xr12, rs_xr13, rs_xr14, rs_xr15
+    rs_xr8, rs_xr9, rs_xr10, rs_xr11
 };
 
 static constexpr ArrayRef<const RegStorage> empty_pool;
@@ -95,6 +120,8 @@
 static constexpr ArrayRef<const RegStorage> sp_regs_64(sp_regs_arr_64);
 static constexpr ArrayRef<const RegStorage> dp_regs_32(dp_regs_arr_32);
 static constexpr ArrayRef<const RegStorage> dp_regs_64(dp_regs_arr_64);
+static constexpr ArrayRef<const RegStorage> xp_regs_32(xp_regs_arr_32);
+static constexpr ArrayRef<const RegStorage> xp_regs_64(xp_regs_arr_64);
 static constexpr ArrayRef<const RegStorage> reserved_regs_32(reserved_regs_arr_32);
 static constexpr ArrayRef<const RegStorage> reserved_regs_64(reserved_regs_arr_64);
 static constexpr ArrayRef<const RegStorage> reserved_regs_64q(reserved_regs_arr_64q);
@@ -437,21 +464,13 @@
 
 /* Clobber all regs that might be used by an external C call */
 void X86Mir2Lir::ClobberCallerSave() {
-  Clobber(rs_rAX);
-  Clobber(rs_rCX);
-  Clobber(rs_rDX);
-  Clobber(rs_rBX);
-
-  Clobber(rs_fr0);
-  Clobber(rs_fr1);
-  Clobber(rs_fr2);
-  Clobber(rs_fr3);
-  Clobber(rs_fr4);
-  Clobber(rs_fr5);
-  Clobber(rs_fr6);
-  Clobber(rs_fr7);
-
   if (cu_->target64) {
+    Clobber(rs_rAX);
+    Clobber(rs_rCX);
+    Clobber(rs_rDX);
+    Clobber(rs_rSI);
+    Clobber(rs_rDI);
+
     Clobber(rs_r8);
     Clobber(rs_r9);
     Clobber(rs_r10);
@@ -461,11 +480,21 @@
     Clobber(rs_fr9);
     Clobber(rs_fr10);
     Clobber(rs_fr11);
-    Clobber(rs_fr12);
-    Clobber(rs_fr13);
-    Clobber(rs_fr14);
-    Clobber(rs_fr15);
+  } else {
+    Clobber(rs_rAX);
+    Clobber(rs_rCX);
+    Clobber(rs_rDX);
+    Clobber(rs_rBX);
   }
+
+  Clobber(rs_fr0);
+  Clobber(rs_fr1);
+  Clobber(rs_fr2);
+  Clobber(rs_fr3);
+  Clobber(rs_fr4);
+  Clobber(rs_fr5);
+  Clobber(rs_fr6);
+  Clobber(rs_fr7);
 }
 
 RegLocation X86Mir2Lir::GetReturnWideAlt() {
@@ -599,11 +628,15 @@
   // Target-specific adjustments.
 
   // Add in XMM registers.
-  const ArrayRef<const RegStorage> *xp_temps = cu_->target64 ? &xp_temps_64 : &xp_temps_32;
-  for (RegStorage reg : *xp_temps) {
+  const ArrayRef<const RegStorage> *xp_regs = cu_->target64 ? &xp_regs_64 : &xp_regs_32;
+  for (RegStorage reg : *xp_regs) {
     RegisterInfo* info = new (arena_) RegisterInfo(reg, GetRegMaskCommon(reg));
     reginfo_map_.Put(reg.GetReg(), info);
-    info->SetIsTemp(true);
+  }
+  const ArrayRef<const RegStorage> *xp_temps = cu_->target64 ? &xp_temps_64 : &xp_temps_32;
+  for (RegStorage reg : *xp_temps) {
+    RegisterInfo* xp_reg_info = GetRegInfo(reg);
+    xp_reg_info->SetIsTemp(true);
   }
 
   // Alias single precision xmm to double xmms.
@@ -665,9 +698,11 @@
   // Spill mask not including fake return address register
   uint32_t mask = core_spill_mask_ & ~(1 << rs_rRET.GetRegNum());
   int offset = frame_size_ - (GetInstructionSetPointerSize(cu_->instruction_set) * num_core_spills_);
+  OpSize size = cu_->target64 ? k64 : k32;
   for (int reg = 0; mask; mask >>= 1, reg++) {
     if (mask & 0x1) {
-      StoreWordDisp(rs_rX86_SP, offset, RegStorage::Solo32(reg));
+      StoreBaseDisp(rs_rX86_SP, offset, cu_->target64 ? RegStorage::Solo64(reg) :  RegStorage::Solo32(reg),
+                   size, kNotVolatile);
       offset += GetInstructionSetPointerSize(cu_->instruction_set);
     }
   }
@@ -680,14 +715,46 @@
   // Spill mask not including fake return address register
   uint32_t mask = core_spill_mask_ & ~(1 << rs_rRET.GetRegNum());
   int offset = frame_size_ - (GetInstructionSetPointerSize(cu_->instruction_set) * num_core_spills_);
+  OpSize size = cu_->target64 ? k64 : k32;
   for (int reg = 0; mask; mask >>= 1, reg++) {
     if (mask & 0x1) {
-      LoadWordDisp(rs_rX86_SP, offset, RegStorage::Solo32(reg));
+      LoadBaseDisp(rs_rX86_SP, offset, cu_->target64 ? RegStorage::Solo64(reg) :  RegStorage::Solo32(reg),
+                   size, kNotVolatile);
       offset += GetInstructionSetPointerSize(cu_->instruction_set);
     }
   }
 }
 
+void X86Mir2Lir::SpillFPRegs() {
+  if (num_fp_spills_ == 0) {
+    return;
+  }
+  uint32_t mask = fp_spill_mask_;
+  int offset = frame_size_ - (GetInstructionSetPointerSize(cu_->instruction_set) * (num_fp_spills_ + num_core_spills_));
+  for (int reg = 0; mask; mask >>= 1, reg++) {
+    if (mask & 0x1) {
+      StoreBaseDisp(rs_rX86_SP, offset, RegStorage::FloatSolo64(reg),
+                   k64, kNotVolatile);
+      offset += sizeof(double);
+    }
+  }
+}
+void X86Mir2Lir::UnSpillFPRegs() {
+  if (num_fp_spills_ == 0) {
+    return;
+  }
+  uint32_t mask = fp_spill_mask_;
+  int offset = frame_size_ - (GetInstructionSetPointerSize(cu_->instruction_set) * (num_fp_spills_ + num_core_spills_));
+  for (int reg = 0; mask; mask >>= 1, reg++) {
+    if (mask & 0x1) {
+      LoadBaseDisp(rs_rX86_SP, offset, RegStorage::FloatSolo64(reg),
+                   k64, kNotVolatile);
+      offset += sizeof(double);
+    }
+  }
+}
+
+
 bool X86Mir2Lir::IsUnconditionalBranch(LIR* lir) {
   return (lir->opcode == kX86Jmp8 || lir->opcode == kX86Jmp32);
 }
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index 2789923..5657381 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -66,7 +66,9 @@
  *  XMM6: caller | caller, arg7 | caller, scratch            | caller, arg7, scratch
  *  XMM7: caller | caller, arg8 | caller, scratch            | caller, arg8, scratch
  *  ---  x86-64/x32 registers
- *  XMM8 .. 15: caller save available as scratch registers for ART.
+ *  XMM8 .. 11: caller save available as scratch registers for ART.
+ *  XMM12 .. 15: callee save available as promoted registers for ART.
+ *  This change (XMM12..15) is for QCG only, for others they are caller save.
  *
  * X87 is a necessary evil outside of ART code for x86:
  *  ST0:  x86 float/double native return value, caller save
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
index 5febed2..525f05c 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.cc
@@ -130,6 +130,10 @@
   callee_save_regs_.push_back(X86_64ManagedRegister::FromCpuRegister(R13));
   callee_save_regs_.push_back(X86_64ManagedRegister::FromCpuRegister(R14));
   callee_save_regs_.push_back(X86_64ManagedRegister::FromCpuRegister(R15));
+  callee_save_regs_.push_back(X86_64ManagedRegister::FromXmmRegister(XMM12));
+  callee_save_regs_.push_back(X86_64ManagedRegister::FromXmmRegister(XMM13));
+  callee_save_regs_.push_back(X86_64ManagedRegister::FromXmmRegister(XMM14));
+  callee_save_regs_.push_back(X86_64ManagedRegister::FromXmmRegister(XMM15));
 }
 
 uint32_t X86_64JniCallingConvention::CoreSpillMask() const {
@@ -137,6 +141,10 @@
       1 << kNumberOfCpuRegisters;
 }
 
+uint32_t X86_64JniCallingConvention::FpSpillMask() const {
+  return 1 << XMM12 | 1 << XMM13 | 1 << XMM14 | 1 << XMM15;
+}
+
 size_t X86_64JniCallingConvention::FrameSize() {
   // Method*, return address and callee save area size, local reference segment state
   size_t frame_data_size = sizeof(StackReference<mirror::ArtMethod>) +
diff --git a/compiler/jni/quick/x86_64/calling_convention_x86_64.h b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
index 1ba5353..7a90c6e 100644
--- a/compiler/jni/quick/x86_64/calling_convention_x86_64.h
+++ b/compiler/jni/quick/x86_64/calling_convention_x86_64.h
@@ -61,9 +61,7 @@
   }
   ManagedRegister ReturnScratchRegister() const OVERRIDE;
   uint32_t CoreSpillMask() const OVERRIDE;
-  uint32_t FpSpillMask() const OVERRIDE {
-    return 0;
-  }
+  uint32_t FpSpillMask() const OVERRIDE;
   bool IsCurrentParamInRegister() OVERRIDE;
   bool IsCurrentParamOnStack() OVERRIDE;
   ManagedRegister CurrentParamRegister() OVERRIDE;
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index 4d5d613..78738d8 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -1671,16 +1671,31 @@
                                  const std::vector<ManagedRegister>& spill_regs,
                                  const ManagedRegisterEntrySpills& entry_spills) {
   CHECK_ALIGNED(frame_size, kStackAlignment);
+  int gpr_count = 0;
   for (int i = spill_regs.size() - 1; i >= 0; --i) {
-    pushq(spill_regs.at(i).AsX86_64().AsCpuRegister());
+    x86_64::X86_64ManagedRegister spill = spill_regs.at(i).AsX86_64();
+    if (spill.IsCpuRegister()) {
+      pushq(spill.AsCpuRegister());
+      gpr_count++;
+    }
   }
   // return address then method on stack
-  addq(CpuRegister(RSP), Immediate(-static_cast<int64_t>(frame_size) + (spill_regs.size() * kFramePointerSize) +
-                                   sizeof(StackReference<mirror::ArtMethod>) /*method*/ +
-                                   kFramePointerSize /*return address*/));
+  int64_t rest_of_frame = static_cast<int64_t>(frame_size)
+                          - (gpr_count * kFramePointerSize)
+                          - kFramePointerSize /*return address*/;
+  subq(CpuRegister(RSP), Immediate(rest_of_frame));
+  // spill xmms
+  int64_t offset = rest_of_frame;
+  for (int i = spill_regs.size() - 1; i >= 0; --i) {
+    x86_64::X86_64ManagedRegister spill = spill_regs.at(i).AsX86_64();
+    if (spill.IsXmmRegister()) {
+      offset -= sizeof(double);
+      movsd(Address(CpuRegister(RSP), offset), spill.AsXmmRegister());
+    }
+  }
 
   DCHECK_EQ(4U, sizeof(StackReference<mirror::ArtMethod>));
-  subq(CpuRegister(RSP), Immediate(4));
+
   movl(Address(CpuRegister(RSP), 0), method_reg.AsX86_64().AsCpuRegister());
 
   for (size_t i = 0; i < entry_spills.size(); ++i) {
@@ -1707,9 +1722,24 @@
 void X86_64Assembler::RemoveFrame(size_t frame_size,
                             const std::vector<ManagedRegister>& spill_regs) {
   CHECK_ALIGNED(frame_size, kStackAlignment);
-  addq(CpuRegister(RSP), Immediate(static_cast<int64_t>(frame_size) - (spill_regs.size() * kFramePointerSize) - kFramePointerSize));
+  int gpr_count = 0;
+  // unspill xmms
+  int64_t offset = static_cast<int64_t>(frame_size) - (spill_regs.size() * kFramePointerSize) - 2 * kFramePointerSize;
   for (size_t i = 0; i < spill_regs.size(); ++i) {
-    popq(spill_regs.at(i).AsX86_64().AsCpuRegister());
+    x86_64::X86_64ManagedRegister spill = spill_regs.at(i).AsX86_64();
+    if (spill.IsXmmRegister()) {
+      offset += sizeof(double);
+      movsd(spill.AsXmmRegister(), Address(CpuRegister(RSP), offset));
+    } else {
+      gpr_count++;
+    }
+  }
+  addq(CpuRegister(RSP), Immediate(static_cast<int64_t>(frame_size) - (gpr_count * kFramePointerSize) - kFramePointerSize));
+  for (size_t i = 0; i < spill_regs.size(); ++i) {
+    x86_64::X86_64ManagedRegister spill = spill_regs.at(i).AsX86_64();
+    if (spill.IsCpuRegister()) {
+      popq(spill.AsCpuRegister());
+    }
   }
   ret();
 }
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index f7bad8b..dc1758f 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -246,11 +246,9 @@
   str << "pushq %rsi\n";
   str << "pushq %r10\n";
   // 2) Move down the stack pointer.
-  ssize_t displacement = -static_cast<ssize_t>(frame_size) + spill_regs.size() * 8 +
-      sizeof(StackReference<mirror::ArtMethod>) + 8;
-  str << "addq $" << displacement << ", %rsp\n";
-  // 3) Make space for method reference, and store it.
-  str << "subq $4, %rsp\n";
+  ssize_t displacement = static_cast<ssize_t>(frame_size) - (spill_regs.size() * 8 + 8);
+  str << "subq $" << displacement << ", %rsp\n";
+  // 3) Store method reference.
   str << "movl %edi, (%rsp)\n";
   // 4) Entry spills.
   str << "movq %rax, " << frame_size + 0 << "(%rsp)\n";
diff --git a/runtime/arch/x86_64/asm_support_x86_64.h b/runtime/arch/x86_64/asm_support_x86_64.h
index bff8501..05d0ef8 100644
--- a/runtime/arch/x86_64/asm_support_x86_64.h
+++ b/runtime/arch/x86_64/asm_support_x86_64.h
@@ -35,9 +35,9 @@
 // Offset of field Thread::thin_lock_thread_id_ verified in InitCpu
 #define THREAD_ID_OFFSET 12
 
-#define FRAME_SIZE_SAVE_ALL_CALLEE_SAVE 64
-#define FRAME_SIZE_REFS_ONLY_CALLEE_SAVE 64
-#define FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE 176
+#define FRAME_SIZE_SAVE_ALL_CALLEE_SAVE 64 + 4*8
+#define FRAME_SIZE_REFS_ONLY_CALLEE_SAVE 64 + 4*8
+#define FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE 176 + 4*8
 
 // Expected size of a heap reference
 #define HEAP_REFERENCE_SIZE 4
diff --git a/runtime/arch/x86_64/context_x86_64.cc b/runtime/arch/x86_64/context_x86_64.cc
index e1f47ee..7699eaf 100644
--- a/runtime/arch/x86_64/context_x86_64.cc
+++ b/runtime/arch/x86_64/context_x86_64.cc
@@ -78,6 +78,18 @@
   gprs_[R9] = nullptr;
   gprs_[R10] = nullptr;
   gprs_[R11] = nullptr;
+  fprs_[XMM0] = nullptr;
+  fprs_[XMM1] = nullptr;
+  fprs_[XMM2] = nullptr;
+  fprs_[XMM3] = nullptr;
+  fprs_[XMM4] = nullptr;
+  fprs_[XMM5] = nullptr;
+  fprs_[XMM6] = nullptr;
+  fprs_[XMM7] = nullptr;
+  fprs_[XMM8] = nullptr;
+  fprs_[XMM9] = nullptr;
+  fprs_[XMM10] = nullptr;
+  fprs_[XMM11] = nullptr;
 }
 
 bool X86_64Context::SetGPR(uint32_t reg, uintptr_t value) {
@@ -102,41 +114,26 @@
   }
 }
 
+extern "C" void art_quick_do_long_jump(uintptr_t*, uintptr_t*);
+
 void X86_64Context::DoLongJump() {
 #if defined(__x86_64__)
-  // Array of GPR values, filled from the context backward for the long jump pop. We add a slot at
-  // the top for the stack pointer that doesn't get popped in a pop-all.
-  volatile uintptr_t gprs[kNumberOfCpuRegisters + 1];
+  uintptr_t gprs[kNumberOfCpuRegisters + 1];
+  uintptr_t fprs[kNumberOfFloatRegisters];
+
   for (size_t i = 0; i < kNumberOfCpuRegisters; ++i) {
     gprs[kNumberOfCpuRegisters - i - 1] = gprs_[i] != nullptr ? *gprs_[i] : X86_64Context::kBadGprBase + i;
   }
+  for (size_t i = 0; i < kNumberOfFloatRegisters; ++i) {
+    fprs[i] = fprs_[i] != nullptr ? *fprs_[i] : X86_64Context::kBadFprBase + i;
+  }
+
   // We want to load the stack pointer one slot below so that the ret will pop eip.
   uintptr_t rsp = gprs[kNumberOfCpuRegisters - RSP - 1] - kWordSize;
   gprs[kNumberOfCpuRegisters] = rsp;
   *(reinterpret_cast<uintptr_t*>(rsp)) = rip_;
-  __asm__ __volatile__(
-      "movq %0, %%rsp\n\t"  // RSP points to gprs.
-      "popq %%r15\n\t"       // Load all registers except RSP and RIP with values in gprs.
-      "popq %%r14\n\t"
-      "popq %%r13\n\t"
-      "popq %%r12\n\t"
-      "popq %%r11\n\t"
-      "popq %%r10\n\t"
-      "popq %%r9\n\t"
-      "popq %%r8\n\t"
-      "popq %%rdi\n\t"
-      "popq %%rsi\n\t"
-      "popq %%rbp\n\t"
-      "addq $8, %%rsp\n\t"
-      "popq %%rbx\n\t"
-      "popq %%rdx\n\t"
-      "popq %%rcx\n\t"
-      "popq %%rax\n\t"
-      "popq %%rsp\n\t"      // Load stack pointer.
-      "ret\n\t"             // From higher in the stack pop rip.
-      :  // output.
-      : "g"(&gprs[0])  // input.
-      :);  // clobber.
+
+  art_quick_do_long_jump(gprs, fprs);
 #else
   UNIMPLEMENTED(FATAL);
 #endif
diff --git a/runtime/arch/x86_64/entrypoints_init_x86_64.cc b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
index 609d1c6..204d52c 100644
--- a/runtime/arch/x86_64/entrypoints_init_x86_64.cc
+++ b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
@@ -35,7 +35,7 @@
 extern "C" void art_portable_to_interpreter_bridge(mirror::ArtMethod*);
 
 // Cast entrypoints.
-extern "C" uint32_t artIsAssignableFromCode(const mirror::Class* klass,
+extern "C" uint32_t art_quick_assignable_from_code(const mirror::Class* klass,
                                             const mirror::Class* ref_class);
 extern "C" void art_quick_check_cast(void*, void*);
 
@@ -129,7 +129,7 @@
   ResetQuickAllocEntryPoints(qpoints);
 
   // Cast
-  qpoints->pInstanceofNonTrivial = artIsAssignableFromCode;
+  qpoints->pInstanceofNonTrivial = art_quick_assignable_from_code;
   qpoints->pCheckCast = art_quick_check_cast;
 
   // DexCache
diff --git a/runtime/arch/x86_64/jni_entrypoints_x86_64.S b/runtime/arch/x86_64/jni_entrypoints_x86_64.S
index d668797..f6736df 100644
--- a/runtime/arch/x86_64/jni_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/jni_entrypoints_x86_64.S
@@ -28,8 +28,8 @@
     PUSH rdx  // Arg.
     PUSH rcx  // Arg.
     // Create space for FPR args, plus padding for alignment
-    subq LITERAL(72), %rsp
-    CFI_ADJUST_CFA_OFFSET(72)
+    subq LITERAL(72 + 4 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(72 + 4 * 8)
     // Save FPRs.
     movq %xmm0, 0(%rsp)
     movq %xmm1, 8(%rsp)
@@ -39,6 +39,10 @@
     movq %xmm5, 40(%rsp)
     movq %xmm6, 48(%rsp)
     movq %xmm7, 56(%rsp)
+    movq %xmm12, 64(%rsp)
+    movq %xmm13, 72(%rsp)
+    movq %xmm14, 80(%rsp)
+    movq %xmm15, 88(%rsp)
     // prepare call
     movq %gs:THREAD_SELF_OFFSET, %rdi      // RDI := Thread::Current()
     // call
@@ -52,8 +56,12 @@
     movq 40(%rsp), %xmm5
     movq 48(%rsp), %xmm6
     movq 56(%rsp), %xmm7
-    addq LITERAL(72), %rsp
-    CFI_ADJUST_CFA_OFFSET(-72)
+    movq 64(%rsp), %xmm12
+    movq 72(%rsp), %xmm13
+    movq 80(%rsp), %xmm14
+    movq 88(%rsp), %xmm15
+    addq LITERAL(72 + 4 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(-72 - 4 * 8)
     POP rcx  // Arg.
     POP rdx  // Arg.
     POP rsi  // Arg.
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 8fa947c..7f7226c 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -16,6 +16,26 @@
 
 #include "asm_support_x86_64.S"
 
+MACRO0(SETUP_FP_CALLEE_SAVE_FRAME)
+    // Create space for ART FP callee-saved registers
+    subq LITERAL(4 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(4 * 8)
+    movq %xmm12, 0(%rsp)
+    movq %xmm13, 8(%rsp)
+    movq %xmm14, 16(%rsp)
+    movq %xmm15, 24(%rsp)
+END_MACRO
+
+MACRO0(RESTORE_FP_CALLEE_SAVE_FRAME)
+    // Restore ART FP callee-saved registers
+    movq 0(%rsp), %xmm12
+    movq 8(%rsp), %xmm13
+    movq 16(%rsp), %xmm14
+    movq 24(%rsp), %xmm15
+    addq LITERAL(4 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(- 4 * 8)
+END_MACRO
+
 // For x86, the CFA is esp+4, the address above the pushed return address on the stack.
 
     /*
@@ -37,6 +57,14 @@
     PUSH r12  // Callee save.
     PUSH rbp  // Callee save.
     PUSH rbx  // Callee save.
+    // Create space for FPR args, plus padding for alignment
+    subq LITERAL(4 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(4 * 8)
+    // Save FPRs.
+    movq %xmm12, 0(%rsp)
+    movq %xmm13, 8(%rsp)
+    movq %xmm14, 16(%rsp)
+    movq %xmm15, 24(%rsp)
     subq MACRO_LITERAL(8), %rsp  // Space for Method* (also aligns the frame).
     CFI_ADJUST_CFA_OFFSET(8)
     // R10 := ArtMethod* for save all callee save frame method.
@@ -46,7 +74,7 @@
 
     // Ugly compile-time check, but we only have the preprocessor.
     // Last +8: implicit return address pushed on stack when caller made call.
-#if (FRAME_SIZE_SAVE_ALL_CALLEE_SAVE != 6*8 + 8 + 8)
+#if (FRAME_SIZE_SAVE_ALL_CALLEE_SAVE != 6*8 + 4*8 + 8 + 8)
 #error "SAVE_ALL_CALLEE_SAVE_FRAME(X86_64) size not as expected."
 #endif
 #endif  // __APPLE__
@@ -71,8 +99,14 @@
     PUSH r12  // Callee save.
     PUSH rbp  // Callee save.
     PUSH rbx  // Callee save.
-    subq MACRO_LITERAL(8), %rsp  // Space for Method* (also aligns the frame).
-    CFI_ADJUST_CFA_OFFSET(8)
+    // Create space for FPR args, plus padding for alignment
+    subq LITERAL(8 + 4*8), %rsp
+    CFI_ADJUST_CFA_OFFSET(8 + 4*8)
+    // Save FPRs.
+    movq %xmm12, 8(%rsp)
+    movq %xmm13, 16(%rsp)
+    movq %xmm14, 24(%rsp)
+    movq %xmm15, 32(%rsp)
     // R10 := ArtMethod* for refs only callee save frame method.
     movq RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET(%r10), %r10
     // Store ArtMethod* to bottom of stack.
@@ -80,15 +114,19 @@
 
     // Ugly compile-time check, but we only have the preprocessor.
     // Last +8: implicit return address pushed on stack when caller made call.
-#if (FRAME_SIZE_REFS_ONLY_CALLEE_SAVE != 6*8 + 8 + 8)
+#if (FRAME_SIZE_REFS_ONLY_CALLEE_SAVE != 6*8 + 4*8 + 8 + 8)
 #error "REFS_ONLY_CALLEE_SAVE_FRAME(X86_64) size not as expected."
 #endif
 #endif  // __APPLE__
 END_MACRO
 
 MACRO0(RESTORE_REF_ONLY_CALLEE_SAVE_FRAME)
-    addq MACRO_LITERAL(8), %rsp
-    CFI_ADJUST_CFA_OFFSET(-8)
+    movq 8(%rsp), %xmm12
+    movq 16(%rsp), %xmm13
+    movq 24(%rsp), %xmm14
+    movq 32(%rsp), %xmm15
+    addq LITERAL(8 + 4*8), %rsp
+    CFI_ADJUST_CFA_OFFSET(-8 - 4*8)
     // TODO: optimize by not restoring callee-saves restored by the ABI
     POP rbx
     POP rbp
@@ -123,8 +161,8 @@
     PUSH rdx  // Quick arg 2.
     PUSH rcx  // Quick arg 3.
     // Create space for FPR args and create 2 slots, 1 of padding and 1 for the ArtMethod*.
-    subq MACRO_LITERAL(80), %rsp
-    CFI_ADJUST_CFA_OFFSET(80)
+    subq MACRO_LITERAL(80 + 4 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(80 + 4 * 8)
     // R10 := ArtMethod* for ref and args callee save frame method.
     movq RUNTIME_REF_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET(%r10), %r10
     // Save FPRs.
@@ -136,12 +174,16 @@
     movq %xmm5, 56(%rsp)
     movq %xmm6, 64(%rsp)
     movq %xmm7, 72(%rsp)
+    movq %xmm12, 80(%rsp)
+    movq %xmm13, 88(%rsp)
+    movq %xmm14, 96(%rsp)
+    movq %xmm15, 104(%rsp)
     // Store ArtMethod* to bottom of stack.
     movq %r10, 0(%rsp)
 
     // Ugly compile-time check, but we only have the preprocessor.
     // Last +8: implicit return address pushed on stack when caller made call.
-#if (FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE != 11*8 + 80 + 8)
+#if (FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE != 11*8 + 4*8 + 80 + 8)
 #error "REFS_AND_ARGS_CALLEE_SAVE_FRAME(X86_64) size not as expected."
 #endif
 #endif  // __APPLE__
@@ -157,8 +199,12 @@
     movq 56(%rsp), %xmm5
     movq 64(%rsp), %xmm6
     movq 72(%rsp), %xmm7
-    addq MACRO_LITERAL(80), %rsp
-    CFI_ADJUST_CFA_OFFSET(-80)
+    movq 80(%rsp), %xmm12
+    movq 88(%rsp), %xmm13
+    movq 96(%rsp), %xmm14
+    movq 104(%rsp), %xmm15
+    addq MACRO_LITERAL(80 + 4 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(-(80 + 4 * 8))
     // Restore callee and GPR args, mixed together to agree with core spills bitmap.
     POP rcx
     POP rdx
@@ -536,6 +582,58 @@
 #endif  // __APPLE__
 END_FUNCTION art_quick_invoke_static_stub
 
+    /*
+     * Long jump stub.
+     * On entry:
+     *   rdi = gprs
+     *   rsi = fprs
+     */
+DEFINE_FUNCTION art_quick_do_long_jump
+#if defined(__APPLE__)
+    int3
+    int3
+#else
+    // Restore FPRs.
+    movq 0(%rsi), %xmm0
+    movq 8(%rsi), %xmm1
+    movq 16(%rsi), %xmm2
+    movq 24(%rsi), %xmm3
+    movq 32(%rsi), %xmm4
+    movq 40(%rsi), %xmm5
+    movq 48(%rsi), %xmm6
+    movq 56(%rsi), %xmm7
+    movq 64(%rsi), %xmm8
+    movq 72(%rsi), %xmm9
+    movq 80(%rsi), %xmm10
+    movq 88(%rsi), %xmm11
+    movq 96(%rsi), %xmm12
+    movq 104(%rsi), %xmm13
+    movq 112(%rsi), %xmm14
+    movq 120(%rsi), %xmm15
+    // Restore FPRs.
+    movq %rdi, %rsp   // RSP points to gprs.
+    // Load all registers except RSP and RIP with values in gprs.
+    popq %r15
+    popq %r14
+    popq %r13
+    popq %r12
+    popq %r11
+    popq %r10
+    popq %r9
+    popq %r8
+    popq %rdi
+    popq %rsi
+    popq %rbp
+    addq LITERAL(8), %rsp   // Skip rsp
+    popq %rbx
+    popq %rdx
+    popq %rcx
+    popq %rax
+    popq %rsp      // Load stack pointer.
+    ret            // From higher in the stack pop rip.
+#endif  // __APPLE__
+END_FUNCTION art_quick_do_long_jump
+
 MACRO3(NO_ARG_DOWNCALL, c_name, cxx_name, return_macro)
     DEFINE_FUNCTION VAR(c_name, 0)
     SETUP_REF_ONLY_CALLEE_SAVE_FRAME  // save ref containing registers for GC
@@ -820,13 +918,17 @@
 DEFINE_FUNCTION art_quick_check_cast
     PUSH rdi                          // Save args for exc
     PUSH rsi
+    SETUP_FP_CALLEE_SAVE_FRAME
     call PLT_SYMBOL(artIsAssignableFromCode)  // (Class* klass, Class* ref_klass)
     testq %rax, %rax
     jz 1f                             // jump forward if not assignable
+    RESTORE_FP_CALLEE_SAVE_FRAME
     addq LITERAL(16), %rsp            // pop arguments
     CFI_ADJUST_CFA_OFFSET(-16)
+
     ret
 1:
+    RESTORE_FP_CALLEE_SAVE_FRAME
     POP rsi                           // Pop arguments
     POP rdi
     SETUP_SAVE_ALL_CALLEE_SAVE_FRAME  // save all registers as basis for long jump context
@@ -907,6 +1009,7 @@
     PUSH rdx
     subq LITERAL(8), %rsp        // Alignment padding.
     CFI_ADJUST_CFA_OFFSET(8)
+    SETUP_FP_CALLEE_SAVE_FRAME
 
                                   // "Uncompress" = do nothing, as already zero-extended on load.
     movl CLASS_OFFSET(%edx), %esi // Pass arg2 = value's class.
@@ -918,6 +1021,7 @@
     testq %rax, %rax
     jz   .Lthrow_array_store_exception
 
+    RESTORE_FP_CALLEE_SAVE_FRAME
     // Restore arguments.
     addq LITERAL(8), %rsp
     CFI_ADJUST_CFA_OFFSET(-8)
@@ -934,6 +1038,7 @@
 //  movb %dl, (%rdx, %rdi)
     ret
 .Lthrow_array_store_exception:
+    RESTORE_FP_CALLEE_SAVE_FRAME
     // Restore arguments.
     addq LITERAL(8), %rsp
     CFI_ADJUST_CFA_OFFSET(-8)
@@ -1012,8 +1117,8 @@
     PUSH rdx  // Quick arg 2.
     PUSH rcx  // Quick arg 3.
     // Create space for FPR args and create 2 slots, 1 of padding and 1 for the ArtMethod*.
-    subq LITERAL(80), %rsp
-    CFI_ADJUST_CFA_OFFSET(80)
+    subq LITERAL(80 + 4*8), %rsp
+    CFI_ADJUST_CFA_OFFSET(80 + 4*8)
     // Save FPRs.
     movq %xmm0, 16(%rsp)
     movq %xmm1, 24(%rsp)
@@ -1023,14 +1128,18 @@
     movq %xmm5, 56(%rsp)
     movq %xmm6, 64(%rsp)
     movq %xmm7, 72(%rsp)
+    movq %xmm12, 80(%rsp)
+    movq %xmm13, 88(%rsp)
+    movq %xmm14, 96(%rsp)
+    movq %xmm15, 104(%rsp)
     // Store proxy method to bottom of stack.
     movq %rdi, 0(%rsp)
     movq %gs:THREAD_SELF_OFFSET, %rdx  // Pass Thread::Current().
     movq %rsp, %rcx                    // Pass SP.
     call PLT_SYMBOL(artQuickProxyInvokeHandler) // (proxy method, receiver, Thread*, SP)
     movq %rax, %xmm0                   // Copy return value in case of float returns.
-    addq LITERAL(168), %rsp            // Pop arguments.
-    CFI_ADJUST_CFA_OFFSET(-168)
+    addq LITERAL(168 + 4*8), %rsp            // Pop arguments.
+    CFI_ADJUST_CFA_OFFSET(-168 - 4*8)
     RETURN_OR_DELIVER_PENDING_EXCEPTION
 END_FUNCTION art_quick_proxy_invoke_handler
 
@@ -1156,8 +1265,8 @@
     PUSH rdx  // Quick arg 2.
     PUSH rcx  // Quick arg 3.
     // Create space for FPR args and create 2 slots, 1 of padding and 1 for the ArtMethod*.
-    subq LITERAL(80), %rsp
-    CFI_ADJUST_CFA_OFFSET(80)
+    subq LITERAL(80 + 4*8), %rsp
+    CFI_ADJUST_CFA_OFFSET(80 + 4*8)
     // Save FPRs.
     movq %xmm0, 16(%rsp)
     movq %xmm1, 24(%rsp)
@@ -1167,6 +1276,10 @@
     movq %xmm5, 56(%rsp)
     movq %xmm6, 64(%rsp)
     movq %xmm7, 72(%rsp)
+    movq %xmm12, 80(%rsp)
+    movq %xmm13, 88(%rsp)
+    movq %xmm14, 96(%rsp)
+    movq %xmm15, 104(%rsp)
     movq %rdi, 0(%rsp)              // Store native ArtMethod* to bottom of stack.
     movq %rsp, %rbp                 // save SP at (old) callee-save frame
     CFI_DEF_CFA_REGISTER(rbp)
@@ -1260,9 +1373,13 @@
     movq 56(%rsp), %xmm5
     movq 64(%rsp), %xmm6
     movq 72(%rsp), %xmm7
+    movq 80(%rsp), %xmm12
+    movq 88(%rsp), %xmm13
+    movq 96(%rsp), %xmm14
+    movq 104(%rsp), %xmm15
     // was 80 bytes
-    addq LITERAL(80), %rsp
-    CFI_ADJUST_CFA_OFFSET(-80)
+    addq LITERAL(80 + 4*8), %rsp
+    CFI_ADJUST_CFA_OFFSET(-80 - 4*8)
     // Save callee and GPR args, mixed together to agree with core spills bitmap.
     POP rcx  // Arg.
     POP rdx  // Arg.
@@ -1292,9 +1409,13 @@
     movq 56(%rsp), %xmm5
     movq 64(%rsp), %xmm6
     movq 72(%rsp), %xmm7
-    // was 80 bytes
-    addq LITERAL(80), %rsp
-    CFI_ADJUST_CFA_OFFSET(-80)
+    movq 80(%rsp), %xmm12
+    movq 88(%rsp), %xmm13
+    movq 96(%rsp), %xmm14
+    movq 104(%rsp), %xmm15
+    // was 80 + 32 bytes
+    addq LITERAL(80 + 4*8), %rsp
+    CFI_ADJUST_CFA_OFFSET(-80 - 4*8)
     // Save callee and GPR args, mixed together to agree with core spills bitmap.
     POP rcx  // Arg.
     POP rdx  // Arg.
@@ -1450,3 +1571,10 @@
 END_FUNCTION art_quick_string_compareto
 
 UNIMPLEMENTED art_quick_memcmp16
+
+DEFINE_FUNCTION art_quick_assignable_from_code
+    SETUP_FP_CALLEE_SAVE_FRAME
+    call PLT_SYMBOL(artIsAssignableFromCode)       // (const mirror::Class*, const mirror::Class*)
+    RESTORE_FP_CALLEE_SAVE_FRAME
+    ret
+END_FUNCTION art_quick_assignable_from_code
diff --git a/runtime/arch/x86_64/quick_method_frame_info_x86_64.h b/runtime/arch/x86_64/quick_method_frame_info_x86_64.h
index 6183909..53aa212 100644
--- a/runtime/arch/x86_64/quick_method_frame_info_x86_64.h
+++ b/runtime/arch/x86_64/quick_method_frame_info_x86_64.h
@@ -34,6 +34,9 @@
     (1 << art::x86_64::XMM0) | (1 << art::x86_64::XMM1) | (1 << art::x86_64::XMM2) |
     (1 << art::x86_64::XMM3) | (1 << art::x86_64::XMM4) | (1 << art::x86_64::XMM5) |
     (1 << art::x86_64::XMM6) | (1 << art::x86_64::XMM7);
+static constexpr uint32_t kX86_64CalleeSaveFpSpills =
+    (1 << art::x86_64::XMM12) | (1 << art::x86_64::XMM13) |
+    (1 << art::x86_64::XMM14) | (1 << art::x86_64::XMM15);
 
 constexpr uint32_t X86_64CalleeSaveCoreSpills(Runtime::CalleeSaveType type) {
   return kX86_64CalleeSaveRefSpills |
@@ -42,7 +45,8 @@
 }
 
 constexpr uint32_t X86_64CalleeSaveFpSpills(Runtime::CalleeSaveType type) {
-  return (type == Runtime::kRefsAndArgs ? kX86_64CalleeSaveFpArgSpills : 0);
+  return kX86_64CalleeSaveFpSpills |
+      (type == Runtime::kRefsAndArgs ? kX86_64CalleeSaveFpArgSpills : 0);
 }
 
 constexpr uint32_t X86_64CalleeSaveFrameSize(Runtime::CalleeSaveType type) {
diff --git a/runtime/arch/x86_64/registers_x86_64.cc b/runtime/arch/x86_64/registers_x86_64.cc
index 38f3494..f29c426 100644
--- a/runtime/arch/x86_64/registers_x86_64.cc
+++ b/runtime/arch/x86_64/registers_x86_64.cc
@@ -34,5 +34,14 @@
   return os;
 }
 
+std::ostream& operator<<(std::ostream& os, const FloatRegister& rhs) {
+  if (rhs >= XMM0 && rhs <= XMM15) {
+    os << "xmm" << static_cast<int>(rhs);
+  } else {
+    os << "Register[" << static_cast<int>(rhs) << "]";
+  }
+  return os;
+}
+
 }  // namespace x86_64
 }  // namespace art
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 95cb85e..2a66f2f 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -175,8 +175,8 @@
   static constexpr size_t kNumQuickGprArgs = 5;  // 5 arguments passed in GPRs.
   static constexpr size_t kNumQuickFprArgs = 8;  // 8 arguments passed in FPRs.
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 16;  // Offset of first FPR arg.
-  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 80;  // Offset of first GPR arg.
-  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 168;  // Offset of return address.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 80 + 4*8;  // Offset of first GPR arg.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 168 + 4*8;  // Offset of return address.
   static size_t GprIndexToGprOffset(uint32_t gpr_index) {
     switch (gpr_index) {
       case 0: return (4 * GetBytesPerGprSpillLocation(kRuntimeISA));