Revert "Revert "ART: Implement X86 hard float (Quick/JNI/Baseline)""

This reverts commit 949c91fb91f40a4a80b2b492913cf8541008975e.

This time, don't clobber EBX before saving it.

Redo some of the macros to make register usage explicit.

Change-Id: I8db8662877cd006816e16a28f42444ab7c36bfef
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 3815722..811d4f5 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -41,22 +41,15 @@
     }
    protected:
     Mir2Lir* m2l_;
-   private:
     size_t cur_core_reg_;
     size_t cur_fp_reg_;
   };
 
-  class InToRegStorageX86Mapper : public InToRegStorageMapper {
+  class InToRegStorageX86Mapper : public InToRegStorageX86_64Mapper {
    public:
-    explicit InToRegStorageX86Mapper(Mir2Lir* m2l) : m2l_(m2l), cur_core_reg_(0) {}
+    explicit InToRegStorageX86Mapper(Mir2Lir* m2l)
+        : InToRegStorageX86_64Mapper(m2l) { }
     virtual RegStorage GetNextReg(ShortyArg arg);
-    virtual void Reset() OVERRIDE {
-      cur_core_reg_ = 0;
-    }
-   protected:
-    Mir2Lir* m2l_;
-   private:
-    size_t cur_core_reg_;
   };
 
   InToRegStorageX86_64Mapper in_to_reg_storage_x86_64_mapper_;
@@ -120,9 +113,12 @@
       if (cu_->target64) {
         return As64BitReg(TargetReg32(symbolic_reg));
       } else {
+        if (symbolic_reg >= kFArg0 && symbolic_reg <= kFArg3) {
+          // We want an XMM, not a pair.
+          return As64BitReg(TargetReg32(symbolic_reg));
+        }
         // x86: construct a pair.
         DCHECK((kArg0 <= symbolic_reg && symbolic_reg < kArg3) ||
-               (kFArg0 <= symbolic_reg && symbolic_reg < kFArg3) ||
                (kRet0 == symbolic_reg));
         return RegStorage::MakeRegPair(TargetReg32(symbolic_reg),
                                  TargetReg32(static_cast<SpecialTargetRegister>(symbolic_reg + 1)));
diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc
index bc64aad..0337096 100755
--- a/compiler/dex/quick/x86/target_x86.cc
+++ b/compiler/dex/quick/x86/target_x86.cc
@@ -180,10 +180,10 @@
   RegStorage::InvalidReg(),  // kArg5
   RegStorage::InvalidReg(),  // kArg6
   RegStorage::InvalidReg(),  // kArg7
-  rs_rAX,                    // kFArg0
-  rs_rCX,                    // kFArg1
-  rs_rDX,                    // kFArg2
-  rs_rBX,                    // kFArg3
+  rs_fr0,                    // kFArg0
+  rs_fr1,                    // kFArg1
+  rs_fr2,                    // kFArg2
+  rs_fr3,                    // kFArg3
   RegStorage::InvalidReg(),  // kFArg4
   RegStorage::InvalidReg(),  // kFArg5
   RegStorage::InvalidReg(),  // kFArg6
@@ -200,7 +200,7 @@
   rs_rDX,                    // kRet1
   rs_rAX,                    // kInvokeTgt
   rs_rAX,                    // kHiddenArg - used to hold the method index before copying to fr0.
-  rs_fr0,                    // kHiddenFpArg
+  rs_fr7,                    // kHiddenFpArg
   rs_rCX,                    // kCount
 };
 
@@ -545,13 +545,13 @@
   LockTemp(TargetReg32(kArg1));
   LockTemp(TargetReg32(kArg2));
   LockTemp(TargetReg32(kArg3));
+  LockTemp(TargetReg32(kFArg0));
+  LockTemp(TargetReg32(kFArg1));
+  LockTemp(TargetReg32(kFArg2));
+  LockTemp(TargetReg32(kFArg3));
   if (cu_->target64) {
     LockTemp(TargetReg32(kArg4));
     LockTemp(TargetReg32(kArg5));
-    LockTemp(TargetReg32(kFArg0));
-    LockTemp(TargetReg32(kFArg1));
-    LockTemp(TargetReg32(kFArg2));
-    LockTemp(TargetReg32(kFArg3));
     LockTemp(TargetReg32(kFArg4));
     LockTemp(TargetReg32(kFArg5));
     LockTemp(TargetReg32(kFArg6));
@@ -566,13 +566,13 @@
   FreeTemp(TargetReg32(kArg2));
   FreeTemp(TargetReg32(kArg3));
   FreeTemp(TargetReg32(kHiddenArg));
+  FreeTemp(TargetReg32(kFArg0));
+  FreeTemp(TargetReg32(kFArg1));
+  FreeTemp(TargetReg32(kFArg2));
+  FreeTemp(TargetReg32(kFArg3));
   if (cu_->target64) {
     FreeTemp(TargetReg32(kArg4));
     FreeTemp(TargetReg32(kArg5));
-    FreeTemp(TargetReg32(kFArg0));
-    FreeTemp(TargetReg32(kFArg1));
-    FreeTemp(TargetReg32(kFArg2));
-    FreeTemp(TargetReg32(kFArg3));
     FreeTemp(TargetReg32(kFArg4));
     FreeTemp(TargetReg32(kFArg5));
     FreeTemp(TargetReg32(kFArg6));
@@ -2460,14 +2460,23 @@
 RegStorage X86Mir2Lir::InToRegStorageX86Mapper::GetNextReg(ShortyArg arg) {
   const SpecialTargetRegister coreArgMappingToPhysicalReg[] = {kArg1, kArg2, kArg3};
   const size_t coreArgMappingToPhysicalRegSize = arraysize(coreArgMappingToPhysicalReg);
+  const SpecialTargetRegister fpArgMappingToPhysicalReg[] = {kFArg0, kFArg1, kFArg2, kFArg3};
+  const size_t fpArgMappingToPhysicalRegSize = arraysize(fpArgMappingToPhysicalReg);
 
   RegStorage result = RegStorage::InvalidReg();
-  if (cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
-    result = m2l_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++],
-                          arg.IsRef() ? kRef : kNotWide);
-    if (arg.IsWide() && cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
-      result = RegStorage::MakeRegPair(
-          result, m2l_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++], kNotWide));
+  if (arg.IsFP()) {
+    if (cur_fp_reg_ < fpArgMappingToPhysicalRegSize) {
+      return m2l_->TargetReg(fpArgMappingToPhysicalReg[cur_fp_reg_++],
+                             arg.IsWide() ? kWide : kNotWide);
+    }
+  } else {
+    if (cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
+      result = m2l_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++],
+                               arg.IsRef() ? kRef : kNotWide);
+      if (arg.IsWide() && cur_core_reg_ < coreArgMappingToPhysicalRegSize) {
+        result = RegStorage::MakeRegPair(
+            result, m2l_->TargetReg(coreArgMappingToPhysicalReg[cur_core_reg_++], kNotWide));
+      }
     }
   }
   return result;
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index bc4cb5a..7dea09a 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -57,15 +57,15 @@
  * x86-64/x32 gs: holds it.
  *
  * For floating point we don't support CPUs without SSE2 support (ie newer than PIII):
- *  Native: x86  | x86-64 / x32 | ART x86                    | ART x86-64
- *  XMM0: caller | caller, arg1 | caller, float return value | caller, arg1, float return value
- *  XMM1: caller | caller, arg2 | caller, scratch            | caller, arg2, scratch
- *  XMM2: caller | caller, arg3 | caller, scratch            | caller, arg3, scratch
- *  XMM3: caller | caller, arg4 | caller, scratch            | caller, arg4, scratch
- *  XMM4: caller | caller, arg5 | caller, scratch            | caller, arg5, scratch
- *  XMM5: caller | caller, arg6 | caller, scratch            | caller, arg6, scratch
- *  XMM6: caller | caller, arg7 | caller, scratch            | caller, arg7, scratch
- *  XMM7: caller | caller, arg8 | caller, scratch            | caller, arg8, scratch
+ *  Native: x86  | x86-64 / x32 | ART x86                          | ART x86-64
+ *  XMM0: caller | caller, arg1 | caller, arg1, float return value | caller, arg1, float return value
+ *  XMM1: caller | caller, arg2 | caller, arg2, scratch            | caller, arg2, scratch
+ *  XMM2: caller | caller, arg3 | caller, arg3, scratch            | caller, arg3, scratch
+ *  XMM3: caller | caller, arg4 | caller, arg4, scratch            | caller, arg4, scratch
+ *  XMM4: caller | caller, arg5 | caller, scratch                  | caller, arg5, scratch
+ *  XMM5: caller | caller, arg6 | caller, scratch                  | caller, arg6, scratch
+ *  XMM6: caller | caller, arg7 | caller, scratch                  | caller, arg7, scratch
+ *  XMM7: caller | caller, arg8 | caller, scratch                  | caller, arg8, scratch
  *  ---  x86-64/x32 registers
  *  XMM8 .. 11: caller save available as scratch registers for ART.
  *  XMM12 .. 15: callee save available as promoted registers for ART.
diff --git a/compiler/jni/quick/x86/calling_convention_x86.cc b/compiler/jni/quick/x86/calling_convention_x86.cc
index a5686e1..fc72e88 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.cc
+++ b/compiler/jni/quick/x86/calling_convention_x86.cc
@@ -77,12 +77,34 @@
 }
 
 bool X86ManagedRuntimeCallingConvention::IsCurrentParamOnStack() {
-  return true;  // Everything is passed by stack
+  // We assume all parameters are on stack, args coming via registers are spilled as entry_spills.
+  return true;
 }
 
 ManagedRegister X86ManagedRuntimeCallingConvention::CurrentParamRegister() {
-  LOG(FATAL) << "Should not reach here";
-  return ManagedRegister::NoRegister();
+  ManagedRegister res = ManagedRegister::NoRegister();
+  if (!IsCurrentParamAFloatOrDouble()) {
+    switch (gpr_arg_count_) {
+      case 0: res = X86ManagedRegister::FromCpuRegister(ECX); break;
+      case 1: res = X86ManagedRegister::FromCpuRegister(EDX); break;
+      case 2: res = X86ManagedRegister::FromCpuRegister(EBX); break;
+    }
+  } else if (itr_float_and_doubles_ < 4) {
+    // First four float parameters are passed via XMM0..XMM3
+    res = X86ManagedRegister::FromXmmRegister(
+                                 static_cast<XmmRegister>(XMM0 + itr_float_and_doubles_));
+  }
+  return res;
+}
+
+ManagedRegister X86ManagedRuntimeCallingConvention::CurrentParamHighLongRegister() {
+  ManagedRegister res = ManagedRegister::NoRegister();
+  DCHECK(IsCurrentParamALong());
+  switch (gpr_arg_count_) {
+    case 0: res = X86ManagedRegister::FromCpuRegister(EDX); break;
+    case 1: res = X86ManagedRegister::FromCpuRegister(EBX); break;
+  }
+  return res;
 }
 
 FrameOffset X86ManagedRuntimeCallingConvention::CurrentParamStackOffset() {
@@ -95,15 +117,32 @@
   // We spill the argument registers on X86 to free them up for scratch use, we then assume
   // all arguments are on the stack.
   if (entry_spills_.size() == 0) {
-    size_t num_spills = NumArgs() + NumLongOrDoubleArgs();
-    if (num_spills > 0) {
-      entry_spills_.push_back(X86ManagedRegister::FromCpuRegister(ECX));
-      if (num_spills > 1) {
-        entry_spills_.push_back(X86ManagedRegister::FromCpuRegister(EDX));
-        if (num_spills > 2) {
-          entry_spills_.push_back(X86ManagedRegister::FromCpuRegister(EBX));
+    ResetIterator(FrameOffset(0));
+    while (HasNext()) {
+      ManagedRegister in_reg = CurrentParamRegister();
+      if (!in_reg.IsNoRegister()) {
+        int32_t size = IsParamADouble(itr_args_) ? 8 : 4;
+        int32_t spill_offset = CurrentParamStackOffset().Uint32Value();
+        ManagedRegisterSpill spill(in_reg, size, spill_offset);
+        entry_spills_.push_back(spill);
+        if (IsCurrentParamALong() && !IsCurrentParamAReference()) {  // Long.
+          // special case, as we may need a second register here.
+          in_reg = CurrentParamHighLongRegister();
+          if (!in_reg.IsNoRegister()) {
+            // We have to spill the second half of the long.
+            ManagedRegisterSpill spill2(in_reg, size, spill_offset + 4);
+            entry_spills_.push_back(spill2);
+            // Long was allocated in 2 registers.
+            gpr_arg_count_++;
+          }
+        }
+
+        // Keep track of the number of GPRs allocated.
+        if (!IsCurrentParamAFloatOrDouble()) {
+          gpr_arg_count_++;
         }
       }
+      Next();
     }
   }
   return entry_spills_;
diff --git a/compiler/jni/quick/x86/calling_convention_x86.h b/compiler/jni/quick/x86/calling_convention_x86.h
index 025eb6d..b1b3598 100644
--- a/compiler/jni/quick/x86/calling_convention_x86.h
+++ b/compiler/jni/quick/x86/calling_convention_x86.h
@@ -28,7 +28,8 @@
  public:
   explicit X86ManagedRuntimeCallingConvention(bool is_static, bool is_synchronized,
                                               const char* shorty)
-      : ManagedRuntimeCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize) {}
+      : ManagedRuntimeCallingConvention(is_static, is_synchronized, shorty, kFramePointerSize),
+        gpr_arg_count_(0) {}
   ~X86ManagedRuntimeCallingConvention() OVERRIDE {}
   // Calling convention
   ManagedRegister ReturnRegister() OVERRIDE;
@@ -40,7 +41,10 @@
   ManagedRegister CurrentParamRegister() OVERRIDE;
   FrameOffset CurrentParamStackOffset() OVERRIDE;
   const ManagedRegisterEntrySpills& EntrySpills() OVERRIDE;
+
  private:
+  int gpr_arg_count_;
+  ManagedRegister CurrentParamHighLongRegister();
   ManagedRegisterEntrySpills entry_spills_;
   DISALLOW_COPY_AND_ASSIGN(X86ManagedRuntimeCallingConvention);
 };
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index c0fdcaa..66f1d5e 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -36,8 +36,9 @@
 static constexpr Register kRuntimeParameterCoreRegisters[] = { EAX, ECX, EDX, EBX };
 static constexpr size_t kRuntimeParameterCoreRegistersLength =
     arraysize(kRuntimeParameterCoreRegisters);
-static constexpr XmmRegister kRuntimeParameterFpuRegisters[] = { };
-static constexpr size_t kRuntimeParameterFpuRegistersLength = 0;
+static constexpr XmmRegister kRuntimeParameterFpuRegisters[] = { XMM0, XMM1, XMM2, XMM3 };
+static constexpr size_t kRuntimeParameterFpuRegistersLength =
+    arraysize(kRuntimeParameterFpuRegisters);
 
 static constexpr int kC2ConditionMask = 0x400;
 
@@ -504,30 +505,49 @@
     case Primitive::kPrimChar:
     case Primitive::kPrimShort:
     case Primitive::kPrimInt:
-    case Primitive::kPrimFloat:
     case Primitive::kPrimNot: {
       uint32_t index = gp_index_++;
+      stack_index_++;
       if (index < calling_convention.GetNumberOfRegisters()) {
         return Location::RegisterLocation(calling_convention.GetRegisterAt(index));
       } else {
-        return Location::StackSlot(calling_convention.GetStackOffsetOf(index));
+        return Location::StackSlot(calling_convention.GetStackOffsetOf(stack_index_ - 1));
       }
     }
 
-    case Primitive::kPrimLong:
-    case Primitive::kPrimDouble: {
+    case Primitive::kPrimLong: {
       uint32_t index = gp_index_;
       gp_index_ += 2;
+      stack_index_ += 2;
       if (index + 1 < calling_convention.GetNumberOfRegisters()) {
         X86ManagedRegister pair = X86ManagedRegister::FromRegisterPair(
             calling_convention.GetRegisterPairAt(index));
         return Location::RegisterPairLocation(pair.AsRegisterPairLow(), pair.AsRegisterPairHigh());
       } else if (index + 1 == calling_convention.GetNumberOfRegisters()) {
-        // On X86, the register index and stack index of a quick parameter is the same, since
-        // we are passing floating pointer values in core registers.
-        return Location::QuickParameter(index, index);
+        // stack_index_ is the right offset for the memory.
+        return Location::QuickParameter(index, stack_index_ - 2);
       } else {
-        return Location::DoubleStackSlot(calling_convention.GetStackOffsetOf(index));
+        return Location::DoubleStackSlot(calling_convention.GetStackOffsetOf(stack_index_ - 2));
+      }
+    }
+
+    case Primitive::kPrimFloat: {
+      uint32_t index = fp_index_++;
+      stack_index_++;
+      if (index < calling_convention.GetNumberOfFpuRegisters()) {
+        return Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(index));
+      } else {
+        return Location::StackSlot(calling_convention.GetStackOffsetOf(stack_index_ - 1));
+      }
+    }
+
+    case Primitive::kPrimDouble: {
+      uint32_t index = fp_index_++;
+      stack_index_ += 2;
+      if (index < calling_convention.GetNumberOfFpuRegisters()) {
+        return Location::FpuRegisterLocation(calling_convention.GetFpuRegisterAt(index));
+      } else {
+        return Location::DoubleStackSlot(calling_convention.GetStackOffsetOf(stack_index_ - 2));
       }
     }
 
@@ -1186,7 +1206,7 @@
 void LocationsBuilderX86::VisitInvokeInterface(HInvokeInterface* invoke) {
   HandleInvoke(invoke);
   // Add the hidden argument.
-  invoke->GetLocations()->AddTemp(Location::FpuRegisterLocation(XMM0));
+  invoke->GetLocations()->AddTemp(Location::FpuRegisterLocation(XMM7));
 }
 
 void InstructionCodeGeneratorX86::VisitInvokeInterface(HInvokeInterface* invoke) {
@@ -1388,31 +1408,17 @@
           locations->SetOut(Location::RegisterPairLocation(EAX, EDX));
           break;
 
-        case Primitive::kPrimFloat: {
-          // Processing a Dex `float-to-long' instruction.
-          InvokeRuntimeCallingConvention calling_convention;
-          // Note that on x86 floating-point parameters are passed
-          // through core registers (here, EAX).
-          locations->SetInAt(0, Location::RegisterLocation(
-              calling_convention.GetRegisterAt(0)));
-          // The runtime helper puts the result in EAX, EDX.
-          locations->SetOut(Location::RegisterPairLocation(EAX, EDX));
-          break;
-        }
-
+        case Primitive::kPrimFloat:
         case Primitive::kPrimDouble: {
-          // Processing a Dex `double-to-long' instruction.
+          // Processing a Dex `float-to-long' or 'double-to-long' instruction.
           InvokeRuntimeCallingConvention calling_convention;
-          // Note that on x86 floating-point parameters are passed
-          // through core registers (here, EAX and ECX).
-          locations->SetInAt(0, Location::RegisterPairLocation(
-              calling_convention.GetRegisterAt(0),
-              calling_convention.GetRegisterAt(1)));
+          XmmRegister parameter = calling_convention.GetFpuRegisterAt(0);
+          locations->SetInAt(0, Location::FpuRegisterLocation(parameter));
+
           // The runtime helper puts the result in EAX, EDX.
           locations->SetOut(Location::RegisterPairLocation(EAX, EDX));
-          break;
         }
-          break;
+        break;
 
         default:
           LOG(FATAL) << "Unexpected type conversion from " << input_type
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 73b647c..55d71e3 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -36,8 +36,8 @@
 static constexpr Register kParameterCoreRegisters[] = { ECX, EDX, EBX };
 static constexpr RegisterPair kParameterCorePairRegisters[] = { ECX_EDX, EDX_EBX };
 static constexpr size_t kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters);
-static constexpr XmmRegister kParameterFpuRegisters[] = { };
-static constexpr size_t kParameterFpuRegistersLength = 0;
+static constexpr XmmRegister kParameterFpuRegisters[] = { XMM0, XMM1, XMM2, XMM3 };
+static constexpr size_t kParameterFpuRegistersLength = arraysize(kParameterFpuRegisters);
 
 class InvokeDexCallingConvention : public CallingConvention<Register, XmmRegister> {
  public:
@@ -58,13 +58,18 @@
 
 class InvokeDexCallingConventionVisitor {
  public:
-  InvokeDexCallingConventionVisitor() : gp_index_(0) {}
+  InvokeDexCallingConventionVisitor() : gp_index_(0), fp_index_(0), stack_index_(0) {}
 
   Location GetNextLocation(Primitive::Type type);
 
  private:
   InvokeDexCallingConvention calling_convention;
+  // The current index for cpu registers.
   uint32_t gp_index_;
+  // The current index for fpu registers.
+  uint32_t fp_index_;
+  // The current stack index.
+  uint32_t stack_index_;
 
   DISALLOW_COPY_AND_ASSIGN(InvokeDexCallingConventionVisitor);
 };
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index 3f266fe..1f0dba5 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -1537,8 +1537,12 @@
 
   uint32_t reg_offset = 1;
   CHECK_ALIGNED(frame_size, kStackAlignment);
+  int gpr_count = 0;
   for (int i = spill_regs.size() - 1; i >= 0; --i) {
-    pushl(spill_regs.at(i).AsX86().AsCpuRegister());
+    x86::X86ManagedRegister spill = spill_regs.at(i).AsX86();
+    DCHECK(spill.IsCpuRegister());
+    pushl(spill.AsCpuRegister());
+    gpr_count++;
 
     // DW_CFA_advance_loc
     DW_CFA_advance_loc(&cfi_info_, buffer_.Size() - cfi_pc_);
@@ -1552,7 +1556,7 @@
   }
 
   // return address then method on stack
-  int32_t adjust = frame_size - (spill_regs.size() * kFramePointerSize) -
+  int32_t adjust = frame_size - (gpr_count * kFramePointerSize) -
                    sizeof(StackReference<mirror::ArtMethod>) /*method*/ -
                    kFramePointerSize /*return address*/;
   addl(ESP, Immediate(-adjust));
@@ -1572,9 +1576,18 @@
   DW_CFA_def_cfa_offset(&cfi_info_, cfi_cfa_offset_);
 
   for (size_t i = 0; i < entry_spills.size(); ++i) {
-    movl(Address(ESP, frame_size + sizeof(StackReference<mirror::ArtMethod>) +
-                 (i * kFramePointerSize)),
-         entry_spills.at(i).AsX86().AsCpuRegister());
+    ManagedRegisterSpill spill = entry_spills.at(i);
+    if (spill.AsX86().IsCpuRegister()) {
+      movl(Address(ESP, frame_size + spill.getSpillOffset()), spill.AsX86().AsCpuRegister());
+    } else {
+      DCHECK(spill.AsX86().IsXmmRegister());
+      if (spill.getSize() == 8) {
+        movsd(Address(ESP, frame_size + spill.getSpillOffset()), spill.AsX86().AsXmmRegister());
+      } else {
+        CHECK_EQ(spill.getSize(), 4);
+        movss(Address(ESP, frame_size + spill.getSpillOffset()), spill.AsX86().AsXmmRegister());
+      }
+    }
   }
 }
 
@@ -1584,7 +1597,9 @@
   addl(ESP, Immediate(frame_size - (spill_regs.size() * kFramePointerSize) -
                       sizeof(StackReference<mirror::ArtMethod>)));
   for (size_t i = 0; i < spill_regs.size(); ++i) {
-    popl(spill_regs.at(i).AsX86().AsCpuRegister());
+    x86::X86ManagedRegister spill = spill_regs.at(i).AsX86();
+    DCHECK(spill.IsCpuRegister());
+    popl(spill.AsCpuRegister());
   }
   ret();
 }