Revert "Revert "ART: Implement X86 hard float (Quick/JNI/Baseline)""

This reverts commit 949c91fb91f40a4a80b2b492913cf8541008975e.

This time, don't clobber EBX before saving it.

Redo some of the macros to make register usage explicit.

Change-Id: I8db8662877cd006816e16a28f42444ab7c36bfef
diff --git a/runtime/arch/x86/context_x86.cc b/runtime/arch/x86/context_x86.cc
index 2a6ff14..4ea4684 100644
--- a/runtime/arch/x86/context_x86.cc
+++ b/runtime/arch/x86/context_x86.cc
@@ -30,6 +30,9 @@
   for (size_t  i = 0; i < kNumberOfCpuRegisters; i++) {
     gprs_[i] = nullptr;
   }
+  for (size_t i = 0; i < kNumberOfFloatRegisters; ++i) {
+    fprs_[i] = nullptr;
+  }
   gprs_[ESP] = &esp_;
   // Initialize registers with easy to spot debug values.
   esp_ = X86Context::kBadGprBase + ESP;
@@ -40,7 +43,7 @@
   mirror::ArtMethod* method = fr.GetMethod();
   const QuickMethodFrameInfo frame_info = method->GetQuickFrameInfo();
   size_t spill_count = POPCOUNT(frame_info.CoreSpillMask());
-  DCHECK_EQ(frame_info.FpSpillMask(), 0u);
+  size_t fp_spill_count = POPCOUNT(frame_info.FpSpillMask());
   if (spill_count > 0) {
     // Lowest number spill is farthest away, walk registers and fill into context.
     int j = 2;  // Offset j to skip return address spill.
@@ -51,6 +54,24 @@
       }
     }
   }
+  if (fp_spill_count > 0) {
+    // Lowest number spill is farthest away, walk registers and fill into context.
+    size_t j = 2;  // Offset j to skip return address spill.
+    size_t fp_spill_size_in_words = fp_spill_count * 2;
+    for (size_t i = 0; i < kNumberOfFloatRegisters; ++i) {
+      if (((frame_info.FpSpillMask() >> i) & 1) != 0) {
+        // There are 2 pieces to each XMM register, to match VR size.
+        fprs_[2*i] = reinterpret_cast<uint32_t*>(
+            fr.CalleeSaveAddress(spill_count + fp_spill_size_in_words - j,
+                                 frame_info.FrameSizeInBytes()));
+        fprs_[2*i+1] = reinterpret_cast<uint32_t*>(
+            fr.CalleeSaveAddress(spill_count + fp_spill_size_in_words - j - 1,
+                                 frame_info.FrameSizeInBytes()));
+        // Two void* per XMM register.
+        j += 2;
+      }
+    }
+  }
 }
 
 void X86Context::SmashCallerSaves() {
@@ -59,6 +80,7 @@
   gprs_[EDX] = const_cast<uintptr_t*>(&gZero);
   gprs_[ECX] = nullptr;
   gprs_[EBX] = nullptr;
+  memset(&fprs_[0], '\0', sizeof(fprs_));
 }
 
 void X86Context::SetGPR(uint32_t reg, uintptr_t value) {
@@ -68,14 +90,11 @@
   *gprs_[reg] = value;
 }
 
-uintptr_t X86Context::GetFPR(uint32_t reg ATTRIBUTE_UNUSED) {
-  LOG(FATAL) << "Floating-point registers are all caller save in X86";
-  UNREACHABLE();
-}
-
-void X86Context::SetFPR(uint32_t reg ATTRIBUTE_UNUSED, uintptr_t value ATTRIBUTE_UNUSED) {
-  LOG(FATAL) << "Floating-point registers are all caller save in X86";
-  UNREACHABLE();
+void X86Context::SetFPR(uint32_t reg, uintptr_t value) {
+  CHECK_LT(reg, static_cast<uint32_t>(kNumberOfFloatRegisters));
+  DCHECK(IsAccessibleFPR(reg));
+  CHECK_NE(fprs_[reg], reinterpret_cast<const uint32_t*>(&gZero));
+  *fprs_[reg] = value;
 }
 
 void X86Context::DoLongJump() {
@@ -86,17 +105,30 @@
   for (size_t i = 0; i < kNumberOfCpuRegisters; ++i) {
     gprs[kNumberOfCpuRegisters - i - 1] = gprs_[i] != nullptr ? *gprs_[i] : X86Context::kBadGprBase + i;
   }
+  uint32_t fprs[kNumberOfFloatRegisters];
+  for (size_t i = 0; i < kNumberOfFloatRegisters; ++i) {
+    fprs[i] = fprs_[i] != nullptr ? *fprs_[i] : X86Context::kBadFprBase + i;
+  }
   // We want to load the stack pointer one slot below so that the ret will pop eip.
   uintptr_t esp = gprs[kNumberOfCpuRegisters - ESP - 1] - sizeof(intptr_t);
   gprs[kNumberOfCpuRegisters] = esp;
   *(reinterpret_cast<uintptr_t*>(esp)) = eip_;
   __asm__ __volatile__(
+      "movl %1, %%ebx\n\t"          // Address base of FPRs.
+      "movsd 0(%%ebx), %%xmm0\n\t"  // Load up XMM0-XMM7.
+      "movsd 8(%%ebx), %%xmm1\n\t"
+      "movsd 16(%%ebx), %%xmm2\n\t"
+      "movsd 24(%%ebx), %%xmm3\n\t"
+      "movsd 32(%%ebx), %%xmm4\n\t"
+      "movsd 40(%%ebx), %%xmm5\n\t"
+      "movsd 48(%%ebx), %%xmm6\n\t"
+      "movsd 56(%%ebx), %%xmm7\n\t"
       "movl %0, %%esp\n\t"  // ESP points to gprs.
       "popal\n\t"           // Load all registers except ESP and EIP with values in gprs.
       "popl %%esp\n\t"      // Load stack pointer.
       "ret\n\t"             // From higher in the stack pop eip.
       :  // output.
-      : "g"(&gprs[0])  // input.
+      : "g"(&gprs[0]), "g"(&fprs[0]) // input.
       :);  // clobber.
 #else
   UNIMPLEMENTED(FATAL);