Fixes to enable TrackLiveTemps optimization on x86.

- Created new kRegRegStore instruction class for Movdrx, where the
  source is first, and the destination is second.

- Reverted neg_float and neg_double implementation to prevent confusion
  of register types when optimizations are performed.

- Swapped order of loads for wide values to prevent base pointer from
  being clobbered when the base pointer equals the low destination reg.

- Implemented opRegCopyWide for general purpose reg source to floating
  point reg destination and vice versa.

- Added more opcode coverage to x86 disassembler.

Change-Id: I4e58eec91742cc51333003fa5a678ba5b23eb575
diff --git a/src/compiler/codegen/x86/Assemble.cc b/src/compiler/codegen/x86/Assemble.cc
index 7bd5c52..63e4cc3 100644
--- a/src/compiler/codegen/x86/Assemble.cc
+++ b/src/compiler/codegen/x86/Assemble.cc
@@ -280,10 +280,13 @@
   EXT_0F_ENCODING_MAP(Divsd,     0xF2, 0x5E, REG_DEF0),
   EXT_0F_ENCODING_MAP(Divss,     0xF3, 0x5E, REG_DEF0),
 
+  { kX86PsrlqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 2, 0, 1 }, "PsrlqRI", "!0r,!1d" },
   { kX86PsllqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 6, 0, 1 }, "PsllqRI", "!0r,!1d" },
 
   EXT_0F_ENCODING_MAP(Movdxr,    0x66, 0x6E, REG_DEF0),
-  EXT_0F_ENCODING_MAP(Movdrx,    0x66, 0x7E, REG_DEF0),
+  { kX86MovdrxRR, kRegRegStore, IS_BINARY_OP | REG_DEF0   | REG_USE01,  { 0x66, 0, 0x0F, 0x7E, 0, 0, 0, 0 }, "MovdrxRR", "!0r,!1r" },
+  { kX86MovdrxMR, kMemReg,      IS_STORE | IS_TERTIARY_OP | REG_USE02,  { 0x66, 0, 0x0F, 0x7E, 0, 0, 0, 0 }, "MovdrxMR", "[!0r+!1d],!2r" },
+  { kX86MovdrxAR, kArrayReg,    IS_STORE | IS_QUIN_OP     | REG_USE014, { 0x66, 0, 0x0F, 0x7E, 0, 0, 0, 0 }, "MovdrxAR", "[!0r+!1r<<!2d+!3d],!4r" },
 
   { kX86Set8R, kRegCond,              IS_BINARY_OP   | REG_DEF0  | USES_CCODES, { 0, 0, 0x0F, 0x90, 0, 0, 0, 0 }, "Set8R", "!1c !0r" },
   { kX86Set8M, kMemCond,   IS_STORE | IS_TERTIARY_OP | REG_USE0  | USES_CCODES, { 0, 0, 0x0F, 0x90, 0, 0, 0, 0 }, "Set8M", "!2c [!0r+!1d]" },
@@ -375,6 +378,8 @@
       return computeSize(entry, lir->operands[0], false);
     case kRegReg:
       return computeSize(entry, 0, false);
+    case kRegRegStore:
+      return computeSize(entry, 0, false);
     case kRegMem: { // lir operands - 0: reg, 1: base, 2: disp
       int base = lir->operands[1];
       return computeSize(entry, lir->operands[2], false) + (base == rSP ? 1 : 0);
@@ -800,6 +805,9 @@
       DCHECK_EQ(0, entry->skeleton.extra_opcode1);
       DCHECK_EQ(0, entry->skeleton.extra_opcode2);
     }
+    if (FPREG(reg)) {
+      reg = reg & FP_REG_MASK;
+    }
     uint8_t modrm = (3 << 6) | (entry->skeleton.modrm_opcode << 3) | reg;
     cUnit->codeBuffer.push_back(modrm);
   }
@@ -1307,6 +1315,9 @@
       case kRegReg:  // lir operands - 0: reg1, 1: reg2
         emitRegReg(cUnit, entry, lir->operands[0], lir->operands[1]);
         break;
+      case kRegRegStore:  // lir operands - 0: reg2, 1: reg1
+        emitRegReg(cUnit, entry, lir->operands[1], lir->operands[0]);
+        break;
       case kRegRegImm:
         emitRegRegImm(cUnit, entry, lir->operands[0], lir->operands[1], lir->operands[2]);
         break;
diff --git a/src/compiler/codegen/x86/FP/X86FP.cc b/src/compiler/codegen/x86/FP/X86FP.cc
index c00b5fc..f2488d0 100644
--- a/src/compiler/codegen/x86/FP/X86FP.cc
+++ b/src/compiler/codegen/x86/FP/X86FP.cc
@@ -21,6 +21,7 @@
                             RegLocation rlSrc2) {
   X86OpCode op = kX86Nop;
   RegLocation rlResult;
+  int tempReg;
 
   /*
    * Don't attempt to optimize register usage since these opcodes call out to
@@ -44,12 +45,13 @@
       op = kX86MulssRR;
       break;
     case Instruction::NEG_FLOAT:
-      // TODO: Make this nicer. Subtracting the source from 0 doesn't work in
-      // the 0 case, and using FCHS is difficult with register promotion. This
-      // code treats the value as a CoreReg to make it easy to manipulate.
-      rlSrc1 = loadValue(cUnit, rlSrc1, kCoreReg);
-      rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
-      opRegRegImm(cUnit, kOpAdd, rlResult.lowReg, rlSrc1.lowReg, 0x80000000);
+      // TODO: Make this an XorpsRM where the memory location holds 0x80000000
+      rlSrc1 = loadValue(cUnit, rlSrc1, kFPReg);
+      rlResult = oatEvalLoc(cUnit, rlDest, kFPReg, true);
+      tempReg = oatAllocTemp(cUnit);
+      loadConstant(cUnit, tempReg, 0x80000000);
+      newLIR2(cUnit, kX86MovdxrRR, rlResult.lowReg, tempReg);
+      newLIR2(cUnit, kX86XorpsRR, rlResult.lowReg, rlSrc1.lowReg);
       storeValue(cUnit, rlDest, rlResult);
       return false;
     case Instruction::REM_FLOAT_2ADDR:
@@ -81,6 +83,7 @@
                              RegLocation rlSrc2) {
   X86OpCode op = kX86Nop;
   RegLocation rlResult;
+  int tempReg;
 
   switch (opcode) {
     case Instruction::ADD_DOUBLE_2ADDR:
@@ -100,13 +103,14 @@
       op = kX86MulsdRR;
       break;
     case Instruction::NEG_DOUBLE:
-      // TODO: Make this nicer. Subtracting the source from 0 doesn't work in
-      // the 0 case, and using FCHS is difficult with register promotion. This
-      // code treats the value as a CoreReg to make it easy to manipulate.
-      rlSrc1 = loadValueWide(cUnit, rlSrc1, kCoreReg);
-      rlResult = oatEvalLoc(cUnit, rlDest, kCoreReg, true);
-      opRegRegImm(cUnit, kOpAdd, rlResult.highReg, rlSrc1.highReg, 0x80000000);
-      opRegCopy(cUnit, rlResult.lowReg, rlSrc1.lowReg);
+      // TODO: Make this an XorpdRM where the memory location holds 0x8000000000000000
+      rlSrc1 = loadValueWide(cUnit, rlSrc1, kFPReg);
+      rlResult = oatEvalLoc(cUnit, rlDest, kFPReg, true);
+      tempReg = oatAllocTemp(cUnit);
+      loadConstant(cUnit, tempReg, 0x80000000);
+      newLIR2(cUnit, kX86MovdxrRR, rlResult.lowReg, tempReg);
+      newLIR2(cUnit, kX86PsllqRI, rlResult.lowReg, 32);
+      newLIR2(cUnit, kX86XorpsRR, rlResult.lowReg, rlSrc1.lowReg);
       storeValueWide(cUnit, rlDest, rlResult);
       return false;
     case Instruction::REM_DOUBLE_2ADDR:
diff --git a/src/compiler/codegen/x86/X86/Factory.cc b/src/compiler/codegen/x86/X86/Factory.cc
index d60d9de..9721038 100644
--- a/src/compiler/codegen/x86/X86/Factory.cc
+++ b/src/compiler/codegen/x86/X86/Factory.cc
@@ -475,9 +475,15 @@
     if (!pair) {
       load = newLIR3(cUnit, opcode, rDest, rBase, displacement + LOWORD_OFFSET);
     } else {
-      load = newLIR3(cUnit, opcode, rDest, rBase, displacement + LOWORD_OFFSET);
-      load2 = newLIR3(cUnit, opcode, rDestHi, rBase,
-                      displacement + HIWORD_OFFSET);
+      if (rBase == rDest) {
+        load2 = newLIR3(cUnit, opcode, rDestHi, rBase,
+                        displacement + HIWORD_OFFSET);
+        load = newLIR3(cUnit, opcode, rDest, rBase, displacement + LOWORD_OFFSET);
+      } else {
+        load = newLIR3(cUnit, opcode, rDest, rBase, displacement + LOWORD_OFFSET);
+        load2 = newLIR3(cUnit, opcode, rDestHi, rBase,
+                        displacement + HIWORD_OFFSET);
+      }
     }
     if (rBase == rSP) {
       annotateDalvikRegAccess(load, (displacement + (pair ? LOWORD_OFFSET : 0))
@@ -492,10 +498,17 @@
       load = newLIR5(cUnit, opcode, rDest, rBase, rIndex, scale,
                      displacement + LOWORD_OFFSET);
     } else {
-      load = newLIR5(cUnit, opcode, rDest, rBase, rIndex, scale,
-                     displacement + LOWORD_OFFSET);
-      load2 = newLIR5(cUnit, opcode, rDestHi, rBase, rIndex, scale,
-                      displacement + HIWORD_OFFSET);
+      if (rBase == rDest) {
+        load2 = newLIR5(cUnit, opcode, rDestHi, rBase, rIndex, scale,
+                        displacement + HIWORD_OFFSET);
+        load = newLIR5(cUnit, opcode, rDest, rBase, rIndex, scale,
+                       displacement + LOWORD_OFFSET);
+      } else {
+        load = newLIR5(cUnit, opcode, rDest, rBase, rIndex, scale,
+                       displacement + LOWORD_OFFSET);
+        load2 = newLIR5(cUnit, opcode, rDestHi, rBase, rIndex, scale,
+                        displacement + HIWORD_OFFSET);
+      }
     }
   }
 
diff --git a/src/compiler/codegen/x86/X86/Gen.cc b/src/compiler/codegen/x86/X86/Gen.cc
index b0b6ba8..adad05b 100644
--- a/src/compiler/codegen/x86/X86/Gen.cc
+++ b/src/compiler/codegen/x86/X86/Gen.cc
@@ -352,13 +352,18 @@
     if (srcFP) {
       opRegCopy(cUnit, S2D(destLo, destHi), S2D(srcLo, srcHi));
     } else {
-      UNIMPLEMENTED(WARNING);
-      newLIR0(cUnit, kX86Bkpt);
+      // TODO: Prevent this from happening in the code. The result is often
+      // unused or could have been loaded more easily from memory.
+      newLIR2(cUnit, kX86MovdxrRR, destLo, srcLo);
+      newLIR2(cUnit, kX86MovdxrRR, destHi, srcHi);
+      newLIR2(cUnit, kX86PsllqRI, destHi, 32);
+      newLIR2(cUnit, kX86OrpsRR, destLo, destHi);
     }
   } else {
     if (srcFP) {
-      UNIMPLEMENTED(WARNING);
-      newLIR0(cUnit, kX86Bkpt);
+      newLIR2(cUnit, kX86MovdrxRR, destLo, srcLo);
+      newLIR2(cUnit, kX86PsrlqRI, srcLo, 32);
+      newLIR2(cUnit, kX86MovdrxRR, destHi, srcLo);
     } else {
       // Handle overlap
       if (srcHi == destLo) {
diff --git a/src/compiler/codegen/x86/X86LIR.h b/src/compiler/codegen/x86/X86LIR.h
index 4c44118..c229844 100644
--- a/src/compiler/codegen/x86/X86LIR.h
+++ b/src/compiler/codegen/x86/X86LIR.h
@@ -110,7 +110,7 @@
 /* Offset to distingish FP regs */
 #define FP_REG_OFFSET 32
 /* Offset to distinguish DP FP regs */
-#define FP_DOUBLE (FP_REG_OFFSET + 32)
+#define FP_DOUBLE (FP_REG_OFFSET + 16)
 /* Offset to distingish the extra regs */
 #define EXTRA_REG_OFFSET (FP_DOUBLE + 16)
 /* Reg types */
@@ -433,9 +433,10 @@
   Binary0fOpCode(kX86Subss),    // float subtract
   Binary0fOpCode(kX86Divsd),    // double divide
   Binary0fOpCode(kX86Divss),    // float divide
-  kX86PsllqRI,                  // shift of floating point registers
+  kX86PsrlqRI,                  // right shift of floating point registers
+  kX86PsllqRI,                  // left shift of floating point registers
   Binary0fOpCode(kX86Movdxr),   // move into xmm from gpr
-  Binary0fOpCode(kX86Movdrx),   // move into reg from xmm
+  kX86MovdrxRR, kX86MovdrxMR, kX86MovdrxAR,// move into reg from xmm
   kX86Set8R, kX86Set8M, kX86Set8A,// set byte depending on condition operand
   kX86Mfence,                   // memory barrier
   Binary0fOpCode(kX86Imul16),   // 16bit multiply
@@ -470,6 +471,7 @@
   kReg, kMem, kArray,                      // R, M and A instruction kinds.
   kMemReg, kArrayReg, kThreadReg,          // MR, AR and TR instruction kinds.
   kRegReg, kRegMem, kRegArray, kRegThread, // RR, RM, RA and RT instruction kinds.
+  kRegRegStore,                            // RR following the store modrm reg-reg encoding rather than the load.
   kRegImm, kMemImm, kArrayImm, kThreadImm, // RI, MI, AI and TI instruction kinds.
   kRegRegImm, kRegMemImm, kRegArrayImm,    // RRI, RMI and RAI instruction kinds.
   kMovRegImm,                              // Shorter form move RI.