Numerous fixes to enable PromoteRegs, though it's still broken.

- Fixed ThrowNullPointerFromCode launchpad to load the array length
  directly into the necessary arg reg without clobbering the array
  pointer, since that value may be live afterwards.

- genArrayPut use a temporary reg for bytes if the source reg is >= 4,
  since x86 can't express this.

- Fixed the order that core regs are spilled and unspilled.

- Correctly emit instructions when base == rBP and disp == 0.

- Added checks to the compiler to ensure that byte opcodes aren't used
  on registers that can't be byte accessed.

- Fixed generation of a number of ops which use byte opcodes, including
  floating point comparison, int-to-byte, and and-int/lit16.

- Added rBP, rSI, and rDI to spill registers for the x86 jni compiler.

- Various fixes and additions to the x86 disassembler.

Change-Id: I365fe7dec5cc64d181248fd58e90789f100b45e7
diff --git a/src/compiler/codegen/GenCommon.cc b/src/compiler/codegen/GenCommon.cc
index b4b0f6a..baa4b48 100644
--- a/src/compiler/codegen/GenCommon.cc
+++ b/src/compiler/codegen/GenCommon.cc
@@ -898,22 +898,33 @@
         funcOffset = ENTRYPOINT_OFFSET(pThrowNullPointerFromCode);
         break;
       case kThrowArrayBounds:
-#if defined (TARGET_X86)
-        // x86 leaves the array pointer in v2, so load the array length that the handler expects
-        opRegMem(cUnit, kOpMov, v2, v2, Array::LengthOffset().Int32Value());
-#endif
         // Move v1 (array index) to rARG0 and v2 (array length) to rARG1
         if (v2 != rARG0) {
           opRegCopy(cUnit, rARG0, v1);
+#if defined (TARGET_X86)
+          // x86 leaves the array pointer in v2, so load the array length that the handler expects
+          opRegMem(cUnit, kOpMov, rARG1, v2, Array::LengthOffset().Int32Value());
+#else
           opRegCopy(cUnit, rARG1, v2);
+#endif
         } else {
           if (v1 == rARG1) {
             // Swap v1 and v2, using rARG2 as a temp
             opRegCopy(cUnit, rARG2, v1);
+#if defined (TARGET_X86)
+            // x86 leaves the array pointer in v2, so load the array length that the handler expects
+            opRegMem(cUnit, kOpMov, rARG1, v2, Array::LengthOffset().Int32Value());
+#else
             opRegCopy(cUnit, rARG1, v2);
+#endif
             opRegCopy(cUnit, rARG0, rARG2);
           } else {
+#if defined (TARGET_X86)
+            // x86 leaves the array pointer in v2, so load the array length that the handler expects
+            opRegMem(cUnit, kOpMov, rARG1, v2, Array::LengthOffset().Int32Value());
+#else
             opRegCopy(cUnit, rARG1, v2);
+#endif
             opRegCopy(cUnit, rARG0, v1);
           }
         }
@@ -1598,9 +1609,18 @@
   } else {
     rlSrc = loadValue(cUnit, rlSrc, regClass);
   }
-  storeBaseIndexedDisp(cUnit, rlArray.lowReg, rlIndex.lowReg, scale,
-                       dataOffset, rlSrc.lowReg, rlSrc.highReg, size,
-                       INVALID_SREG);
+  // If the src reg can't be byte accessed, move it to a temp first.
+  if ((size == kSignedByte || size == kUnsignedByte) && rlSrc.lowReg >= 4) {
+    int temp = oatAllocTemp(cUnit);
+    opRegCopy(cUnit, temp, rlSrc.lowReg);
+    storeBaseIndexedDisp(cUnit, rlArray.lowReg, rlIndex.lowReg, scale,
+                         dataOffset, temp, INVALID_REG, size,
+                         INVALID_SREG);
+  } else {
+    storeBaseIndexedDisp(cUnit, rlArray.lowReg, rlIndex.lowReg, scale,
+                         dataOffset, rlSrc.lowReg, rlSrc.highReg, size,
+                         INVALID_SREG);
+  }
 #else
   bool needsRangeCheck = (!(optFlags & MIR_IGNORE_RANGE_CHECK));
   int regLen = INVALID_REG;
diff --git a/src/compiler/codegen/x86/ArchFactory.cc b/src/compiler/codegen/x86/ArchFactory.cc
index 1620044..001a93d 100644
--- a/src/compiler/codegen/x86/ArchFactory.cc
+++ b/src/compiler/codegen/x86/ArchFactory.cc
@@ -128,11 +128,11 @@
   }
   // Spill mask not including fake return address register
   uint32_t mask = cUnit->coreSpillMask & ~(1 << rRET);
-  int offset = cUnit->frameSize - 4;
+  int offset = cUnit->frameSize - (4 * cUnit->numCoreSpills);
   for (int reg = 0; mask; mask >>= 1, reg++) {
     if (mask & 0x1) {
-      offset -= 4;
       storeWordDisp(cUnit, rSP, offset, reg);
+      offset += 4;
     }
   }
 }
@@ -143,11 +143,11 @@
   }
   // Spill mask not including fake return address register
   uint32_t mask = cUnit->coreSpillMask & ~(1 << rRET);
-  int offset = cUnit->frameSize - 4;
+  int offset = cUnit->frameSize - (4 * cUnit->numCoreSpills);
   for (int reg = 0; mask; mask >>= 1, reg++) {
     if (mask & 0x1) {
-      offset -= 4;
       loadWordDisp(cUnit, rSP, offset, reg);
+      offset += 4;
     }
   }
 }
diff --git a/src/compiler/codegen/x86/Assemble.cc b/src/compiler/codegen/x86/Assemble.cc
index 63e4cc3..a245660 100644
--- a/src/compiler/codegen/x86/Assemble.cc
+++ b/src/compiler/codegen/x86/Assemble.cc
@@ -362,15 +362,19 @@
       return computeSize(entry, 0, false);
     case kMem: { // lir operands - 0: base, 1: disp
       int base = lir->operands[0];
-      // SP requires a special extra SIB byte
-      return computeSize(entry, lir->operands[1], false) + (base == rSP ? 1 : 0);
+      int disp = lir->operands[1];
+      // SP requires a special extra SIB byte. BP requires explicit disp,
+      // so add a byte for disp 0 which would normally be omitted.
+      return computeSize(entry, disp, false) + ((base == rSP) || (base == rBP && disp == 0) ? 1 : 0);
     }
     case kArray:  // lir operands - 0: base, 1: index, 2: scale, 3: disp
       return computeSize(entry, lir->operands[3], true);
     case kMemReg: { // lir operands - 0: base, 1: disp, 2: reg
       int base = lir->operands[0];
-      // SP requires a special extra SIB byte
-      return computeSize(entry, lir->operands[1], false) + (base == rSP ? 1 : 0);
+      int disp = lir->operands[1];
+      // SP requires a special extra SIB byte. BP requires explicit disp,
+      // so add a byte for disp 0 which would normally be omitted.
+      return computeSize(entry, disp, false) + ((base == rSP) || (base == rBP && disp == 0) ? 1 : 0);
     }
     case kArrayReg:  // lir operands - 0: base, 1: index, 2: scale, 3: disp, 4: reg
       return computeSize(entry, lir->operands[3], true);
@@ -382,10 +386,17 @@
       return computeSize(entry, 0, false);
     case kRegMem: { // lir operands - 0: reg, 1: base, 2: disp
       int base = lir->operands[1];
-      return computeSize(entry, lir->operands[2], false) + (base == rSP ? 1 : 0);
+      int disp = lir->operands[2];
+      // SP requires a special extra SIB byte. BP requires explicit disp,
+      // so add a byte for disp 0 which would normally be omitted.
+      return computeSize(entry, disp, false) + ((base == rSP) || (base == rBP && disp == 0) ? 1 : 0);
     }
-    case kRegArray:  // lir operands - 0: reg, 1: base, 2: index, 3: scale, 4: disp
-      return computeSize(entry, lir->operands[4], true);
+    case kRegArray:  { // lir operands - 0: reg, 1: base, 2: index, 3: scale, 4: disp
+      int base = lir->operands[1];
+      int disp = lir->operands[4];
+      // BP requires explicit disp, so add a byte for disp 0 which would normally be omitted.
+      return computeSize(entry, disp, true) + ((base == rBP && disp == 0) ? 1 : 0);
+    }
     case kRegThread:  // lir operands - 0: reg, 1: disp
       return computeSize(entry, 0x12345678, false);  // displacement size is always 32bit
     case kRegImm: {  // lir operands - 0: reg, 1: immediate
@@ -487,8 +498,9 @@
   return 0;
 }
 
-static uint8_t modrmForDisp(int disp) {
-  if (disp == 0) {
+static uint8_t modrmForDisp(int base, int disp) {
+  // BP requires an explicit disp, so do not omit it in the 0 case
+  if (disp == 0 && base != rBP) {
     return 0;
   } else if (IS_SIMM8(disp)) {
     return 1;
@@ -497,8 +509,9 @@
   }
 }
 
-static void emitDisp(CompilationUnit* cUnit, int disp) {
-  if (disp == 0) {
+static void emitDisp(CompilationUnit* cUnit, int base, int disp) {
+  // BP requires an explicit disp, so do not omit it in the 0 case
+  if (disp == 0 && base != rBP) {
     return;
   } else if (IS_SIMM8(disp)) {
     cUnit->codeBuffer.push_back(disp & 0xFF);
@@ -534,6 +547,10 @@
   if (FPREG(reg)) {
     reg = reg & FP_REG_MASK;
   }
+  if (reg >= 4) {
+    DCHECK(strchr(entry->name, '8') == NULL) << entry->name << " " << (int) reg
+        << " in " << PrettyMethod(cUnit->method_idx, *cUnit->dex_file);
+  }
   DCHECK_LT(reg, 8);
   uint8_t modrm = (3 << 6) | (entry->skeleton.modrm_opcode << 3) | reg;
   cUnit->codeBuffer.push_back(modrm);
@@ -555,9 +572,9 @@
   DCHECK_EQ(0, entry->skeleton.extra_opcode2);
   DCHECK_LT(entry->skeleton.modrm_opcode, 8);
   DCHECK_LT(base, 8);
-  uint8_t modrm = (modrmForDisp(disp) << 6) | (entry->skeleton.modrm_opcode << 3) | base;
+  uint8_t modrm = (modrmForDisp(base, disp) << 6) | (entry->skeleton.modrm_opcode << 3) | base;
   cUnit->codeBuffer.push_back(modrm);
-  emitDisp(cUnit, disp);
+  emitDisp(cUnit, base, disp);
   DCHECK_EQ(0, entry->skeleton.ax_opcode);
   DCHECK_EQ(0, entry->skeleton.immediate_bytes);
 }
@@ -587,15 +604,19 @@
   if (FPREG(reg)) {
     reg = reg & FP_REG_MASK;
   }
+  if (reg >= 4) {
+    DCHECK(strchr(entry->name, '8') == NULL) << entry->name << " " << (int) reg
+        << " in " << PrettyMethod(cUnit->method_idx, *cUnit->dex_file);
+  }
   DCHECK_LT(reg, 8);
   DCHECK_LT(base, 8);
-  uint8_t modrm = (modrmForDisp(disp) << 6) | (reg << 3) | base;
+  uint8_t modrm = (modrmForDisp(base, disp) << 6) | (reg << 3) | base;
   cUnit->codeBuffer.push_back(modrm);
   if (base == rSP) {
     // Special SIB for SP base
     cUnit->codeBuffer.push_back(0 << 6 | (rSP << 3) | rSP);
   }
-  emitDisp(cUnit, disp);
+  emitDisp(cUnit, base, disp);
   DCHECK_EQ(0, entry->skeleton.modrm_opcode);
   DCHECK_EQ(0, entry->skeleton.ax_opcode);
   DCHECK_EQ(0, entry->skeleton.immediate_bytes);
@@ -633,14 +654,14 @@
     reg = reg & FP_REG_MASK;
   }
   DCHECK_LT(reg, 8);
-  uint8_t modrm = (modrmForDisp(disp) << 6) | (reg << 3) | rSP;
+  uint8_t modrm = (modrmForDisp(base, disp) << 6) | (reg << 3) | rSP;
   cUnit->codeBuffer.push_back(modrm);
   DCHECK_LT(scale, 4);
   DCHECK_LT(index, 8);
   DCHECK_LT(base, 8);
   uint8_t sib = (scale << 6) | (index << 3) | base;
   cUnit->codeBuffer.push_back(sib);
-  emitDisp(cUnit, disp);
+  emitDisp(cUnit, base, disp);
   DCHECK_EQ(0, entry->skeleton.modrm_opcode);
   DCHECK_EQ(0, entry->skeleton.ax_opcode);
   DCHECK_EQ(0, entry->skeleton.immediate_bytes);
@@ -674,6 +695,10 @@
   if (FPREG(reg)) {
     reg = reg & FP_REG_MASK;
   }
+  if (reg >= 4) {
+    DCHECK(strchr(entry->name, '8') == NULL) << entry->name << " " << (int) reg
+        << " in " << PrettyMethod(cUnit->method_idx, *cUnit->dex_file);
+  }
   DCHECK_LT(reg, 8);
   uint8_t modrm = (0 << 6) | (reg << 3) | rBP;
   cUnit->codeBuffer.push_back(modrm);
@@ -923,6 +948,10 @@
     DCHECK_EQ(0, entry->skeleton.extra_opcode1);
     DCHECK_EQ(0, entry->skeleton.extra_opcode2);
   }
+  if (reg >= 4) {
+    DCHECK(strchr(entry->name, '8') == NULL) << entry->name << " " << (int) reg
+        << " in " << PrettyMethod(cUnit->method_idx, *cUnit->dex_file);
+  }
   DCHECK_LT(reg, 8);
   uint8_t modrm = (3 << 6) | (entry->skeleton.modrm_opcode << 3) | reg;
   cUnit->codeBuffer.push_back(modrm);
@@ -1037,13 +1066,13 @@
     DCHECK_EQ(0, entry->skeleton.extra_opcode1);
     DCHECK_EQ(0, entry->skeleton.extra_opcode2);
   }
-  uint8_t modrm = (modrmForDisp(disp) << 6) | (entry->skeleton.modrm_opcode << 3) | base;
+  uint8_t modrm = (modrmForDisp(base, disp) << 6) | (entry->skeleton.modrm_opcode << 3) | base;
   cUnit->codeBuffer.push_back(modrm);
   if (base == rSP) {
     // Special SIB for SP base
     cUnit->codeBuffer.push_back(0 << 6 | (rSP << 3) | rSP);
   }
-  emitDisp(cUnit, disp);
+  emitDisp(cUnit, base, disp);
   DCHECK_EQ(0, entry->skeleton.ax_opcode);
   DCHECK_EQ(0, entry->skeleton.immediate_bytes);
 }
diff --git a/src/compiler/codegen/x86/FP/X86FP.cc b/src/compiler/codegen/x86/FP/X86FP.cc
index f2488d0..8cd32b4 100644
--- a/src/compiler/codegen/x86/FP/X86FP.cc
+++ b/src/compiler/codegen/x86/FP/X86FP.cc
@@ -264,7 +264,20 @@
   if (unorderedGt) {
     branch = newLIR2(cUnit, kX86Jcc8, 0, kX86CondPE);
   }
-  newLIR2(cUnit, kX86Set8R, rlResult.lowReg, kX86CondA /* above - unsigned > */);
+  // If the result reg can't be byte accessed, use a jump and move instead of a set.
+  if (rlResult.lowReg >= 4) {
+    LIR* branch2 = NULL;
+    if (unorderedGt) {
+      branch2 = newLIR2(cUnit, kX86Jcc8, 0, kX86CondA);
+      newLIR2(cUnit, kX86Mov32RI, rlResult.lowReg, 0x0);
+    } else {
+      branch2 = newLIR2(cUnit, kX86Jcc8, 0, kX86CondBe);
+      newLIR2(cUnit, kX86Mov32RI, rlResult.lowReg, 0x1);
+    }
+    branch2->target = newLIR0(cUnit, kPseudoTargetLabel);
+  } else {
+    newLIR2(cUnit, kX86Set8R, rlResult.lowReg, kX86CondA /* above - unsigned > */);
+  }
   newLIR2(cUnit, kX86Sbb32RI, rlResult.lowReg, 0);
   if (unorderedGt) {
     branch->target = newLIR0(cUnit, kPseudoTargetLabel);
diff --git a/src/compiler/codegen/x86/X86/Factory.cc b/src/compiler/codegen/x86/X86/Factory.cc
index 9721038..f77a793 100644
--- a/src/compiler/codegen/x86/X86/Factory.cc
+++ b/src/compiler/codegen/x86/X86/Factory.cc
@@ -198,7 +198,16 @@
       case kOpAnd: opcode = kX86And32RR; break;
       case kOpOr:  opcode = kX86Or32RR; break;
       case kOpXor: opcode = kX86Xor32RR; break;
-      case kOp2Byte: opcode = kX86Movsx8RR; break;
+      case kOp2Byte:
+        // Use shifts instead of a byte operand if the source can't be byte accessed.
+        if (rSrc2 >= 4) {
+          newLIR2(cUnit, kX86Mov32RR, rDestSrc1, rSrc2);
+          newLIR2(cUnit, kX86Sal32RI, rDestSrc1, 24);
+          return newLIR2(cUnit, kX86Sar32RI, rDestSrc1, 24);
+        } else {
+          opcode = kX86Movsx8RR;
+        }
+        break;
       case kOp2Short: opcode = kX86Movsx16RR; break;
       case kOp2Char: opcode = kX86Movzx16RR; break;
       case kOpMul: opcode = kX86Imul32RR; break;
@@ -228,7 +237,7 @@
     case kOp2Char: opcode = kX86Movzx16RM; break;
     case kOpMul:
     default:
-      LOG(FATAL) << "Bad case in opRegReg " << op;
+      LOG(FATAL) << "Bad case in opRegMem " << op;
       break;
   }
   return newLIR3(cUnit, opcode, rDest, rBase, offset);
@@ -290,7 +299,7 @@
     X86OpCode opcode = IS_SIMM8(value) ? kX86Imul32RRI8 : kX86Imul32RRI;
     return newLIR3(cUnit, opcode, rDest, rSrc, value);
   } else if (op == kOpAnd) {
-    if (value == 0xFF) {
+    if (value == 0xFF && rDest < 4) {
       return newLIR2(cUnit, kX86Movzx8RR, rDest, rSrc);
     } else if (value == 0xFFFF) {
       return newLIR2(cUnit, kX86Movzx16RR, rDest, rSrc);
diff --git a/src/compiler/codegen/x86/X86LIR.h b/src/compiler/codegen/x86/X86LIR.h
index c229844..5bf4dd9 100644
--- a/src/compiler/codegen/x86/X86LIR.h
+++ b/src/compiler/codegen/x86/X86LIR.h
@@ -199,6 +199,9 @@
   rSI    = r6,
   r7     = 7,
   rDI    = r7,
+#ifndef TARGET_REX_SUPPORT
+  rRET   = 8,  // fake return address register for core spill mask
+#else
   r8     = 8,
   r9     = 9,
   r10    = 10,
@@ -208,6 +211,7 @@
   r14    = 14,
   r15    = 15,
   rRET   = 16,  // fake return address register for core spill mask
+#endif
   fr0  =  0 + FP_REG_OFFSET,
   fr1  =  1 + FP_REG_OFFSET,
   fr2  =  2 + FP_REG_OFFSET,