Compiler constant handling rework

In preparation for de-optimization, reworked the constant
handling mechanism.  Also took advantage of knowledge of
constant operands (particularly for long operations).

Significant performance improvements for Mandelbrot
(~60 seconds to ~34 seconds).  Minor improvements in other
benchmarks.

The new constant handling breaks two of the existing
optimization passes: "Skip Large Method" and "Load/Store
Elimization."

I don't intend to update the large method optimization
because it will be superceeded by the upcoming interpreter/
fingerprinting mechanism.  Leaving the code in place for
now in order to compare compile-time improvements with
fingerprinting/interpret.  All related code will be deleted
when that is complete.

The load/store elimination pass needs some rework to handle
uses of multiple-register loads and stores.  It will be
updated & restored in a future CL.

Change-Id: Ia979abaf51b8ae81bbb0428031cbcea854625fac
diff --git a/src/compiler/codegen/arm/assemble_arm.cc b/src/compiler/codegen/arm/assemble_arm.cc
index 91f25d6..455ea67 100644
--- a/src/compiler/codegen/arm/assemble_arm.cc
+++ b/src/compiler/codegen/arm/assemble_arm.cc
@@ -646,7 +646,7 @@
                  kFmtUnused, -1, -1,
                  IS_UNARY_OP | REG_DEF_SP | REG_USE_SP | REG_USE_LIST0
                  | IS_STORE | NEEDS_FIXUP, "push", "<!0R>", 4),
-    ENCODING_MAP(kThumb2CmpRI8, 0xf1b00f00,
+    ENCODING_MAP(kThumb2CmpRI12, 0xf1b00f00,
                  kFmtBitBlt, 19, 16, kFmtModImm, -1, -1, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1,
                  IS_BINARY_OP | REG_USE0 | SETS_CCODES,
@@ -917,8 +917,8 @@
                  "b", "!0t", 4),
     ENCODING_MAP(kThumb2MovImm16H,       0xf2c00000,
                  kFmtBitBlt, 11, 8, kFmtImm16, -1, -1, kFmtUnused, -1, -1,
-                 kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0,
-                 "movh", "!0C, #!1M", 4),
+                 kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0 | REG_USE0,
+                 "movt", "!0C, #!1M", 4),
     ENCODING_MAP(kThumb2AddPCR,      0x4487,
                  kFmtBitBlt, 6, 3, kFmtUnused, -1, -1, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1,
@@ -936,8 +936,8 @@
                  "mov", "!0C, #!1M", 4),
     ENCODING_MAP(kThumb2MovImm16HST,     0xf2c00000,
                  kFmtBitBlt, 11, 8, kFmtImm16, -1, -1, kFmtUnused, -1, -1,
-                 kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0 | NEEDS_FIXUP,
-                 "movh", "!0C, #!1M", 4),
+                 kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0 | REG_USE0 | NEEDS_FIXUP,
+                 "movt", "!0C, #!1M", 4),
     ENCODING_MAP(kThumb2LdmiaWB,         0xe8b00000,
                  kFmtBitBlt, 19, 16, kFmtBitBlt, 15, 0, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1,
@@ -972,7 +972,21 @@
                  kFmtBitBlt, 3, 0,
                  IS_QUAD_OP | REG_DEF0 | REG_DEF1 | REG_USE2 | REG_USE3,
                  "smull", "!0C, !1C, !2C, !3C", 4),
-
+    ENCODING_MAP(kThumb2LdrdPcRel8,  0xe9df0000,
+                 kFmtBitBlt, 15, 12, kFmtBitBlt, 11, 8, kFmtBitBlt, 7, 0,
+                 kFmtUnused, -1, -1,
+                 IS_TERTIARY_OP | REG_DEF0 | REG_DEF1 | REG_USE_PC | IS_LOAD | NEEDS_FIXUP,
+                 "ldrd", "!0C, !1C, [pc, #!2E]", 4),
+    ENCODING_MAP(kThumb2LdrdI8, 0xe9d00000,
+                 kFmtBitBlt, 15, 12, kFmtBitBlt, 11, 8, kFmtBitBlt, 19, 16,
+                 kFmtBitBlt, 7, 0,
+                 IS_QUAD_OP | REG_DEF0 | REG_DEF1 | REG_USE2 | IS_LOAD,
+                 "ldrd", "!0C, !1C, [!2C, #!3E]", 4),
+    ENCODING_MAP(kThumb2StrdI8, 0xe9c00000,
+                 kFmtBitBlt, 15, 12, kFmtBitBlt, 11, 8, kFmtBitBlt, 19, 16,
+                 kFmtBitBlt, 7, 0,
+                 IS_QUAD_OP | REG_USE0 | REG_USE1 | REG_USE2 | IS_STORE,
+                 "strd", "!0C, !1C, [!2C, #!3E]", 4),
 };
 
 /*
@@ -1023,13 +1037,14 @@
       if (lir->opcode == kThumbLdrPcRel ||
           lir->opcode == kThumb2LdrPcRel12 ||
           lir->opcode == kThumbAddPcRel ||
+          lir->opcode == kThumb2LdrdPcRel8 ||
           ((lir->opcode == kThumb2Vldrd) && (lir->operands[1] == r15pc)) ||
           ((lir->opcode == kThumb2Vldrs) && (lir->operands[1] == r15pc))) {
         /*
          * PC-relative loads are mostly used to load immediates
          * that are too large to materialize directly in one shot.
          * However, if the load displacement exceeds the limit,
-         * we revert to a 2-instruction materialization sequence.
+         * we revert to a multiple-instruction materialization sequence.
          */
         LIR *lir_target = lir->target;
         uintptr_t pc = (lir->offset + 4) & ~3;
@@ -1044,8 +1059,9 @@
           // Shouldn't happen in current codegen.
           LOG(FATAL) << "Unexpected pc-rel offset " << delta;
         }
-        // Now, check for the two difficult cases
+        // Now, check for the difficult cases
         if (((lir->opcode == kThumb2LdrPcRel12) && (delta > 4091)) ||
+            ((lir->opcode == kThumb2LdrdPcRel8) && (delta > 1020)) ||
             ((lir->opcode == kThumb2Vldrs) && (delta > 1020)) ||
             ((lir->opcode == kThumb2Vldrd) && (delta > 1020))) {
           /*
@@ -1053,26 +1069,34 @@
            * vldrs/vldrd we include REG_DEF_LR in the resource
            * masks for these instructions.
            */
-          int base_reg = (lir->opcode == kThumb2LdrPcRel12) ?
-            lir->operands[0] : rARM_LR;
+          int base_reg = ((lir->opcode == kThumb2LdrdPcRel8) || (lir->opcode == kThumb2LdrPcRel12))
+              ?  lir->operands[0] : rARM_LR;
 
-          // Add new Adr to generate the address
+          // Add new Adr to generate the address.
           LIR* new_adr = RawLIR(cu, lir->dalvik_offset, kThumb2Adr,
                      base_reg, 0, 0, 0, 0, lir->target);
           InsertLIRBefore(lir, new_adr);
 
-          // Convert to normal load
+          // Convert to normal load.
           if (lir->opcode == kThumb2LdrPcRel12) {
             lir->opcode = kThumb2LdrRRI12;
+          } else if (lir->opcode == kThumb2LdrdPcRel8) {
+            lir->opcode = kThumb2LdrdI8;
           }
-          // Change the load to be relative to the new Adr base
-          lir->operands[1] = base_reg;
-          lir->operands[2] = 0;
+          // Change the load to be relative to the new Adr base.
+          if (lir->opcode == kThumb2LdrdI8) {
+            lir->operands[3] = 0;
+            lir->operands[2] = base_reg;
+          } else {
+            lir->operands[2] = 0;
+            lir->operands[1] = base_reg;
+          }
           SetupResourceMasks(cu, lir);
           res = kRetryAll;
         } else {
           if ((lir->opcode == kThumb2Vldrs) ||
-              (lir->opcode == kThumb2Vldrd)) {
+              (lir->opcode == kThumb2Vldrd) ||
+              (lir->opcode == kThumb2LdrdPcRel8)) {
             lir->operands[2] = delta >> 2;
           } else {
             lir->operands[1] = (lir->opcode == kThumb2LdrPcRel12) ?  delta :