AArch64: Add tbz/tbnz and tst.

Since the branch offset supported by tbz/tbnz is quite small(-32k ~ +32k),
it will be replaced by tst and beq/bneq in the fix-up stage if the branch
offset is too large.

Change-Id: I4cace06bec6425e0f2e1f5f7c471eec08d06bca6
diff --git a/compiler/dex/compiler_enums.h b/compiler/dex/compiler_enums.h
index dcc67c3..63f3e64 100644
--- a/compiler/dex/compiler_enums.h
+++ b/compiler/dex/compiler_enums.h
@@ -528,6 +528,7 @@
   kFixupLoad,        // Mostly for immediates.
   kFixupVLoad,       // FP load which *may* be pc-relative.
   kFixupCBxZ,        // Cbz, Cbnz.
+  kFixupTBxZ,        // Tbz, Tbnz.
   kFixupPushPop,     // Not really pc relative, but changes size based on args.
   kFixupCondBranch,  // Conditional branch
   kFixupT1Branch,    // Thumb1 Unconditional branch
diff --git a/compiler/dex/quick/arm64/arm64_lir.h b/compiler/dex/quick/arm64/arm64_lir.h
index a449cbd..d001dd6 100644
--- a/compiler/dex/quick/arm64/arm64_lir.h
+++ b/compiler/dex/quick/arm64/arm64_lir.h
@@ -116,6 +116,7 @@
 #define IS_SIGNED_IMM7(value) IS_SIGNED_IMM(7, value)
 #define IS_SIGNED_IMM9(value) IS_SIGNED_IMM(9, value)
 #define IS_SIGNED_IMM12(value) IS_SIGNED_IMM(12, value)
+#define IS_SIGNED_IMM14(value) IS_SIGNED_IMM(14, value)
 #define IS_SIGNED_IMM19(value) IS_SIGNED_IMM(19, value)
 #define IS_SIGNED_IMM21(value) IS_SIGNED_IMM(21, value)
 
@@ -355,7 +356,10 @@
   kA64Sub4rrro,      // sub [s1001011000] rm[20-16] imm_6[15-10] rn[9-5] rd[4-0].
   kA64Sub4RRre,      // sub [s1001011001] rm[20-16] option[15-13] imm_3[12-10] rn[9-5] rd[4-0].
   kA64Subs3rRd,      // subs[s111000100] imm_12[21-10] rn[9-5] rd[4-0].
+  kA64Tst2rl,        // tst alias of "ands rzr, rn, #imm".
   kA64Tst3rro,       // tst alias of "ands rzr, arg1, arg2, arg3".
+  kA64Tbnz3rht,      // tbnz imm_6_b5[31] [0110111] imm_6_b40[23-19] imm_14[18-5] rt[4-0].
+  kA64Tbz3rht,       // tbz imm_6_b5[31] [0110110] imm_6_b40[23-19] imm_14[18-5] rt[4-0].
   kA64Ubfm4rrdd,     // ubfm[s10100110] N[22] imm_r[21-16] imm_s[15-10] rn[9-5] rd[4-0].
   kA64Last,
   kA64NotWide = 0,   // Flag used to select the first instruction variant.
@@ -400,23 +404,24 @@
 enum ArmEncodingKind {
   // All the formats below are encoded in the same way (as a kFmtBitBlt).
   // These are grouped together, for fast handling (e.g. "if (LIKELY(fmt <= kFmtBitBlt)) ...").
-  kFmtRegW = 0,  // Word register (w) or wzr.
-  kFmtRegX,      // Extended word register (x) or xzr.
-  kFmtRegR,      // Register with same width as the instruction or zr.
-  kFmtRegWOrSp,  // Word register (w) or wsp.
-  kFmtRegXOrSp,  // Extended word register (x) or sp.
-  kFmtRegROrSp,  // Register with same width as the instruction or sp.
-  kFmtRegS,      // Single FP reg.
-  kFmtRegD,      // Double FP reg.
-  kFmtRegF,      // Single/double FP reg depending on the instruction width.
-  kFmtBitBlt,    // Bit string using end/start.
+  kFmtRegW = 0,   // Word register (w) or wzr.
+  kFmtRegX,       // Extended word register (x) or xzr.
+  kFmtRegR,       // Register with same width as the instruction or zr.
+  kFmtRegWOrSp,   // Word register (w) or wsp.
+  kFmtRegXOrSp,   // Extended word register (x) or sp.
+  kFmtRegROrSp,   // Register with same width as the instruction or sp.
+  kFmtRegS,       // Single FP reg.
+  kFmtRegD,       // Double FP reg.
+  kFmtRegF,       // Single/double FP reg depending on the instruction width.
+  kFmtBitBlt,     // Bit string using end/start.
 
   // Less likely formats.
-  kFmtUnused,    // Unused field and marks end of formats.
-  kFmtImm21,     // Sign-extended immediate using [23..5,30..29].
-  kFmtShift,     // Register shift, 9-bit at [23..21, 15..10]..
-  kFmtExtend,    // Register extend, 9-bit at [23..21, 15..10].
-  kFmtSkip,      // Unused field, but continue to next.
+  kFmtUnused,     // Unused field and marks end of formats.
+  kFmtImm6Shift,  // Shift immediate, 6-bit at [31, 23..19].
+  kFmtImm21,      // Sign-extended immediate using [23..5,30..29].
+  kFmtShift,      // Register shift, 9-bit at [23..21, 15..10]..
+  kFmtExtend,     // Register extend, 9-bit at [23..21, 15..10].
+  kFmtSkip,       // Unused field, but continue to next.
 };
 
 // Struct used to define the snippet positions for each A64 opcode.
diff --git a/compiler/dex/quick/arm64/assemble_arm64.cc b/compiler/dex/quick/arm64/assemble_arm64.cc
index 15c89f2..5115246 100644
--- a/compiler/dex/quick/arm64/assemble_arm64.cc
+++ b/compiler/dex/quick/arm64/assemble_arm64.cc
@@ -89,6 +89,7 @@
  *     M -> 16-bit shift expression ("" or ", lsl #16" or ", lsl #32"...)
  *     B -> dmb option string (sy, st, ish, ishst, nsh, hshst)
  *     H -> operand shift
+ *     h -> 6-bit shift immediate
  *     T -> register shift (either ", lsl #0" or ", lsl #12")
  *     e -> register extend (e.g. uxtb #1)
  *     o -> register shift (e.g. lsl #1) for Word registers
@@ -614,10 +615,24 @@
                  kFmtRegR, 4, 0, kFmtRegROrSp, 9, 5, kFmtBitBlt, 21, 10,
                  kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_DEF0_USE1 | SETS_CCODES,
                  "subs", "!0r, !1R, #!2d", kFixupNone),
-    ENCODING_MAP(WIDE(kA64Tst3rro), SF_VARIANTS(0x6a000000),
+    ENCODING_MAP(WIDE(kA64Tst2rl), SF_VARIANTS(0x7200001f),
+                 kFmtRegR, 9, 5, kFmtBitBlt, 22, 10, kFmtUnused, -1, -1,
+                 kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_USE0 | SETS_CCODES,
+                 "tst", "!0r, !1l", kFixupNone),
+    ENCODING_MAP(WIDE(kA64Tst3rro), SF_VARIANTS(0x6a00001f),
                  kFmtRegR, 9, 5, kFmtRegR, 20, 16, kFmtShift, -1, -1,
-                 kFmtUnused, -1, -1, IS_QUAD_OP | REG_USE01 | SETS_CCODES,
+                 kFmtUnused, -1, -1, IS_TERTIARY_OP | REG_USE01 | SETS_CCODES,
                  "tst", "!0r, !1r!2o", kFixupNone),
+    // NOTE: Tbz/Tbnz does not require SETS_CCODES, but it may be replaced by some other LIRs
+    // which require SETS_CCODES in the fix-up stage.
+    ENCODING_MAP(WIDE(kA64Tbnz3rht), CUSTOM_VARIANTS(0x37000000, 0x37000000),
+                 kFmtRegR, 4, 0, kFmtImm6Shift, -1, -1, kFmtBitBlt, 18, 5, kFmtUnused, -1, -1,
+                 IS_TERTIARY_OP | REG_USE0 | IS_BRANCH | NEEDS_FIXUP | SETS_CCODES,
+                 "tbnz", "!0r, #!1h, !2t", kFixupTBxZ),
+    ENCODING_MAP(WIDE(kA64Tbz3rht), CUSTOM_VARIANTS(0x36000000, 0x36000000),
+                 kFmtRegR, 4, 0, kFmtImm6Shift, -1, -1, kFmtBitBlt, 18, 5, kFmtUnused, -1, -1,
+                 IS_TERTIARY_OP | REG_USE0 | IS_BRANCH | NEEDS_FIXUP | SETS_CCODES,
+                 "tbz", "!0r, #!1h, !2t", kFixupTBxZ),
     ENCODING_MAP(WIDE(kA64Ubfm4rrdd), SF_N_VARIANTS(0x53000000),
                  kFmtRegR, 4, 0, kFmtRegR, 9, 5, kFmtBitBlt, 21, 16,
                  kFmtBitBlt, 15, 10, IS_QUAD_OP | REG_DEF0_USE1,
@@ -787,6 +802,11 @@
               value |= ((operand & 0x1ffffc) >> 2) << 5;
               bits |= value;
               break;
+            case kFmtImm6Shift:
+              value = (operand & 0x1f) << 19;
+              value |= ((operand & 0x20) >> 5) << 31;
+              bits |= value;
+              break;
             default:
               LOG(FATAL) << "Bad fmt for arg. " << i << " in " << encoder->name
                          << " (" << kind << ")";
@@ -827,11 +847,6 @@
    */
   int generation = 0;
   while (true) {
-    // TODO(Arm64): check whether passes and offset adjustments are really necessary.
-    //   Currently they aren't, as - in the fixups below - LIR are never inserted.
-    //   Things can be different if jump ranges above 1 MB need to be supported.
-    //   If they are not, then we can get rid of the assembler retry logic.
-
     offset_adjustment = 0;
     AssemblerStatus res = kSuccess;  // Assume success
     generation ^= 1;
@@ -839,13 +854,9 @@
     lir = first_fixup_;
     prev_lir = NULL;
     while (lir != NULL) {
-      /*
-       * NOTE: the lir being considered here will be encoded following the switch (so long as
-       * we're not in a retry situation).  However, any new non-pc_rel instructions inserted
-       * due to retry must be explicitly encoded at the time of insertion.  Note that
-       * inserted instructions don't need use/def flags, but do need size and pc-rel status
-       * properly updated.
-       */
+      // NOTE: Any new non-pc_rel instructions inserted due to retry must be explicitly encoded at
+      // the time of insertion.  Note that inserted instructions don't need use/def flags, but do
+      // need size and pc-rel status properly updated.
       lir->offset += offset_adjustment;
       // During pass, allows us to tell whether a node has been updated with offset_adjustment yet.
       lir->flags.generation = generation;
@@ -861,7 +872,8 @@
           CodeOffset target = target_lir->offset +
               ((target_lir->flags.generation == lir->flags.generation) ? 0 : offset_adjustment);
           int32_t delta = target - pc;
-          if (!((delta & 0x3) == 0 && IS_SIGNED_IMM19(delta >> 2))) {
+          DCHECK_EQ(delta & 0x3, 0);
+          if (!IS_SIGNED_IMM19(delta >> 2)) {
             LOG(FATAL) << "Invalid jump range in kFixupT1Branch";
           }
           lir->operands[0] = delta >> 2;
@@ -876,12 +888,75 @@
           CodeOffset target = target_lir->offset +
             ((target_lir->flags.generation == lir->flags.generation) ? 0 : offset_adjustment);
           int32_t delta = target - pc;
-          if (!((delta & 0x3) == 0 && IS_SIGNED_IMM19(delta >> 2))) {
+          DCHECK_EQ(delta & 0x3, 0);
+          if (!IS_SIGNED_IMM19(delta >> 2)) {
             LOG(FATAL) << "Invalid jump range in kFixupLoad";
           }
           lir->operands[1] = delta >> 2;
           break;
         }
+        case kFixupTBxZ: {
+          int16_t opcode = lir->opcode;
+          RegStorage reg(lir->operands[0] | RegStorage::kValid);
+          int32_t imm = lir->operands[1];
+          DCHECK_EQ(IS_WIDE(opcode), reg.Is64Bit());
+          DCHECK_LT(imm, 64);
+          if (imm >= 32) {
+            DCHECK(IS_WIDE(opcode));
+          } else if (kIsDebugBuild && IS_WIDE(opcode)) {
+            // "tbz/tbnz x0, #imm(<32)" is the same with "tbz/tbnz w0, #imm(<32)", but GCC/oatdump
+            // will disassemble it as "tbz/tbnz w0, #imm(<32)". So unwide the LIR to make the
+            // compiler log behave the same with those disassembler in debug build.
+            // This will also affect tst instruction if it need to be replaced, but there is no
+            // performance difference between "tst Xt" and "tst Wt".
+            lir->opcode = UNWIDE(opcode);
+            lir->operands[0] = As32BitReg(reg).GetReg();
+          }
+
+          // Fix-up branch offset.
+          LIR *target_lir = lir->target;
+          DCHECK(target_lir);
+          CodeOffset pc = lir->offset;
+          CodeOffset target = target_lir->offset +
+              ((target_lir->flags.generation == lir->flags.generation) ? 0 : offset_adjustment);
+          int32_t delta = target - pc;
+          DCHECK_EQ(delta & 0x3, 0);
+          // Check if branch offset can be encoded in tbz/tbnz.
+          if (!IS_SIGNED_IMM14(delta >> 2)) {
+            DexOffset dalvik_offset = lir->dalvik_offset;
+            int16_t opcode = lir->opcode;
+            LIR* target = lir->target;
+            // "tbz/tbnz Rt, #imm, label" -> "tst Rt, #(1<<imm)".
+            offset_adjustment -= lir->flags.size;
+            int32_t imm = EncodeLogicalImmediate(IS_WIDE(opcode), 1 << lir->operands[1]);
+            DCHECK_NE(imm, -1);
+            lir->opcode = IS_WIDE(opcode) ? WIDE(kA64Tst2rl) : kA64Tst2rl;
+            lir->operands[1] = imm;
+            lir->target = nullptr;
+            lir->flags.fixup = EncodingMap[kA64Tst2rl].fixup;
+            lir->flags.size = EncodingMap[kA64Tst2rl].size;
+            offset_adjustment += lir->flags.size;
+            // Insert "beq/bneq label".
+            opcode = UNWIDE(opcode);
+            DCHECK(opcode == kA64Tbz3rht || opcode == kA64Tbnz3rht);
+            LIR* new_lir = RawLIR(dalvik_offset, kA64B2ct,
+                opcode == kA64Tbz3rht ? kArmCondEq : kArmCondNe, 0, 0, 0, 0, target);
+            InsertLIRAfter(lir, new_lir);
+            new_lir->offset = lir->offset + lir->flags.size;
+            new_lir->flags.generation = generation;
+            new_lir->flags.fixup = EncodingMap[kA64B2ct].fixup;
+            new_lir->flags.size = EncodingMap[kA64B2ct].size;
+            offset_adjustment += new_lir->flags.size;
+            // lir no longer pcrel, unlink and link in new_lir.
+            ReplaceFixup(prev_lir, lir, new_lir);
+            prev_lir = new_lir;  // Continue with the new instruction.
+            lir = new_lir->u.a.pcrel_next;
+            res = kRetryAll;
+            continue;
+          }
+          lir->operands[2] = delta >> 2;
+          break;
+        }
         case kFixupAdr: {
           LIR* target_lir = lir->target;
           int32_t delta;
@@ -910,6 +985,7 @@
     }
 
     if (res == kSuccess) {
+      DCHECK_EQ(offset_adjustment, 0);
       break;
     } else {
       assembler_retries++;
diff --git a/compiler/dex/quick/arm64/int_arm64.cc b/compiler/dex/quick/arm64/int_arm64.cc
index d00c57d..d1b9c81 100644
--- a/compiler/dex/quick/arm64/int_arm64.cc
+++ b/compiler/dex/quick/arm64/int_arm64.cc
@@ -271,8 +271,12 @@
       ArmOpcode opcode = kA64Cbz2rt;
       ArmOpcode wide = reg.Is64Bit() ? WIDE(0) : UNWIDE(0);
       branch = NewLIR2(opcode | wide, reg.GetReg(), 0);
+    } else if (arm_cond == kArmCondLt || arm_cond == kArmCondGe) {
+      ArmOpcode opcode = (arm_cond == kArmCondLt) ? kA64Tbnz3rht : kA64Tbz3rht;
+      ArmOpcode wide = reg.Is64Bit() ? WIDE(0) : UNWIDE(0);
+      int value = reg.Is64Bit() ? 63 : 31;
+      branch = NewLIR3(opcode | wide, reg.GetReg(), value, 0);
     }
-    // TODO: Use tbz/tbnz for < 0 or >= 0.
   }
 
   if (branch == nullptr) {
@@ -856,16 +860,14 @@
   OpRegRegImm(kOpLsl, rs_length, rs_length, 1);
 
   // Copy one element.
-  OpRegRegImm(kOpAnd, rs_tmp, As32BitReg(rs_length), 2);
-  LIR* jmp_to_copy_two = OpCmpImmBranch(kCondEq, rs_tmp, 0, nullptr);
+  LIR* jmp_to_copy_two = NewLIR3(WIDE(kA64Tbz3rht), rs_length.GetReg(), 1, 0);
   OpRegImm(kOpSub, rs_length, 2);
   LoadBaseIndexed(rs_src, rs_length, rs_tmp, 0, kSignedHalf);
   StoreBaseIndexed(rs_dst, rs_length, rs_tmp, 0, kSignedHalf);
 
   // Copy two elements.
   LIR *copy_two = NewLIR0(kPseudoTargetLabel);
-  OpRegRegImm(kOpAnd, rs_tmp, As32BitReg(rs_length), 4);
-  LIR* jmp_to_copy_four = OpCmpImmBranch(kCondEq, rs_tmp, 0, nullptr);
+  LIR* jmp_to_copy_four = NewLIR3(WIDE(kA64Tbz3rht), rs_length.GetReg(), 2, 0);
   OpRegImm(kOpSub, rs_length, 4);
   LoadBaseIndexed(rs_src, rs_length, rs_tmp, 0, k32);
   StoreBaseIndexed(rs_dst, rs_length, rs_tmp, 0, k32);
diff --git a/compiler/dex/quick/arm64/target_arm64.cc b/compiler/dex/quick/arm64/target_arm64.cc
index 9b4546a..685f8d5 100644
--- a/compiler/dex/quick/arm64/target_arm64.cc
+++ b/compiler/dex/quick/arm64/target_arm64.cc
@@ -504,6 +504,9 @@
              else
                strcpy(tbuf, ", DecodeError3");
              break;
+           case 'h':
+             snprintf(tbuf, arraysize(tbuf), "%d", operand);
+             break;
            default:
              strcpy(tbuf, "DecodeError1");
              break;