ART: Rework ARM64 entry sequence

Try to fold one sub of SP in the ARM64 entry sequence. When the
framesize is small, generate a sub over the full frame-size, and
adjust the spill offsets accordingly. If the framesize is too
large, use a pre-indexed store and fill upwards from there.

Change-Id: I1c15ac6276fb62b8164372de02fd92437f605938
diff --git a/compiler/dex/quick/arm64/int_arm64.cc b/compiler/dex/quick/arm64/int_arm64.cc
index f9f85f4..d8df30f 100644
--- a/compiler/dex/quick/arm64/int_arm64.cc
+++ b/compiler/dex/quick/arm64/int_arm64.cc
@@ -22,6 +22,7 @@
 #include "dex/reg_storage_eq.h"
 #include "entrypoints/quick/quick_entrypoints.h"
 #include "mirror/array.h"
+#include "utils.h"
 
 namespace art {
 
@@ -1237,6 +1238,14 @@
   StoreValueWide(rl_dest, rl_result);
 }
 
+static uint32_t ExtractReg(uint32_t reg_mask, int* reg) {
+  // Find first register.
+  int first_bit_set = CTZ(reg_mask) + 1;
+  *reg = *reg + first_bit_set;
+  reg_mask >>= first_bit_set;
+  return reg_mask;
+}
+
 /**
  * @brief Split a register list in pairs or registers.
  *
@@ -1253,15 +1262,15 @@
  *   }
  * @endcode
  */
-uint32_t Arm64Mir2Lir::GenPairWise(uint32_t reg_mask, int* reg1, int* reg2) {
+static uint32_t GenPairWise(uint32_t reg_mask, int* reg1, int* reg2) {
   // Find first register.
-  int first_bit_set = __builtin_ctz(reg_mask) + 1;
+  int first_bit_set = CTZ(reg_mask) + 1;
   int reg = *reg1 + first_bit_set;
   reg_mask >>= first_bit_set;
 
   if (LIKELY(reg_mask)) {
     // Save the first register, find the second and use the pair opcode.
-    int second_bit_set = __builtin_ctz(reg_mask) + 1;
+    int second_bit_set = CTZ(reg_mask) + 1;
     *reg2 = reg;
     reg_mask >>= second_bit_set;
     *reg1 = reg + second_bit_set;
@@ -1274,68 +1283,274 @@
   return reg_mask;
 }
 
-void Arm64Mir2Lir::UnSpillCoreRegs(RegStorage base, int offset, uint32_t reg_mask) {
-  int reg1 = -1, reg2 = -1;
-  const int reg_log2_size = 3;
-
-  for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) {
-     reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
-    if (UNLIKELY(reg2 < 0)) {
-      NewLIR3(WIDE(kA64Ldr3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
-    } else {
-      DCHECK_LE(offset, 63);
-      NewLIR4(WIDE(kA64Ldp4rrXD), RegStorage::Solo64(reg2).GetReg(),
-              RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
-    }
-  }
-}
-
-void Arm64Mir2Lir::SpillCoreRegs(RegStorage base, int offset, uint32_t reg_mask) {
+static void SpillCoreRegs(Arm64Mir2Lir* m2l, RegStorage base, int offset, uint32_t reg_mask) {
   int reg1 = -1, reg2 = -1;
   const int reg_log2_size = 3;
 
   for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) {
     reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
     if (UNLIKELY(reg2 < 0)) {
-      NewLIR3(WIDE(kA64Str3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
+      m2l->NewLIR3(WIDE(kA64Str3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
     } else {
-      NewLIR4(WIDE(kA64Stp4rrXD), RegStorage::Solo64(reg2).GetReg(),
-              RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
-    }
-  }
-}
-
-void Arm64Mir2Lir::UnSpillFPRegs(RegStorage base, int offset, uint32_t reg_mask) {
-  int reg1 = -1, reg2 = -1;
-  const int reg_log2_size = 3;
-
-  for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) {
-     reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
-    if (UNLIKELY(reg2 < 0)) {
-      NewLIR3(FWIDE(kA64Ldr3fXD), RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset);
-    } else {
-      NewLIR4(WIDE(kA64Ldp4ffXD), RegStorage::FloatSolo64(reg2).GetReg(),
-              RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset);
+      m2l->NewLIR4(WIDE(kA64Stp4rrXD), RegStorage::Solo64(reg2).GetReg(),
+                   RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
     }
   }
 }
 
 // TODO(Arm64): consider using ld1 and st1?
-void Arm64Mir2Lir::SpillFPRegs(RegStorage base, int offset, uint32_t reg_mask) {
+static void SpillFPRegs(Arm64Mir2Lir* m2l, RegStorage base, int offset, uint32_t reg_mask) {
   int reg1 = -1, reg2 = -1;
   const int reg_log2_size = 3;
 
   for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) {
     reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
     if (UNLIKELY(reg2 < 0)) {
-      NewLIR3(FWIDE(kA64Str3fXD), RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset);
+      m2l->NewLIR3(FWIDE(kA64Str3fXD), RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(),
+                   offset);
     } else {
-      NewLIR4(WIDE(kA64Stp4ffXD), RegStorage::FloatSolo64(reg2).GetReg(),
-              RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset);
+      m2l->NewLIR4(WIDE(kA64Stp4ffXD), RegStorage::FloatSolo64(reg2).GetReg(),
+                   RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset);
     }
   }
 }
 
+static int SpillRegsPreSub(Arm64Mir2Lir* m2l, RegStorage base, uint32_t core_reg_mask,
+                           uint32_t fp_reg_mask, int frame_size) {
+  m2l->OpRegRegImm(kOpSub, rs_sp, rs_sp, frame_size);
+
+  int core_count = POPCOUNT(core_reg_mask);
+
+  if (fp_reg_mask != 0) {
+    // Spill FP regs.
+    int fp_count = POPCOUNT(fp_reg_mask);
+    int spill_offset = frame_size - (core_count + fp_count) * kArm64PointerSize;
+    SpillFPRegs(m2l, rs_sp, spill_offset, fp_reg_mask);
+  }
+
+  if (core_reg_mask != 0) {
+    // Spill core regs.
+    int spill_offset = frame_size - (core_count * kArm64PointerSize);
+    SpillCoreRegs(m2l, rs_sp, spill_offset, core_reg_mask);
+  }
+
+  return frame_size;
+}
+
+static int SpillRegsPreIndexed(Arm64Mir2Lir* m2l, RegStorage base, uint32_t core_reg_mask,
+                               uint32_t fp_reg_mask, int frame_size) {
+  // Otherwise, spill both core and fp regs at the same time.
+  // The very first instruction will be an stp with pre-indexed address, moving the stack pointer
+  // down. From then on, we fill upwards. This will generate overall the same number of instructions
+  // as the specialized code above in most cases (exception being odd number of core and even
+  // non-zero fp spills), but is more flexible, as the offsets are guaranteed small.
+  //
+  // Some demonstrative fill cases : (c) = core, (f) = fp
+  // cc    44   cc    44   cc    22   cc    33   fc => 1[1/2]
+  // fc => 23   fc => 23   ff => 11   ff => 22
+  // ff    11    f    11               f    11
+  //
+  int reg1 = -1, reg2 = -1;
+  int core_count = POPCOUNT(core_reg_mask);
+  int fp_count = POPCOUNT(fp_reg_mask);
+
+  int combined = fp_count + core_count;
+  int all_offset = RoundUp(combined, 2);  // Needs to be 16B = 2-reg aligned.
+
+  int cur_offset = 2;  // What's the starting offset after the first stp? We expect the base slot
+                       // to be filled.
+
+  // First figure out whether the bottom is FP or core.
+  if (fp_count > 0) {
+    // Some FP spills.
+    //
+    // Four cases: (d0 is dummy to fill up stp)
+    // 1) Single FP, even number of core -> stp d0, fp_reg
+    // 2) Single FP, odd number of core -> stp fp_reg, d0
+    // 3) More FP, even number combined -> stp fp_reg1, fp_reg2
+    // 4) More FP, odd number combined -> stp d0, fp_reg
+    if (fp_count == 1) {
+      fp_reg_mask = ExtractReg(fp_reg_mask, &reg1);
+      DCHECK_EQ(fp_reg_mask, 0U);
+      if (core_count % 2 == 0) {
+        m2l->NewLIR4(WIDE(kA64StpPre4ffXD),
+                     RegStorage::FloatSolo64(reg1).GetReg(),
+                     RegStorage::FloatSolo64(reg1).GetReg(),
+                     base.GetReg(), -all_offset);
+      } else {
+        m2l->NewLIR4(WIDE(kA64StpPre4ffXD),
+                     RegStorage::FloatSolo64(reg1).GetReg(),
+                     RegStorage::FloatSolo64(reg1).GetReg(),
+                     base.GetReg(), -all_offset);
+        cur_offset = 0;  // That core reg needs to go into the upper half.
+      }
+    } else {
+      if (combined % 2 == 0) {
+        fp_reg_mask = GenPairWise(fp_reg_mask, &reg1, &reg2);
+        m2l->NewLIR4(WIDE(kA64StpPre4ffXD), RegStorage::FloatSolo64(reg2).GetReg(),
+                     RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), -all_offset);
+      } else {
+        fp_reg_mask = ExtractReg(fp_reg_mask, &reg1);
+        m2l->NewLIR4(WIDE(kA64StpPre4ffXD), rs_d0.GetReg(), RegStorage::FloatSolo64(reg1).GetReg(),
+                     base.GetReg(), -all_offset);
+      }
+    }
+  } else {
+    // No FP spills.
+    //
+    // Two cases:
+    // 1) Even number of core -> stp core1, core2
+    // 2) Odd number of core -> stp xzr, core1
+    if (core_count % 2 == 1) {
+      core_reg_mask = ExtractReg(core_reg_mask, &reg1);
+      m2l->NewLIR4(WIDE(kA64StpPre4rrXD), rs_xzr.GetReg(),
+                   RegStorage::Solo64(reg1).GetReg(), base.GetReg(), -all_offset);
+    } else {
+      core_reg_mask = GenPairWise(core_reg_mask, &reg1, &reg2);
+      m2l->NewLIR4(WIDE(kA64StpPre4rrXD), RegStorage::Solo64(reg2).GetReg(),
+                   RegStorage::Solo64(reg1).GetReg(), base.GetReg(), -all_offset);
+    }
+  }
+
+  if (fp_count != 0) {
+    for (; fp_reg_mask != 0;) {
+      // Have some FP regs to do.
+      fp_reg_mask = GenPairWise(fp_reg_mask, &reg1, &reg2);
+      if (UNLIKELY(reg2 < 0)) {
+        m2l->NewLIR3(FWIDE(kA64Str3fXD), RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(),
+                     cur_offset);
+        // Do not increment offset here, as the second half will be filled by a core reg.
+      } else {
+        m2l->NewLIR4(WIDE(kA64Stp4ffXD), RegStorage::FloatSolo64(reg2).GetReg(),
+                     RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), cur_offset);
+        cur_offset += 2;
+      }
+    }
+
+    // Reset counting.
+    reg1 = -1;
+
+    // If there is an odd number of core registers, we need to store the bottom now.
+    if (core_count % 2 == 1) {
+      core_reg_mask = ExtractReg(core_reg_mask, &reg1);
+      m2l->NewLIR3(WIDE(kA64Str3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(),
+                   cur_offset + 1);
+      cur_offset += 2;  // Half-slot filled now.
+    }
+  }
+
+  // Spill the rest of the core regs. They are guaranteed to be even.
+  DCHECK_EQ(POPCOUNT(core_reg_mask) % 2, 0);
+  for (; core_reg_mask != 0; cur_offset += 2) {
+    core_reg_mask = GenPairWise(core_reg_mask, &reg1, &reg2);
+    m2l->NewLIR4(WIDE(kA64Stp4rrXD), RegStorage::Solo64(reg2).GetReg(),
+                 RegStorage::Solo64(reg1).GetReg(), base.GetReg(), cur_offset);
+  }
+
+  DCHECK_EQ(cur_offset, all_offset);
+
+  return all_offset * 8;
+}
+
+int Arm64Mir2Lir::SpillRegs(RegStorage base, uint32_t core_reg_mask, uint32_t fp_reg_mask,
+                            int frame_size) {
+  // If the frame size is small enough that all offsets would fit into the immediates, use that
+  // setup, as it decrements sp early (kind of instruction scheduling), and is not worse
+  // instruction-count wise than the complicated code below.
+  //
+  // This case is also optimal when we have an odd number of core spills, and an even (non-zero)
+  // number of fp spills.
+  if ((RoundUp(frame_size, 8) / 8 <= 63)) {
+    return SpillRegsPreSub(this, base, core_reg_mask, fp_reg_mask, frame_size);
+  } else {
+    return SpillRegsPreIndexed(this, base, core_reg_mask, fp_reg_mask, frame_size);
+  }
+}
+
+static void UnSpillCoreRegs(Arm64Mir2Lir* m2l, RegStorage base, int offset, uint32_t reg_mask) {
+  int reg1 = -1, reg2 = -1;
+  const int reg_log2_size = 3;
+
+  for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) {
+    reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
+    if (UNLIKELY(reg2 < 0)) {
+      m2l->NewLIR3(WIDE(kA64Ldr3rXD), RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
+    } else {
+      DCHECK_LE(offset, 63);
+      m2l->NewLIR4(WIDE(kA64Ldp4rrXD), RegStorage::Solo64(reg2).GetReg(),
+                   RegStorage::Solo64(reg1).GetReg(), base.GetReg(), offset);
+    }
+  }
+}
+
+static void UnSpillFPRegs(Arm64Mir2Lir* m2l, RegStorage base, int offset, uint32_t reg_mask) {
+  int reg1 = -1, reg2 = -1;
+  const int reg_log2_size = 3;
+
+  for (offset = (offset >> reg_log2_size); reg_mask; offset += 2) {
+     reg_mask = GenPairWise(reg_mask, & reg1, & reg2);
+    if (UNLIKELY(reg2 < 0)) {
+      m2l->NewLIR3(FWIDE(kA64Ldr3fXD), RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(),
+                   offset);
+    } else {
+      m2l->NewLIR4(WIDE(kA64Ldp4ffXD), RegStorage::FloatSolo64(reg2).GetReg(),
+                   RegStorage::FloatSolo64(reg1).GetReg(), base.GetReg(), offset);
+    }
+  }
+}
+
+void Arm64Mir2Lir::UnspillRegs(RegStorage base, uint32_t core_reg_mask, uint32_t fp_reg_mask,
+                               int frame_size) {
+  // Restore saves and drop stack frame.
+  // 2 versions:
+  //
+  // 1. (Original): Try to address directly, then drop the whole frame.
+  //                Limitation: ldp is a 7b signed immediate.
+  //
+  // 2. (New): Drop the non-save-part. Then do similar to original, which is now guaranteed to be
+  //           in range. Then drop the rest.
+  //
+  // TODO: In methods with few spills but huge frame, it would be better to do non-immediate loads
+  //       in variant 1.
+
+  // "Magic" constant, 63 (max signed 7b) * 8.
+  static constexpr int kMaxFramesizeForOffset = 63 * kArm64PointerSize;
+
+  const int num_core_spills = POPCOUNT(core_reg_mask);
+  const int num_fp_spills = POPCOUNT(fp_reg_mask);
+
+  int early_drop = 0;
+
+  if (frame_size > kMaxFramesizeForOffset) {
+    // Second variant. Drop the frame part.
+
+    // TODO: Always use the first formula, as num_fp_spills would be zero?
+    if (fp_reg_mask != 0) {
+      early_drop = frame_size - kArm64PointerSize * (num_fp_spills + num_core_spills);
+    } else {
+      early_drop = frame_size - kArm64PointerSize * num_core_spills;
+    }
+
+    // Drop needs to be 16B aligned, so that SP keeps aligned.
+    early_drop = RoundDown(early_drop, 16);
+
+    OpRegImm64(kOpAdd, rs_sp, early_drop);
+  }
+
+  // Unspill.
+  if (fp_reg_mask != 0) {
+    int offset = frame_size - early_drop - kArm64PointerSize * (num_fp_spills + num_core_spills);
+    UnSpillFPRegs(this, rs_sp, offset, fp_reg_mask);
+  }
+  if (core_reg_mask != 0) {
+    int offset = frame_size - early_drop - kArm64PointerSize * num_core_spills;
+    UnSpillCoreRegs(this, rs_sp, offset, core_reg_mask);
+  }
+
+  // Drop the (rest of) the frame.
+  OpRegImm64(kOpAdd, rs_sp, frame_size - early_drop);
+}
+
 bool Arm64Mir2Lir::GenInlinedReverseBits(CallInfo* info, OpSize size) {
   ArmOpcode wide = (size == k64) ? WIDE(0) : UNWIDE(0);
   RegLocation rl_src_i = info->args[0];