Intrinsic Unsafe.CompareAndSwapLong() for ARM.

(cherry picked from cb53fcd79b1a5ce608208ec454b5c19f64aaba37)

Change-Id: Iadd3cc8b4ed390670463b80f8efd579ce6ece226
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index 9727179..e839fe5 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -561,22 +561,67 @@
 }
 
 bool ArmMir2Lir::GenInlinedCas(CallInfo* info, bool is_long, bool is_object) {
-  DCHECK(!is_long);  // not supported yet
   DCHECK_EQ(cu_->instruction_set, kThumb2);
   // Unused - RegLocation rl_src_unsafe = info->args[0];
   RegLocation rl_src_obj = info->args[1];  // Object - known non-null
   RegLocation rl_src_offset = info->args[2];  // long low
   rl_src_offset.wide = 0;  // ignore high half in info->args[3]
   RegLocation rl_src_expected = info->args[4];  // int, long or Object
-  RegLocation rl_src_new_value = info->args[5];  // int, long or Object
+  // If is_long, high half is in info->args[5]
+  RegLocation rl_src_new_value = info->args[is_long ? 6 : 5];  // int, long or Object
+  // If is_long, high half is in info->args[7]
   RegLocation rl_dest = InlineTarget(info);  // boolean place for result
 
+  // We have only 5 temporary registers available and actually only 4 if the InlineTarget
+  // above locked one of the temps. For a straightforward CAS64 we need 7 registers:
+  // r_ptr (1), new_value (2), expected(2) and ldrexd result (2). If neither expected nor
+  // new_value is in a non-temp core register we shall reload them in the ldrex/strex loop
+  // into the same temps, reducing the number of required temps down to 5. We shall work
+  // around the potentially locked temp by using LR for r_ptr, unconditionally.
+  // TODO: Pass information about the need for more temps to the stack frame generation
+  // code so that we can rely on being able to allocate enough temps.
+  DCHECK(!reg_pool_->core_regs[rARM_LR].is_temp);
+  MarkTemp(rARM_LR);
+  FreeTemp(rARM_LR);
+  LockTemp(rARM_LR);
+  bool load_early = true;
+  if (is_long) {
+    bool expected_is_core_reg =
+        rl_src_expected.location == kLocPhysReg && !IsFpReg(rl_src_expected.low_reg);
+    bool new_value_is_core_reg =
+        rl_src_new_value.location == kLocPhysReg && !IsFpReg(rl_src_new_value.low_reg);
+    bool expected_is_good_reg = expected_is_core_reg && !IsTemp(rl_src_expected.low_reg);
+    bool new_value_is_good_reg = new_value_is_core_reg && !IsTemp(rl_src_new_value.low_reg);
+
+    if (!expected_is_good_reg && !new_value_is_good_reg) {
+      // None of expected/new_value is non-temp reg, need to load both late
+      load_early = false;
+      // Make sure they are not in the temp regs and the load will not be skipped.
+      if (expected_is_core_reg) {
+        FlushRegWide(rl_src_expected.low_reg, rl_src_expected.high_reg);
+        ClobberSReg(rl_src_expected.s_reg_low);
+        ClobberSReg(GetSRegHi(rl_src_expected.s_reg_low));
+        rl_src_expected.location = kLocDalvikFrame;
+      }
+      if (new_value_is_core_reg) {
+        FlushRegWide(rl_src_new_value.low_reg, rl_src_new_value.high_reg);
+        ClobberSReg(rl_src_new_value.s_reg_low);
+        ClobberSReg(GetSRegHi(rl_src_new_value.s_reg_low));
+        rl_src_new_value.location = kLocDalvikFrame;
+      }
+    }
+  }
 
   // Release store semantics, get the barrier out of the way.  TODO: revisit
   GenMemBarrier(kStoreLoad);
 
   RegLocation rl_object = LoadValue(rl_src_obj, kCoreReg);
-  RegLocation rl_new_value = LoadValue(rl_src_new_value, kCoreReg);
+  RegLocation rl_new_value;
+  if (!is_long) {
+    rl_new_value = LoadValue(rl_src_new_value, kCoreReg);
+  } else if (load_early) {
+    rl_new_value = LoadValueWide(rl_src_new_value, kCoreReg);
+  }
 
   if (is_object && !mir_graph_->IsConstantNullRef(rl_new_value)) {
     // Mark card for object assuming new value is stored.
@@ -585,7 +630,7 @@
 
   RegLocation rl_offset = LoadValue(rl_src_offset, kCoreReg);
 
-  int r_ptr = AllocTemp();
+  int r_ptr = rARM_LR;
   OpRegRegReg(kOpAdd, r_ptr, rl_object.low_reg, rl_offset.low_reg);
 
   // Free now unneeded rl_object and rl_offset to give more temps.
@@ -594,29 +639,77 @@
   ClobberSReg(rl_offset.s_reg_low);
   FreeTemp(rl_offset.low_reg);
 
-  RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
-  LoadConstant(rl_result.low_reg, 0);  // r_result := 0
+  RegLocation rl_expected;
+  if (!is_long) {
+    rl_expected = LoadValue(rl_src_expected, kCoreReg);
+  } else if (load_early) {
+    rl_expected = LoadValueWide(rl_src_expected, kCoreReg);
+  } else {
+    rl_new_value.low_reg = rl_expected.low_reg = AllocTemp();
+    rl_new_value.high_reg = rl_expected.high_reg = AllocTemp();
+  }
 
-  // while ([r_ptr] == rExpected && r_result == 0) {
-  //   [r_ptr] <- r_new_value && r_result := success ? 0 : 1
-  //   r_result ^= 1
-  // }
-  int r_old_value = AllocTemp();
+  // do {
+  //   tmp = [r_ptr] - expected;
+  // } while (tmp == 0 && failure([r_ptr] <- r_new_value));
+  // result = tmp != 0;
+
+  int r_tmp = AllocTemp();
   LIR* target = NewLIR0(kPseudoTargetLabel);
-  NewLIR3(kThumb2Ldrex, r_old_value, r_ptr, 0);
 
-  RegLocation rl_expected = LoadValue(rl_src_expected, kCoreReg);
-  OpRegReg(kOpCmp, r_old_value, rl_expected.low_reg);
-  FreeTemp(r_old_value);  // Now unneeded.
-  OpIT(kCondEq, "TT");
-  NewLIR4(kThumb2Strex /* eq */, rl_result.low_reg, rl_new_value.low_reg, r_ptr, 0);
-  FreeTemp(r_ptr);  // Now unneeded.
-  OpRegImm(kOpXor /* eq */, rl_result.low_reg, 1);
-  OpRegImm(kOpCmp /* eq */, rl_result.low_reg, 0);
+  if (is_long) {
+    int r_tmp_high = AllocTemp();
+    if (!load_early) {
+      LoadValueDirectWide(rl_src_expected, rl_expected.low_reg, rl_expected.high_reg);
+    }
+    NewLIR3(kThumb2Ldrexd, r_tmp, r_tmp_high, r_ptr);
+    OpRegReg(kOpSub, r_tmp, rl_expected.low_reg);
+    OpRegReg(kOpSub, r_tmp_high, rl_expected.high_reg);
+    if (!load_early) {
+      LoadValueDirectWide(rl_src_new_value, rl_new_value.low_reg, rl_new_value.high_reg);
+    }
+    // Make sure we use ORR that sets the ccode
+    if (ARM_LOWREG(r_tmp) && ARM_LOWREG(r_tmp_high)) {
+      NewLIR2(kThumbOrr, r_tmp, r_tmp_high);
+    } else {
+      NewLIR4(kThumb2OrrRRRs, r_tmp, r_tmp, r_tmp_high, 0);
+    }
+    FreeTemp(r_tmp_high);  // Now unneeded
+
+    DCHECK(last_lir_insn_->u.m.def_mask & ENCODE_CCODE);
+    OpIT(kCondEq, "T");
+    NewLIR4(kThumb2Strexd /* eq */, r_tmp, rl_new_value.low_reg, rl_new_value.high_reg, r_ptr);
+
+  } else {
+    NewLIR3(kThumb2Ldrex, r_tmp, r_ptr, 0);
+    OpRegReg(kOpSub, r_tmp, rl_expected.low_reg);
+    DCHECK(last_lir_insn_->u.m.def_mask & ENCODE_CCODE);
+    OpIT(kCondEq, "T");
+    NewLIR4(kThumb2Strex /* eq */, r_tmp, rl_new_value.low_reg, r_ptr, 0);
+  }
+
+  // Still one conditional left from OpIT(kCondEq, "T") from either branch
+  OpRegImm(kOpCmp /* eq */, r_tmp, 1);
   OpCondBranch(kCondEq, target);
 
+  if (!load_early) {
+    FreeTemp(rl_expected.low_reg);  // Now unneeded.
+    FreeTemp(rl_expected.high_reg);  // Now unneeded.
+  }
+
+  // result := (tmp1 != 0) ? 0 : 1;
+  RegLocation rl_result = EvalLoc(rl_dest, kCoreReg, true);
+  OpRegRegImm(kOpRsub, rl_result.low_reg, r_tmp, 1);
+  DCHECK(last_lir_insn_->u.m.def_mask & ENCODE_CCODE);
+  OpIT(kCondCc, "");
+  LoadConstant(rl_result.low_reg, 0); /* cc */
+  FreeTemp(r_tmp);  // Now unneeded.
+
   StoreValue(rl_dest, rl_result);
 
+  // Now, restore lr to its non-temp status.
+  Clobber(rARM_LR);
+  UnmarkTemp(rARM_LR);
   return true;
 }