Revert "Revert "ART: Split out more cases of Load/StoreRef, volatile as parameter""

This reverts commit de68676b24f61a55adc0b22fe828f036a5925c41.

Fixes an API comment, and differentiates between inserting and appending.

Change-Id: I0e9a21bb1d25766e3cbd802d8b48633ae251a6bf
diff --git a/compiler/dex/quick/arm/call_arm.cc b/compiler/dex/quick/arm/call_arm.cc
index 590c767..04d6898 100644
--- a/compiler/dex/quick/arm/call_arm.cc
+++ b/compiler/dex/quick/arm/call_arm.cc
@@ -316,9 +316,9 @@
   int ex_offset = Thread::ExceptionOffset<4>().Int32Value();
   RegLocation rl_result = EvalLoc(rl_dest, kRefReg, true);
   RegStorage reset_reg = AllocTempRef();
-  LoadRefDisp(rs_rARM_SELF, ex_offset, rl_result.reg);
+  LoadRefDisp(rs_rARM_SELF, ex_offset, rl_result.reg, kNotVolatile);
   LoadConstant(reset_reg, 0);
-  StoreRefDisp(rs_rARM_SELF, ex_offset, reset_reg);
+  StoreRefDisp(rs_rARM_SELF, ex_offset, reset_reg, kNotVolatile);
   FreeTemp(reset_reg);
   StoreValue(rl_dest, rl_result);
 }
diff --git a/compiler/dex/quick/arm/codegen_arm.h b/compiler/dex/quick/arm/codegen_arm.h
index 4499862..70dce7f 100644
--- a/compiler/dex/quick/arm/codegen_arm.h
+++ b/compiler/dex/quick/arm/codegen_arm.h
@@ -33,20 +33,16 @@
     LIR* CheckSuspendUsingLoad() OVERRIDE;
     RegStorage LoadHelper(ThreadOffset<4> offset) OVERRIDE;
     RegStorage LoadHelper(ThreadOffset<8> offset) OVERRIDE;
-    LIR* LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
-                              OpSize size) OVERRIDE;
     LIR* LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
-                      OpSize size) OVERRIDE;
+                      OpSize size, VolatileKind is_volatile) OVERRIDE;
     LIR* LoadBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_dest, int scale,
                          OpSize size) OVERRIDE;
     LIR* LoadBaseIndexedDisp(RegStorage r_base, RegStorage r_index, int scale, int displacement,
                              RegStorage r_dest, OpSize size) OVERRIDE;
     LIR* LoadConstantNoClobber(RegStorage r_dest, int value);
     LIR* LoadConstantWide(RegStorage r_dest, int64_t value);
-    LIR* StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
-                               OpSize size) OVERRIDE;
     LIR* StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
-                       OpSize size) OVERRIDE;
+                       OpSize size, VolatileKind is_volatile) OVERRIDE;
     LIR* StoreBaseIndexed(RegStorage r_base, RegStorage r_index, RegStorage r_src, int scale,
                           OpSize size) OVERRIDE;
     LIR* StoreBaseIndexedDisp(RegStorage r_base, RegStorage r_index, int scale, int displacement,
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index 916c528..e34d944 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -723,7 +723,7 @@
   } else {
     DCHECK(size == kSignedByte || size == kSignedHalf || size == k32);
     // Unaligned load with LDR and LDRSH is allowed on ARMv7 with SCTLR.A set to 0.
-    LoadBaseDisp(rl_address.reg, 0, rl_result.reg, size);
+    LoadBaseDisp(rl_address.reg, 0, rl_result.reg, size, kNotVolatile);
     StoreValue(rl_dest, rl_result);
   }
   return true;
@@ -737,13 +737,13 @@
   if (size == k64) {
     // Fake unaligned STRD by two unaligned STR instructions on ARMv7 with SCTLR.A set to 0.
     RegLocation rl_value = LoadValueWide(rl_src_value, kCoreReg);
-    StoreBaseDisp(rl_address.reg, 0, rl_value.reg.GetLow(), k32);
-    StoreBaseDisp(rl_address.reg, 4, rl_value.reg.GetHigh(), k32);
+    StoreBaseDisp(rl_address.reg, 0, rl_value.reg.GetLow(), k32, kNotVolatile);
+    StoreBaseDisp(rl_address.reg, 4, rl_value.reg.GetHigh(), k32, kNotVolatile);
   } else {
     DCHECK(size == kSignedByte || size == kSignedHalf || size == k32);
     // Unaligned store with STR and STRSH is allowed on ARMv7 with SCTLR.A set to 0.
     RegLocation rl_value = LoadValue(rl_src_value, kCoreReg);
-    StoreBaseDisp(rl_address.reg, 0, rl_value.reg, size);
+    StoreBaseDisp(rl_address.reg, 0, rl_value.reg, size, kNotVolatile);
   }
   return true;
 }
@@ -1230,7 +1230,7 @@
       }
       FreeTemp(reg_len);
     }
-    LoadBaseDisp(reg_ptr, data_offset, rl_result.reg, size);
+    LoadBaseDisp(reg_ptr, data_offset, rl_result.reg, size, kNotVolatile);
     MarkPossibleNullPointerException(opt_flags);
     if (!constant_index) {
       FreeTemp(reg_ptr);
@@ -1330,7 +1330,7 @@
       FreeTemp(reg_len);
     }
 
-    StoreBaseDisp(reg_ptr, data_offset, rl_src.reg, size);
+    StoreBaseDisp(reg_ptr, data_offset, rl_src.reg, size, kNotVolatile);
     MarkPossibleNullPointerException(opt_flags);
   } else {
     /* reg_ptr -> array data */
diff --git a/compiler/dex/quick/arm/utility_arm.cc b/compiler/dex/quick/arm/utility_arm.cc
index b236f99..bc8f95b 100644
--- a/compiler/dex/quick/arm/utility_arm.cc
+++ b/compiler/dex/quick/arm/utility_arm.cc
@@ -961,31 +961,37 @@
   return load;
 }
 
-LIR* ArmMir2Lir::LoadBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_dest,
-                                      OpSize size) {
-  // Only 64-bit load needs special handling.
-  if (UNLIKELY(size == k64 || size == kDouble)) {
-    DCHECK(!r_dest.IsFloat());  // See RegClassForFieldLoadSave().
-    // If the cpu supports LPAE, aligned LDRD is atomic - fall through to LoadBaseDisp().
-    if (!cu_->compiler_driver->GetInstructionSetFeatures().HasLpae()) {
-      // Use LDREXD for the atomic load. (Expect displacement > 0, don't optimize for == 0.)
-      RegStorage r_ptr = AllocTemp();
-      OpRegRegImm(kOpAdd, r_ptr, r_base, displacement);
-      LIR* lir = NewLIR3(kThumb2Ldrexd, r_dest.GetLowReg(), r_dest.GetHighReg(), r_ptr.GetReg());
-      FreeTemp(r_ptr);
-      return lir;
-    }
-  }
-  return LoadBaseDisp(r_base, displacement, r_dest, size);
-}
-
 LIR* ArmMir2Lir::LoadBaseDisp(RegStorage r_base, int displacement, RegStorage r_dest,
-                              OpSize size) {
+                              OpSize size, VolatileKind is_volatile) {
   // TODO: base this on target.
   if (size == kWord) {
     size = k32;
   }
-  return LoadBaseDispBody(r_base, displacement, r_dest, size);
+  LIR* load;
+  if (UNLIKELY(is_volatile == kVolatile &&
+               (size == k64 || size == kDouble) &&
+               !cu_->compiler_driver->GetInstructionSetFeatures().HasLpae())) {
+    // Only 64-bit load needs special handling.
+    // If the cpu supports LPAE, aligned LDRD is atomic - fall through to LoadBaseDisp().
+    DCHECK(!r_dest.IsFloat());  // See RegClassForFieldLoadSave().
+    // Use LDREXD for the atomic load. (Expect displacement > 0, don't optimize for == 0.)
+    RegStorage r_ptr = AllocTemp();
+    OpRegRegImm(kOpAdd, r_ptr, r_base, displacement);
+    LIR* lir = NewLIR3(kThumb2Ldrexd, r_dest.GetLowReg(), r_dest.GetHighReg(), r_ptr.GetReg());
+    FreeTemp(r_ptr);
+    return lir;
+  } else {
+    load = LoadBaseDispBody(r_base, displacement, r_dest, size);
+  }
+
+  if (UNLIKELY(is_volatile == kVolatile)) {
+    // Without context sensitive analysis, we must issue the most conservative barriers.
+    // In this case, either a load or store may follow so we issue both barriers.
+    GenMemBarrier(kLoadLoad);
+    GenMemBarrier(kLoadStore);
+  }
+
+  return load;
 }
 
 
@@ -1081,49 +1087,58 @@
   return store;
 }
 
-LIR* ArmMir2Lir::StoreBaseDispVolatile(RegStorage r_base, int displacement, RegStorage r_src,
-                                       OpSize size) {
-  // Only 64-bit store needs special handling.
-  if (UNLIKELY(size == k64 || size == kDouble)) {
-    DCHECK(!r_src.IsFloat());  // See RegClassForFieldLoadSave().
-    // If the cpu supports LPAE, aligned STRD is atomic - fall through to StoreBaseDisp().
-    if (!cu_->compiler_driver->GetInstructionSetFeatures().HasLpae()) {
-      // Use STREXD for the atomic store. (Expect displacement > 0, don't optimize for == 0.)
-      RegStorage r_ptr = AllocTemp();
-      OpRegRegImm(kOpAdd, r_ptr, r_base, displacement);
-      LIR* fail_target = NewLIR0(kPseudoTargetLabel);
-      // We have only 5 temporary registers available and if r_base, r_src and r_ptr already
-      // take 4, we can't directly allocate 2 more for LDREXD temps. In that case clobber r_ptr
-      // in LDREXD and recalculate it from r_base.
-      RegStorage r_temp = AllocTemp();
-      RegStorage r_temp_high = AllocFreeTemp();  // We may not have another temp.
-      if (r_temp_high.Valid()) {
-        NewLIR3(kThumb2Ldrexd, r_temp.GetReg(), r_temp_high.GetReg(), r_ptr.GetReg());
-        FreeTemp(r_temp_high);
-        FreeTemp(r_temp);
-      } else {
-        // If we don't have another temp, clobber r_ptr in LDREXD and reload it.
-        NewLIR3(kThumb2Ldrexd, r_temp.GetReg(), r_ptr.GetReg(), r_ptr.GetReg());
-        FreeTemp(r_temp);  // May need the temp for kOpAdd.
-        OpRegRegImm(kOpAdd, r_ptr, r_base, displacement);
-      }
-      LIR* lir = NewLIR4(kThumb2Strexd, r_temp.GetReg(), r_src.GetLowReg(), r_src.GetHighReg(),
-                         r_ptr.GetReg());
-      OpCmpImmBranch(kCondNe, r_temp, 0, fail_target);
-      FreeTemp(r_ptr);
-      return lir;
-    }
-  }
-  return StoreBaseDisp(r_base, displacement, r_src, size);
-}
-
 LIR* ArmMir2Lir::StoreBaseDisp(RegStorage r_base, int displacement, RegStorage r_src,
-                               OpSize size) {
-  // TODO: base this on target.
-  if (size == kWord) {
-    size = k32;
+                               OpSize size, VolatileKind is_volatile) {
+  if (UNLIKELY(is_volatile == kVolatile)) {
+    // There might have been a store before this volatile one so insert StoreStore barrier.
+    GenMemBarrier(kStoreStore);
   }
-  return StoreBaseDispBody(r_base, displacement, r_src, size);
+
+  LIR* store;
+  if (UNLIKELY(is_volatile == kVolatile &&
+               (size == k64 || size == kDouble) &&
+               !cu_->compiler_driver->GetInstructionSetFeatures().HasLpae())) {
+    // Only 64-bit store needs special handling.
+    // If the cpu supports LPAE, aligned STRD is atomic - fall through to StoreBaseDisp().
+    // Use STREXD for the atomic store. (Expect displacement > 0, don't optimize for == 0.)
+    DCHECK(!r_src.IsFloat());  // See RegClassForFieldLoadSave().
+    RegStorage r_ptr = AllocTemp();
+    OpRegRegImm(kOpAdd, r_ptr, r_base, displacement);
+    LIR* fail_target = NewLIR0(kPseudoTargetLabel);
+    // We have only 5 temporary registers available and if r_base, r_src and r_ptr already
+    // take 4, we can't directly allocate 2 more for LDREXD temps. In that case clobber r_ptr
+    // in LDREXD and recalculate it from r_base.
+    RegStorage r_temp = AllocTemp();
+    RegStorage r_temp_high = AllocFreeTemp();  // We may not have another temp.
+    if (r_temp_high.Valid()) {
+      NewLIR3(kThumb2Ldrexd, r_temp.GetReg(), r_temp_high.GetReg(), r_ptr.GetReg());
+      FreeTemp(r_temp_high);
+      FreeTemp(r_temp);
+    } else {
+      // If we don't have another temp, clobber r_ptr in LDREXD and reload it.
+      NewLIR3(kThumb2Ldrexd, r_temp.GetReg(), r_ptr.GetReg(), r_ptr.GetReg());
+      FreeTemp(r_temp);  // May need the temp for kOpAdd.
+      OpRegRegImm(kOpAdd, r_ptr, r_base, displacement);
+    }
+    store = NewLIR4(kThumb2Strexd, r_temp.GetReg(), r_src.GetLowReg(), r_src.GetHighReg(),
+                    r_ptr.GetReg());
+    OpCmpImmBranch(kCondNe, r_temp, 0, fail_target);
+    FreeTemp(r_ptr);
+  } else {
+    // TODO: base this on target.
+    if (size == kWord) {
+      size = k32;
+    }
+
+    store = StoreBaseDispBody(r_base, displacement, r_src, size);
+  }
+
+  if (UNLIKELY(is_volatile == kVolatile)) {
+    // A load might follow the volatile store so insert a StoreLoad barrier.
+    GenMemBarrier(kStoreLoad);
+  }
+
+  return store;
 }
 
 LIR* ArmMir2Lir::OpFpRegCopy(RegStorage r_dest, RegStorage r_src) {