Tidy up memory barriers.

Change-Id: I937ea93e6df1835ecfe2d4bb7d84c24fe7fc097b
diff --git a/compiler/dex/quick/arm/int_arm.cc b/compiler/dex/quick/arm/int_arm.cc
index e839fe5..d5173b0 100644
--- a/compiler/dex/quick/arm/int_arm.cc
+++ b/compiler/dex/quick/arm/int_arm.cc
@@ -760,10 +760,10 @@
   int dmb_flavor;
   // TODO: revisit Arm barrier kinds
   switch (barrier_kind) {
-    case kLoadStore: dmb_flavor = kSY; break;
-    case kLoadLoad: dmb_flavor = kSY; break;
-    case kStoreStore: dmb_flavor = kST; break;
-    case kStoreLoad: dmb_flavor = kSY; break;
+    case kLoadStore: dmb_flavor = kISH; break;
+    case kLoadLoad: dmb_flavor = kISH; break;
+    case kStoreStore: dmb_flavor = kISHST; break;
+    case kStoreLoad: dmb_flavor = kISH; break;
     default:
       LOG(FATAL) << "Unexpected MemBarrierKind: " << barrier_kind;
       dmb_flavor = kSY;  // quiet gcc.
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 1124541..9cffb3c 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -1404,7 +1404,7 @@
   }
 
   size_t NextIndex() {
-    return index_.fetch_add(1);
+    return index_.FetchAndAdd(1);
   }
 
  private:
diff --git a/disassembler/disassembler_arm.cc b/disassembler/disassembler_arm.cc
index 90d84d5..71f70c4 100644
--- a/disassembler/disassembler_arm.cc
+++ b/disassembler/disassembler_arm.cc
@@ -81,6 +81,19 @@
   }
 }
 
+void DisassemblerArm::DumpMemoryDomain(std::ostream& os, uint32_t domain) {
+  switch (domain) {
+    case 0b1111: os << "sy"; break;
+    case 0b1110: os << "st"; break;
+    case 0b1011: os << "ish"; break;
+    case 0b1010: os << "ishst"; break;
+    case 0b0111: os << "nsh"; break;
+    case 0b0110: os << "nshst"; break;
+    case 0b0011: os << "osh"; break;
+    case 0b0010: os << "oshst"; break;
+  }
+}
+
 void DisassemblerArm::DumpBranchTarget(std::ostream& os, const uint8_t* instr_ptr, int32_t imm32) {
   os << StringPrintf("%+d (%p)", imm32, instr_ptr + imm32);
 }
@@ -996,9 +1009,9 @@
               // Miscellaneous control instructions
               uint32_t op5 = (instr >> 4) & 0xF;
               switch (op5) {
-                case 4: opcode << "dsb"; break;
-                case 5: opcode << "dmb"; break;
-                case 6: opcode << "isb"; break;
+                case 4: opcode << "dsb"; DumpMemoryDomain(args, instr & 0xF); break;
+                case 5: opcode << "dmb"; DumpMemoryDomain(args, instr & 0xF); break;
+                case 6: opcode << "isb"; DumpMemoryDomain(args, instr & 0xF); break;
               }
             }
             break;
diff --git a/disassembler/disassembler_arm.h b/disassembler/disassembler_arm.h
index 2e699ff..e34274e 100644
--- a/disassembler/disassembler_arm.h
+++ b/disassembler/disassembler_arm.h
@@ -30,6 +30,7 @@
 
   virtual size_t Dump(std::ostream& os, const uint8_t* begin);
   virtual void Dump(std::ostream& os, const uint8_t* begin, const uint8_t* end);
+
  private:
   void DumpArm(std::ostream& os, const uint8_t* instr);
 
@@ -39,6 +40,7 @@
 
   void DumpBranchTarget(std::ostream& os, const uint8_t* instr_ptr, int32_t imm32);
   void DumpCond(std::ostream& os, uint32_t cond);
+  void DumpMemoryDomain(std::ostream& os, uint32_t domain);
 
   std::vector<const char*> it_conditions_;
 
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 61be14b..34de93f 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -359,6 +359,7 @@
     @ unlocked case - r2 holds thread id with count of 0
     strex  r3, r2, [r0, #LOCK_WORD_OFFSET]
     cbnz   r3, strex_fail             @ store failed, retry
+    dmb    ish                        @ full (LoadLoad) memory barrier
     bx lr
 strex_fail:
     b retry_lock                      @ unlikely forward branch, need to reload and recheck r1/r2
@@ -402,6 +403,7 @@
     bpl    recursive_thin_unlock
     @ transition to unlocked, r3 holds 0
     str    r3, [r0, #LOCK_WORD_OFFSET]
+    dmb    ish                        @ full (StoreLoad) memory barrier
     bx     lr
 recursive_thin_unlock:
     sub    r1, r1, #65536
diff --git a/runtime/atomic.cc b/runtime/atomic.cc
index 47cee6a..bac0a99 100644
--- a/runtime/atomic.cc
+++ b/runtime/atomic.cc
@@ -15,135 +15,52 @@
  */
 
 #include "atomic.h"
-
-#define NEED_SWAP_MUTEXES !defined(__arm__) && !defined(__i386__)
-
-#if NEED_SWAP_MUTEXES
-#include <vector>
 #include "base/mutex.h"
 #include "base/stl_util.h"
-#include "base/stringprintf.h"
 #include "thread-inl.h"
-#endif
 
 namespace art {
 
-#if NEED_SWAP_MUTEXES
-// We stripe across a bunch of different mutexes to reduce contention.
-static const size_t kSwapMutexCount = 32;
-static std::vector<Mutex*>* gSwapMutexes;
+std::vector<Mutex*>* QuasiAtomic::gSwapMutexes = nullptr;
 
-static Mutex& GetSwapMutex(const volatile int64_t* addr) {
-  return *(*gSwapMutexes)[(reinterpret_cast<unsigned>(addr) >> 3U) % kSwapMutexCount];
+Mutex* QuasiAtomic::GetSwapMutex(const volatile int64_t* addr) {
+  return (*gSwapMutexes)[(reinterpret_cast<unsigned>(addr) >> 3U) % kSwapMutexCount];
 }
-#endif
 
 void QuasiAtomic::Startup() {
-#if NEED_SWAP_MUTEXES
-  gSwapMutexes = new std::vector<Mutex*>;
-  for (size_t i = 0; i < kSwapMutexCount; ++i) {
-    gSwapMutexes->push_back(new Mutex("QuasiAtomic stripe"));
+  if (kNeedSwapMutexes) {
+    gSwapMutexes = new std::vector<Mutex*>;
+    for (size_t i = 0; i < kSwapMutexCount; ++i) {
+      gSwapMutexes->push_back(new Mutex("QuasiAtomic stripe"));
+    }
   }
-#endif
 }
 
 void QuasiAtomic::Shutdown() {
-#if NEED_SWAP_MUTEXES
-  STLDeleteElements(gSwapMutexes);
-  delete gSwapMutexes;
-#endif
+  if (kNeedSwapMutexes) {
+    STLDeleteElements(gSwapMutexes);
+    delete gSwapMutexes;
+  }
 }
 
-int64_t QuasiAtomic::Read64(volatile const int64_t* addr) {
-  int64_t value;
-#if NEED_SWAP_MUTEXES
-  MutexLock mu(Thread::Current(), GetSwapMutex(addr));
-  value = *addr;
-#elif defined(__arm__)
-  // Exclusive loads are defined not to tear, clearing the exclusive state isn't necessary. If we
-  // have LPAE (such as Cortex-A15) then ldrd would suffice.
-  __asm__ __volatile__("@ QuasiAtomic::Read64\n"
-      "ldrexd     %0, %H0, [%1]"
-      : "=&r" (value)
-      : "r" (addr));
-#elif defined(__i386__)
-  __asm__ __volatile__(
-      "movq     %1, %0\n"
-      : "=x" (value)
-      : "m" (*addr));
-#else
-#error Unexpected architecture
-#endif
-  return value;
+int64_t QuasiAtomic::SwapMutexRead64(volatile const int64_t* addr) {
+  MutexLock mu(Thread::Current(), *GetSwapMutex(addr));
+  return *addr;
 }
 
-void QuasiAtomic::Write64(volatile int64_t* addr, int64_t value) {
-#if NEED_SWAP_MUTEXES
-  MutexLock mu(Thread::Current(), GetSwapMutex(addr));
+void QuasiAtomic::SwapMutexWrite64(volatile int64_t* addr, int64_t value) {
+  MutexLock mu(Thread::Current(), *GetSwapMutex(addr));
   *addr = value;
-#elif defined(__arm__)
-  // The write is done as a swap so that the cache-line is in the exclusive state for the store. If
-  // we know that ARM architecture has LPAE (such as Cortex-A15) this isn't necessary and strd will
-  // suffice.
-  int64_t prev;
-  int status;
-  do {
-    __asm__ __volatile__("@ QuasiAtomic::Write64\n"
-        "ldrexd     %0, %H0, [%3]\n"
-        "strexd     %1, %4, %H4, [%3]"
-        : "=&r" (prev), "=&r" (status), "+m"(*addr)
-        : "r" (addr), "r" (value)
-        : "cc");
-  } while (__builtin_expect(status != 0, 0));
-#elif defined(__i386__)
-  __asm__ __volatile__(
-      "movq     %1, %0"
-      : "=m" (*addr)
-      : "x" (value));
-#else
-#error Unexpected architecture
-#endif
 }
 
 
-bool QuasiAtomic::Cas64(int64_t old_value, int64_t new_value, volatile int64_t* addr) {
-#if NEED_SWAP_MUTEXES
-  MutexLock mu(Thread::Current(), GetSwapMutex(addr));
+bool QuasiAtomic::SwapMutexCas64(int64_t old_value, int64_t new_value, volatile int64_t* addr) {
+  MutexLock mu(Thread::Current(), *GetSwapMutex(addr));
   if (*addr == old_value) {
     *addr = new_value;
     return true;
   }
   return false;
-#elif defined(__arm__)
-  int64_t prev;
-  int status;
-  do {
-    __asm__ __volatile__("@ QuasiAtomic::Cas64\n"
-        "ldrexd     %0, %H0, [%3]\n"
-        "mov        %1, #0\n"
-        "teq        %0, %4\n"
-        "teqeq      %H0, %H4\n"
-        "strexdeq   %1, %5, %H5, [%3]"
-        : "=&r" (prev), "=&r" (status), "+m"(*addr)
-        : "r" (addr), "Ir" (old_value), "r" (new_value)
-        : "cc");
-  } while (__builtin_expect(status != 0, 0));
-  return prev == old_value;
-#elif defined(__i386__)
-  // The compiler does the right job and works better than inline assembly, especially with -O0
-  // compilation.
-  return __sync_bool_compare_and_swap(addr, old_value, new_value);
-#else
-#error Unexpected architecture
-#endif
-}
-
-bool QuasiAtomic::LongAtomicsUseMutexes() {
-#if NEED_SWAP_MUTEXES
-  return true;
-#else
-  return false;
-#endif
 }
 
 }  // namespace art
diff --git a/runtime/atomic.h b/runtime/atomic.h
index cb6f86b..b1e9870 100644
--- a/runtime/atomic.h
+++ b/runtime/atomic.h
@@ -18,11 +18,14 @@
 #define ART_RUNTIME_ATOMIC_H_
 
 #include <stdint.h>
+#include <vector>
 
 #include "base/macros.h"
 
 namespace art {
 
+class Mutex;
+
 // NOTE: Two "quasiatomic" operations on the exact same memory address
 // are guaranteed to operate atomically with respect to each other,
 // but no guarantees are made about quasiatomic operations mixed with
@@ -30,25 +33,108 @@
 // quasiatomic operations that are performed on partially-overlapping
 // memory.
 class QuasiAtomic {
+#if !defined(__arm__) && !defined(__i386__)
+  static constexpr bool kNeedSwapMutexes = true;
+#else
+  static constexpr bool kNeedSwapMutexes = false;
+#endif
+
  public:
   static void Startup();
 
   static void Shutdown();
 
   // Reads the 64-bit value at "addr" without tearing.
-  static int64_t Read64(volatile const int64_t* addr);
+  static int64_t Read64(volatile const int64_t* addr) {
+    if (!kNeedSwapMutexes) {
+      return *addr;
+    } else {
+      return SwapMutexRead64(addr);
+    }
+  }
 
   // Writes to the 64-bit value at "addr" without tearing.
-  static void Write64(volatile int64_t* addr, int64_t val);
+  static void Write64(volatile int64_t* addr, int64_t val) {
+    if (!kNeedSwapMutexes) {
+      *addr = val;
+    } else {
+      SwapMutexWrite64(addr, val);
+    }
+  }
 
   // Atomically compare the value at "addr" to "old_value", if equal replace it with "new_value"
   // and return true. Otherwise, don't swap, and return false.
-  static bool Cas64(int64_t old_value, int64_t new_value, volatile int64_t* addr);
+  static bool Cas64(int64_t old_value, int64_t new_value, volatile int64_t* addr) {
+    if (!kNeedSwapMutexes) {
+      return __sync_bool_compare_and_swap(addr, old_value, new_value);
+    } else {
+      return SwapMutexCas64(old_value, new_value, addr);
+    }
+  }
 
   // Does the architecture provide reasonable atomic long operations or do we fall back on mutexes?
-  static bool LongAtomicsUseMutexes();
+  static bool LongAtomicsUseMutexes() {
+    return !kNeedSwapMutexes;
+  }
+
+  static void MembarLoadStore() {
+  #if defined(__arm__)
+    __asm__ __volatile__("dmb ish" : : : "memory");
+  #elif defined(__i386__)
+    __asm__ __volatile__("" : : : "memory");
+  #elif defined(__mips__)
+    __asm__ __volatile__("sync" : : : "memory");
+  #else
+  #error Unexpected architecture
+  #endif
+  }
+
+  static void MembarLoadLoad() {
+  #if defined(__arm__)
+    __asm__ __volatile__("dmb ish" : : : "memory");
+  #elif defined(__i386__)
+    __asm__ __volatile__("" : : : "memory");
+  #elif defined(__mips__)
+    __asm__ __volatile__("sync" : : : "memory");
+  #else
+  #error Unexpected architecture
+  #endif
+  }
+
+  static void MembarStoreStore() {
+  #if defined(__arm__)
+    __asm__ __volatile__("dmb ishst" : : : "memory");
+  #elif defined(__i386__)
+    __asm__ __volatile__("" : : : "memory");
+  #elif defined(__mips__)
+    __asm__ __volatile__("sync" : : : "memory");
+  #else
+  #error Unexpected architecture
+  #endif
+  }
+
+  static void MembarStoreLoad() {
+  #if defined(__arm__)
+    __asm__ __volatile__("dmb ish" : : : "memory");
+  #elif defined(__i386__)
+    __asm__ __volatile__("mfence" : : : "memory");
+  #elif defined(__mips__)
+    __asm__ __volatile__("sync" : : : "memory");
+  #else
+  #error Unexpected architecture
+  #endif
+  }
 
  private:
+  static Mutex* GetSwapMutex(const volatile int64_t* addr);
+  static int64_t SwapMutexRead64(volatile const int64_t* addr);
+  static void SwapMutexWrite64(volatile int64_t* addr, int64_t val);
+  static bool SwapMutexCas64(int64_t old_value, int64_t new_value, volatile int64_t* addr);
+
+  // We stripe across a bunch of different mutexes to reduce contention.
+  static constexpr size_t kSwapMutexCount = 32;
+  static std::vector<Mutex*>* gSwapMutexes;
+
   DISALLOW_COPY_AND_ASSIGN(QuasiAtomic);
 };
 
diff --git a/runtime/atomic_integer.h b/runtime/atomic_integer.h
index 132f968..651ca4a 100644
--- a/runtime/atomic_integer.h
+++ b/runtime/atomic_integer.h
@@ -17,8 +17,7 @@
 #ifndef ART_RUNTIME_ATOMIC_INTEGER_H_
 #define ART_RUNTIME_ATOMIC_INTEGER_H_
 
-#include "cutils/atomic.h"
-#include "cutils/atomic-inline.h"
+#include <stdint.h>
 
 namespace art {
 
@@ -28,53 +27,57 @@
 
   explicit AtomicInteger(int32_t value) : value_(value) { }
 
-  // Unsafe = operator for non atomic operations on the integer.
-  void store(int32_t desired) {
-    value_ = desired;
-  }
-
   AtomicInteger& operator=(int32_t desired) {
-    store(desired);
+    Store(desired);
     return *this;
   }
 
-  int32_t load() const {
+  int32_t Load() const {
     return value_;
   }
 
   operator int32_t() const {
-    return load();
+    return Load();
   }
 
-  int32_t fetch_add(const int32_t value) {
-    return android_atomic_add(value, &value_);
+  int32_t FetchAndAdd(const int32_t value) {
+    return __sync_fetch_and_add(&value_, value);  // Return old_value.
   }
 
-  int32_t fetch_sub(const int32_t value) {
-    return android_atomic_add(-value, &value_);
+  int32_t FetchAndSub(const int32_t value) {
+    return __sync_fetch_and_sub(&value_, value);  // Return old value.
   }
 
-  int32_t operator++() {
-    return android_atomic_inc(&value_) + 1;
+  int32_t operator++() {  // Prefix operator.
+    return __sync_add_and_fetch(&value_, 1);  // Return new value.
   }
 
-  int32_t operator++(int32_t) {
-    return android_atomic_inc(&value_);
+  int32_t operator++(int32_t) {  // Postfix operator.
+    return __sync_fetch_and_add(&value_, 1);  // Return old value.
   }
 
-  int32_t operator--() {
-    return android_atomic_dec(&value_) - 1;
+  int32_t operator--() {  // Prefix operator.
+    return __sync_sub_and_fetch(&value_, 1);  // Return new value.
   }
 
-  int32_t operator--(int32_t) {
-    return android_atomic_dec(&value_);
+  int32_t operator--(int32_t) {  // Postfix operator.
+    return __sync_fetch_and_sub(&value_, 1);  // Return old value.
   }
 
-  bool compare_and_swap(int32_t expected_value, int32_t desired_value) {
-    return android_atomic_cas(expected_value, desired_value, &value_) == 0;
+  bool CompareAndSwap(int32_t expected_value, int32_t desired_value) {
+    return __sync_bool_compare_and_swap(&value_, expected_value, desired_value);
+  }
+
+  volatile int32_t* Address() {
+    return &value_;
   }
 
  private:
+  // Unsafe = operator for non atomic operations on the integer.
+  void Store(int32_t desired) {
+    value_ = desired;
+  }
+
   volatile int32_t value_;
 };
 
diff --git a/runtime/base/bounded_fifo.h b/runtime/base/bounded_fifo.h
index cb92d40..d04840a 100644
--- a/runtime/base/bounded_fifo.h
+++ b/runtime/base/bounded_fifo.h
@@ -17,9 +17,6 @@
 #ifndef ART_RUNTIME_BASE_BOUNDED_FIFO_H_
 #define ART_RUNTIME_BASE_BOUNDED_FIFO_H_
 
-#include "cutils/atomic.h"
-#include "cutils/atomic-inline.h"
-
 namespace art {
 
 // A bounded fifo is a fifo which has a bounded size. The power of two version uses a bit mask to
@@ -49,7 +46,7 @@
   void push_back(const T& value) {
     ++size_;
     DCHECK_LE(size_, MaxSize);
-    // Relies on integer overflow behaviour.
+    // Relies on integer overflow behavior.
     data_[back_index_++ & mask_] = value;
   }
 
diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index ec79c55..05e3a83 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc
@@ -21,8 +21,6 @@
 
 #include "atomic.h"
 #include "base/logging.h"
-#include "cutils/atomic.h"
-#include "cutils/atomic-inline.h"
 #include "mutex-inl.h"
 #include "runtime.h"
 #include "scoped_thread_state_change.h"
@@ -59,12 +57,12 @@
 class ScopedAllMutexesLock {
  public:
   explicit ScopedAllMutexesLock(const BaseMutex* mutex) : mutex_(mutex) {
-    while (!gAllMutexData->all_mutexes_guard.compare_and_swap(0, reinterpret_cast<int32_t>(mutex))) {
+    while (!gAllMutexData->all_mutexes_guard.CompareAndSwap(0, reinterpret_cast<int32_t>(mutex))) {
       NanoSleep(100);
     }
   }
   ~ScopedAllMutexesLock() {
-    while (!gAllMutexData->all_mutexes_guard.compare_and_swap(reinterpret_cast<int32_t>(mutex_), 0)) {
+    while (!gAllMutexData->all_mutexes_guard.CompareAndSwap(reinterpret_cast<int32_t>(mutex_), 0)) {
       NanoSleep(100);
     }
   }
@@ -176,7 +174,7 @@
       do {
         slot = data->cur_content_log_entry;
         new_slot = (slot + 1) % kContentionLogSize;
-      } while (!data->cur_content_log_entry.compare_and_swap(slot, new_slot));
+      } while (!data->cur_content_log_entry.CompareAndSwap(slot, new_slot));
       log[new_slot].blocked_tid = blocked_tid;
       log[new_slot].owner_tid = owner_tid;
       log[new_slot].count = 1;
@@ -300,11 +298,11 @@
       int32_t cur_state = state_;
       if (LIKELY(cur_state == 0)) {
         // Change state from 0 to 1.
-        done = android_atomic_acquire_cas(0, 1, &state_) == 0;
+        done = __sync_bool_compare_and_swap(&state_, 0 /* cur_state */, 1 /* new state */);
       } else {
         // Failed to acquire, hang up.
         ScopedContentionRecorder scr(this, SafeGetTid(self), GetExclusiveOwnerTid());
-        android_atomic_inc(&num_contenders_);
+        num_contenders_++;
         if (futex(&state_, FUTEX_WAIT, 1, NULL, NULL, 0) != 0) {
           // EAGAIN and EINTR both indicate a spurious failure, try again from the beginning.
           // We don't use TEMP_FAILURE_RETRY so we can intentionally retry to acquire the lock.
@@ -312,9 +310,10 @@
             PLOG(FATAL) << "futex wait failed for " << name_;
           }
         }
-        android_atomic_dec(&num_contenders_);
+        num_contenders_--;
       }
     } while (!done);
+    QuasiAtomic::MembarStoreLoad();
     DCHECK_EQ(state_, 1);
     exclusive_owner_ = SafeGetTid(self);
 #else
@@ -342,11 +341,12 @@
       int32_t cur_state = state_;
       if (cur_state == 0) {
         // Change state from 0 to 1.
-        done = android_atomic_acquire_cas(0, 1, &state_) == 0;
+        done = __sync_bool_compare_and_swap(&state_, 0 /* cur_state */, 1 /* new state */);
       } else {
         return false;
       }
     } while (!done);
+    QuasiAtomic::MembarStoreLoad();
     DCHECK_EQ(state_, 1);
     exclusive_owner_ = SafeGetTid(self);
 #else
@@ -385,10 +385,11 @@
   do {
     int32_t cur_state = state_;
     if (LIKELY(cur_state == 1)) {
+      QuasiAtomic::MembarStoreStore();
       // We're no longer the owner.
       exclusive_owner_ = 0;
       // Change state to 0.
-      done = android_atomic_release_cas(cur_state, 0, &state_) == 0;
+      done =  __sync_bool_compare_and_swap(&state_, cur_state, 0 /* new state */);
       if (LIKELY(done)) {  // Spurious fail?
         // Wake a contender
         if (UNLIKELY(num_contenders_ > 0)) {
@@ -407,6 +408,7 @@
       }
     }
   } while (!done);
+  QuasiAtomic::MembarStoreLoad();
 #else
     CHECK_MUTEX_CALL(pthread_mutex_unlock, (&mutex_));
 #endif
@@ -468,11 +470,11 @@
     int32_t cur_state = state_;
     if (LIKELY(cur_state == 0)) {
       // Change state from 0 to -1.
-      done = android_atomic_acquire_cas(0, -1, &state_) == 0;
+      done =  __sync_bool_compare_and_swap(&state_, 0 /* cur_state*/, -1 /* new state */);
     } else {
       // Failed to acquire, hang up.
       ScopedContentionRecorder scr(this, SafeGetTid(self), GetExclusiveOwnerTid());
-      android_atomic_inc(&num_pending_writers_);
+      num_pending_writers_++;
       if (futex(&state_, FUTEX_WAIT, cur_state, NULL, NULL, 0) != 0) {
         // EAGAIN and EINTR both indicate a spurious failure, try again from the beginning.
         // We don't use TEMP_FAILURE_RETRY so we can intentionally retry to acquire the lock.
@@ -480,7 +482,7 @@
           PLOG(FATAL) << "futex wait failed for " << name_;
         }
       }
-      android_atomic_dec(&num_pending_writers_);
+      num_pending_writers_--;
     }
   } while (!done);
   DCHECK_EQ(state_, -1);
@@ -504,7 +506,7 @@
       // We're no longer the owner.
       exclusive_owner_ = 0;
       // Change state from -1 to 0.
-      done = android_atomic_release_cas(-1, 0, &state_) == 0;
+      done =  __sync_bool_compare_and_swap(&state_, -1 /* cur_state*/, 0 /* new state */);
       if (LIKELY(done)) {  // cmpxchg may fail due to noise?
         // Wake any waiters.
         if (UNLIKELY(num_pending_readers_ > 0 || num_pending_writers_ > 0)) {
@@ -531,7 +533,7 @@
     int32_t cur_state = state_;
     if (cur_state == 0) {
       // Change state from 0 to -1.
-      done = android_atomic_acquire_cas(0, -1, &state_) == 0;
+      done =  __sync_bool_compare_and_swap(&state_, 0 /* cur_state */, -1 /* new state */);
     } else {
       // Failed to acquire, hang up.
       timespec now_abs_ts;
@@ -541,10 +543,10 @@
         return false;  // Timed out.
       }
       ScopedContentionRecorder scr(this, SafeGetTid(self), GetExclusiveOwnerTid());
-      android_atomic_inc(&num_pending_writers_);
+      num_pending_writers_++;
       if (futex(&state_, FUTEX_WAIT, cur_state, &rel_ts, NULL, 0) != 0) {
         if (errno == ETIMEDOUT) {
-          android_atomic_dec(&num_pending_writers_);
+          num_pending_writers_--;
           return false;  // Timed out.
         } else if ((errno != EAGAIN) && (errno != EINTR)) {
           // EAGAIN and EINTR both indicate a spurious failure,
@@ -553,7 +555,7 @@
           PLOG(FATAL) << "timed futex wait failed for " << name_;
         }
       }
-      android_atomic_dec(&num_pending_writers_);
+      num_pending_writers_--;
     }
   } while (!done);
   exclusive_owner_ = SafeGetTid(self);
@@ -583,7 +585,7 @@
     int32_t cur_state = state_;
     if (cur_state >= 0) {
       // Add as an extra reader.
-      done = android_atomic_acquire_cas(cur_state, cur_state + 1, &state_) == 0;
+      done =  __sync_bool_compare_and_swap(&state_, cur_state, cur_state + 1);
     } else {
       // Owner holds it exclusively.
       return false;
@@ -666,13 +668,13 @@
   DCHECK_EQ(guard_.GetExclusiveOwnerTid(), SafeGetTid(self));
 #if ART_USE_FUTEXES
   if (num_waiters_ > 0) {
-    android_atomic_inc(&sequence_);  // Indicate the broadcast occurred.
+    sequence_++;  // Indicate the broadcast occurred.
     bool done = false;
     do {
       int32_t cur_sequence = sequence_;
       // Requeue waiters onto mutex. The waiter holds the contender count on the mutex high ensuring
       // mutex unlocks will awaken the requeued waiter thread.
-      done = futex(&sequence_, FUTEX_CMP_REQUEUE, 0,
+      done = futex(sequence_.Address(), FUTEX_CMP_REQUEUE, 0,
                    reinterpret_cast<const timespec*>(std::numeric_limits<int32_t>::max()),
                    &guard_.state_, cur_sequence) != -1;
       if (!done) {
@@ -692,10 +694,10 @@
   guard_.AssertExclusiveHeld(self);
 #if ART_USE_FUTEXES
   if (num_waiters_ > 0) {
-    android_atomic_inc(&sequence_);  // Indicate a signal occurred.
+    sequence_++;  // Indicate a signal occurred.
     // Futex wake 1 waiter who will then come and in contend on mutex. It'd be nice to requeue them
     // to avoid this, however, requeueing can only move all waiters.
-    int num_woken = futex(&sequence_, FUTEX_WAKE, 1, NULL, NULL, 0);
+    int num_woken = futex(sequence_.Address(), FUTEX_WAKE, 1, NULL, NULL, 0);
     // Check something was woken or else we changed sequence_ before they had chance to wait.
     CHECK((num_woken == 0) || (num_woken == 1));
   }
@@ -716,11 +718,11 @@
 #if ART_USE_FUTEXES
   num_waiters_++;
   // Ensure the Mutex is contended so that requeued threads are awoken.
-  android_atomic_inc(&guard_.num_contenders_);
+  guard_.num_contenders_++;
   guard_.recursion_count_ = 1;
   int32_t cur_sequence = sequence_;
   guard_.ExclusiveUnlock(self);
-  if (futex(&sequence_, FUTEX_WAIT, cur_sequence, NULL, NULL, 0) != 0) {
+  if (futex(sequence_.Address(), FUTEX_WAIT, cur_sequence, NULL, NULL, 0) != 0) {
     // Futex failed, check it is an expected error.
     // EAGAIN == EWOULDBLK, so we let the caller try again.
     // EINTR implies a signal was sent to this thread.
@@ -733,7 +735,7 @@
   num_waiters_--;
   // We awoke and so no longer require awakes from the guard_'s unlock.
   CHECK_GE(guard_.num_contenders_, 0);
-  android_atomic_dec(&guard_.num_contenders_);
+  guard_.num_contenders_--;
 #else
   guard_.recursion_count_ = 0;
   CHECK_MUTEX_CALL(pthread_cond_wait, (&cond_, &guard_.mutex_));
@@ -751,11 +753,11 @@
   InitTimeSpec(false, CLOCK_REALTIME, ms, ns, &rel_ts);
   num_waiters_++;
   // Ensure the Mutex is contended so that requeued threads are awoken.
-  android_atomic_inc(&guard_.num_contenders_);
+  guard_.num_contenders_++;
   guard_.recursion_count_ = 1;
   int32_t cur_sequence = sequence_;
   guard_.ExclusiveUnlock(self);
-  if (futex(&sequence_, FUTEX_WAIT, cur_sequence, &rel_ts, NULL, 0) != 0) {
+  if (futex(sequence_.Address(), FUTEX_WAIT, cur_sequence, &rel_ts, NULL, 0) != 0) {
     if (errno == ETIMEDOUT) {
       // Timed out we're done.
     } else if ((errno == EAGAIN) || (errno == EINTR)) {
@@ -769,7 +771,7 @@
   num_waiters_--;
   // We awoke and so no longer require awakes from the guard_'s unlock.
   CHECK_GE(guard_.num_contenders_, 0);
-  android_atomic_dec(&guard_.num_contenders_);
+  guard_.num_contenders_--;
 #else
 #ifdef HAVE_TIMEDWAIT_MONOTONIC
 #define TIMEDWAIT pthread_cond_timedwait_monotonic
diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index b894c0a..1c1dcaf 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h
@@ -191,7 +191,7 @@
   // Exclusive owner.
   volatile uint64_t exclusive_owner_;
   // Number of waiting contenders.
-  volatile int32_t num_contenders_;
+  AtomicInteger num_contenders_;
 #else
   pthread_mutex_t mutex_;
 #endif
@@ -304,7 +304,7 @@
   // Pending readers.
   volatile int32_t num_pending_readers_;
   // Pending writers.
-  volatile int32_t num_pending_writers_;
+  AtomicInteger num_pending_writers_;
 #else
   pthread_rwlock_t rwlock_;
 #endif
@@ -339,7 +339,7 @@
   // their Mutex and another thread takes it and signals, the waiting thread observes that sequence_
   // changed and doesn't enter the wait. Modified while holding guard_, but is read by futex wait
   // without guard_ held.
-  volatile int32_t sequence_;
+  AtomicInteger sequence_;
   // Number of threads that have come into to wait, not the length of the waiters on the futex as
   // waiters may have been requeued onto guard_. Guarded by guard_.
   volatile int32_t num_waiters_;
diff --git a/runtime/gc/accounting/atomic_stack.h b/runtime/gc/accounting/atomic_stack.h
index 8fa5b86..02e01b8 100644
--- a/runtime/gc/accounting/atomic_stack.h
+++ b/runtime/gc/accounting/atomic_stack.h
@@ -68,7 +68,7 @@
         // Stack overflow.
         return false;
       }
-    } while (!back_index_.compare_and_swap(index, index + 1));
+    } while (!back_index_.CompareAndSwap(index, index + 1));
     begin_[index] = value;
     return true;
   }
@@ -93,7 +93,7 @@
   // Take an item from the front of the stack.
   T PopFront() {
     int32_t index = front_index_;
-    DCHECK_LT(index, back_index_.load());
+    DCHECK_LT(index, back_index_.Load());
     front_index_ = front_index_ + 1;
     return begin_[index];
   }
@@ -101,7 +101,7 @@
   // Pop a number of elements.
   void PopBackCount(int32_t n) {
     DCHECK_GE(Size(), static_cast<size_t>(n));
-    back_index_.fetch_sub(n);
+    back_index_.FetchAndSub(n);
   }
 
   bool IsEmpty() const {
@@ -132,11 +132,11 @@
   }
 
   void Sort() {
-    int32_t start_back_index = back_index_.load();
-    int32_t start_front_index = front_index_.load();
+    int32_t start_back_index = back_index_.Load();
+    int32_t start_front_index = front_index_.Load();
     std::sort(Begin(), End());
-    CHECK_EQ(start_back_index, back_index_.load());
-    CHECK_EQ(start_front_index, front_index_.load());
+    CHECK_EQ(start_back_index, back_index_.Load());
+    CHECK_EQ(start_front_index, front_index_.Load());
     if (kIsDebugBuild) {
       debug_is_sorted_ = true;
     }
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index 28cc510..cae2a54 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -1109,8 +1109,8 @@
   // AllocSpace::FreeList clears the value in ptrs, so perform after clearing the live bit
   size_t freed_bytes = space->FreeList(self, num_ptrs, ptrs);
   heap->RecordFree(freed_objects, freed_bytes);
-  mark_sweep->freed_objects_.fetch_add(freed_objects);
-  mark_sweep->freed_bytes_.fetch_add(freed_bytes);
+  mark_sweep->freed_objects_.FetchAndAdd(freed_objects);
+  mark_sweep->freed_bytes_.FetchAndAdd(freed_bytes);
 }
 
 void MarkSweep::ZygoteSweepCallback(size_t num_ptrs, Object** ptrs, void* arg) {
@@ -1192,10 +1192,10 @@
   VLOG(heap) << "Freed " << freed_objects << "/" << count
              << " objects with size " << PrettySize(freed_bytes);
   heap_->RecordFree(freed_objects + freed_large_objects, freed_bytes + freed_large_object_bytes);
-  freed_objects_.fetch_add(freed_objects);
-  freed_large_objects_.fetch_add(freed_large_objects);
-  freed_bytes_.fetch_add(freed_bytes);
-  freed_large_object_bytes_.fetch_add(freed_large_object_bytes);
+  freed_objects_.FetchAndAdd(freed_objects);
+  freed_large_objects_.FetchAndAdd(freed_large_objects);
+  freed_bytes_.FetchAndAdd(freed_bytes);
+  freed_large_object_bytes_.FetchAndAdd(freed_large_object_bytes);
   timings_.EndSplit();
 
   timings_.StartSplit("ResetStack");
@@ -1267,8 +1267,8 @@
       ++freed_objects;
     }
   }
-  freed_large_objects_.fetch_add(freed_objects);
-  freed_large_object_bytes_.fetch_add(freed_bytes);
+  freed_large_objects_.FetchAndAdd(freed_objects);
+  freed_large_object_bytes_.FetchAndAdd(freed_bytes);
   GetHeap()->RecordFree(freed_objects, freed_bytes);
 }
 
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index f29eadb..a4f7121 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -236,8 +236,8 @@
   int freed_bytes = from_bytes - to_bytes;
   int freed_objects = from_objects - to_objects;
   CHECK_GE(freed_bytes, 0);
-  freed_bytes_.fetch_add(freed_bytes);
-  freed_objects_.fetch_add(freed_objects);
+  freed_bytes_.FetchAndAdd(freed_bytes);
+  freed_objects_.FetchAndAdd(freed_objects);
   heap_->RecordFree(static_cast<size_t>(freed_objects), static_cast<size_t>(freed_bytes));
 
   timings_.StartSplit("PreSweepingGcVerification");
@@ -332,7 +332,7 @@
             // If out of space, fall back to the to-space.
             forward_address = to_space_->Alloc(self_, object_size, &bytes_allocated);
           } else {
-            GetHeap()->num_bytes_allocated_.fetch_add(bytes_promoted);
+            GetHeap()->num_bytes_allocated_.FetchAndAdd(bytes_promoted);
             bytes_promoted_ += bytes_promoted;
             // Mark forward_address on the live bit map.
             accounting::SpaceBitmap* live_bitmap = non_moving_space->GetLiveBitmap();
@@ -446,8 +446,8 @@
   Locks::heap_bitmap_lock_->AssertExclusiveHeld(self);
   size_t freed_bytes = space->FreeList(self, num_ptrs, ptrs);
   heap->RecordFree(num_ptrs, freed_bytes);
-  gc->freed_objects_.fetch_add(num_ptrs);
-  gc->freed_bytes_.fetch_add(freed_bytes);
+  gc->freed_objects_.FetchAndAdd(num_ptrs);
+  gc->freed_bytes_.FetchAndAdd(freed_bytes);
 }
 
 void SemiSpace::ZygoteSweepCallback(size_t num_ptrs, Object** ptrs, void* arg) {
@@ -526,8 +526,8 @@
       ++freed_objects;
     }
   }
-  freed_large_objects_.fetch_add(freed_objects);
-  freed_large_object_bytes_.fetch_add(freed_bytes);
+  freed_large_objects_.FetchAndAdd(freed_objects);
+  freed_large_object_bytes_.FetchAndAdd(freed_bytes);
   GetHeap()->RecordFree(freed_objects, freed_bytes);
 }
 
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 9fb5760..af1b26b 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -61,7 +61,7 @@
   pre_fence_visitor(obj);
   DCHECK_GT(bytes_allocated, 0u);
   const size_t new_num_bytes_allocated =
-      static_cast<size_t>(num_bytes_allocated_.fetch_add(bytes_allocated)) + bytes_allocated;
+      static_cast<size_t>(num_bytes_allocated_.FetchAndAdd(bytes_allocated)) + bytes_allocated;
   // TODO: Deprecate.
   if (kInstrumented) {
     if (Runtime::Current()->HasStatsEnabled()) {
@@ -200,7 +200,7 @@
     // Only if the allocation succeeded, record the time.
     if (allocated_obj != nullptr) {
       uint64_t allocation_end_time = NanoTime() / kTimeAdjust;
-      heap_->total_allocation_time_.fetch_add(allocation_end_time - allocation_start_time_);
+      heap_->total_allocation_time_.FetchAndAdd(allocation_end_time - allocation_start_time_);
     }
   }
 };
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 61c66e7..e08106b 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -413,13 +413,13 @@
 
 void Heap::RegisterGCAllocation(size_t bytes) {
   if (this != nullptr) {
-    gc_memory_overhead_.fetch_add(bytes);
+    gc_memory_overhead_.FetchAndAdd(bytes);
   }
 }
 
 void Heap::RegisterGCDeAllocation(size_t bytes) {
   if (this != nullptr) {
-    gc_memory_overhead_.fetch_sub(bytes);
+    gc_memory_overhead_.FetchAndSub(bytes);
   }
 }
 
@@ -802,7 +802,7 @@
 void Heap::VerifyObjectBody(const mirror::Object* obj) {
   CHECK(IsAligned<kObjectAlignment>(obj)) << "Object isn't aligned: " << obj;
   // Ignore early dawn of the universe verifications.
-  if (UNLIKELY(static_cast<size_t>(num_bytes_allocated_.load()) < 10 * KB)) {
+  if (UNLIKELY(static_cast<size_t>(num_bytes_allocated_.Load()) < 10 * KB)) {
     return;
   }
   const byte* raw_addr = reinterpret_cast<const byte*>(obj) +
@@ -847,7 +847,8 @@
 
 void Heap::RecordFree(size_t freed_objects, size_t freed_bytes) {
   DCHECK_LE(freed_bytes, static_cast<size_t>(num_bytes_allocated_));
-  num_bytes_allocated_.fetch_sub(freed_bytes);
+  num_bytes_allocated_.FetchAndSub(freed_bytes);
+
   if (Runtime::Current()->HasStatsEnabled()) {
     RuntimeStats* thread_stats = Thread::Current()->GetStats();
     thread_stats->freed_objects += freed_objects;
@@ -2082,7 +2083,7 @@
     native_need_to_run_finalization_ = false;
   }
   // Total number of native bytes allocated.
-  native_bytes_allocated_.fetch_add(bytes);
+  native_bytes_allocated_.FetchAndAdd(bytes);
   if (static_cast<size_t>(native_bytes_allocated_) > native_footprint_gc_watermark_) {
     collector::GcType gc_type = have_zygote_space_ ? collector::kGcTypePartial :
         collector::kGcTypeFull;
@@ -2118,7 +2119,7 @@
 void Heap::RegisterNativeFree(JNIEnv* env, int bytes) {
   int expected_size, new_size;
   do {
-    expected_size = native_bytes_allocated_.load();
+    expected_size = native_bytes_allocated_.Load();
     new_size = expected_size - bytes;
     if (UNLIKELY(new_size < 0)) {
       ScopedObjectAccess soa(env);
@@ -2127,7 +2128,7 @@
                                  "registered as allocated", bytes, expected_size).c_str());
       break;
     }
-  } while (!native_bytes_allocated_.compare_and_swap(expected_size, new_size));
+  } while (!native_bytes_allocated_.CompareAndSwap(expected_size, new_size));
 }
 
 int64_t Heap::GetTotalMemory() const {
diff --git a/runtime/gc/space/bump_pointer_space-inl.h b/runtime/gc/space/bump_pointer_space-inl.h
index 82e96a4..ac20972 100644
--- a/runtime/gc/space/bump_pointer_space-inl.h
+++ b/runtime/gc/space/bump_pointer_space-inl.h
@@ -44,8 +44,8 @@
 inline mirror::Object* BumpPointerSpace::AllocNonvirtual(size_t num_bytes) {
   mirror::Object* ret = AllocNonvirtualWithoutAccounting(num_bytes);
   if (ret != nullptr) {
-    objects_allocated_.fetch_add(1);
-    bytes_allocated_.fetch_add(num_bytes);
+    objects_allocated_.FetchAndAdd(1);
+    bytes_allocated_.FetchAndAdd(num_bytes);
   }
   return ret;
 }
diff --git a/runtime/gc/space/bump_pointer_space.cc b/runtime/gc/space/bump_pointer_space.cc
index 7ea202c..d5bc667 100644
--- a/runtime/gc/space/bump_pointer_space.cc
+++ b/runtime/gc/space/bump_pointer_space.cc
@@ -172,7 +172,7 @@
 
 uint64_t BumpPointerSpace::GetBytesAllocated() {
   // Start out pre-determined amount (blocks which are not being allocated into).
-  uint64_t total = static_cast<uint64_t>(bytes_allocated_.load());
+  uint64_t total = static_cast<uint64_t>(bytes_allocated_.Load());
   Thread* self = Thread::Current();
   MutexLock mu(self, *Locks::runtime_shutdown_lock_);
   MutexLock mu2(self, *Locks::thread_list_lock_);
@@ -190,7 +190,7 @@
 
 uint64_t BumpPointerSpace::GetObjectsAllocated() {
   // Start out pre-determined amount (blocks which are not being allocated into).
-  uint64_t total = static_cast<uint64_t>(objects_allocated_.load());
+  uint64_t total = static_cast<uint64_t>(objects_allocated_.Load());
   Thread* self = Thread::Current();
   MutexLock mu(self, *Locks::runtime_shutdown_lock_);
   MutexLock mu2(self, *Locks::thread_list_lock_);
@@ -207,8 +207,8 @@
 }
 
 void BumpPointerSpace::RevokeThreadLocalBuffersLocked(Thread* thread) {
-  objects_allocated_.fetch_add(thread->thread_local_objects_);
-  bytes_allocated_.fetch_add(thread->thread_local_pos_ - thread->thread_local_start_);
+  objects_allocated_.FetchAndAdd(thread->thread_local_objects_);
+  bytes_allocated_.FetchAndAdd(thread->thread_local_pos_ - thread->thread_local_start_);
   thread->SetTLAB(nullptr, nullptr);
 }
 
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index c6177bd..4777cc6 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -227,7 +227,7 @@
     *error_msg = StringPrintf("Failed to map image bitmap: %s", error_msg->c_str());
     return nullptr;
   }
-  size_t bitmap_index = bitmap_index_.fetch_add(1);
+  size_t bitmap_index = bitmap_index_.FetchAndAdd(1);
   std::string bitmap_name(StringPrintf("imagespace %s live-bitmap %u", image_file_name,
                                        bitmap_index));
   UniquePtr<accounting::SpaceBitmap> bitmap(
diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc
index 4ad9c63..47c1899 100644
--- a/runtime/instrumentation.cc
+++ b/runtime/instrumentation.cc
@@ -407,9 +407,9 @@
 void Instrumentation::InstrumentQuickAllocEntryPoints() {
   // TODO: the read of quick_alloc_entry_points_instrumentation_counter_ is racey and this code
   //       should be guarded by a lock.
-  DCHECK_GE(quick_alloc_entry_points_instrumentation_counter_.load(), 0);
+  DCHECK_GE(quick_alloc_entry_points_instrumentation_counter_.Load(), 0);
   const bool enable_instrumentation =
-      quick_alloc_entry_points_instrumentation_counter_.fetch_add(1) == 0;
+      quick_alloc_entry_points_instrumentation_counter_.FetchAndAdd(1) == 0;
   if (enable_instrumentation) {
     // Instrumentation wasn't enabled so enable it.
     SetQuickAllocEntryPointsInstrumented(true);
@@ -420,9 +420,9 @@
 void Instrumentation::UninstrumentQuickAllocEntryPoints() {
   // TODO: the read of quick_alloc_entry_points_instrumentation_counter_ is racey and this code
   //       should be guarded by a lock.
-  DCHECK_GT(quick_alloc_entry_points_instrumentation_counter_.load(), 0);
+  DCHECK_GT(quick_alloc_entry_points_instrumentation_counter_.Load(), 0);
   const bool disable_instrumentation =
-      quick_alloc_entry_points_instrumentation_counter_.fetch_sub(1) == 1;
+      quick_alloc_entry_points_instrumentation_counter_.FetchAndSub(1) == 1;
   if (disable_instrumentation) {
     SetQuickAllocEntryPointsInstrumented(false);
     ResetQuickAllocEntryPoints();
diff --git a/runtime/interpreter/interpreter_goto_table_impl.cc b/runtime/interpreter/interpreter_goto_table_impl.cc
index 99c85bd..942c275 100644
--- a/runtime/interpreter/interpreter_goto_table_impl.cc
+++ b/runtime/interpreter/interpreter_goto_table_impl.cc
@@ -245,7 +245,7 @@
       // If access checks are required then the dex-to-dex compiler and analysis of
       // whether the class has final fields hasn't been performed. Conservatively
       // perform the memory barrier now.
-      ANDROID_MEMBAR_STORE();
+      QuasiAtomic::MembarStoreLoad();
     }
     if (UNLIKELY(self->TestAllFlags())) {
       CheckSuspend(self);
@@ -261,7 +261,7 @@
   HANDLE_INSTRUCTION_END();
 
   HANDLE_INSTRUCTION_START(RETURN_VOID_BARRIER) {
-    ANDROID_MEMBAR_STORE();
+    QuasiAtomic::MembarStoreLoad();
     JValue result;
     if (UNLIKELY(self->TestAllFlags())) {
       CheckSuspend(self);
diff --git a/runtime/interpreter/interpreter_switch_impl.cc b/runtime/interpreter/interpreter_switch_impl.cc
index 675095f..75041ea 100644
--- a/runtime/interpreter/interpreter_switch_impl.cc
+++ b/runtime/interpreter/interpreter_switch_impl.cc
@@ -169,7 +169,7 @@
           // If access checks are required then the dex-to-dex compiler and analysis of
           // whether the class has final fields hasn't been performed. Conservatively
           // perform the memory barrier now.
-          ANDROID_MEMBAR_STORE();
+          QuasiAtomic::MembarStoreLoad();
         }
         if (UNLIKELY(self->TestAllFlags())) {
           CheckSuspend(self);
@@ -183,7 +183,7 @@
       }
       case Instruction::RETURN_VOID_BARRIER: {
         PREAMBLE();
-        ANDROID_MEMBAR_STORE();
+        QuasiAtomic::MembarStoreLoad();
         JValue result;
         if (UNLIKELY(self->TestAllFlags())) {
           CheckSuspend(self);
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index 7ac2c8c..9161bc5 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -253,11 +253,40 @@
   return result;
 }
 
+inline uint32_t Object::GetField32(MemberOffset field_offset, bool is_volatile) const {
+  VerifyObject(this);
+  const byte* raw_addr = reinterpret_cast<const byte*>(this) + field_offset.Int32Value();
+  const int32_t* word_addr = reinterpret_cast<const int32_t*>(raw_addr);
+  if (UNLIKELY(is_volatile)) {
+    int32_t result = *(reinterpret_cast<volatile int32_t*>(const_cast<int32_t*>(word_addr)));
+    QuasiAtomic::MembarLoadLoad();
+    return result;
+  } else {
+    return *word_addr;
+  }
+}
+
+inline void Object::SetField32(MemberOffset field_offset, uint32_t new_value, bool is_volatile,
+                               bool this_is_valid) {
+  if (this_is_valid) {
+    VerifyObject(this);
+  }
+  byte* raw_addr = reinterpret_cast<byte*>(this) + field_offset.Int32Value();
+  uint32_t* word_addr = reinterpret_cast<uint32_t*>(raw_addr);
+  if (UNLIKELY(is_volatile)) {
+    QuasiAtomic::MembarStoreStore();  // Ensure this store occurs after others in the queue.
+    *word_addr = new_value;
+    QuasiAtomic::MembarStoreLoad();  // Ensure this store occurs before any loads.
+  } else {
+    *word_addr = new_value;
+  }
+}
+
 inline bool Object::CasField32(MemberOffset field_offset, uint32_t old_value, uint32_t new_value) {
   VerifyObject(this);
   byte* raw_addr = reinterpret_cast<byte*>(this) + field_offset.Int32Value();
-  int32_t* addr = reinterpret_cast<int32_t*>(raw_addr);
-  return android_atomic_release_cas(old_value, new_value, addr) == 0;
+  volatile uint32_t* addr = reinterpret_cast<volatile uint32_t*>(raw_addr);
+  return __sync_bool_compare_and_swap(addr, old_value, new_value);
 }
 
 inline uint64_t Object::GetField64(MemberOffset field_offset, bool is_volatile) const {
@@ -266,7 +295,7 @@
   const int64_t* addr = reinterpret_cast<const int64_t*>(raw_addr);
   if (UNLIKELY(is_volatile)) {
     uint64_t result = QuasiAtomic::Read64(addr);
-    ANDROID_MEMBAR_FULL();
+    QuasiAtomic::MembarLoadLoad();
     return result;
   } else {
     return *addr;
@@ -278,9 +307,13 @@
   byte* raw_addr = reinterpret_cast<byte*>(this) + field_offset.Int32Value();
   int64_t* addr = reinterpret_cast<int64_t*>(raw_addr);
   if (UNLIKELY(is_volatile)) {
-    ANDROID_MEMBAR_STORE();
+    QuasiAtomic::MembarStoreStore();  // Ensure this store occurs after others in the queue.
     QuasiAtomic::Write64(addr, new_value);
-    // Post-store barrier not required due to use of atomic op or mutex.
+    if (!QuasiAtomic::LongAtomicsUseMutexes()) {
+      QuasiAtomic::MembarStoreLoad();  // Ensure this store occurs before any loads.
+    } else {
+      // Fence from from mutex is enough.
+    }
   } else {
     *addr = new_value;
   }
diff --git a/runtime/mirror/object.cc b/runtime/mirror/object.cc
index 008a173..bdb3250 100644
--- a/runtime/mirror/object.cc
+++ b/runtime/mirror/object.cc
@@ -89,10 +89,10 @@
   static AtomicInteger seed(987654321 + std::time(nullptr));
   int32_t expected_value, new_value;
   do {
-    expected_value = static_cast<uint32_t>(seed.load());
+    expected_value = static_cast<uint32_t>(seed.Load());
     new_value = expected_value * 1103515245 + 12345;
   } while ((expected_value & LockWord::kHashMask) == 0 ||
-      !seed.compare_and_swap(expected_value, new_value));
+      !seed.CompareAndSwap(expected_value, new_value));
   return expected_value & LockWord::kHashMask;
 }
 
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index fe89b7e..058aee7 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -187,37 +187,10 @@
     return reinterpret_cast<Object**>(reinterpret_cast<byte*>(this) + field_offset.Int32Value());
   }
 
-  uint32_t GetField32(MemberOffset field_offset, bool is_volatile) const {
-    VerifyObject(this);
-    const byte* raw_addr = reinterpret_cast<const byte*>(this) + field_offset.Int32Value();
-    const int32_t* word_addr = reinterpret_cast<const int32_t*>(raw_addr);
-    if (UNLIKELY(is_volatile)) {
-      return android_atomic_acquire_load(word_addr);
-    } else {
-      return *word_addr;
-    }
-  }
+  uint32_t GetField32(MemberOffset field_offset, bool is_volatile) const;
 
   void SetField32(MemberOffset field_offset, uint32_t new_value, bool is_volatile,
-                  bool this_is_valid = true) {
-    if (this_is_valid) {
-      VerifyObject(this);
-    }
-    byte* raw_addr = reinterpret_cast<byte*>(this) + field_offset.Int32Value();
-    uint32_t* word_addr = reinterpret_cast<uint32_t*>(raw_addr);
-    if (UNLIKELY(is_volatile)) {
-      /*
-       * TODO: add an android_atomic_synchronization_store() function and
-       * use it in the 32-bit volatile set handlers.  On some platforms we
-       * can use a fast atomic instruction and avoid the barriers.
-       */
-      ANDROID_MEMBAR_STORE();
-      *word_addr = new_value;
-      ANDROID_MEMBAR_FULL();
-    } else {
-      *word_addr = new_value;
-    }
-  }
+                  bool this_is_valid = true);
 
   bool CasField32(MemberOffset field_offset, uint32_t old_value, uint32_t new_value);
 
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index ef9a9ce..4186693 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -98,12 +98,12 @@
 
 int32_t Monitor::GetHashCode() {
   while (!HasHashCode()) {
-    if (hash_code_.compare_and_swap(0, mirror::Object::GenerateIdentityHashCode())) {
+    if (hash_code_.CompareAndSwap(0, mirror::Object::GenerateIdentityHashCode())) {
       break;
     }
   }
   DCHECK(HasHashCode());
-  return hash_code_.load();
+  return hash_code_.Load();
 }
 
 bool Monitor::Install(Thread* self) {
@@ -660,6 +660,7 @@
       case LockWord::kUnlocked: {
         LockWord thin_locked(LockWord::FromThinLockId(thread_id, 0));
         if (sirt_obj->CasLockWord(lock_word, thin_locked)) {
+          QuasiAtomic::MembarLoadLoad();
           return;  // Success!
         }
         continue;  // Go again.
diff --git a/runtime/monitor.h b/runtime/monitor.h
index bfd8545..16e9410 100644
--- a/runtime/monitor.h
+++ b/runtime/monitor.h
@@ -105,7 +105,7 @@
   bool IsLocked() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   bool HasHashCode() const {
-    return hash_code_.load() != 0;
+    return hash_code_.Load() != 0;
   }
 
   static void InflateThinLocked(Thread* self, SirtRef<mirror::Object>& obj, LockWord lock_word,
diff --git a/runtime/native/java_lang_reflect_Field.cc b/runtime/native/java_lang_reflect_Field.cc
index 553aeb8..269a4a3 100644
--- a/runtime/native/java_lang_reflect_Field.cc
+++ b/runtime/native/java_lang_reflect_Field.cc
@@ -222,7 +222,7 @@
   // Special handling for final fields on SMP systems.
   // We need a store/store barrier here (JMM requirement).
   if (f->IsFinal()) {
-    ANDROID_MEMBAR_STORE();
+    QuasiAtomic::MembarStoreLoad();
   }
 }
 
diff --git a/runtime/native/sun_misc_Unsafe.cc b/runtime/native/sun_misc_Unsafe.cc
index 2c6d281..b5fc7e7 100644
--- a/runtime/native/sun_misc_Unsafe.cc
+++ b/runtime/native/sun_misc_Unsafe.cc
@@ -86,7 +86,7 @@
 static void Unsafe_putOrderedInt(JNIEnv* env, jobject, jobject javaObj, jlong offset, jint newValue) {
   ScopedFastNativeObjectAccess soa(env);
   mirror::Object* obj = soa.Decode<mirror::Object*>(javaObj);
-  ANDROID_MEMBAR_STORE();
+  QuasiAtomic::MembarStoreStore();
   obj->SetField32(MemberOffset(offset), newValue, false);
 }
 
@@ -117,7 +117,7 @@
 static void Unsafe_putOrderedLong(JNIEnv* env, jobject, jobject javaObj, jlong offset, jlong newValue) {
   ScopedFastNativeObjectAccess soa(env);
   mirror::Object* obj = soa.Decode<mirror::Object*>(javaObj);
-  ANDROID_MEMBAR_STORE();
+  QuasiAtomic::MembarStoreStore();
   obj->SetField64(MemberOffset(offset), newValue, false);
 }
 
@@ -153,7 +153,7 @@
   ScopedFastNativeObjectAccess soa(env);
   mirror::Object* obj = soa.Decode<mirror::Object*>(javaObj);
   mirror::Object* newValue = soa.Decode<mirror::Object*>(javaNewValue);
-  ANDROID_MEMBAR_STORE();
+  QuasiAtomic::MembarStoreStore();
   obj->SetFieldObject(MemberOffset(offset), newValue, false);
 }
 
diff --git a/runtime/thread_pool_test.cc b/runtime/thread_pool_test.cc
index 1b22361..2029d4b 100644
--- a/runtime/thread_pool_test.cc
+++ b/runtime/thread_pool_test.cc
@@ -94,7 +94,7 @@
   EXPECT_EQ(0, bad_count);
   // Allow tasks to finish up and delete themselves.
   thread_pool.StartWorkers(self);
-  while (count.load() != num_tasks && bad_count.load() != 1) {
+  while (count.Load() != num_tasks && bad_count.Load() != 1) {
     usleep(200);
   }
   thread_pool.StopWorkers(self);