Revert "Remove GCC atomic workarounds."

This reverts commit a29ffd505328b3d580c25fff054e463b7cac08a8.

Change-Id: Ibb4845b8a1378f3d1fb0975f9677758f420f843f
diff --git a/runtime/atomic.h b/runtime/atomic.h
index 9c36a7a..5ddafb4 100644
--- a/runtime/atomic.h
+++ b/runtime/atomic.h
@@ -17,8 +17,14 @@
 #ifndef ART_RUNTIME_ATOMIC_H_
 #define ART_RUNTIME_ATOMIC_H_
 
+#ifdef __clang__
+#define ART_HAVE_STDATOMIC 1
+#endif
+
 #include <stdint.h>
+#if ART_HAVE_STDATOMIC
 #include <atomic>
+#endif
 #include <limits>
 #include <vector>
 
@@ -151,6 +157,8 @@
     return kNeedSwapMutexes;
   }
 
+  #if ART_HAVE_STDATOMIC
+
   static void ThreadFenceAcquire() {
     std::atomic_thread_fence(std::memory_order_acquire);
   }
@@ -171,6 +179,66 @@
     std::atomic_thread_fence(std::memory_order_seq_cst);
   }
 
+  #else
+
+  static void ThreadFenceAcquire() {
+  #if defined(__arm__) || defined(__aarch64__)
+    __asm__ __volatile__("dmb ish" : : : "memory");
+    // Could possibly use dmb ishld on aarch64
+    // But currently we also use this on volatile loads
+    // to enforce store atomicity.  Ishld is
+    // insufficient for that purpose.
+  #elif defined(__i386__) || defined(__x86_64__)
+    __asm__ __volatile__("" : : : "memory");
+  #elif defined(__mips__)
+    __asm__ __volatile__("sync" : : : "memory");
+  #else
+  #error Unexpected architecture
+  #endif
+  }
+
+  static void ThreadFenceRelease() {
+  #if defined(__arm__) || defined(__aarch64__)
+    __asm__ __volatile__("dmb ish" : : : "memory");
+    // ishst doesn't order load followed by store.
+  #elif defined(__i386__) || defined(__x86_64__)
+    __asm__ __volatile__("" : : : "memory");
+  #elif defined(__mips__)
+    __asm__ __volatile__("sync" : : : "memory");
+  #else
+  #error Unexpected architecture
+  #endif
+  }
+
+  // Fence at the end of a constructor with final fields
+  // or allocation.  We believe this
+  // only has to order stores, and can thus be weaker than
+  // release on aarch64.
+  static void ThreadFenceForConstructor() {
+  #if defined(__arm__) || defined(__aarch64__)
+    __asm__ __volatile__("dmb ishst" : : : "memory");
+  #elif defined(__i386__) || defined(__x86_64__)
+    __asm__ __volatile__("" : : : "memory");
+  #elif defined(__mips__)
+    __asm__ __volatile__("sync" : : : "memory");
+  #else
+  #error Unexpected architecture
+  #endif
+  }
+
+  static void ThreadFenceSequentiallyConsistent() {
+  #if defined(__arm__) || defined(__aarch64__)
+    __asm__ __volatile__("dmb ish" : : : "memory");
+  #elif defined(__i386__) || defined(__x86_64__)
+    __asm__ __volatile__("mfence" : : : "memory");
+  #elif defined(__mips__)
+    __asm__ __volatile__("sync" : : : "memory");
+  #else
+  #error Unexpected architecture
+  #endif
+  }
+  #endif
+
  private:
   static Mutex* GetSwapMutex(const volatile int64_t* addr);
   static int64_t SwapMutexRead64(volatile const int64_t* addr);
@@ -184,8 +252,9 @@
   DISALLOW_COPY_AND_ASSIGN(QuasiAtomic);
 };
 
+#if ART_HAVE_STDATOMIC
 template<typename T>
-class PACKED(sizeof(T)) Atomic : public std::atomic<T> {
+class Atomic : public std::atomic<T> {
  public:
   Atomic<T>() : std::atomic<T>() { }
 
@@ -291,20 +360,292 @@
   }
 };
 
+#else
+
+template<typename T> class Atomic;
+
+// Helper class for Atomic to deal separately with size 8 and small
+// objects.  Should not be used directly.
+
+template<int SZ, class T> struct AtomicHelper {
+  friend class Atomic<T>;
+
+ private:
+  COMPILE_ASSERT(sizeof(T) <= 4, bad_atomic_helper_arg);
+
+  static T LoadRelaxed(const volatile T* loc) {
+    // sizeof(T) <= 4
+    return *loc;
+  }
+
+  static void StoreRelaxed(volatile T* loc, T desired) {
+    // sizeof(T) <= 4
+    *loc = desired;
+  }
+
+  static bool CompareExchangeStrongSequentiallyConsistent(volatile T* loc,
+                                                  T expected_value, T desired_value) {
+    // sizeof(T) <= 4
+    return __sync_bool_compare_and_swap(loc, expected_value, desired_value);
+  }
+};
+
+// Interpret the bit pattern of input (type U) as type V. Requires the size
+// of V >= size of U (compile-time checked).
+// Reproduced here from utils.h to keep dependencies small.
+template<typename U, typename V>
+static inline V bit_cast_atomic(U in) {
+  COMPILE_ASSERT(sizeof(U) == sizeof(V), size_of_u_not_eq_size_of_v);
+  union {
+    U u;
+    V v;
+  } tmp;
+  tmp.u = in;
+  return tmp.v;
+}
+
+template<class T> struct AtomicHelper<8, T> {
+  friend class Atomic<T>;
+
+ private:
+  COMPILE_ASSERT(sizeof(T) == 8, bad_large_atomic_helper_arg);
+
+  static T LoadRelaxed(const volatile T* loc) {
+    // sizeof(T) == 8
+    volatile const int64_t* loc_ptr =
+              reinterpret_cast<volatile const int64_t*>(loc);
+    return bit_cast_atomic<int64_t, T>(QuasiAtomic::Read64(loc_ptr));
+  }
+
+  static void StoreRelaxed(volatile T* loc, T desired) {
+    // sizeof(T) == 8
+    volatile int64_t* loc_ptr =
+                reinterpret_cast<volatile int64_t*>(loc);
+    QuasiAtomic::Write64(loc_ptr, bit_cast_atomic<T, int64_t>(desired));
+  }
+
+
+  static bool CompareExchangeStrongSequentiallyConsistent(volatile T* loc,
+                                                  T expected_value, T desired_value) {
+    // sizeof(T) == 8
+    volatile int64_t* loc_ptr = reinterpret_cast<volatile int64_t*>(loc);
+    return QuasiAtomic::Cas64(bit_cast_atomic<T, int64_t>(expected_value),
+                              bit_cast_atomic<T, int64_t>(desired_value),
+                              loc_ptr);
+  }
+};
+
+template<typename T>
+class PACKED(sizeof(T)) Atomic {
+ private:
+  COMPILE_ASSERT(sizeof(T) <= 4 || sizeof(T) == 8, bad_atomic_arg);
+
+ public:
+  Atomic<T>() : value_(0) { }
+
+  explicit Atomic<T>(T value) : value_(value) { }
+
+  // Load from memory without ordering or synchronization constraints.
+  T LoadRelaxed() const {
+    return AtomicHelper<sizeof(T), T>::LoadRelaxed(&value_);
+  }
+
+  // Word tearing allowed, but may race.
+  T LoadJavaData() const {
+    return value_;
+  }
+
+  // Load from memory with a total ordering.
+  T LoadSequentiallyConsistent() const;
+
+  // Store to memory without ordering or synchronization constraints.
+  void StoreRelaxed(T desired) {
+    AtomicHelper<sizeof(T), T>::StoreRelaxed(&value_, desired);
+  }
+
+  // Word tearing allowed, but may race.
+  void StoreJavaData(T desired) {
+    value_ = desired;
+  }
+
+  // Store to memory with release ordering.
+  void StoreRelease(T desired);
+
+  // Store to memory with a total ordering.
+  void StoreSequentiallyConsistent(T desired);
+
+  // Atomically replace the value with desired value if it matches the expected value.
+  // Participates in total ordering of atomic operations.
+  bool CompareExchangeStrongSequentiallyConsistent(T expected_value, T desired_value) {
+    return AtomicHelper<sizeof(T), T>::
+        CompareExchangeStrongSequentiallyConsistent(&value_, expected_value, desired_value);
+  }
+
+  // The same, but may fail spuriously.
+  bool CompareExchangeWeakSequentiallyConsistent(T expected_value, T desired_value) {
+    // TODO: Take advantage of the fact that it may fail spuriously.
+    return AtomicHelper<sizeof(T), T>::
+        CompareExchangeStrongSequentiallyConsistent(&value_, expected_value, desired_value);
+  }
+
+  // Atomically replace the value with desired value if it matches the expected value. Doesn't
+  // imply ordering or synchronization constraints.
+  bool CompareExchangeStrongRelaxed(T expected_value, T desired_value) {
+    // TODO: make this relaxed.
+    return CompareExchangeStrongSequentiallyConsistent(expected_value, desired_value);
+  }
+
+  // The same, but may fail spuriously.
+  bool CompareExchangeWeakRelaxed(T expected_value, T desired_value) {
+    // TODO: Take advantage of the fact that it may fail spuriously.
+    // TODO: make this relaxed.
+    return CompareExchangeStrongSequentiallyConsistent(expected_value, desired_value);
+  }
+
+  // Atomically replace the value with desired value if it matches the expected value. Prior accesses
+  // made to other memory locations by the thread that did the release become visible in this
+  // thread.
+  bool CompareExchangeWeakAcquire(T expected_value, T desired_value) {
+    // TODO: make this acquire.
+    return CompareExchangeWeakSequentiallyConsistent(expected_value, desired_value);
+  }
+
+  // Atomically replace the value with desired value if it matches the expected value. Prior accesses
+  // to other memory locations become visible to the threads that do a consume or an acquire on the
+  // same location.
+  bool CompareExchangeWeakRelease(T expected_value, T desired_value) {
+    // TODO: make this release.
+    return CompareExchangeWeakSequentiallyConsistent(expected_value, desired_value);
+  }
+
+  volatile T* Address() {
+    return &value_;
+  }
+
+  T FetchAndAddSequentiallyConsistent(const T value) {
+    if (sizeof(T) <= 4) {
+      return __sync_fetch_and_add(&value_, value);  // Return old value.
+    } else {
+      T expected;
+      do {
+        expected = LoadRelaxed();
+      } while (!CompareExchangeWeakSequentiallyConsistent(expected, expected + value));
+      return expected;
+    }
+  }
+
+  T FetchAndSubSequentiallyConsistent(const T value) {
+    if (sizeof(T) <= 4) {
+      return __sync_fetch_and_sub(&value_, value);  // Return old value.
+    } else {
+      return FetchAndAddSequentiallyConsistent(-value);
+    }
+  }
+
+  T FetchAndOrSequentiallyConsistent(const T value) {
+    if (sizeof(T) <= 4) {
+      return __sync_fetch_and_or(&value_, value);  // Return old value.
+    } else {
+      T expected;
+      do {
+        expected = LoadRelaxed();
+      } while (!CompareExchangeWeakSequentiallyConsistent(expected, expected | value));
+      return expected;
+    }
+  }
+
+  T FetchAndAndSequentiallyConsistent(const T value) {
+    if (sizeof(T) <= 4) {
+      return __sync_fetch_and_and(&value_, value);  // Return old value.
+    } else {
+      T expected;
+      do {
+        expected = LoadRelaxed();
+      } while (!CompareExchangeWeakSequentiallyConsistent(expected, expected & value));
+      return expected;
+    }
+  }
+
+  T operator++() {  // Prefix operator.
+    if (sizeof(T) <= 4) {
+      return __sync_add_and_fetch(&value_, 1);  // Return new value.
+    } else {
+      return FetchAndAddSequentiallyConsistent(1) + 1;
+    }
+  }
+
+  T operator++(int) {  // Postfix operator.
+    return FetchAndAddSequentiallyConsistent(1);
+  }
+
+  T operator--() {  // Prefix operator.
+    if (sizeof(T) <= 4) {
+      return __sync_sub_and_fetch(&value_, 1);  // Return new value.
+    } else {
+      return FetchAndSubSequentiallyConsistent(1) - 1;
+    }
+  }
+
+  T operator--(int) {  // Postfix operator.
+    return FetchAndSubSequentiallyConsistent(1);
+  }
+
+  static T MaxValue() {
+    return std::numeric_limits<T>::max();
+  }
+
+
+ private:
+  volatile T value_;
+};
+#endif
+
 typedef Atomic<int32_t> AtomicInteger;
 
 COMPILE_ASSERT(sizeof(AtomicInteger) == sizeof(int32_t), weird_atomic_int_size);
 COMPILE_ASSERT(alignof(AtomicInteger) == alignof(int32_t),
                atomic_int_alignment_differs_from_that_of_underlying_type);
 COMPILE_ASSERT(sizeof(Atomic<int64_t>) == sizeof(int64_t), weird_atomic_int64_size);
-
-// Assert the alignment of 64-bit integers is 64-bit. This isn't true on certain 32-bit
-// architectures (e.g. x86-32) but we know that 64-bit integers here are arranged to be 8-byte
-// aligned.
 #if defined(__LP64__)
   COMPILE_ASSERT(alignof(Atomic<int64_t>) == alignof(int64_t),
                  atomic_int64_alignment_differs_from_that_of_underlying_type);
 #endif
+// The above fails on x86-32.
+// This is OK, since we explicitly arrange for alignment of 8-byte fields.
+
+
+#if !ART_HAVE_STDATOMIC
+template<typename T>
+inline T Atomic<T>::LoadSequentiallyConsistent() const {
+  T result = value_;
+  if (sizeof(T) != 8 || !QuasiAtomic::LongAtomicsUseMutexes()) {
+    QuasiAtomic::ThreadFenceAcquire();
+    // We optimistically assume this suffices for store atomicity.
+    // On ARMv8 we strengthen ThreadFenceAcquire to make that true.
+  }
+  return result;
+}
+
+template<typename T>
+inline void Atomic<T>::StoreRelease(T desired) {
+  if (sizeof(T) != 8 || !QuasiAtomic::LongAtomicsUseMutexes()) {
+    QuasiAtomic::ThreadFenceRelease();
+  }
+  StoreRelaxed(desired);
+}
+
+template<typename T>
+inline void Atomic<T>::StoreSequentiallyConsistent(T desired) {
+  if (sizeof(T) != 8 || !QuasiAtomic::LongAtomicsUseMutexes()) {
+    QuasiAtomic::ThreadFenceRelease();
+  }
+  StoreRelaxed(desired);
+  if (sizeof(T) != 8 || !QuasiAtomic::LongAtomicsUseMutexes()) {
+    QuasiAtomic::ThreadFenceSequentiallyConsistent();
+  }
+}
+
+#endif
 
 }  // namespace art