Make .bss stores atomic release operations.

And rely on architecture-dependent behavior for the .bss
entry loads.

This fixes theoretical races when one thread updates the
.bss entry and another uses it immediately thereafter;
previously we did not ensure correct memory visibility.

Test: m test-art-host-gtest
Test: testrunner.py --host --optimizing
Test: aosp_taimen-userdebug boots.
Test: run-gtests.sh
Test: testrunner.py --target --optimizing
Change-Id: Ie7b7969eb355025b9c9205f8c936e702861943f4
diff --git a/runtime/entrypoints/quick/quick_dexcache_entrypoints.cc b/runtime/entrypoints/quick/quick_dexcache_entrypoints.cc
index e939982..838b5b5 100644
--- a/runtime/entrypoints/quick/quick_dexcache_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_dexcache_entrypoints.cc
@@ -53,7 +53,10 @@
   DCHECK_LT(slot, oat_file->GetBssGcRoots().data() + oat_file->GetBssGcRoots().size());
   if (slot->IsNull()) {
     // This may race with another thread trying to store the very same value but that's OK.
-    *slot = GcRoot<mirror::Object>(object);
+    std::atomic<GcRoot<mirror::Object>>* atomic_slot =
+        reinterpret_cast<std::atomic<GcRoot<mirror::Object>>*>(slot);
+    static_assert(sizeof(*slot) == sizeof(*atomic_slot), "Size check");
+    atomic_slot->store(GcRoot<mirror::Object>(object), std::memory_order_release);
     // We need a write barrier for the class loader that holds the GC roots in the .bss.
     ObjPtr<mirror::ClassLoader> class_loader = outer_method->GetClassLoader();
     Runtime* runtime = Runtime::Current();