Merge "Reduce the number of fences needed for monitors"
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index 6d29ed3..354410e 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -97,6 +97,12 @@
       OFFSET_OF_OBJECT_MEMBER(Object, monitor_), old_val.GetValue(), new_val.GetValue());
 }
 
+inline bool Object::CasLockWordWeakAcquire(LockWord old_val, LockWord new_val) {
+  // Force use of non-transactional mode and do not check.
+  return CasFieldWeakAcquire32<false, false>(
+      OFFSET_OF_OBJECT_MEMBER(Object, monitor_), old_val.GetValue(), new_val.GetValue());
+}
+
 inline bool Object::CasLockWordWeakRelease(LockWord old_val, LockWord new_val) {
   // Force use of non-transactional mode and do not check.
   return CasFieldWeakRelease32<false, false>(
@@ -759,6 +765,24 @@
 }
 
 template<bool kTransactionActive, bool kCheckTransaction, VerifyObjectFlags kVerifyFlags>
+inline bool Object::CasFieldWeakAcquire32(MemberOffset field_offset,
+                                          int32_t old_value, int32_t new_value) {
+  if (kCheckTransaction) {
+    DCHECK_EQ(kTransactionActive, Runtime::Current()->IsActiveTransaction());
+  }
+  if (kTransactionActive) {
+    Runtime::Current()->RecordWriteField32(this, field_offset, old_value, true);
+  }
+  if (kVerifyFlags & kVerifyThis) {
+    VerifyObject(this);
+  }
+  uint8_t* raw_addr = reinterpret_cast<uint8_t*>(this) + field_offset.Int32Value();
+  AtomicInteger* atomic_addr = reinterpret_cast<AtomicInteger*>(raw_addr);
+
+  return atomic_addr->CompareExchangeWeakAcquire(old_value, new_value);
+}
+
+template<bool kTransactionActive, bool kCheckTransaction, VerifyObjectFlags kVerifyFlags>
 inline bool Object::CasFieldWeakRelease32(MemberOffset field_offset,
                                           int32_t old_value, int32_t new_value) {
   if (kCheckTransaction) {
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index 67b5ddb..db58a60 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -153,6 +153,8 @@
       REQUIRES_SHARED(Locks::mutator_lock_);
   bool CasLockWordWeakRelaxed(LockWord old_val, LockWord new_val)
       REQUIRES_SHARED(Locks::mutator_lock_);
+  bool CasLockWordWeakAcquire(LockWord old_val, LockWord new_val)
+      REQUIRES_SHARED(Locks::mutator_lock_);
   bool CasLockWordWeakRelease(LockWord old_val, LockWord new_val)
       REQUIRES_SHARED(Locks::mutator_lock_);
   uint32_t GetLockOwnerThreadId();
@@ -460,6 +462,12 @@
 
   template<bool kTransactionActive, bool kCheckTransaction = true,
       VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  bool CasFieldWeakAcquire32(MemberOffset field_offset, int32_t old_value,
+                             int32_t new_value) ALWAYS_INLINE
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  template<bool kTransactionActive, bool kCheckTransaction = true,
+      VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   bool CasFieldWeakRelease32(MemberOffset field_offset, int32_t old_value,
                              int32_t new_value) ALWAYS_INLINE
       REQUIRES_SHARED(Locks::mutator_lock_);
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index 222eb5c..893abd5 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -161,7 +161,7 @@
   }
   LockWord fat(this, lw.GCState());
   // Publish the updated lock word, which may race with other threads.
-  bool success = GetObject()->CasLockWordWeakSequentiallyConsistent(lw, fat);
+  bool success = GetObject()->CasLockWordWeakRelease(lw, fat);
   // Lock profiling.
   if (success && owner_ != nullptr && lock_profiling_threshold_ != 0) {
     // Do not abort on dex pc errors. This can easily happen when we want to dump a stack trace on
@@ -879,13 +879,16 @@
   StackHandleScope<1> hs(self);
   Handle<mirror::Object> h_obj(hs.NewHandle(obj));
   while (true) {
-    LockWord lock_word = h_obj->GetLockWord(true);
+    // We initially read the lockword with ordinary Java/relaxed semantics. When stronger
+    // semantics are needed, we address it below. Since GetLockWord bottoms out to a relaxed load,
+    // we can fix it later, in an infrequently executed case, with a fence.
+    LockWord lock_word = h_obj->GetLockWord(false);
     switch (lock_word.GetState()) {
       case LockWord::kUnlocked: {
+        // No ordering required for preceding lockword read, since we retest.
         LockWord thin_locked(LockWord::FromThinLockId(thread_id, 0, lock_word.GCState()));
-        if (h_obj->CasLockWordWeakSequentiallyConsistent(lock_word, thin_locked)) {
+        if (h_obj->CasLockWordWeakAcquire(lock_word, thin_locked)) {
           AtraceMonitorLock(self, h_obj.Get(), false /* is_wait */);
-          // CasLockWord enforces more than the acquire ordering we need here.
           return h_obj.Get();  // Success!
         }
         continue;  // Go again.
@@ -893,19 +896,22 @@
       case LockWord::kThinLocked: {
         uint32_t owner_thread_id = lock_word.ThinLockOwner();
         if (owner_thread_id == thread_id) {
+          // No ordering required for initial lockword read.
           // We own the lock, increase the recursion count.
           uint32_t new_count = lock_word.ThinLockCount() + 1;
           if (LIKELY(new_count <= LockWord::kThinLockMaxCount)) {
             LockWord thin_locked(LockWord::FromThinLockId(thread_id,
                                                           new_count,
                                                           lock_word.GCState()));
+            // Only this thread pays attention to the count. Thus there is no need for stronger
+            // than relaxed memory ordering.
             if (!kUseReadBarrier) {
-              h_obj->SetLockWord(thin_locked, true);
+              h_obj->SetLockWord(thin_locked, false /* volatile */);
               AtraceMonitorLock(self, h_obj.Get(), false /* is_wait */);
               return h_obj.Get();  // Success!
             } else {
               // Use CAS to preserve the read barrier state.
-              if (h_obj->CasLockWordWeakSequentiallyConsistent(lock_word, thin_locked)) {
+              if (h_obj->CasLockWordWeakRelaxed(lock_word, thin_locked)) {
                 AtraceMonitorLock(self, h_obj.Get(), false /* is_wait */);
                 return h_obj.Get();  // Success!
               }
@@ -922,20 +928,28 @@
           // Contention.
           contention_count++;
           Runtime* runtime = Runtime::Current();
-          if (contention_count <= runtime->GetMaxSpinsBeforeThinkLockInflation()) {
+          if (contention_count <= runtime->GetMaxSpinsBeforeThinLockInflation()) {
             // TODO: Consider switching the thread state to kBlocked when we are yielding.
             // Use sched_yield instead of NanoSleep since NanoSleep can wait much longer than the
             // parameter you pass in. This can cause thread suspension to take excessively long
             // and make long pauses. See b/16307460.
+            // TODO: We should literally spin first, without sched_yield. Sched_yield either does
+            // nothing (at significant expense), or guarantees that we wait at least microseconds.
+            // If the owner is running, I would expect the median lock hold time to be hundreds
+            // of nanoseconds or less.
             sched_yield();
           } else {
             contention_count = 0;
+            // No ordering required for initial lockword read. Install rereads it anyway.
             InflateThinLocked(self, h_obj, lock_word, 0);
           }
         }
         continue;  // Start from the beginning.
       }
       case LockWord::kFatLocked: {
+        // We should have done an acquire read of the lockword initially, to ensure
+        // visibility of the monitor data structure. Use an explicit fence instead.
+        QuasiAtomic::ThreadFenceAcquire();
         Monitor* mon = lock_word.FatLockMonitor();
         if (trylock) {
           return mon->TryLock(self) ? h_obj.Get() : nullptr;
@@ -946,6 +960,8 @@
       }
       case LockWord::kHashCode:
         // Inflate with the existing hashcode.
+        // Again no ordering required for initial lockword read, since we don't rely
+        // on the visibility of any prior computation.
         Inflate(self, nullptr, h_obj.Get(), lock_word.GetHashCode());
         continue;  // Start from the beginning.
       default: {
@@ -988,13 +1004,16 @@
           }
           if (!kUseReadBarrier) {
             DCHECK_EQ(new_lw.ReadBarrierState(), 0U);
+            // TODO: This really only needs memory_order_release, but we currently have
+            // no way to specify that. In fact there seem to be no legitimate uses of SetLockWord
+            // with a final argument of true. This slows down x86 and ARMv7, but probably not v8.
             h_obj->SetLockWord(new_lw, true);
             AtraceMonitorUnlock();
             // Success!
             return true;
           } else {
             // Use CAS to preserve the read barrier state.
-            if (h_obj->CasLockWordWeakSequentiallyConsistent(lock_word, new_lw)) {
+            if (h_obj->CasLockWordWeakRelease(lock_word, new_lw)) {
               AtraceMonitorUnlock();
               // Success!
               return true;
diff --git a/runtime/runtime.h b/runtime/runtime.h
index d40c631..8fc211c 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -268,7 +268,7 @@
     return java_vm_.get();
   }
 
-  size_t GetMaxSpinsBeforeThinkLockInflation() const {
+  size_t GetMaxSpinsBeforeThinLockInflation() const {
     return max_spins_before_thin_lock_inflation_;
   }