ART: JNI thread state transition optimization This patch improves the JNI performance by removing the explicit acquiring and releasing the mutator lock when a thread state transits between suspended and runnable states. The functions responsible for changing the state were found to be the costliest part of the JNI. Originally, a thread needs to acquire a shared mutator lock by a CAS instruction when entering the runnable state and also needs to release the lock by a CAS when entering the native state from runnable. This patch removes these CAS operations when a thread state transits between suspended and runnable. A thread in the runnable state is considered to have shared ownership of the mutator lock and therefore transitions in and out of the runnable state have associated implication on the mutator lock ownership. Meanwhile, a barrier is added to control suspending all threads from running. JNI transition overhead was reduced by 25% on IA platform and by 17% on ARM platform by this patch, while it has little impact on GC pause time (measured with "suspend all histogram"). Change-Id: Icee95d8ffff1bbfc95309a41cc48836536fec689 Signed-off-by: Yu, Li <yu.l.li@intel.com> Signed-off-by: Haitao, Feng <haitao.feng@intel.com> Signed-off-by: Lei, Li <lei.l.li@intel.com>

commit: eac4424b3420c280f97ff2f815b5dedd8dac9801 [log] [tgz]
author: Yu Li <yu.l.li@intel.com> Mon Jun 29 10:50:03 2015 +0800
committer: Man Cao <manc@google.com> Mon Jul 13 15:56:19 2015 -0700
tree: 4e7c31b4312aa000bd3d2250df6a05daf61523d7
parent: ec97825f8553b3a99b6dfbbb90a50fe65301ea94 [diff]
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index 20d75f3..87c4af5 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h

@@ -108,7 +108,7 @@
 ADD_TEST_EQ(THREAD_SELF_OFFSET,
             art::Thread::SelfOffset<__SIZEOF_POINTER__>().Int32Value())
 
-#define THREAD_LOCAL_POS_OFFSET (THREAD_CARD_TABLE_OFFSET + 147 * __SIZEOF_POINTER__)
+#define THREAD_LOCAL_POS_OFFSET (THREAD_CARD_TABLE_OFFSET + 150 * __SIZEOF_POINTER__)
 ADD_TEST_EQ(THREAD_LOCAL_POS_OFFSET,
             art::Thread::ThreadLocalPosOffset<__SIZEOF_POINTER__>().Int32Value())
 #define THREAD_LOCAL_END_OFFSET (THREAD_LOCAL_POS_OFFSET + __SIZEOF_POINTER__)

diff --git a/runtime/base/mutex-inl.h b/runtime/base/mutex-inl.h
index 87840e7..bd8de87 100644
--- a/runtime/base/mutex-inl.h
+++ b/runtime/base/mutex-inl.h

@@ -218,6 +218,16 @@
 #endif
 }
 
+inline void MutatorMutex::TransitionFromRunnableToSuspended(Thread* self) {
+  AssertSharedHeld(self);
+  RegisterAsUnlocked(self);
+}
+
+inline void MutatorMutex::TransitionFromSuspendedToRunnable(Thread* self) {
+  RegisterAsLocked(self);
+  AssertSharedHeld(self);
+}
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_BASE_MUTEX_INL_H_

diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index 5c6065d..e48d170 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc

@@ -47,7 +47,7 @@
 Mutex* Locks::logging_lock_ = nullptr;
 Mutex* Locks::mem_maps_lock_ = nullptr;
 Mutex* Locks::modify_ldt_lock_ = nullptr;
-ReaderWriterMutex* Locks::mutator_lock_ = nullptr;
+MutatorMutex* Locks::mutator_lock_ = nullptr;
 Mutex* Locks::profiler_lock_ = nullptr;
 Mutex* Locks::reference_processor_lock_ = nullptr;
 Mutex* Locks::reference_queue_cleared_references_lock_ = nullptr;
@@ -738,6 +738,11 @@
   return os;
 }
 
+std::ostream& operator<<(std::ostream& os, const MutatorMutex& mu) {
+  mu.Dump(os);
+  return os;
+}
+
 ConditionVariable::ConditionVariable(const char* name, Mutex& guard)
     : name_(name), guard_(guard) {
 #if ART_USE_FUTEXES
@@ -958,7 +963,7 @@
 
     UPDATE_CURRENT_LOCK_LEVEL(kMutatorLock);
     DCHECK(mutator_lock_ == nullptr);
-    mutator_lock_ = new ReaderWriterMutex("mutator lock", current_lock_level);
+    mutator_lock_ = new MutatorMutex("mutator lock", current_lock_level);
 
     UPDATE_CURRENT_LOCK_LEVEL(kHeapBitmapLock);
     DCHECK(heap_bitmap_lock_ == nullptr);

diff --git a/runtime/base/mutex.h b/runtime/base/mutex.h
index 678d55b..f87467a 100644
--- a/runtime/base/mutex.h
+++ b/runtime/base/mutex.h

@@ -44,6 +44,7 @@
 namespace art {
 
 class LOCKABLE ReaderWriterMutex;
+class LOCKABLE MutatorMutex;
 class ScopedContentionRecorder;
 class Thread;
 
@@ -138,6 +139,7 @@
 
   virtual bool IsMutex() const { return false; }
   virtual bool IsReaderWriterMutex() const { return false; }
+  virtual bool IsMutatorMutex() const { return false; }
 
   virtual void Dump(std::ostream& os) const = 0;
 
@@ -385,6 +387,36 @@
   DISALLOW_COPY_AND_ASSIGN(ReaderWriterMutex);
 };
 
+// MutatorMutex is a special kind of ReaderWriterMutex created specifically for the
+// Locks::mutator_lock_ mutex. The behaviour is identical to the ReaderWriterMutex except that
+// thread state changes also play a part in lock ownership. The mutator_lock_ will not be truly
+// held by any mutator threads. However, a thread in the kRunnable state is considered to have
+// shared ownership of the mutator lock and therefore transitions in and out of the kRunnable
+// state have associated implications on lock ownership. Extra methods to handle the state
+// transitions have been added to the interface but are only accessible to the methods dealing
+// with state transitions. The thread state and flags attributes are used to ensure thread state
+// transitions are consistent with the permitted behaviour of the mutex.
+//
+// *) The most important consequence of this behaviour is that all threads must be in one of the
+// suspended states before exclusive ownership of the mutator mutex is sought.
+//
+std::ostream& operator<<(std::ostream& os, const MutatorMutex& mu);
+class LOCKABLE MutatorMutex : public ReaderWriterMutex {
+ public:
+  explicit MutatorMutex(const char* name, LockLevel level = kDefaultMutexLevel)
+    : ReaderWriterMutex(name, level) {}
+  ~MutatorMutex() {}
+
+  virtual bool IsMutatorMutex() const { return true; }
+
+ private:
+  friend class Thread;
+  void TransitionFromRunnableToSuspended(Thread* self) UNLOCK_FUNCTION() ALWAYS_INLINE;
+  void TransitionFromSuspendedToRunnable(Thread* self) SHARED_LOCK_FUNCTION() ALWAYS_INLINE;
+
+  DISALLOW_COPY_AND_ASSIGN(MutatorMutex);
+};
+
 // ConditionVariables allow threads to queue and sleep. Threads may then be resumed individually
 // (Signal) or all at once (Broadcast).
 class ConditionVariable {
@@ -495,35 +527,28 @@
   // Guards allocation entrypoint instrumenting.
   static Mutex* instrument_entrypoints_lock_;
 
-  // The mutator_lock_ is used to allow mutators to execute in a shared (reader) mode or to block
-  // mutators by having an exclusive (writer) owner. In normal execution each mutator thread holds
-  // a share on the mutator_lock_. The garbage collector may also execute with shared access but
-  // at times requires exclusive access to the heap (not to be confused with the heap meta-data
-  // guarded by the heap_lock_ below). When the garbage collector requires exclusive access it asks
-  // the mutators to suspend themselves which also involves usage of the thread_suspend_count_lock_
-  // to cover weaknesses in using ReaderWriterMutexes with ConditionVariables. We use a condition
-  // variable to wait upon in the suspension logic as releasing and then re-acquiring a share on
-  // the mutator lock doesn't necessarily allow the exclusive user (e.g the garbage collector)
-  // chance to acquire the lock.
+  // A barrier is used to synchronize the GC/Debugger thread with mutator threads. When GC/Debugger
+  // thread wants to suspend all mutator threads, it needs to wait for all mutator threads to pass
+  // a barrier. Threads that are already suspended will get their barrier passed by the GC/Debugger
+  // thread; threads in the runnable state will pass the barrier when they transit to the suspended
+  // state. GC/Debugger thread will be woken up when all mutator threads are suspended.
   //
   // Thread suspension:
-  // Shared users                                  | Exclusive user
-  // (holding mutator lock and in kRunnable state) |   .. running ..
+  // mutator thread                                | GC/Debugger
+  //   .. running ..                               |   .. running ..
   //   .. running ..                               | Request thread suspension by:
   //   .. running ..                               |   - acquiring thread_suspend_count_lock_
   //   .. running ..                               |   - incrementing Thread::suspend_count_ on
   //   .. running ..                               |     all mutator threads
   //   .. running ..                               |   - releasing thread_suspend_count_lock_
-  //   .. running ..                               | Block trying to acquire exclusive mutator lock
+  //   .. running ..                               | Block wait for all threads to pass a barrier
   // Poll Thread::suspend_count_ and enter full    |   .. blocked ..
   // suspend code.                                 |   .. blocked ..
-  // Change state to kSuspended                    |   .. blocked ..
-  // x: Release share on mutator_lock_             | Carry out exclusive access
-  // Acquire thread_suspend_count_lock_            |   .. exclusive ..
-  // while Thread::suspend_count_ > 0              |   .. exclusive ..
-  //   - wait on Thread::resume_cond_              |   .. exclusive ..
-  //     (releases thread_suspend_count_lock_)     |   .. exclusive ..
-  //   .. waiting ..                               | Release mutator_lock_
+  // Change state to kSuspended (pass the barrier) | Wake up when all threads pass the barrier
+  // x: Acquire thread_suspend_count_lock_         |   .. running ..
+  // while Thread::suspend_count_ > 0              |   .. running ..
+  //   - wait on Thread::resume_cond_              |   .. running ..
+  //     (releases thread_suspend_count_lock_)     |   .. running ..
   //   .. waiting ..                               | Request thread resumption by:
   //   .. waiting ..                               |   - acquiring thread_suspend_count_lock_
   //   .. waiting ..                               |   - decrementing Thread::suspend_count_ on
@@ -531,29 +556,13 @@
   //   .. waiting ..                               |   - notifying on Thread::resume_cond_
   //    - re-acquire thread_suspend_count_lock_    |   - releasing thread_suspend_count_lock_
   // Release thread_suspend_count_lock_            |  .. running ..
-  // Acquire share on mutator_lock_                |  .. running ..
-  //  - This could block but the thread still      |  .. running ..
-  //    has a state of kSuspended and so this      |  .. running ..
-  //    isn't an issue.                            |  .. running ..
-  // Acquire thread_suspend_count_lock_            |  .. running ..
-  //  - we poll here as we're transitioning into   |  .. running ..
-  //    kRunnable and an individual thread suspend |  .. running ..
-  //    request (e.g for debugging) won't try      |  .. running ..
-  //    to acquire the mutator lock (which would   |  .. running ..
-  //    block as we hold the mutator lock). This   |  .. running ..
-  //    poll ensures that if the suspender thought |  .. running ..
-  //    we were suspended by incrementing our      |  .. running ..
-  //    Thread::suspend_count_ and then reading    |  .. running ..
-  //    our state we go back to waiting on         |  .. running ..
-  //    Thread::resume_cond_.                      |  .. running ..
-  // can_go_runnable = Thread::suspend_count_ == 0 |  .. running ..
-  // Release thread_suspend_count_lock_            |  .. running ..
-  // if can_go_runnable                            |  .. running ..
-  //   Change state to kRunnable                   |  .. running ..
-  // else                                          |  .. running ..
-  //   Goto x                                      |  .. running ..
+  // Change to kRunnable                           |  .. running ..
+  //  - this uses a CAS operation to ensure the    |  .. running ..
+  //    suspend request flag isn't raised as the   |  .. running ..
+  //    state is changed                           |  .. running ..
+  //  - if the CAS operation fails then goto x     |  .. running ..
   //  .. running ..                                |  .. running ..
-  static ReaderWriterMutex* mutator_lock_ ACQUIRED_AFTER(instrument_entrypoints_lock_);
+  static MutatorMutex* mutator_lock_ ACQUIRED_AFTER(instrument_entrypoints_lock_);
 
   // Allow reader-writer mutual exclusion on the mark and live bitmaps of the heap.
   static ReaderWriterMutex* heap_bitmap_lock_ ACQUIRED_AFTER(mutator_lock_);

diff --git a/runtime/entrypoints_order_test.cc b/runtime/entrypoints_order_test.cc
index 0a5ebfa..656944a 100644
--- a/runtime/entrypoints_order_test.cc
+++ b/runtime/entrypoints_order_test.cc

@@ -116,7 +116,7 @@
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, last_no_thread_suspension_cause, checkpoint_functions,
                         sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, checkpoint_functions, interpreter_entrypoints,
-                        sizeof(void*) * 3);
+                        sizeof(void*) * 6);
 
     // Skip across the entrypoints structures.
 

diff --git a/runtime/oat.h b/runtime/oat.h
index 5706c4e..3451d0f 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h

@@ -32,7 +32,7 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '0', '6', '5', '\0' };
+  static constexpr uint8_t kOatVersion[] = { '0', '6', '6', '\0' };
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";

diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index 5f965f1..39ef68a 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h

@@ -67,8 +67,10 @@
 }
 
 inline ThreadState Thread::SetState(ThreadState new_state) {
-  // Cannot use this code to change into Runnable as changing to Runnable should fail if
-  // old_state_and_flags.suspend_request is true.
+  // Should only be used to change between suspended states.
+  // Cannot use this code to change into or from Runnable as changing to Runnable should
+  // fail if old_state_and_flags.suspend_request is true and changing from Runnable might
+  // miss passing an active suspend barrier.
   DCHECK_NE(new_state, kRunnable);
   if (kIsDebugBuild && this != Thread::Current()) {
     std::string name;
@@ -78,6 +80,7 @@
   }
   union StateAndFlags old_state_and_flags;
   old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
+  CHECK_NE(old_state_and_flags.as_struct.state, kRunnable);
   tls32_.state_and_flags.as_struct.state = new_state;
   return static_cast<ThreadState>(old_state_and_flags.as_struct.state);
 }
@@ -126,20 +129,34 @@
     new_state_and_flags.as_struct.flags = old_state_and_flags.as_struct.flags;
     new_state_and_flags.as_struct.state = new_state;
 
-    // CAS the value without a memory ordering as that is given by the lock release below.
+    // CAS the value with a memory ordering.
     bool done =
-        tls32_.state_and_flags.as_atomic_int.CompareExchangeWeakRelaxed(old_state_and_flags.as_int,
+        tls32_.state_and_flags.as_atomic_int.CompareExchangeWeakRelease(old_state_and_flags.as_int,
                                                                         new_state_and_flags.as_int);
     if (LIKELY(done)) {
       break;
     }
   }
-  // Release share on mutator_lock_.
-  Locks::mutator_lock_->SharedUnlock(this);
+
+  // Change to non-runnable state, thereby appearing suspended to the system.
+  // Mark the release of the share of the mutator_lock_.
+  Locks::mutator_lock_->TransitionFromRunnableToSuspended(this);
+
+  // Once suspended - check the active suspend barrier flag
+  while (true) {
+    uint16_t current_flags = tls32_.state_and_flags.as_struct.flags;
+    if (LIKELY((current_flags & (kCheckpointRequest | kActiveSuspendBarrier)) == 0)) {
+      break;
+    } else if ((current_flags & kActiveSuspendBarrier) != 0) {
+      PassActiveSuspendBarriers(this);
+    } else {
+      // Impossible
+      LOG(FATAL) << "Fatal, thread transited into suspended without running the checkpoint";
+    }
+  }
 }
 
 inline ThreadState Thread::TransitionFromSuspendedToRunnable() {
-  bool done = false;
   union StateAndFlags old_state_and_flags;
   old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
   int16_t old_state = old_state_and_flags.as_struct.state;
@@ -148,7 +165,26 @@
     Locks::mutator_lock_->AssertNotHeld(this);  // Otherwise we starve GC..
     old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
     DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
-    if (UNLIKELY((old_state_and_flags.as_struct.flags & kSuspendRequest) != 0)) {
+    if (LIKELY(old_state_and_flags.as_struct.flags == 0)) {
+      // Optimize for the return from native code case - this is the fast path.
+      // Atomically change from suspended to runnable if no suspend request pending.
+      union StateAndFlags new_state_and_flags;
+      new_state_and_flags.as_int = old_state_and_flags.as_int;
+      new_state_and_flags.as_struct.state = kRunnable;
+      // CAS the value with a memory barrier.
+      if (LIKELY(tls32_.state_and_flags.as_atomic_int.CompareExchangeWeakAcquire(
+                                                 old_state_and_flags.as_int,
+                                                 new_state_and_flags.as_int))) {
+        // Mark the acquisition of a share of the mutator_lock_.
+        Locks::mutator_lock_->TransitionFromSuspendedToRunnable(this);
+        break;
+      }
+    } else if ((old_state_and_flags.as_struct.flags & kActiveSuspendBarrier) != 0) {
+      PassActiveSuspendBarriers(this);
+    } else if ((old_state_and_flags.as_struct.flags & kCheckpointRequest) != 0) {
+      // Impossible
+      LOG(FATAL) << "Fatal, wrong checkpoint flag";
+    } else if ((old_state_and_flags.as_struct.flags & kSuspendRequest) != 0) {
       // Wait while our suspend count is non-zero.
       MutexLock mu(this, *Locks::thread_suspend_count_lock_);
       old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
@@ -161,32 +197,13 @@
       }
       DCHECK_EQ(GetSuspendCount(), 0);
     }
-    // Re-acquire shared mutator_lock_ access.
-    Locks::mutator_lock_->SharedLock(this);
-    // Atomically change from suspended to runnable if no suspend request pending.
-    old_state_and_flags.as_int = tls32_.state_and_flags.as_int;
-    DCHECK_EQ(old_state_and_flags.as_struct.state, old_state);
-    if (LIKELY((old_state_and_flags.as_struct.flags & kSuspendRequest) == 0)) {
-      union StateAndFlags new_state_and_flags;
-      new_state_and_flags.as_int = old_state_and_flags.as_int;
-      new_state_and_flags.as_struct.state = kRunnable;
-      // CAS the value without a memory ordering as that is given by the lock acquisition above.
-      done =
-          tls32_.state_and_flags.as_atomic_int.CompareExchangeWeakRelaxed(old_state_and_flags.as_int,
-                                                                          new_state_and_flags.as_int);
-    }
-    if (UNLIKELY(!done)) {
-      // Failed to transition to Runnable. Release shared mutator_lock_ access and try again.
-      Locks::mutator_lock_->SharedUnlock(this);
-    } else {
-      // Run the flip function, if set.
-      Closure* flip_func = GetFlipFunction();
-      if (flip_func != nullptr) {
-        flip_func->Run(this);
-      }
-      return static_cast<ThreadState>(old_state);
-    }
   } while (true);
+  // Run the flip function, if set.
+  Closure* flip_func = GetFlipFunction();
+  if (flip_func != nullptr) {
+    flip_func->Run(this);
+  }
+  return static_cast<ThreadState>(old_state);
 }
 
 inline void Thread::VerifyStack() {

diff --git a/runtime/thread.cc b/runtime/thread.cc
index 37a86f1..bde463e 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc

@@ -74,6 +74,14 @@
 #include "vmap_table.h"
 #include "well_known_classes.h"
 
+#if ART_USE_FUTEXES
+#include "linux/futex.h"
+#include "sys/syscall.h"
+#ifndef SYS_futex
+#define SYS_futex __NR_futex
+#endif
+#endif  // ART_USE_FUTEXES
+
 namespace art {
 
 bool Thread::is_started_ = false;
@@ -758,7 +766,8 @@
   LOG(FATAL) << ss.str();
 }
 
-void Thread::ModifySuspendCount(Thread* self, int delta, bool for_debugger) {
+bool Thread::ModifySuspendCount(Thread* self, int delta, AtomicInteger* suspend_barrier,
+                                bool for_debugger) {
   if (kIsDebugBuild) {
     DCHECK(delta == -1 || delta == +1 || delta == -tls32_.debug_suspend_count)
           << delta << " " << tls32_.debug_suspend_count << " " << this;
@@ -770,7 +779,24 @@
   }
   if (UNLIKELY(delta < 0 && tls32_.suspend_count <= 0)) {
     UnsafeLogFatalForSuspendCount(self, this);
-    return;
+    return false;
+  }
+
+  uint16_t flags = kSuspendRequest;
+  if (delta > 0 && suspend_barrier != nullptr) {
+    uint32_t available_barrier = kMaxSuspendBarriers;
+    for (uint32_t i = 0; i < kMaxSuspendBarriers; ++i) {
+      if (tlsPtr_.active_suspend_barriers[i] == nullptr) {
+        available_barrier = i;
+        break;
+      }
+    }
+    if (available_barrier == kMaxSuspendBarriers) {
+      // No barrier spaces available, we can't add another.
+      return false;
+    }
+    tlsPtr_.active_suspend_barriers[available_barrier] = suspend_barrier;
+    flags |= kActiveSuspendBarrier;
   }
 
   tls32_.suspend_count += delta;
@@ -781,9 +807,76 @@
   if (tls32_.suspend_count == 0) {
     AtomicClearFlag(kSuspendRequest);
   } else {
-    AtomicSetFlag(kSuspendRequest);
+    // Two bits might be set simultaneously.
+    tls32_.state_and_flags.as_atomic_int.FetchAndOrSequentiallyConsistent(flags);
     TriggerSuspend();
   }
+  return true;
+}
+
+bool Thread::PassActiveSuspendBarriers(Thread* self) {
+  // Grab the suspend_count lock and copy the current set of
+  // barriers. Then clear the list and the flag. The ModifySuspendCount
+  // function requires the lock so we prevent a race between setting
+  // the kActiveSuspendBarrier flag and clearing it.
+  AtomicInteger* pass_barriers[kMaxSuspendBarriers];
+  {
+    MutexLock mu(self, *Locks::thread_suspend_count_lock_);
+    if (!ReadFlag(kActiveSuspendBarrier)) {
+      // quick exit test: the barriers have already been claimed - this is
+      // possible as there may be a race to claim and it doesn't matter
+      // who wins.
+      // All of the callers of this function (except the SuspendAllInternal)
+      // will first test the kActiveSuspendBarrier flag without lock. Here
+      // double-check whether the barrier has been passed with the
+      // suspend_count lock.
+      return false;
+    }
+
+    for (uint32_t i = 0; i < kMaxSuspendBarriers; ++i) {
+      pass_barriers[i] = tlsPtr_.active_suspend_barriers[i];
+      tlsPtr_.active_suspend_barriers[i] = nullptr;
+    }
+    AtomicClearFlag(kActiveSuspendBarrier);
+  }
+
+  uint32_t barrier_count = 0;
+  for (uint32_t i = 0; i < kMaxSuspendBarriers; i++) {
+    AtomicInteger* pending_threads = pass_barriers[i];
+    if (pending_threads != nullptr) {
+      bool done = false;
+      do {
+        int32_t cur_val = pending_threads->LoadRelaxed();
+        CHECK_GT(cur_val, 0) << "Unexpected value for PassActiveSuspendBarriers(): " << cur_val;
+        // Reduce value by 1.
+        done = pending_threads->CompareExchangeWeakRelaxed(cur_val, cur_val - 1);
+#if ART_USE_FUTEXES
+        if (done && (cur_val - 1) == 0) {  // Weak CAS may fail spuriously.
+          futex(pending_threads->Address(), FUTEX_WAKE, -1, nullptr, nullptr, 0);
+        }
+#endif
+      } while (!done);
+      ++barrier_count;
+    }
+  }
+  CHECK_GT(barrier_count, 0U);
+  return true;
+}
+
+void Thread::ClearSuspendBarrier(AtomicInteger* target) {
+  CHECK(ReadFlag(kActiveSuspendBarrier));
+  bool clear_flag = true;
+  for (uint32_t i = 0; i < kMaxSuspendBarriers; ++i) {
+    AtomicInteger* ptr = tlsPtr_.active_suspend_barriers[i];
+    if (ptr == target) {
+      tlsPtr_.active_suspend_barriers[i] = nullptr;
+    } else if (ptr != nullptr) {
+      clear_flag = false;
+    }
+  }
+  if (LIKELY(clear_flag)) {
+    AtomicClearFlag(kActiveSuspendBarrier);
+  }
 }
 
 void Thread::RunCheckpointFunction() {
@@ -1288,6 +1381,9 @@
   for (uint32_t i = 0; i < kMaxCheckpoints; ++i) {
     tlsPtr_.checkpoint_functions[i] = nullptr;
   }
+  for (uint32_t i = 0; i < kMaxSuspendBarriers; ++i) {
+    tlsPtr_.active_suspend_barriers[i] = nullptr;
+  }
   tlsPtr_.flip_function = nullptr;
   tls32_.suspended_at_suspend_check = false;
 }

diff --git a/runtime/thread.h b/runtime/thread.h
index 0e71c08..3595026 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h

@@ -98,7 +98,8 @@
 enum ThreadFlag {
   kSuspendRequest   = 1,  // If set implies that suspend_count_ > 0 and the Thread should enter the
                           // safepoint handler.
-  kCheckpointRequest = 2  // Request that the thread do some checkpoint work and then continue.
+  kCheckpointRequest = 2,  // Request that the thread do some checkpoint work and then continue.
+  kActiveSuspendBarrier = 4  // Register that at least 1 suspend barrier needs to be passed.
 };
 
 enum class StackedShadowFrameType {
@@ -223,7 +224,7 @@
         (state_and_flags.as_struct.flags & kSuspendRequest) != 0;
   }
 
-  void ModifySuspendCount(Thread* self, int delta, bool for_debugger)
+  bool ModifySuspendCount(Thread* self, int delta, AtomicInteger* suspend_barrier, bool for_debugger)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_suspend_count_lock_);
 
   bool RequestCheckpoint(Closure* function)
@@ -846,6 +847,12 @@
 
   void RunCheckpointFunction();
 
+  bool PassActiveSuspendBarriers(Thread* self)
+      LOCKS_EXCLUDED(Locks::thread_suspend_count_lock_);
+
+  void ClearSuspendBarrier(AtomicInteger* target)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_suspend_count_lock_);
+
   bool ReadFlag(ThreadFlag flag) const {
     return (tls32_.state_and_flags.as_struct.flags & flag) != 0;
   }
@@ -964,6 +971,11 @@
   ThreadState SetStateUnsafe(ThreadState new_state) {
     ThreadState old_state = GetState();
     tls32_.state_and_flags.as_struct.state = new_state;
+    // if transit to a suspended state, check the pass barrier request.
+    if (UNLIKELY((new_state != kRunnable) &&
+                 (tls32_.state_and_flags.as_struct.flags & kActiveSuspendBarrier))) {
+      PassActiveSuspendBarriers(this);
+    }
     return old_state;
   }
 
@@ -1034,6 +1046,9 @@
   // Maximum number of checkpoint functions.
   static constexpr uint32_t kMaxCheckpoints = 3;
 
+  // Maximum number of suspend barriers.
+  static constexpr uint32_t kMaxSuspendBarriers = 3;
+
   // Has Thread::Startup been called?
   static bool is_started_;
 
@@ -1238,6 +1253,12 @@
     // Locks::thread_suspend_count_lock_.
     Closure* checkpoint_functions[kMaxCheckpoints];
 
+    // Pending barriers that require passing or NULL if non-pending. Installation guarding by
+    // Locks::thread_suspend_count_lock_.
+    // They work effectively as art::Barrier, but implemented directly using AtomicInteger and futex
+    // to avoid additional cost of a mutex and a condition variable, as used in art::Barrier.
+    AtomicInteger* active_suspend_barriers[kMaxSuspendBarriers];
+
     // Entrypoint function pointers.
     // TODO: move this to more of a global offset table model to avoid per-thread duplication.
     InterpreterEntryPoints interpreter_entrypoints;

diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index 7e8128f..a3c9ae5 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc

@@ -40,6 +40,14 @@
 #include "trace.h"
 #include "well_known_classes.h"
 
+#if ART_USE_FUTEXES
+#include "linux/futex.h"
+#include "sys/syscall.h"
+#ifndef SYS_futex
+#define SYS_futex __NR_futex
+#endif
+#endif  // ART_USE_FUTEXES
+
 namespace art {
 
 static constexpr uint64_t kLongThreadSuspendThreshold = MsToNs(5);
@@ -278,7 +286,7 @@
               // Spurious fail, try again.
               continue;
             }
-            thread->ModifySuspendCount(self, +1, false);
+            thread->ModifySuspendCount(self, +1, nullptr, false);
             suspended_count_modified_threads.push_back(thread);
             break;
           }
@@ -316,7 +324,7 @@
     checkpoint_function->Run(thread);
     {
       MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
-      thread->ModifySuspendCount(self, -1, false);
+      thread->ModifySuspendCount(self, -1, nullptr, false);
     }
   }
 
@@ -386,7 +394,7 @@
       if (thread == self) {
         continue;
       }
-      thread->ModifySuspendCount(self, +1, false);
+      thread->ModifySuspendCount(self, +1, nullptr, false);
     }
   }
 
@@ -413,7 +421,7 @@
       thread->SetFlipFunction(thread_flip_visitor);
       if (thread->IsSuspendedAtSuspendCheck()) {
         // The thread will resume right after the broadcast.
-        thread->ModifySuspendCount(self, -1, false);
+        thread->ModifySuspendCount(self, -1, nullptr, false);
         runnable_threads.push_back(thread);
       } else {
         other_threads.push_back(thread);
@@ -439,7 +447,7 @@
   {
     MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
     for (const auto& thread : other_threads) {
-      thread->ModifySuspendCount(self, -1, false);
+      thread->ModifySuspendCount(self, -1, nullptr, false);
     }
     Thread::resume_cond_->Broadcast(self);
   }
@@ -458,28 +466,9 @@
   ATRACE_BEGIN("Suspending mutator threads");
   const uint64_t start_time = NanoTime();
 
-  Locks::mutator_lock_->AssertNotHeld(self);
-  Locks::thread_list_lock_->AssertNotHeld(self);
-  Locks::thread_suspend_count_lock_->AssertNotHeld(self);
-  if (kDebugLocking && self != nullptr) {
-    CHECK_NE(self->GetState(), kRunnable);
-  }
-  {
-    MutexLock mu(self, *Locks::thread_list_lock_);
-    MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
-    // Update global suspend all state for attaching threads.
-    ++suspend_all_count_;
-    // Increment everybody's suspend count (except our own).
-    for (const auto& thread : list_) {
-      if (thread == self) {
-        continue;
-      }
-      VLOG(threads) << "requesting thread suspend: " << *thread;
-      thread->ModifySuspendCount(self, +1, false);
-    }
-  }
-
-  // Block on the mutator lock until all Runnable threads release their share of access.
+  SuspendAllInternal(self, self);
+  // All threads are known to have suspended (but a thread may still own the mutator lock)
+  // Make sure this thread grabs exclusive access to the mutator lock and its protected data.
 #if HAVE_TIMED_RWLOCK
   while (true) {
     if (Locks::mutator_lock_->ExclusiveLockWithTimeout(self, kThreadSuspendTimeoutMs, 0)) {
@@ -519,6 +508,112 @@
   }
 }
 
+// Ensures all threads running Java suspend and that those not running Java don't start.
+// Debugger thread might be set to kRunnable for a short period of time after the
+// SuspendAllInternal. This is safe because it will be set back to suspended state before
+// the SuspendAll returns.
+void ThreadList::SuspendAllInternal(Thread* self, Thread* ignore1, Thread* ignore2,
+                                    bool debug_suspend) {
+  Locks::mutator_lock_->AssertNotExclusiveHeld(self);
+  Locks::thread_list_lock_->AssertNotHeld(self);
+  Locks::thread_suspend_count_lock_->AssertNotHeld(self);
+  if (kDebugLocking && self != nullptr) {
+    CHECK_NE(self->GetState(), kRunnable);
+  }
+
+  // First request that all threads suspend, then wait for them to suspend before
+  // returning. This suspension scheme also relies on other behaviour:
+  // 1. Threads cannot be deleted while they are suspended or have a suspend-
+  //    request flag set - (see Unregister() below).
+  // 2. When threads are created, they are created in a suspended state (actually
+  //    kNative) and will never begin executing Java code without first checking
+  //    the suspend-request flag.
+
+  // The atomic counter for number of threads that need to pass the barrier.
+  AtomicInteger pending_threads;
+  uint32_t num_ignored = 0;
+  if (ignore1 != nullptr) {
+    ++num_ignored;
+  }
+  if (ignore2 != nullptr && ignore1 != ignore2) {
+    ++num_ignored;
+  }
+  {
+    MutexLock mu(self, *Locks::thread_list_lock_);
+    MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
+    // Update global suspend all state for attaching threads.
+    ++suspend_all_count_;
+    if (debug_suspend)
+      ++debug_suspend_all_count_;
+    pending_threads.StoreRelaxed(list_.size() - num_ignored);
+    // Increment everybody's suspend count (except those that should be ignored).
+    for (const auto& thread : list_) {
+      if (thread == ignore1 || thread == ignore2) {
+        continue;
+      }
+      VLOG(threads) << "requesting thread suspend: " << *thread;
+      while (true) {
+        if (LIKELY(thread->ModifySuspendCount(self, +1, &pending_threads, debug_suspend))) {
+          break;
+        } else {
+          // Failure means the list of active_suspend_barriers is full, we should release the
+          // thread_suspend_count_lock_ (to avoid deadlock) and wait till the target thread has
+          // executed Thread::PassActiveSuspendBarriers(). Note that we could not simply wait for
+          // the thread to change to a suspended state, because it might need to run checkpoint
+          // function before the state change, which also needs thread_suspend_count_lock_.
+
+          // This is very unlikely to happen since more than kMaxSuspendBarriers threads need to
+          // execute SuspendAllInternal() simultaneously, and target thread stays in kRunnable
+          // in the mean time.
+          Locks::thread_suspend_count_lock_->ExclusiveUnlock(self);
+          NanoSleep(100000);
+          Locks::thread_suspend_count_lock_->ExclusiveLock(self);
+        }
+      }
+
+      // Must install the pending_threads counter first, then check thread->IsSuspend() and clear
+      // the counter. Otherwise there's a race with Thread::TransitionFromRunnableToSuspended()
+      // that can lead a thread to miss a call to PassActiveSuspendBarriers().
+      if (thread->IsSuspended()) {
+        // Only clear the counter for the current thread.
+        thread->ClearSuspendBarrier(&pending_threads);
+        pending_threads.FetchAndSubSequentiallyConsistent(1);
+      }
+    }
+  }
+
+  // Wait for the barrier to be passed by all runnable threads. This wait
+  // is done with a timeout so that we can detect problems.
+  timespec wait_timeout;
+  InitTimeSpec(true, CLOCK_MONOTONIC, 10000, 0, &wait_timeout);
+  while (true) {
+    int32_t cur_val = pending_threads.LoadRelaxed();
+    if (LIKELY(cur_val > 0)) {
+#if ART_USE_FUTEXES
+      if (futex(pending_threads.Address(), FUTEX_WAIT, cur_val, &wait_timeout, nullptr, 0) != 0) {
+        // EAGAIN and EINTR both indicate a spurious failure, try again from the beginning.
+        if ((errno != EAGAIN) && (errno != EINTR)) {
+          if (errno == ETIMEDOUT) {
+            LOG(kIsDebugBuild ? FATAL : ERROR) << "Unexpected time out during suspend all.";
+          } else {
+            PLOG(FATAL) << "futex wait failed for SuspendAllInternal()";
+          }
+        }
+      } else {
+        cur_val = pending_threads.LoadRelaxed();
+        CHECK_EQ(cur_val, 0);
+        break;
+      }
+#else
+      // Spin wait. This is likely to be slow, but on most architecture ART_USE_FUTEXES is set.
+#endif
+    } else {
+      CHECK_EQ(cur_val, 0);
+      break;
+    }
+  }
+}
+
 void ThreadList::ResumeAll() {
   Thread* self = Thread::Current();
 
@@ -549,7 +644,7 @@
       if (thread == self) {
         continue;
       }
-      thread->ModifySuspendCount(self, -1, false);
+      thread->ModifySuspendCount(self, -1, nullptr, false);
     }
 
     // Broadcast a notification to all suspended threads, some or all of
@@ -592,7 +687,7 @@
           << ") thread not within thread list";
       return;
     }
-    thread->ModifySuspendCount(self, -1, for_debugger);
+    thread->ModifySuspendCount(self, -1, nullptr, for_debugger);
   }
 
   {
@@ -644,7 +739,7 @@
           // If we incremented the suspend count but the thread reset its peer, we need to
           // re-decrement it since it is shutting down and may deadlock the runtime in
           // ThreadList::WaitForOtherNonDaemonThreadsToExit.
-          suspended_thread->ModifySuspendCount(soa.Self(), -1, debug_suspension);
+          suspended_thread->ModifySuspendCount(soa.Self(), -1, nullptr, debug_suspension);
         }
         ThreadSuspendByPeerWarning(self, WARNING, "No such thread for suspend", peer);
         return nullptr;
@@ -667,7 +762,7 @@
           }
           CHECK(suspended_thread == nullptr);
           suspended_thread = thread;
-          suspended_thread->ModifySuspendCount(self, +1, debug_suspension);
+          suspended_thread->ModifySuspendCount(self, +1, nullptr, debug_suspension);
           request_suspension = false;
         } else {
           // If the caller isn't requesting suspension, a suspension should have already occurred.
@@ -696,7 +791,7 @@
           ThreadSuspendByPeerWarning(self, FATAL, "Thread suspension timed out", peer);
           if (suspended_thread != nullptr) {
             CHECK_EQ(suspended_thread, thread);
-            suspended_thread->ModifySuspendCount(soa.Self(), -1, debug_suspension);
+            suspended_thread->ModifySuspendCount(soa.Self(), -1, nullptr, debug_suspension);
           }
           *timed_out = true;
           return nullptr;
@@ -765,7 +860,7 @@
             // which will allow this thread to be suspended.
             continue;
           }
-          thread->ModifySuspendCount(self, +1, debug_suspension);
+          thread->ModifySuspendCount(self, +1, nullptr, debug_suspension);
           suspended_thread = thread;
         } else {
           CHECK_EQ(suspended_thread, thread);
@@ -794,7 +889,7 @@
         if (total_delay >= MsToNs(kThreadSuspendTimeoutMs)) {
           ThreadSuspendByThreadIdWarning(WARNING, "Thread suspension timed out", thread_id);
           if (suspended_thread != nullptr) {
-            thread->ModifySuspendCount(soa.Self(), -1, debug_suspension);
+            thread->ModifySuspendCount(soa.Self(), -1, nullptr, debug_suspension);
           }
           *timed_out = true;
           return nullptr;
@@ -831,25 +926,7 @@
 
   VLOG(threads) << *self << " SuspendAllForDebugger starting...";
 
-  {
-    MutexLock thread_list_mu(self, *Locks::thread_list_lock_);
-    {
-      MutexLock suspend_count_mu(self, *Locks::thread_suspend_count_lock_);
-      // Update global suspend all state for attaching threads.
-      DCHECK_GE(suspend_all_count_, debug_suspend_all_count_);
-      ++suspend_all_count_;
-      ++debug_suspend_all_count_;
-      // Increment everybody's suspend count (except our own).
-      for (const auto& thread : list_) {
-        if (thread == self || thread == debug_thread) {
-          continue;
-        }
-        VLOG(threads) << "requesting thread suspend: " << *thread;
-        thread->ModifySuspendCount(self, +1, true);
-      }
-    }
-  }
-
+  SuspendAllInternal(self, self, debug_thread, true);
   // Block on the mutator lock until all Runnable threads release their share of access then
   // immediately unlock again.
 #if HAVE_TIMED_RWLOCK
@@ -887,7 +964,7 @@
     // to ensure that we're the only one fiddling with the suspend count
     // though.
     MutexLock mu(self, *Locks::thread_suspend_count_lock_);
-    self->ModifySuspendCount(self, +1, true);
+    self->ModifySuspendCount(self, +1, nullptr, true);
     CHECK_GT(self->GetSuspendCount(), 0);
 
     VLOG(threads) << *self << " self-suspending (debugger)";
@@ -971,7 +1048,7 @@
           continue;
         }
         VLOG(threads) << "requesting thread resume: " << *thread;
-        thread->ModifySuspendCount(self, -1, true);
+        thread->ModifySuspendCount(self, -1, nullptr, true);
       }
     }
   }
@@ -1000,7 +1077,7 @@
       if (thread == self || thread->GetDebugSuspendCount() == 0) {
         continue;
       }
-      thread->ModifySuspendCount(self, -thread->GetDebugSuspendCount(), true);
+      thread->ModifySuspendCount(self, -thread->GetDebugSuspendCount(), nullptr, true);
     }
   }
 
@@ -1053,7 +1130,7 @@
       // daemons.
       CHECK(thread->IsDaemon()) << *thread;
       if (thread != self) {
-        thread->ModifySuspendCount(self, +1, false);
+        thread->ModifySuspendCount(self, +1, nullptr, false);
       }
     }
   }
@@ -1094,10 +1171,10 @@
   // Modify suspend count in increments of 1 to maintain invariants in ModifySuspendCount. While
   // this isn't particularly efficient the suspend counts are most commonly 0 or 1.
   for (int delta = debug_suspend_all_count_; delta > 0; delta--) {
-    self->ModifySuspendCount(self, +1, true);
+    self->ModifySuspendCount(self, +1, nullptr, true);
   }
   for (int delta = suspend_all_count_ - debug_suspend_all_count_; delta > 0; delta--) {
-    self->ModifySuspendCount(self, +1, false);
+    self->ModifySuspendCount(self, +1, nullptr, false);
   }
   CHECK(!Contains(self));
   list_.push_back(self);

diff --git a/runtime/thread_list.h b/runtime/thread_list.h
index 2c1f813..edd1e05 100644
--- a/runtime/thread_list.h
+++ b/runtime/thread_list.h

@@ -155,6 +155,8 @@
 
   bool Contains(Thread* thread) EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_list_lock_);
   bool Contains(pid_t tid) EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_list_lock_);
+  size_t RunCheckpoint(Closure* checkpoint_function, bool includeSuspended)
+      LOCKS_EXCLUDED(Locks::thread_list_lock_, Locks::thread_suspend_count_lock_);
 
   void DumpUnattachedThreads(std::ostream& os)
       LOCKS_EXCLUDED(Locks::thread_list_lock_);
@@ -166,6 +168,11 @@
       LOCKS_EXCLUDED(Locks::thread_list_lock_,
                      Locks::thread_suspend_count_lock_);
 
+  void SuspendAllInternal(Thread* self, Thread* ignore1, Thread* ignore2 = nullptr,
+                          bool debug_suspend = false)
+      LOCKS_EXCLUDED(Locks::thread_list_lock_,
+                     Locks::thread_suspend_count_lock_);
+
   void AssertThreadsAreSuspended(Thread* self, Thread* ignore1, Thread* ignore2 = nullptr)
       LOCKS_EXCLUDED(Locks::thread_list_lock_,
                      Locks::thread_suspend_count_lock_);
commit	eac4424b3420c280f97ff2f815b5dedd8dac9801	[log] [tgz]
author	Yu Li <yu.l.li@intel.com>	Mon Jun 29 10:50:03 2015 +0800
committer	Man Cao <manc@google.com>	Mon Jul 13 15:56:19 2015 -0700
tree	4e7c31b4312aa000bd3d2250df6a05daf61523d7
parent	ec97825f8553b3a99b6dfbbb90a50fe65301ea94 [diff]