Merge "Reduce the number of fences needed for monitors"
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index 6d29ed3..354410e 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -97,6 +97,12 @@
OFFSET_OF_OBJECT_MEMBER(Object, monitor_), old_val.GetValue(), new_val.GetValue());
}
+inline bool Object::CasLockWordWeakAcquire(LockWord old_val, LockWord new_val) {
+ // Force use of non-transactional mode and do not check.
+ return CasFieldWeakAcquire32<false, false>(
+ OFFSET_OF_OBJECT_MEMBER(Object, monitor_), old_val.GetValue(), new_val.GetValue());
+}
+
inline bool Object::CasLockWordWeakRelease(LockWord old_val, LockWord new_val) {
// Force use of non-transactional mode and do not check.
return CasFieldWeakRelease32<false, false>(
@@ -759,6 +765,24 @@
}
template<bool kTransactionActive, bool kCheckTransaction, VerifyObjectFlags kVerifyFlags>
+inline bool Object::CasFieldWeakAcquire32(MemberOffset field_offset,
+ int32_t old_value, int32_t new_value) {
+ if (kCheckTransaction) {
+ DCHECK_EQ(kTransactionActive, Runtime::Current()->IsActiveTransaction());
+ }
+ if (kTransactionActive) {
+ Runtime::Current()->RecordWriteField32(this, field_offset, old_value, true);
+ }
+ if (kVerifyFlags & kVerifyThis) {
+ VerifyObject(this);
+ }
+ uint8_t* raw_addr = reinterpret_cast<uint8_t*>(this) + field_offset.Int32Value();
+ AtomicInteger* atomic_addr = reinterpret_cast<AtomicInteger*>(raw_addr);
+
+ return atomic_addr->CompareExchangeWeakAcquire(old_value, new_value);
+}
+
+template<bool kTransactionActive, bool kCheckTransaction, VerifyObjectFlags kVerifyFlags>
inline bool Object::CasFieldWeakRelease32(MemberOffset field_offset,
int32_t old_value, int32_t new_value) {
if (kCheckTransaction) {
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index 67b5ddb..db58a60 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -153,6 +153,8 @@
REQUIRES_SHARED(Locks::mutator_lock_);
bool CasLockWordWeakRelaxed(LockWord old_val, LockWord new_val)
REQUIRES_SHARED(Locks::mutator_lock_);
+ bool CasLockWordWeakAcquire(LockWord old_val, LockWord new_val)
+ REQUIRES_SHARED(Locks::mutator_lock_);
bool CasLockWordWeakRelease(LockWord old_val, LockWord new_val)
REQUIRES_SHARED(Locks::mutator_lock_);
uint32_t GetLockOwnerThreadId();
@@ -460,6 +462,12 @@
template<bool kTransactionActive, bool kCheckTransaction = true,
VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+ bool CasFieldWeakAcquire32(MemberOffset field_offset, int32_t old_value,
+ int32_t new_value) ALWAYS_INLINE
+ REQUIRES_SHARED(Locks::mutator_lock_);
+
+ template<bool kTransactionActive, bool kCheckTransaction = true,
+ VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
bool CasFieldWeakRelease32(MemberOffset field_offset, int32_t old_value,
int32_t new_value) ALWAYS_INLINE
REQUIRES_SHARED(Locks::mutator_lock_);
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index 222eb5c..893abd5 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -161,7 +161,7 @@
}
LockWord fat(this, lw.GCState());
// Publish the updated lock word, which may race with other threads.
- bool success = GetObject()->CasLockWordWeakSequentiallyConsistent(lw, fat);
+ bool success = GetObject()->CasLockWordWeakRelease(lw, fat);
// Lock profiling.
if (success && owner_ != nullptr && lock_profiling_threshold_ != 0) {
// Do not abort on dex pc errors. This can easily happen when we want to dump a stack trace on
@@ -879,13 +879,16 @@
StackHandleScope<1> hs(self);
Handle<mirror::Object> h_obj(hs.NewHandle(obj));
while (true) {
- LockWord lock_word = h_obj->GetLockWord(true);
+ // We initially read the lockword with ordinary Java/relaxed semantics. When stronger
+ // semantics are needed, we address it below. Since GetLockWord bottoms out to a relaxed load,
+ // we can fix it later, in an infrequently executed case, with a fence.
+ LockWord lock_word = h_obj->GetLockWord(false);
switch (lock_word.GetState()) {
case LockWord::kUnlocked: {
+ // No ordering required for preceding lockword read, since we retest.
LockWord thin_locked(LockWord::FromThinLockId(thread_id, 0, lock_word.GCState()));
- if (h_obj->CasLockWordWeakSequentiallyConsistent(lock_word, thin_locked)) {
+ if (h_obj->CasLockWordWeakAcquire(lock_word, thin_locked)) {
AtraceMonitorLock(self, h_obj.Get(), false /* is_wait */);
- // CasLockWord enforces more than the acquire ordering we need here.
return h_obj.Get(); // Success!
}
continue; // Go again.
@@ -893,19 +896,22 @@
case LockWord::kThinLocked: {
uint32_t owner_thread_id = lock_word.ThinLockOwner();
if (owner_thread_id == thread_id) {
+ // No ordering required for initial lockword read.
// We own the lock, increase the recursion count.
uint32_t new_count = lock_word.ThinLockCount() + 1;
if (LIKELY(new_count <= LockWord::kThinLockMaxCount)) {
LockWord thin_locked(LockWord::FromThinLockId(thread_id,
new_count,
lock_word.GCState()));
+ // Only this thread pays attention to the count. Thus there is no need for stronger
+ // than relaxed memory ordering.
if (!kUseReadBarrier) {
- h_obj->SetLockWord(thin_locked, true);
+ h_obj->SetLockWord(thin_locked, false /* volatile */);
AtraceMonitorLock(self, h_obj.Get(), false /* is_wait */);
return h_obj.Get(); // Success!
} else {
// Use CAS to preserve the read barrier state.
- if (h_obj->CasLockWordWeakSequentiallyConsistent(lock_word, thin_locked)) {
+ if (h_obj->CasLockWordWeakRelaxed(lock_word, thin_locked)) {
AtraceMonitorLock(self, h_obj.Get(), false /* is_wait */);
return h_obj.Get(); // Success!
}
@@ -922,20 +928,28 @@
// Contention.
contention_count++;
Runtime* runtime = Runtime::Current();
- if (contention_count <= runtime->GetMaxSpinsBeforeThinkLockInflation()) {
+ if (contention_count <= runtime->GetMaxSpinsBeforeThinLockInflation()) {
// TODO: Consider switching the thread state to kBlocked when we are yielding.
// Use sched_yield instead of NanoSleep since NanoSleep can wait much longer than the
// parameter you pass in. This can cause thread suspension to take excessively long
// and make long pauses. See b/16307460.
+ // TODO: We should literally spin first, without sched_yield. Sched_yield either does
+ // nothing (at significant expense), or guarantees that we wait at least microseconds.
+ // If the owner is running, I would expect the median lock hold time to be hundreds
+ // of nanoseconds or less.
sched_yield();
} else {
contention_count = 0;
+ // No ordering required for initial lockword read. Install rereads it anyway.
InflateThinLocked(self, h_obj, lock_word, 0);
}
}
continue; // Start from the beginning.
}
case LockWord::kFatLocked: {
+ // We should have done an acquire read of the lockword initially, to ensure
+ // visibility of the monitor data structure. Use an explicit fence instead.
+ QuasiAtomic::ThreadFenceAcquire();
Monitor* mon = lock_word.FatLockMonitor();
if (trylock) {
return mon->TryLock(self) ? h_obj.Get() : nullptr;
@@ -946,6 +960,8 @@
}
case LockWord::kHashCode:
// Inflate with the existing hashcode.
+ // Again no ordering required for initial lockword read, since we don't rely
+ // on the visibility of any prior computation.
Inflate(self, nullptr, h_obj.Get(), lock_word.GetHashCode());
continue; // Start from the beginning.
default: {
@@ -988,13 +1004,16 @@
}
if (!kUseReadBarrier) {
DCHECK_EQ(new_lw.ReadBarrierState(), 0U);
+ // TODO: This really only needs memory_order_release, but we currently have
+ // no way to specify that. In fact there seem to be no legitimate uses of SetLockWord
+ // with a final argument of true. This slows down x86 and ARMv7, but probably not v8.
h_obj->SetLockWord(new_lw, true);
AtraceMonitorUnlock();
// Success!
return true;
} else {
// Use CAS to preserve the read barrier state.
- if (h_obj->CasLockWordWeakSequentiallyConsistent(lock_word, new_lw)) {
+ if (h_obj->CasLockWordWeakRelease(lock_word, new_lw)) {
AtraceMonitorUnlock();
// Success!
return true;
diff --git a/runtime/runtime.h b/runtime/runtime.h
index d40c631..8fc211c 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -268,7 +268,7 @@
return java_vm_.get();
}
- size_t GetMaxSpinsBeforeThinkLockInflation() const {
+ size_t GetMaxSpinsBeforeThinLockInflation() const {
return max_spins_before_thin_lock_inflation_;
}