Reserve bits in the lock word for read barriers.

This prepares for the CC collector to use the standard object header
model by storing the read barrier state in the lock word.

Bug: 19355854
Bug: 12687968
Change-Id: Ia7585662dd2cebf0479a3e74f734afe5059fb70f
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index fec1ce5..aff3880 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -469,26 +469,33 @@
 .Lretry_lock:
     ldr    r2, [r9, #THREAD_ID_OFFSET]
     ldrex  r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    cbnz   r1, .Lnot_unlocked         @ already thin locked
-    @ unlocked case - r2 holds thread id with count of 0
+    mov    r3, r1
+    and    r3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  @ zero the read barrier bits
+    cbnz   r3, .Lnot_unlocked         @ already thin locked
+    @ unlocked case - r1: original lock word that's zero except for the read barrier bits.
+    orr    r2, r1, r2                 @ r2 holds thread id with count of 0 with preserved read barrier bits
     strex  r3, r2, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    cbnz   r3, .Lstrex_fail           @ store failed, retry
+    cbnz   r3, .Llock_strex_fail      @ store failed, retry
     dmb    ish                        @ full (LoadLoad|LoadStore) memory barrier
     bx lr
-.Lstrex_fail:
-    b .Lretry_lock                    @ unlikely forward branch, need to reload and recheck r1/r2
-.Lnot_unlocked:
-    lsr    r3, r1, 30
+.Lnot_unlocked:  @ r1: original lock word, r2: thread_id with count of 0 and zero read barrier bits
+    lsr    r3, r1, LOCK_WORD_STATE_SHIFT
     cbnz   r3, .Lslow_lock            @ if either of the top two bits are set, go slow path
     eor    r2, r1, r2                 @ lock_word.ThreadId() ^ self->ThreadId()
     uxth   r2, r2                     @ zero top 16 bits
     cbnz   r2, .Lslow_lock            @ lock word and self thread id's match -> recursive lock
                                       @ else contention, go to slow path
-    add    r2, r1, #65536             @ increment count in lock word placing in r2 for storing
-    lsr    r1, r2, 30                 @ if either of the top two bits are set, we overflowed.
-    cbnz   r1, .Lslow_lock            @ if we overflow the count go slow path
-    str    r2, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET] @ no need for strex as we hold the lock
+    mov    r3, r1                     @ copy the lock word to check count overflow.
+    and    r3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  @ zero the read barrier bits.
+    add    r2, r3, #LOCK_WORD_THIN_LOCK_COUNT_ONE  @ increment count in lock word placing in r2 to check overflow
+    lsr    r3, r2, LOCK_WORD_READ_BARRIER_STATE_SHIFT  @ if either of the upper two bits (28-29) are set, we overflowed.
+    cbnz   r3, .Lslow_lock            @ if we overflow the count go slow path
+    add    r2, r1, #LOCK_WORD_THIN_LOCK_COUNT_ONE  @ increment count for real
+    strex  r3, r2, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET] @ strex necessary for read barrier bits
+    cbnz   r3, .Llock_strex_fail      @ strex failed, retry
     bx lr
+.Llock_strex_fail:
+    b      .Lretry_lock               @ retry
 .Lslow_lock:
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME r1, r2  @ save callee saves in case we block
     mov    r1, r9                     @ pass Thread::Current
@@ -505,23 +512,46 @@
     .extern artUnlockObjectFromCode
 ENTRY art_quick_unlock_object
     cbz    r0, .Lslow_unlock
+.Lretry_unlock:
+#ifndef USE_READ_BARRIER
     ldr    r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    lsr    r2, r1, 30
+#else
+    ldrex  r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]  @ Need to use atomic instructions for read barrier
+#endif
+    lsr    r2, r1, #LOCK_WORD_STATE_SHIFT
     cbnz   r2, .Lslow_unlock          @ if either of the top two bits are set, go slow path
     ldr    r2, [r9, #THREAD_ID_OFFSET]
-    eor    r3, r1, r2                 @ lock_word.ThreadId() ^ self->ThreadId()
+    mov    r3, r1                     @ copy lock word to check thread id equality
+    and    r3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  @ zero the read barrier bits
+    eor    r3, r3, r2                 @ lock_word.ThreadId() ^ self->ThreadId()
     uxth   r3, r3                     @ zero top 16 bits
     cbnz   r3, .Lslow_unlock          @ do lock word and self thread id's match?
-    cmp    r1, #65536
+    mov    r3, r1                     @ copy lock word to detect transition to unlocked
+    and    r3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  @ zero the read barrier bits
+    cmp    r3, #LOCK_WORD_THIN_LOCK_COUNT_ONE
     bpl    .Lrecursive_thin_unlock
-    @ transition to unlocked, r3 holds 0
+    @ transition to unlocked
+    mov    r3, r1
+    and    r3, #LOCK_WORD_READ_BARRIER_STATE_MASK  @ r3: zero except for the preserved read barrier bits
     dmb    ish                        @ full (LoadStore|StoreStore) memory barrier
+#ifndef USE_READ_BARRIER
     str    r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+#else
+    strex  r2, r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]  @ strex necessary for read barrier bits
+    cbnz   r2, .Lunlock_strex_fail    @ store failed, retry
+#endif
     bx     lr
-.Lrecursive_thin_unlock:
-    sub    r1, r1, #65536
+.Lrecursive_thin_unlock:  @ r1: original lock word
+    sub    r1, r1, #LOCK_WORD_THIN_LOCK_COUNT_ONE  @ decrement count
+#ifndef USE_READ_BARRIER
     str    r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+#else
+    strex  r2, r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]  @ strex necessary for read barrier bits
+    cbnz   r2, .Lunlock_strex_fail    @ store failed, retry
+#endif
     bx     lr
+.Lunlock_strex_fail:
+    b      .Lretry_unlock             @ retry
 .Lslow_unlock:
     @ save callee saves in case exception allocation triggers GC
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME r1, r2
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 770073b5..382a4c2 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1000,25 +1000,33 @@
 .Lretry_lock:
     ldr    w2, [xSELF, #THREAD_ID_OFFSET] // TODO: Can the thread ID really change during the loop?
     ldxr   w1, [x4]
-    cbnz   w1, .Lnot_unlocked         // already thin locked
+    mov    x3, x1
+    and    w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  // zero the read barrier bits
+    cbnz   w3, .Lnot_unlocked         // already thin locked
+    // unlocked case - x1: original lock word that's zero except for the read barrier bits.
+    orr    x2, x1, x2                 // x2 holds thread id with count of 0 with preserved read barrier bits
     stxr   w3, w2, [x4]
-    cbnz   w3, .Lstrex_fail           // store failed, retry
+    cbnz   w3, .Llock_stxr_fail       // store failed, retry
     dmb    ishld                      // full (LoadLoad|LoadStore) memory barrier
     ret
-.Lstrex_fail:
-    b .Lretry_lock                    // unlikely forward branch, need to reload and recheck r1/r2
-.Lnot_unlocked:
-    lsr    w3, w1, 30
+.Lnot_unlocked:  // x1: original lock word
+    lsr    w3, w1, LOCK_WORD_STATE_SHIFT
     cbnz   w3, .Lslow_lock            // if either of the top two bits are set, go slow path
     eor    w2, w1, w2                 // lock_word.ThreadId() ^ self->ThreadId()
     uxth   w2, w2                     // zero top 16 bits
     cbnz   w2, .Lslow_lock            // lock word and self thread id's match -> recursive lock
                                       // else contention, go to slow path
-    add    w2, w1, #65536             // increment count in lock word placing in w2 for storing
-    lsr    w1, w2, 30                 // if either of the top two bits are set, we overflowed.
-    cbnz   w1, .Lslow_lock            // if we overflow the count go slow path
-    str    w2, [x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]  // no need for stxr as we hold the lock
+    mov    x3, x1                     // copy the lock word to check count overflow.
+    and    w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  // zero the read barrier bits.
+    add    w2, w3, #LOCK_WORD_THIN_LOCK_COUNT_ONE  // increment count in lock word placing in w2 to check overflow
+    lsr    w3, w2, LOCK_WORD_READ_BARRIER_STATE_SHIFT  // if either of the upper two bits (28-29) are set, we overflowed.
+    cbnz   w3, .Lslow_lock            // if we overflow the count go slow path
+    add    w2, w1, #LOCK_WORD_THIN_LOCK_COUNT_ONE  // increment count for real
+    stxr   w3, w2, [x4]
+    cbnz   w3, .Llock_stxr_fail       // store failed, retry
     ret
+.Llock_stxr_fail:
+    b      .Lretry_lock               // retry
 .Lslow_lock:
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME  // save callee saves in case we block
     mov    x1, xSELF                  // pass Thread::Current
@@ -1036,23 +1044,47 @@
     .extern artUnlockObjectFromCode
 ENTRY art_quick_unlock_object
     cbz    x0, .Lslow_unlock
-    ldr    w1, [x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
-    lsr    w2, w1, 30
+    add    x4, x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET  // exclusive load/store has no immediate anymore
+.Lretry_unlock:
+#ifndef USE_READ_BARRIER
+    ldr    w1, [x4]
+#else
+    ldxr   w1, [x4]                   // Need to use atomic instructions for read barrier
+#endif
+    lsr    w2, w1, LOCK_WORD_STATE_SHIFT
     cbnz   w2, .Lslow_unlock          // if either of the top two bits are set, go slow path
     ldr    w2, [xSELF, #THREAD_ID_OFFSET]
-    eor    w3, w1, w2                 // lock_word.ThreadId() ^ self->ThreadId()
+    mov    x3, x1                     // copy lock word to check thread id equality
+    and    w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  // zero the read barrier bits
+    eor    w3, w3, w2                 // lock_word.ThreadId() ^ self->ThreadId()
     uxth   w3, w3                     // zero top 16 bits
     cbnz   w3, .Lslow_unlock          // do lock word and self thread id's match?
-    cmp    w1, #65536
+    mov    x3, x1                     // copy lock word to detect transition to unlocked
+    and    w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  // zero the read barrier bits
+    cmp    w3, #LOCK_WORD_THIN_LOCK_COUNT_ONE
     bpl    .Lrecursive_thin_unlock
-    // transition to unlocked, w3 holds 0
+    // transition to unlocked
+    mov    x3, x1
+    and    w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK  // w3: zero except for the preserved read barrier bits
     dmb    ish                        // full (LoadStore|StoreStore) memory barrier
-    str    w3, [x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+#ifndef USE_READ_BARRIER
+    str    w3, [x4]
+#else
+    stxr   w2, w3, [x4]               // Need to use atomic instructions for read barrier
+    cbnz   w2, .Lunlock_stxr_fail     // store failed, retry
+#endif
     ret
-.Lrecursive_thin_unlock:
-    sub    w1, w1, #65536
-    str    w1, [x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+.Lrecursive_thin_unlock:  // w1: original lock word
+    sub    w1, w1, #LOCK_WORD_THIN_LOCK_COUNT_ONE  // decrement count
+#ifndef USE_READ_BARRIER
+    str    w1, [x4]
+#else
+    stxr   w2, w1, [x4]               // Need to use atomic instructions for read barrier
+    cbnz   w2, .Lunlock_stxr_fail     // store failed, retry
+#endif
     ret
+.Lunlock_stxr_fail:
+    b      .Lretry_unlock               // retry
 .Lslow_unlock:
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME  // save callee saves in case exception allocation triggers GC
     mov    x1, xSELF                  // pass Thread::Current
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index c2acdd1..c437428 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -922,29 +922,39 @@
     jz   .Lslow_lock
 .Lretry_lock:
     movl MIRROR_OBJECT_LOCK_WORD_OFFSET(%eax), %ecx  // ecx := lock word
-    test LITERAL(0xC0000000), %ecx        // test the 2 high bits.
+    test LITERAL(LOCK_WORD_STATE_MASK), %ecx         // test the 2 high bits.
     jne  .Lslow_lock                      // slow path if either of the two high bits are set.
-    movl %fs:THREAD_ID_OFFSET, %edx       // edx := thread id
+    movl %ecx, %edx                       // save lock word (edx) to keep read barrier bits.
+    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %ecx  // zero the read barrier bits.
     test %ecx, %ecx
     jnz  .Lalready_thin                   // lock word contains a thin lock
-    // unlocked case - %edx holds thread id with count of 0
+    // unlocked case - edx: original lock word, eax: obj.
     movl %eax, %ecx                       // remember object in case of retry
-    xor  %eax, %eax                       // eax == 0 for comparison with lock word in cmpxchg
-    lock cmpxchg  %edx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%ecx)
-    jnz  .Lcmpxchg_fail                   // cmpxchg failed retry
+    movl %edx, %eax                       // eax: lock word zero except for read barrier bits.
+    movl %fs:THREAD_ID_OFFSET, %edx       // load thread id.
+    or   %eax, %edx                       // edx: thread id with count of 0 + read barrier bits.
+    lock cmpxchg  %edx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%ecx)  // eax: old val, edx: new val.
+    jnz  .Llock_cmpxchg_fail              // cmpxchg failed retry
     ret
-.Lcmpxchg_fail:
-    movl  %ecx, %eax                      // restore eax
-    jmp  .Lretry_lock
-.Lalready_thin:
+.Lalready_thin:  // edx: lock word (with high 2 bits zero and original rb bits), eax: obj.
+    movl %fs:THREAD_ID_OFFSET, %ecx       // ecx := thread id
     cmpw %cx, %dx                         // do we hold the lock already?
     jne  .Lslow_lock
-    addl LITERAL(65536), %ecx             // increment recursion count
-    test LITERAL(0xC0000000), %ecx        // overflowed if either of top two bits are set
+    movl %edx, %ecx                       // copy the lock word to check count overflow.
+    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %ecx  // zero the read barrier bits.
+    addl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %ecx  // increment recursion count for overflow check.
+    test LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx  // overflowed if either of the upper two bits (28-29) are set.
     jne  .Lslow_lock                      // count overflowed so go slow
-    // update lockword, cmpxchg not necessary as we hold lock
-    movl %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%eax)
+    movl %eax, %ecx                       // save obj to use eax for cmpxchg.
+    movl %edx, %eax                       // copy the lock word as the old val for cmpxchg.
+    addl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %edx  // increment recursion count again for real.
+    // update lockword, cmpxchg necessary for read barrier bits.
+    lock cmpxchg  %edx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%ecx)  // eax: old val, edx: new val.
+    jnz  .Llock_cmpxchg_fail              // cmpxchg failed retry
     ret
+.Llock_cmpxchg_fail:
+    movl  %ecx, %eax                      // restore eax
+    jmp  .Lretry_lock
 .Lslow_lock:
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME  ebx, ebx  // save ref containing registers for GC
     // Outgoing argument set up
@@ -963,20 +973,43 @@
 DEFINE_FUNCTION art_quick_unlock_object
     testl %eax, %eax                      // null check object/eax
     jz   .Lslow_unlock
+.Lretry_unlock:
     movl MIRROR_OBJECT_LOCK_WORD_OFFSET(%eax), %ecx  // ecx := lock word
     movl %fs:THREAD_ID_OFFSET, %edx       // edx := thread id
-    test LITERAL(0xC0000000), %ecx
+    test LITERAL(LOCK_WORD_STATE_MASK), %ecx
     jnz  .Lslow_unlock                    // lock word contains a monitor
     cmpw %cx, %dx                         // does the thread id match?
     jne  .Lslow_unlock
-    cmpl LITERAL(65536), %ecx
+    movl %ecx, %edx                       // copy the lock word to detect new count of 0.
+    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %edx  // zero the read barrier bits.
+    cmpl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %edx
     jae  .Lrecursive_thin_unlock
-    movl LITERAL(0), MIRROR_OBJECT_LOCK_WORD_OFFSET(%eax)
+    // update lockword, cmpxchg necessary for read barrier bits.
+    movl %eax, %edx                       // edx: obj
+    movl %ecx, %eax                       // eax: old lock word.
+    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx  // ecx: new lock word zero except original rb bits.
+#ifndef USE_READ_BARRIER
+    movl %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)
+#else
+    lock cmpxchg  %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)  // eax: old val, ecx: new val.
+    jnz  .Lunlock_cmpxchg_fail            // cmpxchg failed retry
+#endif
     ret
-.Lrecursive_thin_unlock:
-    subl LITERAL(65536), %ecx
-    mov  %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%eax)
+.Lrecursive_thin_unlock:  // ecx: original lock word, eax: obj
+    // update lockword, cmpxchg necessary for read barrier bits.
+    movl %eax, %edx                       // edx: obj
+    movl %ecx, %eax                       // eax: old lock word.
+    subl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %ecx  // ecx: new lock word with decremented count.
+#ifndef USE_READ_BARRIER
+    mov  %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)
+#else
+    lock cmpxchg  %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)  // eax: old val, ecx: new val.
+    jnz  .Lunlock_cmpxchg_fail            // cmpxchg failed retry
+#endif
     ret
+.Lunlock_cmpxchg_fail:  // edx: obj
+    movl %edx, %eax                       // restore eax
+    jmp  .Lretry_unlock
 .Lslow_unlock:
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME  ebx, ebx  // save ref containing registers for GC
     // Outgoing argument set up
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index c865541..9b6b367 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -955,24 +955,33 @@
     jz   .Lslow_lock
 .Lretry_lock:
     movl MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi), %ecx  // ecx := lock word.
-    test LITERAL(0xC0000000), %ecx        // Test the 2 high bits.
+    test LITERAL(LOCK_WORD_STATE_MASK), %ecx         // Test the 2 high bits.
     jne  .Lslow_lock                      // Slow path if either of the two high bits are set.
-    movl %gs:THREAD_ID_OFFSET, %edx       // edx := thread id
+    movl %ecx, %edx                       // save lock word (edx) to keep read barrier bits.
+    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %ecx  // zero the read barrier bits.
     test %ecx, %ecx
     jnz  .Lalready_thin                   // Lock word contains a thin lock.
-    // unlocked case - %edx holds thread id with count of 0
-    xor  %eax, %eax                       // eax == 0 for comparison with lock word in cmpxchg
+    // unlocked case - edx: original lock word, edi: obj.
+    movl %edx, %eax                       // eax: lock word zero except for read barrier bits.
+    movl %gs:THREAD_ID_OFFSET, %edx       // edx := thread id
+    or   %eax, %edx                       // edx: thread id with count of 0 + read barrier bits.
     lock cmpxchg  %edx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)
     jnz  .Lretry_lock                     // cmpxchg failed retry
     ret
-.Lalready_thin:
+.Lalready_thin:  // edx: lock word (with high 2 bits zero and original rb bits), edi: obj.
+    movl %gs:THREAD_ID_OFFSET, %ecx       // ecx := thread id
     cmpw %cx, %dx                         // do we hold the lock already?
     jne  .Lslow_lock
-    addl LITERAL(65536), %ecx             // increment recursion count
-    test LITERAL(0xC0000000), %ecx        // overflowed if either of top two bits are set
+    movl %edx, %ecx                       // copy the lock word to check count overflow.
+    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %ecx  // zero the read barrier bits.
+    addl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %ecx  // increment recursion count
+    test LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx  // overflowed if either of the upper two bits (28-29) are set
     jne  .Lslow_lock                      // count overflowed so go slow
-    // update lockword, cmpxchg not necessary as we hold lock
-    movl %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)
+    movl %edx, %eax                       // copy the lock word as the old val for cmpxchg.
+    addl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %edx   // increment recursion count again for real.
+    // update lockword, cmpxchg necessary for read barrier bits.
+    lock cmpxchg  %edx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)  // eax: old val, edx: new val.
+    jnz  .Lretry_lock                     // cmpxchg failed retry
     ret
 .Lslow_lock:
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME
@@ -985,19 +994,37 @@
 DEFINE_FUNCTION art_quick_unlock_object
     testl %edi, %edi                      // null check object/edi
     jz   .Lslow_unlock
+.Lretry_unlock:
     movl MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi), %ecx  // ecx := lock word
     movl %gs:THREAD_ID_OFFSET, %edx       // edx := thread id
-    test LITERAL(0xC0000000), %ecx
+    test LITERAL(LOCK_WORD_STATE_MASK), %ecx
     jnz  .Lslow_unlock                    // lock word contains a monitor
     cmpw %cx, %dx                         // does the thread id match?
     jne  .Lslow_unlock
-    cmpl LITERAL(65536), %ecx
+    movl %ecx, %edx                       // copy the lock word to detect new count of 0.
+    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %edx  // zero the read barrier bits.
+    cmpl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %edx
     jae  .Lrecursive_thin_unlock
-    movl LITERAL(0), MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)
+    // update lockword, cmpxchg necessary for read barrier bits.
+    movl %ecx, %eax                       // eax: old lock word.
+    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx  // ecx: new lock word zero except original rb bits.
+#ifndef USE_READ_BARRIER
+    movl %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)
+#else
+    lock cmpxchg  %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)  // eax: old val, ecx: new val.
+    jnz  .Lretry_unlock                   // cmpxchg failed retry
+#endif
     ret
-.Lrecursive_thin_unlock:
-    subl LITERAL(65536), %ecx
+.Lrecursive_thin_unlock:  // ecx: original lock word, edi: obj
+    // update lockword, cmpxchg necessary for read barrier bits.
+    movl %ecx, %eax                       // eax: old lock word.
+    subl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %ecx
+#ifndef USE_READ_BARRIER
     mov  %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)
+#else
+    lock cmpxchg  %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)  // eax: old val, ecx: new val.
+    jnz  .Lretry_unlock                   // cmpxchg failed retry
+#endif
     ret
 .Lslow_unlock:
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME