Change one read barrier bit to mark bit
Optimization to help slow path performance. When the GC marks an
object through the read barrier slow path. The GC sets the mark bit
in the lock word of that reference. This bit is checked from the
assembly entrypoint the common case is that it is set. If the bit is
set, the read barrier knows the object is already marked and there is
no work to do.
To prevent dirty pages in zygote and image, the bit is set by the
image writer and zygote space creation.
EAAC score (lower is better):
N9: 777 -> 700 (average 31 of runs)
N6P (960000 mhz): 1737.48 -> 1442.31 (average of 25 runs)
Bug: 30162165
Bug: 12687968
Test: N9, N6P booting, test-art-host, test-art-target all with CC
Change-Id: Iae0cacfae221e33151d3c0ab65338d1c822ab63d
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 34d3158..8828dce 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -520,7 +520,7 @@
ldr r2, [r9, #THREAD_ID_OFFSET]
ldrex r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
mov r3, r1
- and r3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED @ zero the read barrier bits
+ and r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED @ zero the gc bits
cbnz r3, .Lnot_unlocked @ already thin locked
@ unlocked case - r1: original lock word that's zero except for the read barrier bits.
orr r2, r1, r2 @ r2 holds thread id with count of 0 with preserved read barrier bits
@@ -536,9 +536,9 @@
cbnz r2, .Lslow_lock @ lock word and self thread id's match -> recursive lock
@ else contention, go to slow path
mov r3, r1 @ copy the lock word to check count overflow.
- and r3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED @ zero the read barrier bits.
+ and r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED @ zero the gc bits.
add r2, r3, #LOCK_WORD_THIN_LOCK_COUNT_ONE @ increment count in lock word placing in r2 to check overflow
- lsr r3, r2, LOCK_WORD_READ_BARRIER_STATE_SHIFT @ if either of the upper two bits (28-29) are set, we overflowed.
+ lsr r3, r2, #LOCK_WORD_GC_STATE_SHIFT @ if the first gc state bit is set, we overflowed.
cbnz r3, .Lslow_lock @ if we overflow the count go slow path
add r2, r1, #LOCK_WORD_THIN_LOCK_COUNT_ONE @ increment count for real
strex r3, r2, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET] @ strex necessary for read barrier bits
@@ -581,17 +581,17 @@
cbnz r2, .Lslow_unlock @ if either of the top two bits are set, go slow path
ldr r2, [r9, #THREAD_ID_OFFSET]
mov r3, r1 @ copy lock word to check thread id equality
- and r3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED @ zero the read barrier bits
+ and r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED @ zero the gc bits
eor r3, r3, r2 @ lock_word.ThreadId() ^ self->ThreadId()
uxth r3, r3 @ zero top 16 bits
cbnz r3, .Lslow_unlock @ do lock word and self thread id's match?
mov r3, r1 @ copy lock word to detect transition to unlocked
- and r3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED @ zero the read barrier bits
+ and r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED @ zero the gc bits
cmp r3, #LOCK_WORD_THIN_LOCK_COUNT_ONE
bpl .Lrecursive_thin_unlock
@ transition to unlocked
mov r3, r1
- and r3, #LOCK_WORD_READ_BARRIER_STATE_MASK @ r3: zero except for the preserved read barrier bits
+ and r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED @ r3: zero except for the preserved gc bits
dmb ish @ full (LoadStore|StoreStore) memory barrier
#ifndef USE_READ_BARRIER
str r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
@@ -1772,6 +1772,20 @@
*/
.macro READ_BARRIER_MARK_REG name, reg
ENTRY \name
+ // Null check so that we can load the lock word.
+ cmp \reg, #0
+ beq .Lret_rb_\name
+ // Check lock word for mark bit, if marked return.
+ push {r0}
+ ldr r0, [\reg, MIRROR_OBJECT_LOCK_WORD_OFFSET]
+ and r0, #LOCK_WORD_MARK_BIT_MASK_SHIFTED
+ cbz r0, .Lslow_rb_\name
+ // Restore LR and return.
+ pop {r0}
+ bx lr
+
+.Lslow_rb_\name:
+ pop {r0}
push {r0-r4, r9, r12, lr} @ save return address and core caller-save registers
.cfi_adjust_cfa_offset 32
.cfi_rel_offset r0, 0
@@ -1831,6 +1845,8 @@
.endif
.endif
pop {r0-r4, r9, r12, pc} @ restore caller-save registers and return
+.Lret_rb_\name:
+ bx lr
END \name
.endm
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index a5be52d..51094d5 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1090,7 +1090,7 @@
ldr w2, [xSELF, #THREAD_ID_OFFSET] // TODO: Can the thread ID really change during the loop?
ldxr w1, [x4]
mov x3, x1
- and w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED // zero the read barrier bits
+ and w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED // zero the gc bits
cbnz w3, .Lnot_unlocked // already thin locked
// unlocked case - x1: original lock word that's zero except for the read barrier bits.
orr x2, x1, x2 // x2 holds thread id with count of 0 with preserved read barrier bits
@@ -1106,9 +1106,9 @@
cbnz w2, .Lslow_lock // lock word and self thread id's match -> recursive lock
// else contention, go to slow path
mov x3, x1 // copy the lock word to check count overflow.
- and w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED // zero the read barrier bits.
+ and w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED // zero the gc bits.
add w2, w3, #LOCK_WORD_THIN_LOCK_COUNT_ONE // increment count in lock word placing in w2 to check overflow
- lsr w3, w2, LOCK_WORD_READ_BARRIER_STATE_SHIFT // if either of the upper two bits (28-29) are set, we overflowed.
+ lsr w3, w2, #LOCK_WORD_GC_STATE_SHIFT // if the first gc state bit is set, we overflowed.
cbnz w3, .Lslow_lock // if we overflow the count go slow path
add w2, w1, #LOCK_WORD_THIN_LOCK_COUNT_ONE // increment count for real
stxr w3, w2, [x4]
@@ -1152,17 +1152,17 @@
cbnz w2, .Lslow_unlock // if either of the top two bits are set, go slow path
ldr w2, [xSELF, #THREAD_ID_OFFSET]
mov x3, x1 // copy lock word to check thread id equality
- and w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED // zero the read barrier bits
+ and w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED // zero the gc bits
eor w3, w3, w2 // lock_word.ThreadId() ^ self->ThreadId()
uxth w3, w3 // zero top 16 bits
cbnz w3, .Lslow_unlock // do lock word and self thread id's match?
mov x3, x1 // copy lock word to detect transition to unlocked
- and w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED // zero the read barrier bits
+ and w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED // zero the gc bits
cmp w3, #LOCK_WORD_THIN_LOCK_COUNT_ONE
bpl .Lrecursive_thin_unlock
// transition to unlocked
mov x3, x1
- and w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK // w3: zero except for the preserved read barrier bits
+ and w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED // w3: zero except for the preserved read barrier bits
dmb ish // full (LoadStore|StoreStore) memory barrier
#ifndef USE_READ_BARRIER
str w3, [x4]
@@ -1791,12 +1791,20 @@
ldr x2, [x1, #ART_METHOD_DEX_CACHE_TYPES_OFFSET_64] // Load dex cache resolved types array
// Load the class (x2)
ldr w2, [x2, x0, lsl #COMPRESSED_REFERENCE_SIZE_SHIFT]
- // Read barrier for class load.
+
+ // Most common case: GC is not marking.
ldr w3, [xSELF, #THREAD_IS_GC_MARKING_OFFSET]
- cbnz x3, .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path
-.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit:
+ cbnz x3, .Lart_quick_alloc_object_region_tlab_marking
+.Lart_quick_alloc_object_region_tlab_do_allocation:
ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path
-.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path:
+.Lart_quick_alloc_object_region_tlab_marking:
+ // GC is marking, check the lock word of the class for the mark bit.
+ // If the class is null, go slow path. The check is required to read the lock word.
+ cbz w2, .Lart_quick_alloc_object_region_tlab_slow_path
+ // Class is not null, check mark bit in lock word.
+ ldr w3, [x2, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+ // If the bit is not zero, do the allocation.
+ tbnz w3, #LOCK_WORD_MARK_BIT_SHIFT, .Lart_quick_alloc_object_region_tlab_do_allocation
// The read barrier slow path. Mark
// the class.
stp x0, x1, [sp, #-32]! // Save registers (x0, x1, lr).
@@ -1807,7 +1815,7 @@
ldp x0, x1, [sp, #0] // Restore registers.
ldr xLR, [sp, #16]
add sp, sp, #32
- b .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
+ b .Lart_quick_alloc_object_region_tlab_do_allocation
.Lart_quick_alloc_object_region_tlab_slow_path:
SETUP_REFS_ONLY_CALLEE_SAVE_FRAME // Save callee saves in case of GC.
mov x2, xSELF // Pass Thread::Current.
@@ -2265,6 +2273,8 @@
*/
.macro READ_BARRIER_MARK_REG name, wreg, xreg
ENTRY \name
+ // Reference is null, no work to do at all.
+ cbz \wreg, .Lret_rb_\name
/*
* Allocate 46 stack slots * 8 = 368 bytes:
* - 20 slots for core registers X0-X19
@@ -2272,6 +2282,11 @@
* - 1 slot for return address register XLR
* - 1 padding slot for 16-byte stack alignment
*/
+ // Use wIP0 as temp and check the mark bit of the reference. wIP0 is not used by the compiler.
+ ldr wIP0, [\xreg, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+ tbz wIP0, #LOCK_WORD_MARK_BIT_SHIFT, .Lslow_path_rb_\name
+ ret
+.Lslow_path_rb_\name:
// Save all potentially live caller-save core registers.
stp x0, x1, [sp, #-368]!
.cfi_adjust_cfa_offset 368
@@ -2360,6 +2375,7 @@
.cfi_restore x30
add sp, sp, #368
.cfi_adjust_cfa_offset -368
+.Lret_rb_\name:
ret
END \name
.endm
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 77e04e7..6c9239f 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1028,7 +1028,13 @@
movl 0(%edx, %eax, COMPRESSED_REFERENCE_SIZE), %edx
// Read barrier for class load.
cmpl LITERAL(0), %fs:THREAD_IS_GC_MARKING_OFFSET
- jne .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path
+ jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
+ // Null check so that we can load the lock word.
+ testl %edx, %edx
+ jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
+ // Check the mark bit, if it is 1 return.
+ testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)
+ jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path
.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit:
ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path
.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path:
@@ -1065,7 +1071,7 @@
test LITERAL(LOCK_WORD_STATE_MASK), %ecx // test the 2 high bits.
jne .Lslow_lock // slow path if either of the two high bits are set.
movl %ecx, %edx // save lock word (edx) to keep read barrier bits.
- andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %ecx // zero the read barrier bits.
+ andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %ecx // zero the gc bits.
test %ecx, %ecx
jnz .Lalready_thin // lock word contains a thin lock
// unlocked case - edx: original lock word, eax: obj.
@@ -1081,9 +1087,9 @@
cmpw %cx, %dx // do we hold the lock already?
jne .Lslow_lock
movl %edx, %ecx // copy the lock word to check count overflow.
- andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %ecx // zero the read barrier bits.
+ andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %ecx // zero the read barrier bits.
addl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %ecx // increment recursion count for overflow check.
- test LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx // overflowed if either of the upper two bits (28-29) are set.
+ test LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED), %ecx // overflowed if the first gc state bit is set.
jne .Lslow_lock // count overflowed so go slow
movl %eax, %ecx // save obj to use eax for cmpxchg.
movl %edx, %eax // copy the lock word as the old val for cmpxchg.
@@ -1137,13 +1143,13 @@
cmpw %cx, %dx // does the thread id match?
jne .Lslow_unlock
movl %ecx, %edx // copy the lock word to detect new count of 0.
- andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %edx // zero the read barrier bits.
+ andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %edx // zero the gc bits.
cmpl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %edx
jae .Lrecursive_thin_unlock
// update lockword, cmpxchg necessary for read barrier bits.
movl %eax, %edx // edx: obj
movl %ecx, %eax // eax: old lock word.
- andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx // ecx: new lock word zero except original rb bits.
+ andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED), %ecx // ecx: new lock word zero except original rb bits.
#ifndef USE_READ_BARRIER
movl %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)
#else
@@ -1923,6 +1929,14 @@
// convention (e.g. standard callee-save registers are preserved).
MACRO2(READ_BARRIER_MARK_REG, name, reg)
DEFINE_FUNCTION VAR(name)
+ // Null check so that we can load the lock word.
+ test REG_VAR(reg), REG_VAR(reg)
+ jz .Lret_rb_\name
+ // Check the mark bit, if it is 1 return.
+ testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(reg))
+ jz .Lslow_rb_\name
+ ret
+.Lslow_rb_\name:
// Save all potentially live caller-save core registers.
PUSH eax
PUSH ecx
@@ -1970,6 +1984,7 @@
POP_REG_NE edx, RAW_VAR(reg)
POP_REG_NE ecx, RAW_VAR(reg)
POP_REG_NE eax, RAW_VAR(reg)
+.Lret_rb_\name:
ret
END_FUNCTION VAR(name)
END_MACRO
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 784ec39..127d7db 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -989,7 +989,13 @@
// Load the class
movl 0(%rdx, %rdi, COMPRESSED_REFERENCE_SIZE), %edx
cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET
- jne .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path
+ jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
+ // Null check so that we can load the lock word.
+ testl %edx, %edx
+ jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
+ // Check the mark bit, if it is 1 return.
+ testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)
+ jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path
.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit:
ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path
.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path:
@@ -1022,7 +1028,7 @@
test LITERAL(LOCK_WORD_STATE_MASK), %ecx // Test the 2 high bits.
jne .Lslow_lock // Slow path if either of the two high bits are set.
movl %ecx, %edx // save lock word (edx) to keep read barrier bits.
- andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %ecx // zero the read barrier bits.
+ andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %ecx // zero the gc bits.
test %ecx, %ecx
jnz .Lalready_thin // Lock word contains a thin lock.
// unlocked case - edx: original lock word, edi: obj.
@@ -1037,9 +1043,9 @@
cmpw %cx, %dx // do we hold the lock already?
jne .Lslow_lock
movl %edx, %ecx // copy the lock word to check count overflow.
- andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %ecx // zero the read barrier bits.
+ andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %ecx // zero the gc bits.
addl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %ecx // increment recursion count
- test LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx // overflowed if either of the upper two bits (28-29) are set
+ test LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx // overflowed if the upper bit (28) is set
jne .Lslow_lock // count overflowed so go slow
movl %edx, %eax // copy the lock word as the old val for cmpxchg.
addl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %edx // increment recursion count again for real.
@@ -1074,12 +1080,12 @@
cmpw %cx, %dx // does the thread id match?
jne .Lslow_unlock
movl %ecx, %edx // copy the lock word to detect new count of 0.
- andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %edx // zero the read barrier bits.
+ andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %edx // zero the gc bits.
cmpl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %edx
jae .Lrecursive_thin_unlock
// update lockword, cmpxchg necessary for read barrier bits.
movl %ecx, %eax // eax: old lock word.
- andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx // ecx: new lock word zero except original rb bits.
+ andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED), %ecx // ecx: new lock word zero except original gc bits.
#ifndef USE_READ_BARRIER
movl %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)
#else
@@ -1833,6 +1839,14 @@
// convention (e.g. standard callee-save registers are preserved).
MACRO2(READ_BARRIER_MARK_REG, name, reg)
DEFINE_FUNCTION VAR(name)
+ // Null check so that we can load the lock word.
+ testq REG_VAR(reg), REG_VAR(reg)
+ jz .Lret_rb_\name
+ // Check the mark bit, if it is 1 return.
+ testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(reg))
+ jz .Lslow_rb_\name
+ ret
+.Lslow_rb_\name:
// Save all potentially live caller-save core registers.
PUSH rax
PUSH rcx
@@ -1897,6 +1911,7 @@
POP_REG_NE rdx, RAW_VAR(reg)
POP_REG_NE rcx, RAW_VAR(reg)
POP_REG_NE rax, RAW_VAR(reg)
+.Lret_rb_\name:
ret
END_FUNCTION VAR(name)
END_MACRO