Faster allocation fast path
Added a new object size field to class, this field contains the
aligned object size if the object is not finalizable and is
initialized. If the object is finalizable or uninitialized the field
is set to some large value that forces the ASM allocators to go slow
path.
Only implemented for region/normal TLAB for now, will add the to
RosAlloc stubs soon.
CC N6P MemAllocTest: 1067 -> 1039 (25 samples)
CC N6P EAAC: 1281 -> 1260 (25 samples)
RAM overhead technically 0 since mirror::Class was not 8 byte aligned
previously. Since the allocators require 8 byte allignment, there
would have been 1 word of padding at the end of the class. If there
was actually 4 extra bytes per class, the system overhead would be
36000 * 4 = 120KB based on old N6P numbers for the number of loaded
classes after boot.
Bug: 9986565
Test: test-art-host CC baker, N6P phone boot and EAAC runs.
Change-Id: I119a87b8cc6c980bff980a0c62f42610dab5e531
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 3f87a14..76e503c 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -2027,48 +2027,24 @@
ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED \slowPathLabel
.endm
+// TODO: delete ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED since it is the same as
+// ALLOC_OBJECT_TLAB_FAST_PATH_INITIALIZED.
.macro ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED slowPathLabel
- ldr w3, [x2, #MIRROR_CLASS_STATUS_OFFSET] // Check class status.
- cmp x3, #MIRROR_CLASS_STATUS_INITIALIZED
- bne \slowPathLabel
- // Add a fake dependence from the
- // following access flag and size
- // loads to the status load.
- // This is to prevent those loads
- // from being reordered above the
- // status load and reading wrong
- // values (an alternative is to use
- // a load-acquire for the status).
- eor x3, x3, x3
- add x2, x2, x3
ALLOC_OBJECT_TLAB_FAST_PATH_INITIALIZED \slowPathLabel
.endm
.macro ALLOC_OBJECT_TLAB_FAST_PATH_INITIALIZED slowPathLabel
- // Check access flags has
- // kAccClassIsFinalizable.
- ldr w3, [x2, #MIRROR_CLASS_ACCESS_FLAGS_OFFSET]
- tbnz x3, #ACCESS_FLAGS_CLASS_IS_FINALIZABLE_BIT, \slowPathLabel
- // Load thread_local_pos (x4) and
- // thread_local_end (x5).
ldr x4, [xSELF, #THREAD_LOCAL_POS_OFFSET]
ldr x5, [xSELF, #THREAD_LOCAL_END_OFFSET]
- sub x6, x5, x4 // Compute the remaining buf size.
- ldr w7, [x2, #MIRROR_CLASS_OBJECT_SIZE_OFFSET] // Load the object size (x7).
- cmp x7, x6 // Check if it fits. OK to do this
- // before rounding up the object size
- // assuming the buf size alignment.
+ ldr w7, [x2, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET] // Load the object size (x7).
+ add x6, x4, x7 // Add object size to tlab pos.
+ cmp x6, x5 // Check if it fits, overflow works
+ // since the tlab pos and end are 32
+ // bit values.
bhi \slowPathLabel
// "Point of no slow path". Won't go to the slow path from here on. OK to clobber x0 and x1.
- // Round up the object size by the
- // object alignment. (addr + 7) & ~7.
- add x7, x7, #OBJECT_ALIGNMENT_MASK
- and x7, x7, #OBJECT_ALIGNMENT_MASK_TOGGLED
- // Move old thread_local_pos to x0
- // for the return value.
mov x0, x4
- add x5, x0, x7
- str x5, [xSELF, #THREAD_LOCAL_POS_OFFSET] // Store new thread_local_pos.
+ str x6, [xSELF, #THREAD_LOCAL_POS_OFFSET] // Store new thread_local_pos.
ldr x5, [xSELF, #THREAD_LOCAL_OBJECTS_OFFSET] // Increment thread_local_objects.
add x5, x5, #1
str x5, [xSELF, #THREAD_LOCAL_OBJECTS_OFFSET]
@@ -2080,7 +2056,7 @@
// the fields of the class.
// Alternatively we could use "ishst"
// if we use load-acquire for the
- // class status load.)
+ // object size load.)
dmb ish
ret
.endm