Make object allocation entrypoints only take a class.
Change motivated by:
- Dex cache compression: having the allocation fast path do a
dex cache lookup will be too expensive. So instead, rely on the
compiler having direct access to the class (either through BSS for
AOT, or JIT tables for JIT).
- Inlining: the entrypoints relied on the caller of the allocation to
have the same dex cache as the outer method (stored at the bottom of
the stack). This meant we could not inline methods from a different
dex file that do allocations. By avoiding the dex cache lookup in
the entrypoint, we can now remove this restriction.
Code expansion on average for Docs/Gms/FB/Framework (go/lem numbers):
- Around 0.8% on arm64
- Around 1% for x64, arm
- Around 1.5% on x86
Test: test-art-host, test-art-target, ART_USE_READ_BARRIER=true/false
Test: test-art-host, test-art-target, ART_DEFAULT_GC_TYPE=SS ART_USE_TLAB=true
Change-Id: I41f3748bb4d251996aaf6a90fae4c50176f9295f
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index a71ab4b..4d4ebdc 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -1124,28 +1124,23 @@
// Generate the allocation entrypoints for each allocator.
GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
-// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc).
-ENTRY art_quick_alloc_object_rosalloc
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_RESOLVED_OBJECT(_rosalloc, RosAlloc).
+ENTRY art_quick_alloc_object_resolved_rosalloc
// Fast path rosalloc allocation.
- // r0: type_idx/return value, r1: ArtMethod*, r9: Thread::Current
- // r2, r3, r12: free.
- ldr r2, [r1, #ART_METHOD_DEX_CACHE_TYPES_OFFSET_32] // Load dex cache resolved types array
- // Load the class (r2)
- ldr r2, [r2, r0, lsl #COMPRESSED_REFERENCE_SIZE_SHIFT]
- cbz r2, .Lart_quick_alloc_object_rosalloc_slow_path // Check null class
-
+ // r0: type/return value, r9: Thread::Current
+ // r1, r2, r3, r12: free.
ldr r3, [r9, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET] // Check if the thread local
// allocation stack has room.
// TODO: consider using ldrd.
ldr r12, [r9, #THREAD_LOCAL_ALLOC_STACK_END_OFFSET]
cmp r3, r12
- bhs .Lart_quick_alloc_object_rosalloc_slow_path
+ bhs .Lart_quick_alloc_object_resolved_rosalloc_slow_path
- ldr r3, [r2, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET] // Load the object size (r3)
+ ldr r3, [r0, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET] // Load the object size (r3)
cmp r3, #ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE // Check if the size is for a thread
// local allocation. Also does the
// initialized and finalizable checks.
- bhs .Lart_quick_alloc_object_rosalloc_slow_path
+ bhs .Lart_quick_alloc_object_resolved_rosalloc_slow_path
// Compute the rosalloc bracket index
// from the size. Since the size is
// already aligned we can combine the
@@ -1159,7 +1154,7 @@
// Load the free list head (r3). This
// will be the return val.
ldr r3, [r12, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)]
- cbz r3, .Lart_quick_alloc_object_rosalloc_slow_path
+ cbz r3, .Lart_quick_alloc_object_resolved_rosalloc_slow_path
// "Point of no slow path". Won't go to the slow path from here on. OK to clobber r0 and r1.
ldr r1, [r3, #ROSALLOC_SLOT_NEXT_OFFSET] // Load the next pointer of the head
// and update the list head with the
@@ -1172,8 +1167,8 @@
#if ROSALLOC_SLOT_NEXT_OFFSET != MIRROR_OBJECT_CLASS_OFFSET
#error "Class pointer needs to overwrite next pointer."
#endif
- POISON_HEAP_REF r2
- str r2, [r3, #MIRROR_OBJECT_CLASS_OFFSET]
+ POISON_HEAP_REF r0
+ str r0, [r3, #MIRROR_OBJECT_CLASS_OFFSET]
// Fence. This is "ish" not "ishst" so
// that it also ensures ordering of
// the class status load with respect
@@ -1204,20 +1199,20 @@
mov r0, r3 // Set the return value and return.
bx lr
-.Lart_quick_alloc_object_rosalloc_slow_path:
+.Lart_quick_alloc_object_resolved_rosalloc_slow_path:
SETUP_SAVE_REFS_ONLY_FRAME r2 @ save callee saves in case of GC
- mov r2, r9 @ pass Thread::Current
- bl artAllocObjectFromCodeRosAlloc @ (uint32_t type_idx, Method* method, Thread*)
+ mov r1, r9 @ pass Thread::Current
+ bl artAllocObjectFromCodeResolvedRosAlloc @ (mirror::Class* cls, Thread*)
RESTORE_SAVE_REFS_ONLY_FRAME
RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
-END art_quick_alloc_object_rosalloc
+END art_quick_alloc_object_resolved_rosalloc
-// The common fast path code for art_quick_alloc_object_tlab and art_quick_alloc_object_region_tlab.
+// The common fast path code for art_quick_alloc_object_resolved_tlab
+// and art_quick_alloc_object_resolved_region_tlab.
//
-// r0: type_idx/return value, r1: ArtMethod*, r2: class, r9: Thread::Current, r3, r12: free.
-// Need to preserve r0 and r1 to the slow path.
-.macro ALLOC_OBJECT_TLAB_FAST_PATH slowPathLabel
- cbz r2, \slowPathLabel // Check null class
+// r0: type r9: Thread::Current, r1, r2, r3, r12: free.
+// Need to preserve r0 to the slow path.
+.macro ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH slowPathLabel
// Load thread_local_pos (r12) and
// thread_local_end (r3) with ldrd.
// Check constraints for ldrd.
@@ -1232,14 +1227,14 @@
// "Point of no slow path". Won't go to the slow path from here on. OK to clobber r0 and r1.
// Reload old thread_local_pos (r0)
// for the return value.
- ldr r0, [r9, #THREAD_LOCAL_POS_OFFSET]
- add r1, r0, r3
+ ldr r2, [r9, #THREAD_LOCAL_POS_OFFSET]
+ add r1, r2, r3
str r1, [r9, #THREAD_LOCAL_POS_OFFSET] // Store new thread_local_pos.
ldr r1, [r9, #THREAD_LOCAL_OBJECTS_OFFSET] // Increment thread_local_objects.
add r1, r1, #1
str r1, [r9, #THREAD_LOCAL_OBJECTS_OFFSET]
- POISON_HEAP_REF r2
- str r2, [r0, #MIRROR_OBJECT_CLASS_OFFSET] // Store the class pointer.
+ POISON_HEAP_REF r0
+ str r0, [r2, #MIRROR_OBJECT_CLASS_OFFSET] // Store the class pointer.
// Fence. This is "ish" not "ishst" so
// that the code after this allocation
// site will see the right values in
@@ -1247,71 +1242,46 @@
// Alternatively we could use "ishst"
// if we use load-acquire for the
// object size load.)
+ mov r0, r2
dmb ish
bx lr
.endm
-// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB).
-ENTRY art_quick_alloc_object_tlab
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_RESOLVED_OBJECT(_tlab, TLAB).
+ENTRY art_quick_alloc_object_resolved_tlab
// Fast path tlab allocation.
- // r0: type_idx/return value, r1: ArtMethod*, r9: Thread::Current
- // r2, r3, r12: free.
+ // r0: type, r9: Thread::Current
+ // r1, r2, r3, r12: free.
#if defined(USE_READ_BARRIER)
mvn r0, #0 // Read barrier not supported here.
bx lr // Return -1.
#endif
- ldr r2, [r1, #ART_METHOD_DEX_CACHE_TYPES_OFFSET_32] // Load dex cache resolved types array
- // Load the class (r2)
- ldr r2, [r2, r0, lsl #COMPRESSED_REFERENCE_SIZE_SHIFT]
- ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_tlab_slow_path
-.Lart_quick_alloc_object_tlab_slow_path:
+ ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_resolved_tlab_slow_path
+.Lart_quick_alloc_object_resolved_tlab_slow_path:
SETUP_SAVE_REFS_ONLY_FRAME r2 // Save callee saves in case of GC.
- mov r2, r9 // Pass Thread::Current.
- bl artAllocObjectFromCodeTLAB // (uint32_t type_idx, Method* method, Thread*)
+ mov r1, r9 // Pass Thread::Current.
+ bl artAllocObjectFromCodeResolvedTLAB // (mirror::Class* klass, Thread*)
RESTORE_SAVE_REFS_ONLY_FRAME
RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
-END art_quick_alloc_object_tlab
+END art_quick_alloc_object_resolved_tlab
-// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_region_tlab, RegionTLAB)
-ENTRY art_quick_alloc_object_region_tlab
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB)
+ENTRY art_quick_alloc_object_resolved_region_tlab
// Fast path tlab allocation.
- // r0: type_idx/return value, r1: ArtMethod*, r9: Thread::Current, r2, r3, r12: free.
+ // r0: type, r9: Thread::Current, r1, r2, r3, r12: free.
#if !defined(USE_READ_BARRIER)
eor r0, r0, r0 // Read barrier must be enabled here.
sub r0, r0, #1 // Return -1.
bx lr
#endif
- ldr r2, [r1, #ART_METHOD_DEX_CACHE_TYPES_OFFSET_32] // Load dex cache resolved types array
- // Load the class (r2)
- ldr r2, [r2, r0, lsl #COMPRESSED_REFERENCE_SIZE_SHIFT]
- // Read barrier for class load.
- ldr r3, [r9, #THREAD_IS_GC_MARKING_OFFSET]
- cbnz r3, .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_marking
-.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit:
- ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path
-.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_marking:
- cbz r2, .Lart_quick_alloc_object_region_tlab_slow_path // Null check for loading lock word.
- // Check lock word for mark bit, if marked do the allocation.
- ldr r3, [r2, MIRROR_OBJECT_LOCK_WORD_OFFSET]
- ands r3, #LOCK_WORD_MARK_BIT_MASK_SHIFTED
- bne .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
-.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path:
- // The read barrier slow path. Mark
- // the class.
- push {r0, r1, r3, lr} // Save registers. r3 is pushed only
- // to align sp by 16 bytes.
- mov r0, r2 // Pass the class as the first param.
- bl artReadBarrierMark
- mov r2, r0 // Get the (marked) class back.
- pop {r0, r1, r3, lr}
- b .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
-.Lart_quick_alloc_object_region_tlab_slow_path:
+ ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_resolved_region_tlab_slow_path
+.Lart_quick_alloc_object_resolved_region_tlab_slow_path:
SETUP_SAVE_REFS_ONLY_FRAME r2 // Save callee saves in case of GC.
- mov r2, r9 // Pass Thread::Current.
- bl artAllocObjectFromCodeRegionTLAB // (uint32_t type_idx, Method* method, Thread*)
+ mov r1, r9 // Pass Thread::Current.
+ bl artAllocObjectFromCodeResolvedRegionTLAB // (mirror::Class* klass, Thread*)
RESTORE_SAVE_REFS_ONLY_FRAME
RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
-END art_quick_alloc_object_region_tlab
+END art_quick_alloc_object_resolved_region_tlab
/*
* Called by managed code when the value in rSUSPEND has been decremented to 0.