X86_64: Add allocation entrypoint switching for CC is_marking
Only X86_64 done so far. Use normal TLAB allocators if GC is not
marking.
Allocation speed goes up by ~8% based on perf sampling.
Without change:
1.19%: art_quick_alloc_object_region_tlab
With change:
0.63%: art_quick_alloc_object_tlab
0.47%: art_quick_alloc_object_region_tlab
Bug: 31018974
Bug: 12687968
Test: test-art-host-run-test
Change-Id: I4c4d9eb229d4ad2f41b856ba5c2958a5eb3b7ffa
diff --git a/runtime/thread.cc b/runtime/thread.cc
index b99df26..debd13a 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -122,21 +122,27 @@
CHECK(kUseReadBarrier);
tls32_.is_gc_marking = is_marking;
UpdateReadBarrierEntrypoints(&tlsPtr_.quick_entrypoints, is_marking);
+ if (kRuntimeISA == kX86_64) {
+ // Entrypoint switching is only implemented for X86_64.
+ ResetQuickAllocEntryPointsForThread(is_marking);
+ }
}
void Thread::InitTlsEntryPoints() {
// Insert a placeholder so we can easily tell if we call an unimplemented entry point.
uintptr_t* begin = reinterpret_cast<uintptr_t*>(&tlsPtr_.jni_entrypoints);
- uintptr_t* end = reinterpret_cast<uintptr_t*>(reinterpret_cast<uint8_t*>(&tlsPtr_.quick_entrypoints) +
- sizeof(tlsPtr_.quick_entrypoints));
+ uintptr_t* end = reinterpret_cast<uintptr_t*>(
+ reinterpret_cast<uint8_t*>(&tlsPtr_.quick_entrypoints) + sizeof(tlsPtr_.quick_entrypoints));
for (uintptr_t* it = begin; it != end; ++it) {
*it = reinterpret_cast<uintptr_t>(UnimplementedEntryPoint);
}
InitEntryPoints(&tlsPtr_.jni_entrypoints, &tlsPtr_.quick_entrypoints);
}
-void Thread::ResetQuickAllocEntryPointsForThread() {
- ResetQuickAllocEntryPoints(&tlsPtr_.quick_entrypoints);
+void Thread::ResetQuickAllocEntryPointsForThread(bool is_marking) {
+ // Entrypoint switching is currnetly only faster for X86_64 since other archs don't have TLAB
+ // fast path for non region space entrypoints.
+ ResetQuickAllocEntryPoints(&tlsPtr_.quick_entrypoints, is_marking);
}
class DeoptimizationContextRecord {