X86_64: Add allocation entrypoint switching for CC is_marking

Only X86_64 done so far. Use normal TLAB allocators if GC is not
marking.

Allocation speed goes up by ~8% based on perf sampling.

Without change:
1.19%: art_quick_alloc_object_region_tlab

With change:
0.63%: art_quick_alloc_object_tlab
0.47%: art_quick_alloc_object_region_tlab

Bug: 31018974
Bug: 12687968

Test: test-art-host-run-test

Change-Id: I4c4d9eb229d4ad2f41b856ba5c2958a5eb3b7ffa
diff --git a/runtime/thread.cc b/runtime/thread.cc
index b99df26..debd13a 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -122,21 +122,27 @@
   CHECK(kUseReadBarrier);
   tls32_.is_gc_marking = is_marking;
   UpdateReadBarrierEntrypoints(&tlsPtr_.quick_entrypoints, is_marking);
+  if (kRuntimeISA == kX86_64) {
+    // Entrypoint switching is only implemented for X86_64.
+    ResetQuickAllocEntryPointsForThread(is_marking);
+  }
 }
 
 void Thread::InitTlsEntryPoints() {
   // Insert a placeholder so we can easily tell if we call an unimplemented entry point.
   uintptr_t* begin = reinterpret_cast<uintptr_t*>(&tlsPtr_.jni_entrypoints);
-  uintptr_t* end = reinterpret_cast<uintptr_t*>(reinterpret_cast<uint8_t*>(&tlsPtr_.quick_entrypoints) +
-      sizeof(tlsPtr_.quick_entrypoints));
+  uintptr_t* end = reinterpret_cast<uintptr_t*>(
+      reinterpret_cast<uint8_t*>(&tlsPtr_.quick_entrypoints) + sizeof(tlsPtr_.quick_entrypoints));
   for (uintptr_t* it = begin; it != end; ++it) {
     *it = reinterpret_cast<uintptr_t>(UnimplementedEntryPoint);
   }
   InitEntryPoints(&tlsPtr_.jni_entrypoints, &tlsPtr_.quick_entrypoints);
 }
 
-void Thread::ResetQuickAllocEntryPointsForThread() {
-  ResetQuickAllocEntryPoints(&tlsPtr_.quick_entrypoints);
+void Thread::ResetQuickAllocEntryPointsForThread(bool is_marking) {
+  // Entrypoint switching is currnetly only faster for X86_64 since other archs don't have TLAB
+  // fast path for non region space entrypoints.
+  ResetQuickAllocEntryPoints(&tlsPtr_.quick_entrypoints, is_marking);
 }
 
 class DeoptimizationContextRecord {