X86_64: Add allocation entrypoint switching for CC is_marking

Only X86_64 done so far. Use normal TLAB allocators if GC is not
marking.

Allocation speed goes up by ~8% based on perf sampling.

Without change:
1.19%: art_quick_alloc_object_region_tlab

With change:
0.63%: art_quick_alloc_object_tlab
0.47%: art_quick_alloc_object_region_tlab

Bug: 31018974
Bug: 12687968

Test: test-art-host-run-test

Change-Id: I4c4d9eb229d4ad2f41b856ba5c2958a5eb3b7ffa
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 97129e8..54f2210 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -247,7 +247,7 @@
   if (allocator_type != kAllocatorTypeTLAB &&
       allocator_type != kAllocatorTypeRegionTLAB &&
       allocator_type != kAllocatorTypeRosAlloc &&
-      UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type, alloc_size))) {
+      UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, alloc_size, kGrow))) {
     return nullptr;
   }
   mirror::Object* ret;
@@ -267,8 +267,9 @@
       if (kInstrumented && UNLIKELY(is_running_on_memory_tool_)) {
         // If running on valgrind or asan, we should be using the instrumented path.
         size_t max_bytes_tl_bulk_allocated = rosalloc_space_->MaxBytesBulkAllocatedFor(alloc_size);
-        if (UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type,
-                                                      max_bytes_tl_bulk_allocated))) {
+        if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type,
+                                               max_bytes_tl_bulk_allocated,
+                                               kGrow))) {
           return nullptr;
         }
         ret = rosalloc_space_->Alloc(self, alloc_size, bytes_allocated, usable_size,
@@ -277,14 +278,18 @@
         DCHECK(!is_running_on_memory_tool_);
         size_t max_bytes_tl_bulk_allocated =
             rosalloc_space_->MaxBytesBulkAllocatedForNonvirtual(alloc_size);
-        if (UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type,
-                                                      max_bytes_tl_bulk_allocated))) {
+        if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type,
+                                               max_bytes_tl_bulk_allocated,
+                                               kGrow))) {
           return nullptr;
         }
         if (!kInstrumented) {
           DCHECK(!rosalloc_space_->CanAllocThreadLocal(self, alloc_size));
         }
-        ret = rosalloc_space_->AllocNonvirtual(self, alloc_size, bytes_allocated, usable_size,
+        ret = rosalloc_space_->AllocNonvirtual(self,
+                                               alloc_size,
+                                               bytes_allocated,
+                                               usable_size,
                                                bytes_tl_bulk_allocated);
       }
       break;
@@ -292,22 +297,34 @@
     case kAllocatorTypeDlMalloc: {
       if (kInstrumented && UNLIKELY(is_running_on_memory_tool_)) {
         // If running on valgrind, we should be using the instrumented path.
-        ret = dlmalloc_space_->Alloc(self, alloc_size, bytes_allocated, usable_size,
+        ret = dlmalloc_space_->Alloc(self,
+                                     alloc_size,
+                                     bytes_allocated,
+                                     usable_size,
                                      bytes_tl_bulk_allocated);
       } else {
         DCHECK(!is_running_on_memory_tool_);
-        ret = dlmalloc_space_->AllocNonvirtual(self, alloc_size, bytes_allocated, usable_size,
+        ret = dlmalloc_space_->AllocNonvirtual(self,
+                                               alloc_size,
+                                               bytes_allocated,
+                                               usable_size,
                                                bytes_tl_bulk_allocated);
       }
       break;
     }
     case kAllocatorTypeNonMoving: {
-      ret = non_moving_space_->Alloc(self, alloc_size, bytes_allocated, usable_size,
+      ret = non_moving_space_->Alloc(self,
+                                     alloc_size,
+                                     bytes_allocated,
+                                     usable_size,
                                      bytes_tl_bulk_allocated);
       break;
     }
     case kAllocatorTypeLOS: {
-      ret = large_object_space_->Alloc(self, alloc_size, bytes_allocated, usable_size,
+      ret = large_object_space_->Alloc(self,
+                                       alloc_size,
+                                       bytes_allocated,
+                                       usable_size,
                                        bytes_tl_bulk_allocated);
       // Note that the bump pointer spaces aren't necessarily next to
       // the other continuous spaces like the non-moving alloc space or
@@ -315,80 +332,38 @@
       DCHECK(ret == nullptr || large_object_space_->Contains(ret));
       break;
     }
-    case kAllocatorTypeTLAB: {
-      DCHECK_ALIGNED(alloc_size, space::BumpPointerSpace::kAlignment);
-      if (UNLIKELY(self->TlabSize() < alloc_size)) {
-        const size_t new_tlab_size = alloc_size + kDefaultTLABSize;
-        if (UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(allocator_type, new_tlab_size))) {
-          return nullptr;
-        }
-        // Try allocating a new thread local buffer, if the allocaiton fails the space must be
-        // full so return null.
-        if (!bump_pointer_space_->AllocNewTlab(self, new_tlab_size)) {
-          return nullptr;
-        }
-        *bytes_tl_bulk_allocated = new_tlab_size;
-      } else {
-        *bytes_tl_bulk_allocated = 0;
-      }
-      // The allocation can't fail.
-      ret = self->AllocTlab(alloc_size);
-      DCHECK(ret != nullptr);
-      *bytes_allocated = alloc_size;
-      *usable_size = alloc_size;
-      break;
-    }
     case kAllocatorTypeRegion: {
       DCHECK(region_space_ != nullptr);
       alloc_size = RoundUp(alloc_size, space::RegionSpace::kAlignment);
-      ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size,
+      ret = region_space_->AllocNonvirtual<false>(alloc_size,
+                                                  bytes_allocated,
+                                                  usable_size,
                                                   bytes_tl_bulk_allocated);
       break;
     }
+    case kAllocatorTypeTLAB:
+      FALLTHROUGH_INTENDED;
     case kAllocatorTypeRegionTLAB: {
-      DCHECK(region_space_ != nullptr);
-      DCHECK_ALIGNED(alloc_size, space::RegionSpace::kAlignment);
+      DCHECK_ALIGNED(alloc_size, kObjectAlignment);
+      static_assert(space::RegionSpace::kAlignment == space::BumpPointerSpace::kAlignment,
+                    "mismatched alignments");
+      static_assert(kObjectAlignment == space::BumpPointerSpace::kAlignment,
+                    "mismatched alignments");
       if (UNLIKELY(self->TlabSize() < alloc_size)) {
-        if (space::RegionSpace::kRegionSize >= alloc_size) {
-          // Non-large. Check OOME for a tlab.
-          if (LIKELY(!IsOutOfMemoryOnAllocation<kGrow>(allocator_type, space::RegionSpace::kRegionSize))) {
-            // Try to allocate a tlab.
-            if (!region_space_->AllocNewTlab(self)) {
-              // Failed to allocate a tlab. Try non-tlab.
-              ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size,
-                                                          bytes_tl_bulk_allocated);
-              return ret;
-            }
-            *bytes_tl_bulk_allocated = space::RegionSpace::kRegionSize;
-            // Fall-through.
-          } else {
-            // Check OOME for a non-tlab allocation.
-            if (!IsOutOfMemoryOnAllocation<kGrow>(allocator_type, alloc_size)) {
-              ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size,
-                                                          bytes_tl_bulk_allocated);
-              return ret;
-            } else {
-              // Neither tlab or non-tlab works. Give up.
-              return nullptr;
-            }
-          }
-        } else {
-          // Large. Check OOME.
-          if (LIKELY(!IsOutOfMemoryOnAllocation<kGrow>(allocator_type, alloc_size))) {
-            ret = region_space_->AllocNonvirtual<false>(alloc_size, bytes_allocated, usable_size,
-                                                        bytes_tl_bulk_allocated);
-            return ret;
-          } else {
-            return nullptr;
-          }
-        }
-      } else {
-        *bytes_tl_bulk_allocated = 0;  // Allocated in an existing buffer.
+        // kAllocatorTypeTLAB may be the allocator for region space TLAB if the GC is not marking,
+        // that is why the allocator is not passed down.
+        return AllocWithNewTLAB(self,
+                                alloc_size,
+                                kGrow,
+                                bytes_allocated,
+                                usable_size,
+                                bytes_tl_bulk_allocated);
       }
       // The allocation can't fail.
       ret = self->AllocTlab(alloc_size);
       DCHECK(ret != nullptr);
       *bytes_allocated = alloc_size;
+      *bytes_tl_bulk_allocated = 0;  // Allocated in an existing buffer.
       *usable_size = alloc_size;
       break;
     }
@@ -408,15 +383,16 @@
   return byte_count >= large_object_threshold_ && (c->IsPrimitiveArray() || c->IsStringClass());
 }
 
-template <bool kGrow>
-inline bool Heap::IsOutOfMemoryOnAllocation(AllocatorType allocator_type, size_t alloc_size) {
+inline bool Heap::IsOutOfMemoryOnAllocation(AllocatorType allocator_type,
+                                            size_t alloc_size,
+                                            bool grow) {
   size_t new_footprint = num_bytes_allocated_.LoadSequentiallyConsistent() + alloc_size;
   if (UNLIKELY(new_footprint > max_allowed_footprint_)) {
     if (UNLIKELY(new_footprint > growth_limit_)) {
       return true;
     }
     if (!AllocatorMayHaveConcurrentGC(allocator_type) || !IsGcConcurrent()) {
-      if (!kGrow) {
+      if (!grow) {
         return true;
       }
       // TODO: Grow for allocation is racy, fix it.
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index ddc3852..3e1cbeb 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -1815,7 +1815,7 @@
           break;
         }
         // Try to transition the heap if the allocation failure was due to the space being full.
-        if (!IsOutOfMemoryOnAllocation<false>(allocator, alloc_size)) {
+        if (!IsOutOfMemoryOnAllocation(allocator, alloc_size, /*grow*/ false)) {
           // If we aren't out of memory then the OOM was probably from the non moving space being
           // full. Attempt to disable compaction and turn the main space into a non moving space.
           DisableMovingGc();
@@ -4221,5 +4221,72 @@
   gc_pause_listener_.StoreRelaxed(nullptr);
 }
 
+mirror::Object* Heap::AllocWithNewTLAB(Thread* self,
+                                       size_t alloc_size,
+                                       bool grow,
+                                       size_t* bytes_allocated,
+                                       size_t* usable_size,
+                                       size_t* bytes_tl_bulk_allocated) {
+  const AllocatorType allocator_type = GetCurrentAllocator();
+  if (allocator_type == kAllocatorTypeTLAB) {
+    DCHECK(bump_pointer_space_ != nullptr);
+    const size_t new_tlab_size = alloc_size + kDefaultTLABSize;
+    if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, new_tlab_size, grow))) {
+      return nullptr;
+    }
+    // Try allocating a new thread local buffer, if the allocation fails the space must be
+    // full so return null.
+    if (!bump_pointer_space_->AllocNewTlab(self, new_tlab_size)) {
+      return nullptr;
+    }
+    *bytes_tl_bulk_allocated = new_tlab_size;
+  } else {
+    DCHECK(allocator_type == kAllocatorTypeRegionTLAB);
+    DCHECK(region_space_ != nullptr);
+    if (space::RegionSpace::kRegionSize >= alloc_size) {
+      // Non-large. Check OOME for a tlab.
+      if (LIKELY(!IsOutOfMemoryOnAllocation(allocator_type,
+                                            space::RegionSpace::kRegionSize,
+                                            grow))) {
+        // Try to allocate a tlab.
+        if (!region_space_->AllocNewTlab(self)) {
+          // Failed to allocate a tlab. Try non-tlab.
+          return region_space_->AllocNonvirtual<false>(alloc_size,
+                                                       bytes_allocated,
+                                                       usable_size,
+                                                       bytes_tl_bulk_allocated);
+        }
+        *bytes_tl_bulk_allocated = space::RegionSpace::kRegionSize;
+        // Fall-through to using the TLAB below.
+      } else {
+        // Check OOME for a non-tlab allocation.
+        if (!IsOutOfMemoryOnAllocation(allocator_type, alloc_size, grow)) {
+          return region_space_->AllocNonvirtual<false>(alloc_size,
+                                                       bytes_allocated,
+                                                       usable_size,
+                                                       bytes_tl_bulk_allocated);
+        }
+        // Neither tlab or non-tlab works. Give up.
+        return nullptr;
+      }
+    } else {
+      // Large. Check OOME.
+      if (LIKELY(!IsOutOfMemoryOnAllocation(allocator_type, alloc_size, grow))) {
+        return region_space_->AllocNonvirtual<false>(alloc_size,
+                                                     bytes_allocated,
+                                                     usable_size,
+                                                     bytes_tl_bulk_allocated);
+      }
+      return nullptr;
+    }
+  }
+  // Refilled TLAB, return.
+  mirror::Object* ret = self->AllocTlab(alloc_size);
+  DCHECK(ret != nullptr);
+  *bytes_allocated = alloc_size;
+  *usable_size = alloc_size;
+  return ret;
+}
+
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 0c671d2..3a8e29b 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -854,6 +854,10 @@
         allocator_type != kAllocatorTypeRegionTLAB;
   }
   static ALWAYS_INLINE bool AllocatorMayHaveConcurrentGC(AllocatorType allocator_type) {
+    if (kUseReadBarrier) {
+      // Read barrier may have the TLAB allocator but is always concurrent. TODO: clean this up.
+      return true;
+    }
     return
         allocator_type != kAllocatorTypeBumpPointer &&
         allocator_type != kAllocatorTypeTLAB;
@@ -923,11 +927,20 @@
                                               size_t* bytes_tl_bulk_allocated)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
+  mirror::Object* AllocWithNewTLAB(Thread* self,
+                                   size_t alloc_size,
+                                   bool grow,
+                                   size_t* bytes_allocated,
+                                   size_t* usable_size,
+                                   size_t* bytes_tl_bulk_allocated)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
   void ThrowOutOfMemoryError(Thread* self, size_t byte_count, AllocatorType allocator_type)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  template <bool kGrow>
-  ALWAYS_INLINE bool IsOutOfMemoryOnAllocation(AllocatorType allocator_type, size_t alloc_size);
+  ALWAYS_INLINE bool IsOutOfMemoryOnAllocation(AllocatorType allocator_type,
+                                               size_t alloc_size,
+                                               bool grow);
 
   // Run the finalizers. If timeout is non zero, then we use the VMRuntime version.
   void RunFinalization(JNIEnv* env, uint64_t timeout);