Rosalloc thread local allocation path without a cas.

Speedup on N4:
MemAllocTest 3044 -> 2396 (~21% reduction)
BinaryTrees  4101 -> 2929 (~26% reduction)

Bug: 9986565
Change-Id: Ia1d1a37b9e001f903c3c056e8ec68fc8c623a78b
diff --git a/runtime/gc/space/valgrind_malloc_space-inl.h b/runtime/gc/space/valgrind_malloc_space-inl.h
index ae8e892..bc329e1 100644
--- a/runtime/gc/space/valgrind_malloc_space-inl.h
+++ b/runtime/gc/space/valgrind_malloc_space-inl.h
@@ -32,10 +32,15 @@
 template <size_t kValgrindRedZoneBytes, bool kUseObjSizeForUsable>
 inline mirror::Object* AdjustForValgrind(void* obj_with_rdz, size_t num_bytes,
                                          size_t bytes_allocated, size_t usable_size,
-                                         size_t* bytes_allocated_out, size_t* usable_size_out) {
+                                         size_t bytes_tl_bulk_allocated,
+                                         size_t* bytes_allocated_out, size_t* usable_size_out,
+                                         size_t* bytes_tl_bulk_allocated_out) {
   if (bytes_allocated_out != nullptr) {
     *bytes_allocated_out = bytes_allocated;
   }
+  if (bytes_tl_bulk_allocated_out != nullptr) {
+    *bytes_tl_bulk_allocated_out = bytes_tl_bulk_allocated;
+  }
 
   // This cuts over-provision and is a trade-off between testing the over-provisioning code paths
   // vs checking overflows in the regular paths.
@@ -82,20 +87,25 @@
                     kValgrindRedZoneBytes,
                     kAdjustForRedzoneInAllocSize,
                     kUseObjSizeForUsable>::AllocWithGrowth(
-    Thread* self, size_t num_bytes, size_t* bytes_allocated_out, size_t* usable_size_out) {
+    Thread* self, size_t num_bytes, size_t* bytes_allocated_out, size_t* usable_size_out,
+    size_t* bytes_tl_bulk_allocated_out) {
   size_t bytes_allocated;
   size_t usable_size;
+  size_t bytes_tl_bulk_allocated;
   void* obj_with_rdz = S::AllocWithGrowth(self, num_bytes + 2 * kValgrindRedZoneBytes,
-                                          &bytes_allocated, &usable_size);
+                                          &bytes_allocated, &usable_size,
+                                          &bytes_tl_bulk_allocated);
   if (obj_with_rdz == nullptr) {
     return nullptr;
   }
 
-  return valgrind_details::AdjustForValgrind<kValgrindRedZoneBytes,
-                                             kUseObjSizeForUsable>(obj_with_rdz, num_bytes,
-                                                                   bytes_allocated, usable_size,
-                                                                   bytes_allocated_out,
-                                                                   usable_size_out);
+  return valgrind_details::AdjustForValgrind<kValgrindRedZoneBytes, kUseObjSizeForUsable>(
+      obj_with_rdz, num_bytes,
+      bytes_allocated, usable_size,
+      bytes_tl_bulk_allocated,
+      bytes_allocated_out,
+      usable_size_out,
+      bytes_tl_bulk_allocated_out);
 }
 
 template <typename S,
@@ -106,11 +116,13 @@
                                     kValgrindRedZoneBytes,
                                     kAdjustForRedzoneInAllocSize,
                                     kUseObjSizeForUsable>::Alloc(
-    Thread* self, size_t num_bytes, size_t* bytes_allocated_out, size_t* usable_size_out) {
+    Thread* self, size_t num_bytes, size_t* bytes_allocated_out, size_t* usable_size_out,
+    size_t* bytes_tl_bulk_allocated_out) {
   size_t bytes_allocated;
   size_t usable_size;
+  size_t bytes_tl_bulk_allocated;
   void* obj_with_rdz = S::Alloc(self, num_bytes + 2 * kValgrindRedZoneBytes,
-                                &bytes_allocated, &usable_size);
+                                &bytes_allocated, &usable_size, &bytes_tl_bulk_allocated);
   if (obj_with_rdz == nullptr) {
     return nullptr;
   }
@@ -118,8 +130,10 @@
   return valgrind_details::AdjustForValgrind<kValgrindRedZoneBytes,
                                              kUseObjSizeForUsable>(obj_with_rdz, num_bytes,
                                                                    bytes_allocated, usable_size,
+                                                                   bytes_tl_bulk_allocated,
                                                                    bytes_allocated_out,
-                                                                   usable_size_out);
+                                                                   usable_size_out,
+                                                                   bytes_tl_bulk_allocated_out);
 }
 
 template <typename S,
@@ -130,20 +144,25 @@
                                     kValgrindRedZoneBytes,
                                     kAdjustForRedzoneInAllocSize,
                                     kUseObjSizeForUsable>::AllocThreadUnsafe(
-    Thread* self, size_t num_bytes, size_t* bytes_allocated_out, size_t* usable_size_out) {
+    Thread* self, size_t num_bytes, size_t* bytes_allocated_out, size_t* usable_size_out,
+    size_t* bytes_tl_bulk_allocated_out) {
   size_t bytes_allocated;
   size_t usable_size;
+  size_t bytes_tl_bulk_allocated;
   void* obj_with_rdz = S::AllocThreadUnsafe(self, num_bytes + 2 * kValgrindRedZoneBytes,
-                                &bytes_allocated, &usable_size);
+                                            &bytes_allocated, &usable_size,
+                                            &bytes_tl_bulk_allocated);
   if (obj_with_rdz == nullptr) {
     return nullptr;
   }
 
-  return valgrind_details::AdjustForValgrind<kValgrindRedZoneBytes,
-                                             kUseObjSizeForUsable>(obj_with_rdz, num_bytes,
-                                                                   bytes_allocated, usable_size,
-                                                                   bytes_allocated_out,
-                                                                   usable_size_out);
+  return valgrind_details::AdjustForValgrind<kValgrindRedZoneBytes, kUseObjSizeForUsable>(
+      obj_with_rdz, num_bytes,
+      bytes_allocated, usable_size,
+      bytes_tl_bulk_allocated,
+      bytes_allocated_out,
+      usable_size_out,
+      bytes_tl_bulk_allocated_out);
 }
 
 template <typename S,
@@ -226,6 +245,17 @@
                               mem_map->Size() - initial_size);
 }
 
+template <typename S,
+          size_t kValgrindRedZoneBytes,
+          bool kAdjustForRedzoneInAllocSize,
+          bool kUseObjSizeForUsable>
+size_t ValgrindMallocSpace<S,
+                           kValgrindRedZoneBytes,
+                           kAdjustForRedzoneInAllocSize,
+                           kUseObjSizeForUsable>::MaxBytesBulkAllocatedFor(size_t num_bytes) {
+  return S::MaxBytesBulkAllocatedFor(num_bytes + 2 * kValgrindRedZoneBytes);
+}
+
 }  // namespace space
 }  // namespace gc
 }  // namespace art