Rosalloc thread local allocation path without a cas.

Speedup on N4:
MemAllocTest 3044 -> 2396 (~21% reduction)
BinaryTrees  4101 -> 2929 (~26% reduction)

Bug: 9986565
Change-Id: Ia1d1a37b9e001f903c3c056e8ec68fc8c623a78b
diff --git a/runtime/gc/space/dlmalloc_space-inl.h b/runtime/gc/space/dlmalloc_space-inl.h
index 4c8a35e..9eace89 100644
--- a/runtime/gc/space/dlmalloc_space-inl.h
+++ b/runtime/gc/space/dlmalloc_space-inl.h
@@ -27,11 +27,13 @@
 
 inline mirror::Object* DlMallocSpace::AllocNonvirtual(Thread* self, size_t num_bytes,
                                                       size_t* bytes_allocated,
-                                                      size_t* usable_size) {
+                                                      size_t* usable_size,
+                                                      size_t* bytes_tl_bulk_allocated) {
   mirror::Object* obj;
   {
     MutexLock mu(self, lock_);
-    obj = AllocWithoutGrowthLocked(self, num_bytes, bytes_allocated, usable_size);
+    obj = AllocWithoutGrowthLocked(self, num_bytes, bytes_allocated, usable_size,
+                                   bytes_tl_bulk_allocated);
   }
   if (LIKELY(obj != NULL)) {
     // Zero freshly allocated memory, done while not holding the space's lock.
@@ -49,9 +51,11 @@
   return size + kChunkOverhead;
 }
 
-inline mirror::Object* DlMallocSpace::AllocWithoutGrowthLocked(Thread* /*self*/, size_t num_bytes,
-                                                               size_t* bytes_allocated,
-                                                               size_t* usable_size) {
+inline mirror::Object* DlMallocSpace::AllocWithoutGrowthLocked(
+    Thread* /*self*/, size_t num_bytes,
+    size_t* bytes_allocated,
+    size_t* usable_size,
+    size_t* bytes_tl_bulk_allocated) {
   mirror::Object* result = reinterpret_cast<mirror::Object*>(mspace_malloc(mspace_, num_bytes));
   if (LIKELY(result != NULL)) {
     if (kDebugSpaces) {
@@ -61,6 +65,7 @@
     size_t allocation_size = AllocationSizeNonvirtual(result, usable_size);
     DCHECK(bytes_allocated != NULL);
     *bytes_allocated = allocation_size;
+    *bytes_tl_bulk_allocated = allocation_size;
   }
   return result;
 }