Rosalloc thread local allocation path without a cas.

Speedup on N4:
MemAllocTest 3044 -> 2396 (~21% reduction)
BinaryTrees  4101 -> 2929 (~26% reduction)

Bug: 9986565
Change-Id: Ia1d1a37b9e001f903c3c056e8ec68fc8c623a78b
diff --git a/runtime/gc/space/dlmalloc_space.cc b/runtime/gc/space/dlmalloc_space.cc
index b8a9dd6..225861d 100644
--- a/runtime/gc/space/dlmalloc_space.cc
+++ b/runtime/gc/space/dlmalloc_space.cc
@@ -123,7 +123,8 @@
 }
 
 mirror::Object* DlMallocSpace::AllocWithGrowth(Thread* self, size_t num_bytes,
-                                               size_t* bytes_allocated, size_t* usable_size) {
+                                               size_t* bytes_allocated, size_t* usable_size,
+                                               size_t* bytes_tl_bulk_allocated) {
   mirror::Object* result;
   {
     MutexLock mu(self, lock_);
@@ -131,7 +132,8 @@
     size_t max_allowed = Capacity();
     mspace_set_footprint_limit(mspace_, max_allowed);
     // Try the allocation.
-    result = AllocWithoutGrowthLocked(self, num_bytes, bytes_allocated, usable_size);
+    result = AllocWithoutGrowthLocked(self, num_bytes, bytes_allocated, usable_size,
+                                      bytes_tl_bulk_allocated);
     // Shrink back down as small as possible.
     size_t footprint = mspace_footprint(mspace_);
     mspace_set_footprint_limit(mspace_, footprint);