Rosalloc thread local allocation path without a cas.

Speedup on N4:
MemAllocTest 3044 -> 2396 (~21% reduction)
BinaryTrees  4101 -> 2929 (~26% reduction)

Bug: 9986565
Change-Id: Ia1d1a37b9e001f903c3c056e8ec68fc8c623a78b
diff --git a/runtime/gc/space/large_object_space.cc b/runtime/gc/space/large_object_space.cc
index 7523de5..5c8e4b9 100644
--- a/runtime/gc/space/large_object_space.cc
+++ b/runtime/gc/space/large_object_space.cc
@@ -38,10 +38,11 @@
   }
 
   virtual mirror::Object* Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                                size_t* usable_size) OVERRIDE {
+                                size_t* usable_size, size_t* bytes_tl_bulk_allocated)
+      OVERRIDE {
     mirror::Object* obj =
         LargeObjectMapSpace::Alloc(self, num_bytes + kValgrindRedZoneBytes * 2, bytes_allocated,
-                                   usable_size);
+                                   usable_size, bytes_tl_bulk_allocated);
     mirror::Object* object_without_rdz = reinterpret_cast<mirror::Object*>(
         reinterpret_cast<uintptr_t>(obj) + kValgrindRedZoneBytes);
     VALGRIND_MAKE_MEM_NOACCESS(reinterpret_cast<void*>(obj), kValgrindRedZoneBytes);
@@ -108,7 +109,8 @@
 }
 
 mirror::Object* LargeObjectMapSpace::Alloc(Thread* self, size_t num_bytes,
-                                           size_t* bytes_allocated, size_t* usable_size) {
+                                           size_t* bytes_allocated, size_t* usable_size,
+                                           size_t* bytes_tl_bulk_allocated) {
   std::string error_msg;
   MemMap* mem_map = MemMap::MapAnonymous("large object space allocation", nullptr, num_bytes,
                                          PROT_READ | PROT_WRITE, true, false, &error_msg);
@@ -131,6 +133,8 @@
   if (usable_size != nullptr) {
     *usable_size = allocation_size;
   }
+  DCHECK(bytes_tl_bulk_allocated != nullptr);
+  *bytes_tl_bulk_allocated = allocation_size;
   num_bytes_allocated_ += allocation_size;
   total_bytes_allocated_ += allocation_size;
   ++num_objects_allocated_;
@@ -413,7 +417,7 @@
 }
 
 mirror::Object* FreeListSpace::Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                                     size_t* usable_size) {
+                                     size_t* usable_size, size_t* bytes_tl_bulk_allocated) {
   MutexLock mu(self, lock_);
   const size_t allocation_size = RoundUp(num_bytes, kAlignment);
   AllocationInfo temp_info;
@@ -451,6 +455,8 @@
   if (usable_size != nullptr) {
     *usable_size = allocation_size;
   }
+  DCHECK(bytes_tl_bulk_allocated != nullptr);
+  *bytes_tl_bulk_allocated = allocation_size;
   // Need to do these inside of the lock.
   ++num_objects_allocated_;
   ++total_objects_allocated_;