Rosalloc thread local allocation path without a cas.

Speedup on N4:
MemAllocTest 3044 -> 2396 (~21% reduction)
BinaryTrees  4101 -> 2929 (~26% reduction)

Bug: 9986565
Change-Id: Ia1d1a37b9e001f903c3c056e8ec68fc8c623a78b
diff --git a/runtime/gc/space/region_space.h b/runtime/gc/space/region_space.h
index 4160547..b88ce24 100644
--- a/runtime/gc/space/region_space.h
+++ b/runtime/gc/space/region_space.h
@@ -42,18 +42,20 @@
 
   // Allocate num_bytes, returns nullptr if the space is full.
   mirror::Object* Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                        size_t* usable_size) OVERRIDE;
+                        size_t* usable_size, size_t* bytes_tl_bulk_allocated) OVERRIDE;
   // Thread-unsafe allocation for when mutators are suspended, used by the semispace collector.
   mirror::Object* AllocThreadUnsafe(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                                    size_t* usable_size)
+                                    size_t* usable_size, size_t* bytes_tl_bulk_allocated)
       OVERRIDE EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_);
   // The main allocation routine.
   template<bool kForEvac>
   ALWAYS_INLINE mirror::Object* AllocNonvirtual(size_t num_bytes, size_t* bytes_allocated,
-                                                size_t* usable_size);
+                                                size_t* usable_size,
+                                                size_t* bytes_tl_bulk_allocated);
   // Allocate/free large objects (objects that are larger than the region size.)
   template<bool kForEvac>
-  mirror::Object* AllocLarge(size_t num_bytes, size_t* bytes_allocated, size_t* usable_size);
+  mirror::Object* AllocLarge(size_t num_bytes, size_t* bytes_allocated, size_t* usable_size,
+                             size_t* bytes_tl_bulk_allocated);
   void FreeLarge(mirror::Object* large_obj, size_t bytes_allocated);
 
   // Return the storage space required by obj.
@@ -87,10 +89,10 @@
   void DumpRegions(std::ostream& os);
   void DumpNonFreeRegions(std::ostream& os);
 
-  void RevokeThreadLocalBuffers(Thread* thread) LOCKS_EXCLUDED(region_lock_);
+  size_t RevokeThreadLocalBuffers(Thread* thread) LOCKS_EXCLUDED(region_lock_);
   void RevokeThreadLocalBuffersLocked(Thread* thread) EXCLUSIVE_LOCKS_REQUIRED(region_lock_);
-  void RevokeAllThreadLocalBuffers() LOCKS_EXCLUDED(Locks::runtime_shutdown_lock_,
-                                                    Locks::thread_list_lock_);
+  size_t RevokeAllThreadLocalBuffers() LOCKS_EXCLUDED(Locks::runtime_shutdown_lock_,
+                                                      Locks::thread_list_lock_);
   void AssertThreadLocalBuffersAreRevoked(Thread* thread) LOCKS_EXCLUDED(region_lock_);
   void AssertAllThreadLocalBuffersAreRevoked() LOCKS_EXCLUDED(Locks::runtime_shutdown_lock_,
                                                               Locks::thread_list_lock_);
@@ -269,7 +271,8 @@
     }
 
     ALWAYS_INLINE mirror::Object* Alloc(size_t num_bytes, size_t* bytes_allocated,
-                                        size_t* usable_size);
+                                        size_t* usable_size,
+                                        size_t* bytes_tl_bulk_allocated);
 
     bool IsFree() const {
       bool is_free = state_ == RegionState::kRegionStateFree;