Rosalloc thread local allocation path without a cas.

Speedup on N4:
MemAllocTest 3044 -> 2396 (~21% reduction)
BinaryTrees  4101 -> 2929 (~26% reduction)

Bug: 9986565
Change-Id: Ia1d1a37b9e001f903c3c056e8ec68fc8c623a78b
diff --git a/runtime/gc/space/bump_pointer_space-inl.h b/runtime/gc/space/bump_pointer_space-inl.h
index 9f1f953..14a93d1 100644
--- a/runtime/gc/space/bump_pointer_space-inl.h
+++ b/runtime/gc/space/bump_pointer_space-inl.h
@@ -24,7 +24,8 @@
 namespace space {
 
 inline mirror::Object* BumpPointerSpace::Alloc(Thread*, size_t num_bytes, size_t* bytes_allocated,
-                                               size_t* usable_size) {
+                                               size_t* usable_size,
+                                               size_t* bytes_tl_bulk_allocated) {
   num_bytes = RoundUp(num_bytes, kAlignment);
   mirror::Object* ret = AllocNonvirtual(num_bytes);
   if (LIKELY(ret != nullptr)) {
@@ -32,13 +33,15 @@
     if (usable_size != nullptr) {
       *usable_size = num_bytes;
     }
+    *bytes_tl_bulk_allocated = num_bytes;
   }
   return ret;
 }
 
 inline mirror::Object* BumpPointerSpace::AllocThreadUnsafe(Thread* self, size_t num_bytes,
                                                            size_t* bytes_allocated,
-                                                           size_t* usable_size) {
+                                                           size_t* usable_size,
+                                                           size_t* bytes_tl_bulk_allocated) {
   Locks::mutator_lock_->AssertExclusiveHeld(self);
   num_bytes = RoundUp(num_bytes, kAlignment);
   uint8_t* end = end_.LoadRelaxed();
@@ -54,6 +57,7 @@
   if (UNLIKELY(usable_size != nullptr)) {
     *usable_size = num_bytes;
   }
+  *bytes_tl_bulk_allocated = num_bytes;
   return obj;
 }