Rosalloc thread local allocation path without a cas.

Speedup on N4:
MemAllocTest 3044 -> 2396 (~21% reduction)
BinaryTrees  4101 -> 2929 (~26% reduction)

Bug: 9986565
Change-Id: Ia1d1a37b9e001f903c3c056e8ec68fc8c623a78b
diff --git a/runtime/gc/space/dlmalloc_space.h b/runtime/gc/space/dlmalloc_space.h
index 6ce138c..1f80f1f 100644
--- a/runtime/gc/space/dlmalloc_space.h
+++ b/runtime/gc/space/dlmalloc_space.h
@@ -48,11 +48,15 @@
 
   // Virtual to allow ValgrindMallocSpace to intercept.
   virtual mirror::Object* AllocWithGrowth(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                                          size_t* usable_size) OVERRIDE LOCKS_EXCLUDED(lock_);
+                                          size_t* usable_size,
+                                          size_t* bytes_tl_bulk_allocated)
+      OVERRIDE LOCKS_EXCLUDED(lock_);
   // Virtual to allow ValgrindMallocSpace to intercept.
   virtual mirror::Object* Alloc(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                        size_t* usable_size) OVERRIDE LOCKS_EXCLUDED(lock_) {
-    return AllocNonvirtual(self, num_bytes, bytes_allocated, usable_size);
+                                size_t* usable_size, size_t* bytes_tl_bulk_allocated)
+      OVERRIDE LOCKS_EXCLUDED(lock_) {
+    return AllocNonvirtual(self, num_bytes, bytes_allocated, usable_size,
+                           bytes_tl_bulk_allocated);
   }
   // Virtual to allow ValgrindMallocSpace to intercept.
   virtual size_t AllocationSize(mirror::Object* obj, size_t* usable_size) OVERRIDE {
@@ -67,15 +71,22 @@
       LOCKS_EXCLUDED(lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  // DlMallocSpaces don't have thread local state.
-  void RevokeThreadLocalBuffers(art::Thread*) OVERRIDE {
+  size_t MaxBytesBulkAllocatedFor(size_t num_bytes) OVERRIDE {
+    return num_bytes;
   }
-  void RevokeAllThreadLocalBuffers() OVERRIDE {
+
+  // DlMallocSpaces don't have thread local state.
+  size_t RevokeThreadLocalBuffers(art::Thread*) OVERRIDE {
+    return 0U;
+  }
+  size_t RevokeAllThreadLocalBuffers() OVERRIDE {
+    return 0U;
   }
 
   // Faster non-virtual allocation path.
   mirror::Object* AllocNonvirtual(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                                  size_t* usable_size) LOCKS_EXCLUDED(lock_);
+                                  size_t* usable_size, size_t* bytes_tl_bulk_allocated)
+      LOCKS_EXCLUDED(lock_);
 
   // Faster non-virtual allocation size path.
   size_t AllocationSizeNonvirtual(mirror::Object* obj, size_t* usable_size);
@@ -134,7 +145,8 @@
 
  private:
   mirror::Object* AllocWithoutGrowthLocked(Thread* self, size_t num_bytes, size_t* bytes_allocated,
-                                           size_t* usable_size)
+                                           size_t* usable_size,
+                                           size_t* bytes_tl_bulk_allocated)
       EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   void* CreateAllocator(void* base, size_t morecore_start, size_t initial_size,