Faster allocation fast path

Added a new object size field to class, this field contains the
aligned object size if the object is not finalizable and is
initialized. If the object is finalizable or uninitialized the field
is set to some large value that forces the ASM allocators to go slow
path.

Only implemented for region/normal TLAB for now, will add the to
RosAlloc stubs soon.

CC N6P MemAllocTest: 1067 -> 1039 (25 samples)
CC N6P EAAC: 1281 -> 1260 (25 samples)

RAM overhead technically 0 since mirror::Class was not 8 byte aligned
previously. Since the allocators require 8 byte allignment, there
would have been 1 word of padding at the end of the class. If there
was actually 4 extra bytes per class, the system overhead would be
36000 * 4 = 120KB based on old N6P numbers for the number of loaded
classes after boot.

Bug: 9986565

Test: test-art-host CC baker, N6P phone boot and EAAC runs.

Change-Id: I119a87b8cc6c980bff980a0c62f42610dab5e531
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 3f87a14..76e503c 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -2027,48 +2027,24 @@
     ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED \slowPathLabel
 .endm
 
+// TODO: delete ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED since it is the same as
+// ALLOC_OBJECT_TLAB_FAST_PATH_INITIALIZED.
 .macro ALLOC_OBJECT_TLAB_FAST_PATH_RESOLVED slowPathLabel
-    ldr    w3, [x2, #MIRROR_CLASS_STATUS_OFFSET]              // Check class status.
-    cmp    x3, #MIRROR_CLASS_STATUS_INITIALIZED
-    bne    \slowPathLabel
-                                                              // Add a fake dependence from the
-                                                              // following access flag and size
-                                                              // loads to the status load.
-                                                              // This is to prevent those loads
-                                                              // from being reordered above the
-                                                              // status load and reading wrong
-                                                              // values (an alternative is to use
-                                                              // a load-acquire for the status).
-    eor    x3, x3, x3
-    add    x2, x2, x3
     ALLOC_OBJECT_TLAB_FAST_PATH_INITIALIZED \slowPathLabel
 .endm
 
 .macro ALLOC_OBJECT_TLAB_FAST_PATH_INITIALIZED slowPathLabel
-                                                              // Check access flags has
-                                                              // kAccClassIsFinalizable.
-    ldr    w3, [x2, #MIRROR_CLASS_ACCESS_FLAGS_OFFSET]
-    tbnz   x3, #ACCESS_FLAGS_CLASS_IS_FINALIZABLE_BIT, \slowPathLabel
-                                                              // Load thread_local_pos (x4) and
-                                                              // thread_local_end (x5).
     ldr    x4, [xSELF, #THREAD_LOCAL_POS_OFFSET]
     ldr    x5, [xSELF, #THREAD_LOCAL_END_OFFSET]
-    sub    x6, x5, x4                                         // Compute the remaining buf size.
-    ldr    w7, [x2, #MIRROR_CLASS_OBJECT_SIZE_OFFSET]         // Load the object size (x7).
-    cmp    x7, x6                                             // Check if it fits. OK to do this
-                                                              // before rounding up the object size
-                                                              // assuming the buf size alignment.
+    ldr    w7, [x2, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET]  // Load the object size (x7).
+    add    x6, x4, x7                                         // Add object size to tlab pos.
+    cmp    x6, x5                                             // Check if it fits, overflow works
+                                                              // since the tlab pos and end are 32
+                                                              // bit values.
     bhi    \slowPathLabel
     // "Point of no slow path". Won't go to the slow path from here on. OK to clobber x0 and x1.
-                                                              // Round up the object size by the
-                                                              // object alignment. (addr + 7) & ~7.
-    add    x7, x7, #OBJECT_ALIGNMENT_MASK
-    and    x7, x7, #OBJECT_ALIGNMENT_MASK_TOGGLED
-                                                              // Move old thread_local_pos to x0
-                                                              // for the return value.
     mov    x0, x4
-    add    x5, x0, x7
-    str    x5, [xSELF, #THREAD_LOCAL_POS_OFFSET]              // Store new thread_local_pos.
+    str    x6, [xSELF, #THREAD_LOCAL_POS_OFFSET]              // Store new thread_local_pos.
     ldr    x5, [xSELF, #THREAD_LOCAL_OBJECTS_OFFSET]          // Increment thread_local_objects.
     add    x5, x5, #1
     str    x5, [xSELF, #THREAD_LOCAL_OBJECTS_OFFSET]
@@ -2080,7 +2056,7 @@
                                                               // the fields of the class.
                                                               // Alternatively we could use "ishst"
                                                               // if we use load-acquire for the
-                                                              // class status load.)
+                                                              // object size load.)
     dmb    ish
     ret
 .endm