Faster allocation fast path

Added a new object size field to class, this field contains the
aligned object size if the object is not finalizable and is
initialized. If the object is finalizable or uninitialized the field
is set to some large value that forces the ASM allocators to go slow
path.

Only implemented for region/normal TLAB for now, will add the to
RosAlloc stubs soon.

CC N6P MemAllocTest: 1067 -> 1039 (25 samples)
CC N6P EAAC: 1281 -> 1260 (25 samples)

RAM overhead technically 0 since mirror::Class was not 8 byte aligned
previously. Since the allocators require 8 byte allignment, there
would have been 1 word of padding at the end of the class. If there
was actually 4 extra bytes per class, the system overhead would be
36000 * 4 = 120KB based on old N6P numbers for the number of loaded
classes after boot.

Bug: 9986565

Test: test-art-host CC baker, N6P phone boot and EAAC runs.

Change-Id: I119a87b8cc6c980bff980a0c62f42610dab5e531
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index e25e93f..bc4c999 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -1191,25 +1191,6 @@
 // Need to preserve r0 and r1 to the slow path.
 .macro ALLOC_OBJECT_TLAB_FAST_PATH slowPathLabel
     cbz    r2, \slowPathLabel                                 // Check null class
-                                                              // Check class status.
-    ldr    r3, [r2, #MIRROR_CLASS_STATUS_OFFSET]
-    cmp    r3, #MIRROR_CLASS_STATUS_INITIALIZED
-    bne    \slowPathLabel
-                                                              // Add a fake dependence from the
-                                                              // following access flag and size
-                                                              // loads to the status load.
-                                                              // This is to prevent those loads
-                                                              // from being reordered above the
-                                                              // status load and reading wrong
-                                                              // values (an alternative is to use
-                                                              // a load-acquire for the status).
-    eor    r3, r3, r3
-    add    r2, r2, r3
-                                                              // Check access flags has
-                                                              // kAccClassIsFinalizable.
-    ldr    r3, [r2, #MIRROR_CLASS_ACCESS_FLAGS_OFFSET]
-    tst    r3, #ACCESS_FLAGS_CLASS_IS_FINALIZABLE
-    bne    \slowPathLabel
                                                               // Load thread_local_pos (r12) and
                                                               // thread_local_end (r3) with ldrd.
                                                               // Check constraints for ldrd.
@@ -1218,16 +1199,12 @@
 #endif
     ldrd   r12, r3, [r9, #THREAD_LOCAL_POS_OFFSET]
     sub    r12, r3, r12                                       // Compute the remaining buf size.
-    ldr    r3, [r2, #MIRROR_CLASS_OBJECT_SIZE_OFFSET]         // Load the object size (r3).
+    ldr    r3, [r2, #MIRROR_CLASS_OBJECT_SIZE_ALLOC_FAST_PATH_OFFSET]  // Load the object size (r3).
     cmp    r3, r12                                            // Check if it fits. OK to do this
                                                               // before rounding up the object size
                                                               // assuming the buf size alignment.
     bhi    \slowPathLabel
     // "Point of no slow path". Won't go to the slow path from here on. OK to clobber r0 and r1.
-                                                              // Round up the object size by the
-                                                              // object alignment. (addr + 7) & ~7.
-    add    r3, r3, #OBJECT_ALIGNMENT_MASK
-    and    r3, r3, #OBJECT_ALIGNMENT_MASK_TOGGLED
                                                               // Reload old thread_local_pos (r0)
                                                               // for the return value.
     ldr    r0, [r9, #THREAD_LOCAL_POS_OFFSET]
@@ -1244,7 +1221,7 @@
                                                               // the fields of the class.
                                                               // Alternatively we could use "ishst"
                                                               // if we use load-acquire for the
-                                                              // class status load.)
+                                                              // object size load.)
     dmb    ish
     bx     lr
 .endm