Some optimizations for the array alloc path.

- Force Array::Alloc() to be inlined.
- Simplify the array size overflow check.
- Turn fill_usable into a template parameter.
- Remove a branch in Array::DataOffset() and avoid
  Primitive::ComponentSize(), which has a switch, in the array alloc
  path.
- Strength reductions in the array size computation by using component
  size shifts instead of component sizes. Store component size shift
  in the upper 16 bits of primitive_type field.
- Speedup: ~4% (3435->3284) in MemAllocTest on N4.

Bug: 9986565

Change-Id: I4b142ffac4ab8b5b915836f1660a949d6442344c
diff --git a/runtime/mirror/array-inl.h b/runtime/mirror/array-inl.h
index 213dbc2..6582226 100644
--- a/runtime/mirror/array-inl.h
+++ b/runtime/mirror/array-inl.h
@@ -35,13 +35,13 @@
 template<VerifyObjectFlags kVerifyFlags, ReadBarrierOption kReadBarrierOption>
 inline size_t Array::SizeOf() {
   // This is safe from overflow because the array was already allocated, so we know it's sane.
-  size_t component_size =
-      GetClass<kVerifyFlags, kReadBarrierOption>()->template GetComponentSize<kReadBarrierOption>();
+  size_t component_size_shift = GetClass<kVerifyFlags, kReadBarrierOption>()->
+      template GetComponentSizeShift<kReadBarrierOption>();
   // Don't need to check this since we already check this in GetClass.
   int32_t component_count =
       GetLength<static_cast<VerifyObjectFlags>(kVerifyFlags & ~kVerifyThis)>();
-  size_t header_size = DataOffset(component_size).SizeValue();
-  size_t data_size = component_count * component_size;
+  size_t header_size = DataOffset(1U << component_size_shift).SizeValue();
+  size_t data_size = component_count << component_size_shift;
   return header_size + data_size;
 }
 
@@ -56,24 +56,36 @@
 }
 
 static inline size_t ComputeArraySize(Thread* self, Class* array_class, int32_t component_count,
-                                      size_t component_size)
+                                      size_t component_size_shift)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   DCHECK(array_class != NULL);
   DCHECK_GE(component_count, 0);
   DCHECK(array_class->IsArrayClass());
 
+  size_t component_size = 1U << component_size_shift;
   size_t header_size = Array::DataOffset(component_size).SizeValue();
-  size_t data_size = component_count * component_size;
+  size_t data_size = static_cast<size_t>(component_count) << component_size_shift;
   size_t size = header_size + data_size;
 
-  // Check for overflow and throw OutOfMemoryError if this was an unreasonable request.
-  size_t component_shift = sizeof(size_t) * 8 - 1 - CLZ(component_size);
-  if (UNLIKELY(data_size >> component_shift != size_t(component_count) || size < data_size)) {
+  // Check for size_t overflow and throw OutOfMemoryError if this was
+  // an unreasonable request.
+#ifdef __LP64__
+  // 64-bit. No overflow as component_count is 32-bit and the maximum
+  // component size is 8.
+  DCHECK_LE((1U << component_size_shift), 8U);
+#else
+  // 32-bit.
+  DCHECK_NE(header_size, 0U);
+  DCHECK_EQ(RoundUp(header_size, component_size), header_size);
+  // The array length limit (exclusive).
+  const size_t length_limit = (0U - header_size) >> component_size_shift;
+  if (UNLIKELY(length_limit <= static_cast<size_t>(component_count))) {
     self->ThrowOutOfMemoryError(StringPrintf("%s of length %d would overflow",
                                              PrettyDescriptor(array_class).c_str(),
                                              component_count).c_str());
     return 0;  // failure
   }
+#endif
   return size;
 }
 
@@ -103,8 +115,10 @@
 // array.
 class SetLengthToUsableSizeVisitor {
  public:
-  SetLengthToUsableSizeVisitor(int32_t min_length, size_t header_size, size_t component_size) :
-      minimum_length_(min_length), header_size_(header_size), component_size_(component_size) {
+  SetLengthToUsableSizeVisitor(int32_t min_length, size_t header_size,
+                               size_t component_size_shift) :
+      minimum_length_(min_length), header_size_(header_size),
+      component_size_shift_(component_size_shift) {
   }
 
   void operator()(Object* obj, size_t usable_size) const
@@ -112,10 +126,12 @@
     // Avoid AsArray as object is not yet in live bitmap or allocation stack.
     Array* array = down_cast<Array*>(obj);
     // DCHECK(array->IsArrayInstance());
-    int32_t length = (usable_size - header_size_) / component_size_;
+    int32_t length = (usable_size - header_size_) >> component_size_shift_;
     DCHECK_GE(length, minimum_length_);
-    byte* old_end = reinterpret_cast<byte*>(array->GetRawData(component_size_, minimum_length_));
-    byte* new_end = reinterpret_cast<byte*>(array->GetRawData(component_size_, length));
+    byte* old_end = reinterpret_cast<byte*>(array->GetRawData(1U << component_size_shift_,
+                                                              minimum_length_));
+    byte* new_end = reinterpret_cast<byte*>(array->GetRawData(1U << component_size_shift_,
+                                                              length));
     // Ensure space beyond original allocation is zeroed.
     memset(old_end, 0, new_end - old_end);
     array->SetLength(length);
@@ -124,38 +140,46 @@
  private:
   const int32_t minimum_length_;
   const size_t header_size_;
-  const size_t component_size_;
+  const size_t component_size_shift_;
 
   DISALLOW_COPY_AND_ASSIGN(SetLengthToUsableSizeVisitor);
 };
 
-template <bool kIsInstrumented>
+template <bool kIsInstrumented, bool kFillUsable>
 inline Array* Array::Alloc(Thread* self, Class* array_class, int32_t component_count,
-                           size_t component_size, gc::AllocatorType allocator_type,
-                           bool fill_usable) {
+                           size_t component_size_shift, gc::AllocatorType allocator_type) {
   DCHECK(allocator_type != gc::kAllocatorTypeLOS);
-  size_t size = ComputeArraySize(self, array_class, component_count, component_size);
+  DCHECK_EQ(array_class->GetComponentSizeShift(), component_size_shift);
+  DCHECK_EQ(array_class->GetComponentSize(), (1U << component_size_shift));
+  size_t size = ComputeArraySize(self, array_class, component_count, component_size_shift);
+#ifdef __LP64__
+  // 64-bit. No size_t overflow.
+  DCHECK_NE(size, 0U);
+#else
+  // 32-bit.
   if (UNLIKELY(size == 0)) {
     return nullptr;
   }
+#endif
   gc::Heap* heap = Runtime::Current()->GetHeap();
   Array* result;
-  if (!fill_usable) {
+  if (!kFillUsable) {
     SetLengthVisitor visitor(component_count);
     result = down_cast<Array*>(
         heap->AllocObjectWithAllocator<kIsInstrumented, true>(self, array_class, size,
                                                               allocator_type, visitor));
   } else {
-    SetLengthToUsableSizeVisitor visitor(component_count, DataOffset(component_size).SizeValue(),
-                                         component_size);
+    SetLengthToUsableSizeVisitor visitor(component_count,
+                                         DataOffset(1U << component_size_shift).SizeValue(),
+                                         component_size_shift);
     result = down_cast<Array*>(
         heap->AllocObjectWithAllocator<kIsInstrumented, true>(self, array_class, size,
                                                               allocator_type, visitor));
   }
   if (kIsDebugBuild && result != nullptr && Runtime::Current()->IsStarted()) {
     array_class = result->GetClass();  // In case the array class moved.
-    CHECK_EQ(array_class->GetComponentSize(), component_size);
-    if (!fill_usable) {
+    CHECK_EQ(array_class->GetComponentSize(), 1U << component_size_shift);
+    if (!kFillUsable) {
       CHECK_EQ(result->SizeOf(), size);
     } else {
       CHECK_GE(result->SizeOf(), size);
@@ -173,7 +197,8 @@
 
 template<typename T>
 inline PrimitiveArray<T>* PrimitiveArray<T>::Alloc(Thread* self, size_t length) {
-  Array* raw_array = Array::Alloc<true>(self, GetArrayClass(), length, sizeof(T),
+  Array* raw_array = Array::Alloc<true>(self, GetArrayClass(), length,
+                                        ComponentSizeShiftWidth<sizeof(T)>(),
                                         Runtime::Current()->GetHeap()->GetCurrentAllocator());
   return down_cast<PrimitiveArray<T>*>(raw_array);
 }
diff --git a/runtime/mirror/array.cc b/runtime/mirror/array.cc
index 4535f6c..636be33 100644
--- a/runtime/mirror/array.cc
+++ b/runtime/mirror/array.cc
@@ -48,7 +48,8 @@
   StackHandleScope<1> hs(self);
   Handle<Array> new_array(
       hs.NewHandle(
-          Array::Alloc<true>(self, array_class.Get(), array_length, array_class->GetComponentSize(),
+          Array::Alloc<true>(self, array_class.Get(), array_length,
+                             array_class->GetComponentSizeShift(),
                              Runtime::Current()->GetHeap()->GetCurrentAllocator())));
   if (UNLIKELY(new_array.Get() == nullptr)) {
     CHECK(self->IsExceptionPending());
diff --git a/runtime/mirror/array.h b/runtime/mirror/array.h
index 7af88d6..521d7e7 100644
--- a/runtime/mirror/array.h
+++ b/runtime/mirror/array.h
@@ -33,13 +33,12 @@
   // The size of a java.lang.Class representing an array.
   static uint32_t ClassSize();
 
-  // Allocates an array with the given properties, if fill_usable is true the array will be of at
+  // Allocates an array with the given properties, if kFillUsable is true the array will be of at
   // least component_count size, however, if there's usable space at the end of the allocation the
   // array will fill it.
-  template <bool kIsInstrumented>
-  static Array* Alloc(Thread* self, Class* array_class, int32_t component_count,
-                      size_t component_size, gc::AllocatorType allocator_type,
-                      bool fill_usable = false)
+  template <bool kIsInstrumented, bool kFillUsable = false>
+  ALWAYS_INLINE static Array* Alloc(Thread* self, Class* array_class, int32_t component_count,
+                                    size_t component_size_shift, gc::AllocatorType allocator_type)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   static Array* CreateMultiArray(Thread* self, Handle<Class> element_class,
@@ -66,12 +65,11 @@
   }
 
   static MemberOffset DataOffset(size_t component_size) {
-    if (component_size != sizeof(int64_t)) {
-      return OFFSET_OF_OBJECT_MEMBER(Array, first_element_);
-    } else {
-      // Align longs and doubles.
-      return MemberOffset(OFFSETOF_MEMBER(Array, first_element_) + 4);
-    }
+    DCHECK(IsPowerOfTwo(component_size)) << component_size;
+    size_t data_offset = RoundUp(OFFSETOF_MEMBER(Array, first_element_), component_size);
+    DCHECK_EQ(RoundUp(data_offset, component_size), data_offset)
+        << "Array data offset isn't aligned with component size";
+    return MemberOffset(data_offset);
   }
 
   void* GetRawData(size_t component_size, int32_t index)
diff --git a/runtime/mirror/class-inl.h b/runtime/mirror/class-inl.h
index 3f67468..3d3ae16 100644
--- a/runtime/mirror/class-inl.h
+++ b/runtime/mirror/class-inl.h
@@ -510,8 +510,19 @@
 template<VerifyObjectFlags kVerifyFlags>
 inline Primitive::Type Class::GetPrimitiveType() {
   DCHECK_EQ(sizeof(Primitive::Type), sizeof(int32_t));
-  return static_cast<Primitive::Type>(
-      GetField32<kVerifyFlags>(OFFSET_OF_OBJECT_MEMBER(Class, primitive_type_)));
+  int32_t v32 = GetField32<kVerifyFlags>(OFFSET_OF_OBJECT_MEMBER(Class, primitive_type_));
+  Primitive::Type type = static_cast<Primitive::Type>(v32 & 0xFFFF);
+  DCHECK_EQ(static_cast<size_t>(v32 >> 16), Primitive::ComponentSizeShift(type));
+  return type;
+}
+
+template<VerifyObjectFlags kVerifyFlags>
+inline size_t Class::GetPrimitiveTypeSizeShift() {
+  DCHECK_EQ(sizeof(Primitive::Type), sizeof(int32_t));
+  int32_t v32 = GetField32<kVerifyFlags>(OFFSET_OF_OBJECT_MEMBER(Class, primitive_type_));
+  size_t size_shift = static_cast<Primitive::Type>(v32 >> 16);
+  DCHECK_EQ(size_shift, Primitive::ComponentSizeShift(static_cast<Primitive::Type>(v32 & 0xFFFF)));
+  return size_shift;
 }
 
 inline void Class::CheckObjectAlloc() {
diff --git a/runtime/mirror/class.h b/runtime/mirror/class.h
index 4a8d6dc..0acf695 100644
--- a/runtime/mirror/class.h
+++ b/runtime/mirror/class.h
@@ -345,9 +345,16 @@
 
   void SetPrimitiveType(Primitive::Type new_type) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     DCHECK_EQ(sizeof(Primitive::Type), sizeof(int32_t));
-    SetField32<false>(OFFSET_OF_OBJECT_MEMBER(Class, primitive_type_), new_type);
+    int32_t v32 = static_cast<int32_t>(new_type);
+    DCHECK_EQ(v32 & 0xFFFF, v32) << "upper 16 bits aren't zero";
+    // Store the component size shift in the upper 16 bits.
+    v32 |= Primitive::ComponentSizeShift(new_type) << 16;
+    SetField32<false>(OFFSET_OF_OBJECT_MEMBER(Class, primitive_type_), v32);
   }
 
+  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  size_t GetPrimitiveTypeSizeShift() ALWAYS_INLINE SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   // Returns true if the class is a primitive type.
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   bool IsPrimitive() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
@@ -457,8 +464,12 @@
 
   template<ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
   size_t GetComponentSize() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return Primitive::ComponentSize(
-        GetComponentType<kDefaultVerifyFlags, kReadBarrierOption>()->GetPrimitiveType());
+    return 1U << GetComponentSizeShift();
+  }
+
+  template<ReadBarrierOption kReadBarrierOption = kWithReadBarrier>
+  size_t GetComponentSizeShift() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    return GetComponentType<kDefaultVerifyFlags, kReadBarrierOption>()->GetPrimitiveTypeSizeShift();
   }
 
   bool IsObjectClass() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
@@ -1149,8 +1160,9 @@
   // See also class_size_.
   uint32_t object_size_;
 
-  // Primitive type value, or Primitive::kPrimNot (0); set for generated primitive classes.
-  Primitive::Type primitive_type_;
+  // The lower 16 bits contains a Primitive::Type value. The upper 16
+  // bits contains the size shift of the primitive type.
+  uint32_t primitive_type_;
 
   // Bitmap of offsets of ifields.
   uint32_t reference_instance_offsets_;
diff --git a/runtime/mirror/object_array-inl.h b/runtime/mirror/object_array-inl.h
index c7540dc..0ca44f8 100644
--- a/runtime/mirror/object_array-inl.h
+++ b/runtime/mirror/object_array-inl.h
@@ -35,10 +35,13 @@
 inline ObjectArray<T>* ObjectArray<T>::Alloc(Thread* self, Class* object_array_class,
                                              int32_t length, gc::AllocatorType allocator_type) {
   Array* array = Array::Alloc<true>(self, object_array_class, length,
-                                    sizeof(HeapReference<Object>), allocator_type);
+                                    ComponentSizeShiftWidth<sizeof(HeapReference<Object>)>(),
+                                    allocator_type);
   if (UNLIKELY(array == nullptr)) {
     return nullptr;
   } else {
+    DCHECK_EQ(array->GetClass()->GetComponentSizeShift(),
+              ComponentSizeShiftWidth<sizeof(HeapReference<Object>)>());
     return array->AsObjectArray<T>();
   }
 }
diff --git a/runtime/mirror/object_test.cc b/runtime/mirror/object_test.cc
index 1290a3d..7fa664d 100644
--- a/runtime/mirror/object_test.cc
+++ b/runtime/mirror/object_test.cc
@@ -162,19 +162,19 @@
   Class* c = class_linker_->FindSystemClass(soa.Self(), "[I");
   StackHandleScope<1> hs(soa.Self());
   MutableHandle<Array> a(
-      hs.NewHandle(Array::Alloc<true>(soa.Self(), c, 1, c->GetComponentSize(),
+      hs.NewHandle(Array::Alloc<true>(soa.Self(), c, 1, c->GetComponentSizeShift(),
                                       Runtime::Current()->GetHeap()->GetCurrentAllocator())));
   EXPECT_TRUE(c == a->GetClass());
   EXPECT_EQ(1, a->GetLength());
 
   c = class_linker_->FindSystemClass(soa.Self(), "[Ljava/lang/Object;");
-  a.Assign(Array::Alloc<true>(soa.Self(), c, 1, c->GetComponentSize(),
+  a.Assign(Array::Alloc<true>(soa.Self(), c, 1, c->GetComponentSizeShift(),
                               Runtime::Current()->GetHeap()->GetCurrentAllocator()));
   EXPECT_TRUE(c == a->GetClass());
   EXPECT_EQ(1, a->GetLength());
 
   c = class_linker_->FindSystemClass(soa.Self(), "[[Ljava/lang/Object;");
-  a.Assign(Array::Alloc<true>(soa.Self(), c, 1, c->GetComponentSize(),
+  a.Assign(Array::Alloc<true>(soa.Self(), c, 1, c->GetComponentSizeShift(),
                               Runtime::Current()->GetHeap()->GetCurrentAllocator()));
   EXPECT_TRUE(c == a->GetClass());
   EXPECT_EQ(1, a->GetLength());
@@ -185,26 +185,26 @@
   Class* c = class_linker_->FindSystemClass(soa.Self(), "[B");
   StackHandleScope<1> hs(soa.Self());
   MutableHandle<Array> a(
-      hs.NewHandle(Array::Alloc<true>(soa.Self(), c, 1, c->GetComponentSize(),
-                                      Runtime::Current()->GetHeap()->GetCurrentAllocator(), true)));
+      hs.NewHandle(Array::Alloc<true, true>(soa.Self(), c, 1, c->GetComponentSizeShift(),
+                                            Runtime::Current()->GetHeap()->GetCurrentAllocator())));
   EXPECT_TRUE(c == a->GetClass());
   EXPECT_LE(1, a->GetLength());
 
   c = class_linker_->FindSystemClass(soa.Self(), "[I");
-  a.Assign(Array::Alloc<true>(soa.Self(), c, 2, c->GetComponentSize(),
-                              Runtime::Current()->GetHeap()->GetCurrentAllocator(), true));
+  a.Assign(Array::Alloc<true, true>(soa.Self(), c, 2, c->GetComponentSizeShift(),
+                                    Runtime::Current()->GetHeap()->GetCurrentAllocator()));
   EXPECT_TRUE(c == a->GetClass());
   EXPECT_LE(2, a->GetLength());
 
   c = class_linker_->FindSystemClass(soa.Self(), "[Ljava/lang/Object;");
-  a.Assign(Array::Alloc<true>(soa.Self(), c, 2, c->GetComponentSize(),
-                              Runtime::Current()->GetHeap()->GetCurrentAllocator(), true));
+  a.Assign(Array::Alloc<true, true>(soa.Self(), c, 2, c->GetComponentSizeShift(),
+                                    Runtime::Current()->GetHeap()->GetCurrentAllocator()));
   EXPECT_TRUE(c == a->GetClass());
   EXPECT_LE(2, a->GetLength());
 
   c = class_linker_->FindSystemClass(soa.Self(), "[[Ljava/lang/Object;");
-  a.Assign(Array::Alloc<true>(soa.Self(), c, 2, c->GetComponentSize(),
-                             Runtime::Current()->GetHeap()->GetCurrentAllocator(), true));
+  a.Assign(Array::Alloc<true, true>(soa.Self(), c, 2, c->GetComponentSizeShift(),
+                                    Runtime::Current()->GetHeap()->GetCurrentAllocator()));
   EXPECT_TRUE(c == a->GetClass());
   EXPECT_LE(2, a->GetLength());
 }