ARM/AOT: Allow 16-bit LDR for Baker read barrier loads.

Test: m test-art-target-gtest
Test: testrunner.py --target on Nexus 6P.
Test: testrunner.py --target on Nexus 6P with heap poisoning enabled.
Test: Repeat the above tests with ART_USE_OLD_ARM_BACKEND=true.
Bug: 29516974
Bug: 30126666
Bug: 36141117
Change-Id: I458f2ec5fe9abead4db06c7595d992945096fb68
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 35dccd6..8650aee 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -90,13 +90,17 @@
 }
 
 static inline void EmitPlaceholderBne(CodeGeneratorARM* codegen, Label* bne_label) {
-  DCHECK(down_cast<Thumb2Assembler*>(codegen->GetAssembler())->IsForced32Bit());
+  ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(codegen->GetAssembler()));
   __ BindTrackedLabel(bne_label);
   Label placeholder_label;
   __ b(&placeholder_label, NE);  // Placeholder, patched at link-time.
   __ Bind(&placeholder_label);
 }
 
+static inline bool CanEmitNarrowLdr(Register rt, Register rn, uint32_t offset) {
+  return ArmAssembler::IsLowRegister(rt) && ArmAssembler::IsLowRegister(rn) && offset < 32u;
+}
+
 static constexpr int kRegListThreshold = 4;
 
 // SaveLiveRegisters and RestoreLiveRegisters from SlowPathCodeARM operate on sets of S registers,
@@ -8049,8 +8053,9 @@
         //   return_address:
 
         CheckLastTempIsBakerCcEntrypointRegister(instruction);
+        bool narrow = CanEmitNarrowLdr(root_reg, obj, offset);
         uint32_t custom_data =
-            linker::Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg);
+            linker::Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg, narrow);
         Label* bne_label = codegen_->NewBakerReadBarrierPatch(custom_data);
 
         // entrypoint_reg =
@@ -8063,16 +8068,18 @@
         Label return_address;
         __ AdrCode(LR, &return_address);
         __ CmpConstant(kBakerCcEntrypointRegister, 0);
-        static_assert(
-            BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET == -8,
-            "GC root LDR must be 2 32-bit instructions (8B) before the return address label.");
         // Currently the offset is always within range. If that changes,
         // we shall have to split the load the same way as for fields.
         DCHECK_LT(offset, kReferenceLoadMinFarOffset);
-        ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()));
+        DCHECK(!down_cast<Thumb2Assembler*>(GetAssembler())->IsForced32Bit());
+        ScopedForce32Bit maybe_force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()), !narrow);
+        int old_position = GetAssembler()->GetBuffer()->GetPosition();
         __ LoadFromOffset(kLoadWord, root_reg, obj, offset);
         EmitPlaceholderBne(codegen_, bne_label);
         __ Bind(&return_address);
+        DCHECK_EQ(old_position - GetAssembler()->GetBuffer()->GetPosition(),
+                  narrow ? BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET
+                         : BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET);
       } else {
         // Note that we do not actually check the value of
         // `GetIsGcMarking()` to decide whether to mark the loaded GC
@@ -8172,10 +8179,12 @@
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
     //     // into LDR, we use an adjusted base register here.
-    //     GcRoot<mirror::Object> reference = *(obj+offset);
+    //     HeapReference<mirror::Object> reference = *(obj+offset);
     //   gray_return_address:
 
     DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>));
+    Register ref_reg = ref.AsRegister<Register>();
+    bool narrow = CanEmitNarrowLdr(ref_reg, obj, offset);
     Register base = obj;
     if (offset >= kReferenceLoadMinFarOffset) {
       base = temp.AsRegister<Register>();
@@ -8183,10 +8192,14 @@
       static_assert(IsPowerOfTwo(kReferenceLoadMinFarOffset), "Expecting a power of 2.");
       __ AddConstant(base, obj, offset & ~(kReferenceLoadMinFarOffset - 1u));
       offset &= (kReferenceLoadMinFarOffset - 1u);
+      // Use narrow LDR only for small offsets. Generating narrow encoding LDR for the large
+      // offsets with `(offset & (kReferenceLoadMinFarOffset - 1u)) < 32u` would most likely
+      // increase the overall code size when taking the generated thunks into account.
+      DCHECK(!narrow);
     }
     CheckLastTempIsBakerCcEntrypointRegister(instruction);
     uint32_t custom_data =
-        linker::Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(base, obj);
+        linker::Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(base, obj, narrow);
     Label* bne_label = NewBakerReadBarrierPatch(custom_data);
 
     // entrypoint_reg =
@@ -8199,19 +8212,20 @@
     Label return_address;
     __ AdrCode(LR, &return_address);
     __ CmpConstant(kBakerCcEntrypointRegister, 0);
-    ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()));
     EmitPlaceholderBne(this, bne_label);
-    static_assert(BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
-                  "Field LDR must be 1 32-bit instruction (4B) before the return address label; "
-                  " 2 32-bit instructions (8B) for heap poisoning.");
-    Register ref_reg = ref.AsRegister<Register>();
     DCHECK_LT(offset, kReferenceLoadMinFarOffset);
+    DCHECK(!down_cast<Thumb2Assembler*>(GetAssembler())->IsForced32Bit());
+    ScopedForce32Bit maybe_force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()), !narrow);
+    int old_position = GetAssembler()->GetBuffer()->GetPosition();
     __ LoadFromOffset(kLoadWord, ref_reg, base, offset);
     if (needs_null_check) {
       MaybeRecordImplicitNullCheck(instruction);
     }
     GetAssembler()->MaybeUnpoisonHeapReference(ref_reg);
     __ Bind(&return_address);
+    DCHECK_EQ(old_position - GetAssembler()->GetBuffer()->GetPosition(),
+              narrow ? BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET
+                     : BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET);
     return;
   }
 
@@ -8257,7 +8271,7 @@
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
     //     // into LDR, we use an adjusted base register here.
-    //     GcRoot<mirror::Object> reference = data[index];
+    //     HeapReference<mirror::Object> reference = data[index];
     //   gray_return_address:
 
     DCHECK(index.IsValid());
@@ -8282,15 +8296,15 @@
     Label return_address;
     __ AdrCode(LR, &return_address);
     __ CmpConstant(kBakerCcEntrypointRegister, 0);
-    ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()));
     EmitPlaceholderBne(this, bne_label);
-    static_assert(BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
-                  "Array LDR must be 1 32-bit instruction (4B) before the return address label; "
-                  " 2 32-bit instructions (8B) for heap poisoning.");
+    ScopedForce32Bit maybe_force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()));
+    int old_position = GetAssembler()->GetBuffer()->GetPosition();
     __ ldr(ref_reg, Address(data_reg, index_reg, LSL, scale_factor));
     DCHECK(!needs_null_check);  // The thunk cannot handle the null check.
     GetAssembler()->MaybeUnpoisonHeapReference(ref_reg);
     __ Bind(&return_address);
+    DCHECK_EQ(old_position - GetAssembler()->GetBuffer()->GetPosition(),
+              BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET);
     return;
   }