ARM/AOT: Allow 16-bit LDR for Baker read barrier loads.

Test: m test-art-target-gtest
Test: testrunner.py --target on Nexus 6P.
Test: testrunner.py --target on Nexus 6P with heap poisoning enabled.
Test: Repeat the above tests with ART_USE_OLD_ARM_BACKEND=true.
Bug: 29516974
Bug: 30126666
Bug: 36141117
Change-Id: I458f2ec5fe9abead4db06c7595d992945096fb68
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 35dccd6..8650aee 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -90,13 +90,17 @@
 }
 
 static inline void EmitPlaceholderBne(CodeGeneratorARM* codegen, Label* bne_label) {
-  DCHECK(down_cast<Thumb2Assembler*>(codegen->GetAssembler())->IsForced32Bit());
+  ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(codegen->GetAssembler()));
   __ BindTrackedLabel(bne_label);
   Label placeholder_label;
   __ b(&placeholder_label, NE);  // Placeholder, patched at link-time.
   __ Bind(&placeholder_label);
 }
 
+static inline bool CanEmitNarrowLdr(Register rt, Register rn, uint32_t offset) {
+  return ArmAssembler::IsLowRegister(rt) && ArmAssembler::IsLowRegister(rn) && offset < 32u;
+}
+
 static constexpr int kRegListThreshold = 4;
 
 // SaveLiveRegisters and RestoreLiveRegisters from SlowPathCodeARM operate on sets of S registers,
@@ -8049,8 +8053,9 @@
         //   return_address:
 
         CheckLastTempIsBakerCcEntrypointRegister(instruction);
+        bool narrow = CanEmitNarrowLdr(root_reg, obj, offset);
         uint32_t custom_data =
-            linker::Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg);
+            linker::Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg, narrow);
         Label* bne_label = codegen_->NewBakerReadBarrierPatch(custom_data);
 
         // entrypoint_reg =
@@ -8063,16 +8068,18 @@
         Label return_address;
         __ AdrCode(LR, &return_address);
         __ CmpConstant(kBakerCcEntrypointRegister, 0);
-        static_assert(
-            BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET == -8,
-            "GC root LDR must be 2 32-bit instructions (8B) before the return address label.");
         // Currently the offset is always within range. If that changes,
         // we shall have to split the load the same way as for fields.
         DCHECK_LT(offset, kReferenceLoadMinFarOffset);
-        ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()));
+        DCHECK(!down_cast<Thumb2Assembler*>(GetAssembler())->IsForced32Bit());
+        ScopedForce32Bit maybe_force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()), !narrow);
+        int old_position = GetAssembler()->GetBuffer()->GetPosition();
         __ LoadFromOffset(kLoadWord, root_reg, obj, offset);
         EmitPlaceholderBne(codegen_, bne_label);
         __ Bind(&return_address);
+        DCHECK_EQ(old_position - GetAssembler()->GetBuffer()->GetPosition(),
+                  narrow ? BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET
+                         : BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET);
       } else {
         // Note that we do not actually check the value of
         // `GetIsGcMarking()` to decide whether to mark the loaded GC
@@ -8172,10 +8179,12 @@
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
     //     // into LDR, we use an adjusted base register here.
-    //     GcRoot<mirror::Object> reference = *(obj+offset);
+    //     HeapReference<mirror::Object> reference = *(obj+offset);
     //   gray_return_address:
 
     DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>));
+    Register ref_reg = ref.AsRegister<Register>();
+    bool narrow = CanEmitNarrowLdr(ref_reg, obj, offset);
     Register base = obj;
     if (offset >= kReferenceLoadMinFarOffset) {
       base = temp.AsRegister<Register>();
@@ -8183,10 +8192,14 @@
       static_assert(IsPowerOfTwo(kReferenceLoadMinFarOffset), "Expecting a power of 2.");
       __ AddConstant(base, obj, offset & ~(kReferenceLoadMinFarOffset - 1u));
       offset &= (kReferenceLoadMinFarOffset - 1u);
+      // Use narrow LDR only for small offsets. Generating narrow encoding LDR for the large
+      // offsets with `(offset & (kReferenceLoadMinFarOffset - 1u)) < 32u` would most likely
+      // increase the overall code size when taking the generated thunks into account.
+      DCHECK(!narrow);
     }
     CheckLastTempIsBakerCcEntrypointRegister(instruction);
     uint32_t custom_data =
-        linker::Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(base, obj);
+        linker::Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(base, obj, narrow);
     Label* bne_label = NewBakerReadBarrierPatch(custom_data);
 
     // entrypoint_reg =
@@ -8199,19 +8212,20 @@
     Label return_address;
     __ AdrCode(LR, &return_address);
     __ CmpConstant(kBakerCcEntrypointRegister, 0);
-    ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()));
     EmitPlaceholderBne(this, bne_label);
-    static_assert(BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
-                  "Field LDR must be 1 32-bit instruction (4B) before the return address label; "
-                  " 2 32-bit instructions (8B) for heap poisoning.");
-    Register ref_reg = ref.AsRegister<Register>();
     DCHECK_LT(offset, kReferenceLoadMinFarOffset);
+    DCHECK(!down_cast<Thumb2Assembler*>(GetAssembler())->IsForced32Bit());
+    ScopedForce32Bit maybe_force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()), !narrow);
+    int old_position = GetAssembler()->GetBuffer()->GetPosition();
     __ LoadFromOffset(kLoadWord, ref_reg, base, offset);
     if (needs_null_check) {
       MaybeRecordImplicitNullCheck(instruction);
     }
     GetAssembler()->MaybeUnpoisonHeapReference(ref_reg);
     __ Bind(&return_address);
+    DCHECK_EQ(old_position - GetAssembler()->GetBuffer()->GetPosition(),
+              narrow ? BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET
+                     : BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET);
     return;
   }
 
@@ -8257,7 +8271,7 @@
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
     //     // into LDR, we use an adjusted base register here.
-    //     GcRoot<mirror::Object> reference = data[index];
+    //     HeapReference<mirror::Object> reference = data[index];
     //   gray_return_address:
 
     DCHECK(index.IsValid());
@@ -8282,15 +8296,15 @@
     Label return_address;
     __ AdrCode(LR, &return_address);
     __ CmpConstant(kBakerCcEntrypointRegister, 0);
-    ScopedForce32Bit force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()));
     EmitPlaceholderBne(this, bne_label);
-    static_assert(BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
-                  "Array LDR must be 1 32-bit instruction (4B) before the return address label; "
-                  " 2 32-bit instructions (8B) for heap poisoning.");
+    ScopedForce32Bit maybe_force_32bit(down_cast<Thumb2Assembler*>(GetAssembler()));
+    int old_position = GetAssembler()->GetBuffer()->GetPosition();
     __ ldr(ref_reg, Address(data_reg, index_reg, LSL, scale_factor));
     DCHECK(!needs_null_check);  // The thunk cannot handle the null check.
     GetAssembler()->MaybeUnpoisonHeapReference(ref_reg);
     __ Bind(&return_address);
+    DCHECK_EQ(old_position - GetAssembler()->GetBuffer()->GetPosition(),
+              BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET);
     return;
   }
 
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index ed0a64c..54aa03c 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -6094,7 +6094,7 @@
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
     //     // into LDR, we use an adjusted base register here.
-    //     GcRoot<mirror::Object> reference = *(obj+offset);
+    //     HeapReference<mirror::Object> reference = *(obj+offset);
     //   gray_return_address:
 
     DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>));
@@ -6189,7 +6189,7 @@
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
     //     // into LDR, we use an adjusted base register here.
-    //     GcRoot<mirror::Object> reference = data[index];
+    //     HeapReference<mirror::Object> reference = data[index];
     //   gray_return_address:
 
     DCHECK(index.IsValid());
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 8417f84..b2e0a91 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -124,6 +124,10 @@
   __ bind(&placeholder_label);
 }
 
+static inline bool CanEmitNarrowLdr(vixl32::Register rt, vixl32::Register rn, uint32_t offset) {
+  return rt.IsLow() && rn.IsLow() && offset < 32u;
+}
+
 class EmitAdrCode {
  public:
   EmitAdrCode(ArmVIXLMacroAssembler* assembler, vixl32::Register rd, vixl32::Label* label)
@@ -8158,8 +8162,9 @@
 
         UseScratchRegisterScope temps(GetVIXLAssembler());
         ExcludeIPAndBakerCcEntrypointRegister(&temps, instruction);
-        uint32_t custom_data =
-            linker::Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(root_reg.GetCode());
+        bool narrow = CanEmitNarrowLdr(root_reg, obj, offset);
+        uint32_t custom_data = linker::Thumb2RelativePatcher::EncodeBakerReadBarrierGcRootData(
+            root_reg.GetCode(), narrow);
         vixl32::Label* bne_label = codegen_->NewBakerReadBarrierPatch(custom_data);
 
         // entrypoint_reg =
@@ -8174,15 +8179,16 @@
         vixl32::Label return_address;
         EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address);
         __ cmp(kBakerCcEntrypointRegister, Operand(0));
-        static_assert(
-            BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_OFFSET == -8,
-            "GC root LDR must be 2 32-bit instructions (8B) before the return address label.");
         // Currently the offset is always within range. If that changes,
         // we shall have to split the load the same way as for fields.
         DCHECK_LT(offset, kReferenceLoadMinFarOffset);
-        __ ldr(EncodingSize(Wide), root_reg, MemOperand(obj, offset));
+        ptrdiff_t old_offset = GetVIXLAssembler()->GetBuffer()->GetCursorOffset();
+        __ ldr(EncodingSize(narrow ? Narrow : Wide), root_reg, MemOperand(obj, offset));
         EmitPlaceholderBne(codegen_, bne_label);
         __ Bind(&return_address);
+        DCHECK_EQ(old_offset - GetVIXLAssembler()->GetBuffer()->GetCursorOffset(),
+                  narrow ? BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_NARROW_OFFSET
+                         : BAKER_MARK_INTROSPECTION_GC_ROOT_LDR_WIDE_OFFSET);
       } else {
         // Note that we do not actually check the value of
         // `GetIsGcMarking()` to decide whether to mark the loaded GC
@@ -8283,10 +8289,12 @@
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
     //     // into LDR, we use an adjusted base register here.
-    //     GcRoot<mirror::Object> reference = *(obj+offset);
+    //     HeapReference<mirror::Object> reference = *(obj+offset);
     //   gray_return_address:
 
     DCHECK_ALIGNED(offset, sizeof(mirror::HeapReference<mirror::Object>));
+    vixl32::Register ref_reg = RegisterFrom(ref, Primitive::kPrimNot);
+    bool narrow = CanEmitNarrowLdr(ref_reg, obj, offset);
     vixl32::Register base = obj;
     if (offset >= kReferenceLoadMinFarOffset) {
       base = RegisterFrom(temp);
@@ -8294,12 +8302,15 @@
       static_assert(IsPowerOfTwo(kReferenceLoadMinFarOffset), "Expecting a power of 2.");
       __ Add(base, obj, Operand(offset & ~(kReferenceLoadMinFarOffset - 1u)));
       offset &= (kReferenceLoadMinFarOffset - 1u);
+      // Use narrow LDR only for small offsets. Generating narrow encoding LDR for the large
+      // offsets with `(offset & (kReferenceLoadMinFarOffset - 1u)) < 32u` would most likely
+      // increase the overall code size when taking the generated thunks into account.
+      DCHECK(!narrow);
     }
     UseScratchRegisterScope temps(GetVIXLAssembler());
     ExcludeIPAndBakerCcEntrypointRegister(&temps, instruction);
     uint32_t custom_data = linker::Thumb2RelativePatcher::EncodeBakerReadBarrierFieldData(
-        base.GetCode(),
-        obj.GetCode());
+        base.GetCode(), obj.GetCode(), narrow);
     vixl32::Label* bne_label = NewBakerReadBarrierPatch(custom_data);
 
     // entrypoint_reg =
@@ -8316,19 +8327,24 @@
     EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address);
     __ cmp(kBakerCcEntrypointRegister, Operand(0));
     EmitPlaceholderBne(this, bne_label);
-    static_assert(BAKER_MARK_INTROSPECTION_FIELD_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
-                  "Field LDR must be 1 32-bit instruction (4B) before the return address label; "
-                  " 2 32-bit instructions (8B) for heap poisoning.");
-    vixl32::Register ref_reg = RegisterFrom(ref, Primitive::kPrimNot);
-    __ ldr(EncodingSize(Wide), ref_reg, MemOperand(base, offset));
+    ptrdiff_t old_offset = GetVIXLAssembler()->GetBuffer()->GetCursorOffset();
+    __ ldr(EncodingSize(narrow ? Narrow : Wide), ref_reg, MemOperand(base, offset));
     if (needs_null_check) {
       MaybeRecordImplicitNullCheck(instruction);
     }
-    // Note: We need a Wide NEG for the unpoisoning.
+    // Note: We need a specific width for the unpoisoning NEG.
     if (kPoisonHeapReferences) {
-      __ rsb(EncodingSize(Wide), ref_reg, ref_reg, Operand(0));
+      if (narrow) {
+        // The only 16-bit encoding is T1 which sets flags outside IT block (i.e. RSBS, not RSB).
+        __ rsbs(EncodingSize(Narrow), ref_reg, ref_reg, Operand(0));
+      } else {
+        __ rsb(EncodingSize(Wide), ref_reg, ref_reg, Operand(0));
+      }
     }
     __ Bind(&return_address);
+    DCHECK_EQ(old_offset - GetVIXLAssembler()->GetBuffer()->GetCursorOffset(),
+              narrow ? BAKER_MARK_INTROSPECTION_FIELD_LDR_NARROW_OFFSET
+                     : BAKER_MARK_INTROSPECTION_FIELD_LDR_WIDE_OFFSET);
     return;
   }
 
@@ -8374,7 +8390,7 @@
     //   not_gray_return_address:
     //     // Original reference load. If the offset is too large to fit
     //     // into LDR, we use an adjusted base register here.
-    //     GcRoot<mirror::Object> reference = data[index];
+    //     HeapReference<mirror::Object> reference = data[index];
     //   gray_return_address:
 
     DCHECK(index.IsValid());
@@ -8404,9 +8420,7 @@
     EmitAdrCode adr(GetVIXLAssembler(), lr, &return_address);
     __ cmp(kBakerCcEntrypointRegister, Operand(0));
     EmitPlaceholderBne(this, bne_label);
-    static_assert(BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET == (kPoisonHeapReferences ? -8 : -4),
-                  "Array LDR must be 1 32-bit instruction (4B) before the return address label; "
-                  " 2 32-bit instructions (8B) for heap poisoning.");
+    ptrdiff_t old_offset = GetVIXLAssembler()->GetBuffer()->GetCursorOffset();
     __ ldr(ref_reg, MemOperand(data_reg, index_reg, vixl32::LSL, scale_factor));
     DCHECK(!needs_null_check);  // The thunk cannot handle the null check.
     // Note: We need a Wide NEG for the unpoisoning.
@@ -8414,6 +8428,8 @@
       __ rsb(EncodingSize(Wide), ref_reg, ref_reg, Operand(0));
     }
     __ Bind(&return_address);
+    DCHECK_EQ(old_offset - GetVIXLAssembler()->GetBuffer()->GetCursorOffset(),
+              BAKER_MARK_INTROSPECTION_ARRAY_LDR_OFFSET);
     return;
   }