Merge "ART: Make run-test temp dir consistent"
diff --git a/cmdline/cmdline_types.h b/cmdline/cmdline_types.h
index 9b4042c..f05648c 100644
--- a/cmdline/cmdline_types.h
+++ b/cmdline/cmdline_types.h
@@ -462,7 +462,7 @@
 struct XGcOption {
   // These defaults are used when the command line arguments for -Xgc:
   // are either omitted completely or partially.
-  gc::CollectorType collector_type_ =  kUseReadBarrier ?
+  gc::CollectorType collector_type_ = kUseReadBarrier ?
                                            // If RB is enabled (currently a build-time decision),
                                            // use CC as the default GC.
                                            gc::kCollectorTypeCC :
@@ -473,6 +473,7 @@
   bool verify_pre_gc_rosalloc_ = kIsDebugBuild;
   bool verify_pre_sweeping_rosalloc_ = false;
   bool verify_post_gc_rosalloc_ = false;
+  bool measure_ = kIsDebugBuild;
   bool gcstress_ = false;
 };
 
@@ -515,6 +516,8 @@
         xgc.gcstress_ = true;
       } else if (gc_option == "nogcstress") {
         xgc.gcstress_ = false;
+      } else if (gc_option == "measure") {
+        xgc.measure_ = true;
       } else if ((gc_option == "precise") ||
                  (gc_option == "noprecise") ||
                  (gc_option == "verifycardtable") ||
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 47e6625..5e6e175 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -80,7 +80,11 @@
 
   virtual void EmitNativeCode(CodeGenerator* codegen) = 0;
 
+  // Save live core and floating-point caller-save registers and
+  // update the stack mask in `locations` for registers holding object
+  // references.
   virtual void SaveLiveRegisters(CodeGenerator* codegen, LocationSummary* locations);
+  // Restore live core and floating-point caller-save registers.
   virtual void RestoreLiveRegisters(CodeGenerator* codegen, LocationSummary* locations);
 
   bool IsCoreRegisterSaved(int reg) const {
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 47bafb5..236ed20 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -437,11 +437,9 @@
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
-    // Save live registers before the runtime call, and in particular
-    // R0 (if it is live), as it is clobbered by functions
-    // art_quick_read_barrier_mark_regX.
-    SaveLiveRegisters(codegen, locations);
-
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
     InvokeRuntimeCallingConvention calling_convention;
     CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
     DCHECK_NE(reg, SP);
@@ -469,8 +467,6 @@
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
-
-    RestoreLiveRegisters(codegen, locations);
     __ b(GetExitLabel());
   }
 
@@ -4437,6 +4433,10 @@
   Location out_loc = locations->Out();
   uint32_t data_offset = CodeGenerator::GetArrayDataOffset(instruction);
   Primitive::Type type = instruction->GetType();
+  HInstruction* array_instr = instruction->GetArray();
+  bool has_intermediate_address = array_instr->IsIntermediateAddress();
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
+  DCHECK(!(has_intermediate_address && kEmitCompilerReadBarrier));
 
   switch (type) {
     case Primitive::kPrimBoolean:
@@ -4451,8 +4451,21 @@
         LoadOperandType load_type = GetLoadOperandType(type);
         __ LoadFromOffset(load_type, out_loc.AsRegister<Register>(), obj, full_offset);
       } else {
-        __ add(IP, obj, ShifterOperand(data_offset));
-        codegen_->LoadFromShiftedRegOffset(type, out_loc, IP, index.AsRegister<Register>());
+        Register temp = IP;
+
+        if (has_intermediate_address) {
+          // We do not need to compute the intermediate address from the array: the
+          // input instruction has done it already. See the comment in
+          // `TryExtractArrayAccessAddress()`.
+          if (kIsDebugBuild) {
+            HIntermediateAddress* tmp = array_instr->AsIntermediateAddress();
+            DCHECK_EQ(tmp->GetOffset()->AsIntConstant()->GetValueAsUint64(), data_offset);
+          }
+          temp = obj;
+        } else {
+          __ add(temp, obj, ShifterOperand(data_offset));
+        }
+        codegen_->LoadFromShiftedRegOffset(type, out_loc, temp, index.AsRegister<Register>());
       }
       break;
     }
@@ -4481,8 +4494,21 @@
           // reference, if heap poisoning is enabled).
           codegen_->MaybeGenerateReadBarrierSlow(instruction, out_loc, out_loc, obj_loc, offset);
         } else {
-          __ add(IP, obj, ShifterOperand(data_offset));
-          codegen_->LoadFromShiftedRegOffset(type, out_loc, IP, index.AsRegister<Register>());
+          Register temp = IP;
+
+          if (has_intermediate_address) {
+            // We do not need to compute the intermediate address from the array: the
+            // input instruction has done it already. See the comment in
+            // `TryExtractArrayAccessAddress()`.
+            if (kIsDebugBuild) {
+              HIntermediateAddress* tmp = array_instr->AsIntermediateAddress();
+              DCHECK_EQ(tmp->GetOffset()->AsIntConstant()->GetValueAsUint64(), data_offset);
+            }
+            temp = obj;
+          } else {
+            __ add(temp, obj, ShifterOperand(data_offset));
+          }
+          codegen_->LoadFromShiftedRegOffset(type, out_loc, temp, index.AsRegister<Register>());
 
           codegen_->MaybeRecordImplicitNullCheck(instruction);
           // If read barriers are enabled, emit read barriers other than
@@ -4585,6 +4611,10 @@
   uint32_t data_offset =
       mirror::Array::DataOffset(Primitive::ComponentSize(value_type)).Uint32Value();
   Location value_loc = locations->InAt(2);
+  HInstruction* array_instr = instruction->GetArray();
+  bool has_intermediate_address = array_instr->IsIntermediateAddress();
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
+  DCHECK(!(has_intermediate_address && kEmitCompilerReadBarrier));
 
   switch (value_type) {
     case Primitive::kPrimBoolean:
@@ -4599,10 +4629,23 @@
         StoreOperandType store_type = GetStoreOperandType(value_type);
         __ StoreToOffset(store_type, value_loc.AsRegister<Register>(), array, full_offset);
       } else {
-        __ add(IP, array, ShifterOperand(data_offset));
+        Register temp = IP;
+
+        if (has_intermediate_address) {
+          // We do not need to compute the intermediate address from the array: the
+          // input instruction has done it already. See the comment in
+          // `TryExtractArrayAccessAddress()`.
+          if (kIsDebugBuild) {
+            HIntermediateAddress* tmp = array_instr->AsIntermediateAddress();
+            DCHECK(tmp->GetOffset()->AsIntConstant()->GetValueAsUint64() == data_offset);
+          }
+          temp = array;
+        } else {
+          __ add(temp, array, ShifterOperand(data_offset));
+        }
         codegen_->StoreToShiftedRegOffset(value_type,
                                           value_loc,
-                                          IP,
+                                          temp,
                                           index.AsRegister<Register>());
       }
       break;
@@ -4610,6 +4653,9 @@
 
     case Primitive::kPrimNot: {
       Register value = value_loc.AsRegister<Register>();
+      // TryExtractArrayAccessAddress optimization is never applied for non-primitive ArraySet.
+      // See the comment in instruction_simplifier_shared.cc.
+      DCHECK(!has_intermediate_address);
 
       if (instruction->InputAt(2)->IsNullConstant()) {
         // Just setting null.
@@ -4832,6 +4878,37 @@
   codegen_->MaybeRecordImplicitNullCheck(instruction);
 }
 
+void LocationsBuilderARM::VisitIntermediateAddress(HIntermediateAddress* instruction) {
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
+  DCHECK(!kEmitCompilerReadBarrier);
+  LocationSummary* locations =
+      new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
+
+  locations->SetInAt(0, Location::RequiresRegister());
+  locations->SetInAt(1, Location::RegisterOrConstant(instruction->GetOffset()));
+  locations->SetOut(Location::RequiresRegister(), Location::kNoOutputOverlap);
+}
+
+void InstructionCodeGeneratorARM::VisitIntermediateAddress(HIntermediateAddress* instruction) {
+  LocationSummary* locations = instruction->GetLocations();
+  Location out = locations->Out();
+  Location first = locations->InAt(0);
+  Location second = locations->InAt(1);
+
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
+  DCHECK(!kEmitCompilerReadBarrier);
+
+  if (second.IsRegister()) {
+    __ add(out.AsRegister<Register>(),
+           first.AsRegister<Register>(),
+           ShifterOperand(second.AsRegister<Register>()));
+  } else {
+    __ AddConstant(out.AsRegister<Register>(),
+                   first.AsRegister<Register>(),
+                   second.GetConstant()->AsIntConstant()->GetValue());
+  }
+}
+
 void LocationsBuilderARM::VisitBoundsCheck(HBoundsCheck* instruction) {
   LocationSummary::CallKind call_kind = instruction->CanThrowIntoCatchBlock()
       ? LocationSummary::kCallOnSlowPath
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index a2d126d..76b0797 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -603,11 +603,9 @@
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
-    // Save live registers before the runtime call, and in particular
-    // W0 (if it is live), as it is clobbered by functions
-    // art_quick_read_barrier_mark_regX.
-    SaveLiveRegisters(codegen, locations);
-
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
     InvokeRuntimeCallingConvention calling_convention;
     CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
     DCHECK_NE(obj_.reg(), LR);
@@ -635,8 +633,6 @@
                                  instruction_,
                                  instruction_->GetDexPc(),
                                  this);
-
-    RestoreLiveRegisters(codegen, locations);
     __ B(GetExitLabel());
   }
 
@@ -690,10 +686,9 @@
             instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier for heap reference slow path: "
         << instruction_->DebugName();
-    // The read barrier instrumentation does not support the
-    // HArm64IntermediateAddress instruction yet.
+    // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
     DCHECK(!(instruction_->IsArrayGet() &&
-             instruction_->AsArrayGet()->GetArray()->IsArm64IntermediateAddress()));
+             instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress()));
 
     __ Bind(GetEntryLabel());
 
@@ -1983,9 +1978,8 @@
   }
 }
 
-void LocationsBuilderARM64::VisitArm64IntermediateAddress(HArm64IntermediateAddress* instruction) {
-  // The read barrier instrumentation does not support the
-  // HArm64IntermediateAddress instruction yet.
+void LocationsBuilderARM64::VisitIntermediateAddress(HIntermediateAddress* instruction) {
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
   DCHECK(!kEmitCompilerReadBarrier);
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
@@ -1994,10 +1988,9 @@
   locations->SetOut(Location::RequiresRegister());
 }
 
-void InstructionCodeGeneratorARM64::VisitArm64IntermediateAddress(
-    HArm64IntermediateAddress* instruction) {
-  // The read barrier instrumentation does not support the
-  // HArm64IntermediateAddress instruction yet.
+void InstructionCodeGeneratorARM64::VisitIntermediateAddress(
+    HIntermediateAddress* instruction) {
+  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
   DCHECK(!kEmitCompilerReadBarrier);
   __ Add(OutputRegister(instruction),
          InputRegisterAt(instruction, 0),
@@ -2097,9 +2090,8 @@
   if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
     // Object ArrayGet with Baker's read barrier case.
     Register temp = temps.AcquireW();
-    // The read barrier instrumentation does not support the
-    // HArm64IntermediateAddress instruction yet.
-    DCHECK(!instruction->GetArray()->IsArm64IntermediateAddress());
+    // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
+    DCHECK(!instruction->GetArray()->IsIntermediateAddress());
     // Note that a potential implicit null check is handled in the
     // CodeGeneratorARM64::GenerateArrayLoadWithBakerReadBarrier call.
     codegen_->GenerateArrayLoadWithBakerReadBarrier(
@@ -2112,15 +2104,15 @@
       source = HeapOperand(obj, offset);
     } else {
       Register temp = temps.AcquireSameSizeAs(obj);
-      if (instruction->GetArray()->IsArm64IntermediateAddress()) {
+      if (instruction->GetArray()->IsIntermediateAddress()) {
         // The read barrier instrumentation does not support the
-        // HArm64IntermediateAddress instruction yet.
+        // HIntermediateAddress instruction yet.
         DCHECK(!kEmitCompilerReadBarrier);
         // We do not need to compute the intermediate address from the array: the
         // input instruction has done it already. See the comment in
-        // `InstructionSimplifierArm64::TryExtractArrayAccessAddress()`.
+        // `TryExtractArrayAccessAddress()`.
         if (kIsDebugBuild) {
-          HArm64IntermediateAddress* tmp = instruction->GetArray()->AsArm64IntermediateAddress();
+          HIntermediateAddress* tmp = instruction->GetArray()->AsIntermediateAddress();
           DCHECK_EQ(tmp->GetOffset()->AsIntConstant()->GetValueAsUint64(), offset);
         }
         temp = obj;
@@ -2204,15 +2196,15 @@
     } else {
       UseScratchRegisterScope temps(masm);
       Register temp = temps.AcquireSameSizeAs(array);
-      if (instruction->GetArray()->IsArm64IntermediateAddress()) {
+      if (instruction->GetArray()->IsIntermediateAddress()) {
         // The read barrier instrumentation does not support the
-        // HArm64IntermediateAddress instruction yet.
+        // HIntermediateAddress instruction yet.
         DCHECK(!kEmitCompilerReadBarrier);
         // We do not need to compute the intermediate address from the array: the
         // input instruction has done it already. See the comment in
-        // `InstructionSimplifierArm64::TryExtractArrayAccessAddress()`.
+        // `TryExtractArrayAccessAddress()`.
         if (kIsDebugBuild) {
-          HArm64IntermediateAddress* tmp = instruction->GetArray()->AsArm64IntermediateAddress();
+          HIntermediateAddress* tmp = instruction->GetArray()->AsIntermediateAddress();
           DCHECK(tmp->GetOffset()->AsIntConstant()->GetValueAsUint64() == offset);
         }
         temp = array;
@@ -2228,7 +2220,7 @@
     codegen_->MaybeRecordImplicitNullCheck(instruction);
   } else {
     DCHECK(needs_write_barrier);
-    DCHECK(!instruction->GetArray()->IsArm64IntermediateAddress());
+    DCHECK(!instruction->GetArray()->IsIntermediateAddress());
     vixl::aarch64::Label done;
     SlowPathCodeARM64* slow_path = nullptr;
     {
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index c5c0aad..82baaa0 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -472,11 +472,9 @@
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
-    // Save live registers before the runtime call, and in particular
-    // EAX (if it is live), as it is clobbered by functions
-    // art_quick_read_barrier_mark_regX.
-    SaveLiveRegisters(codegen, locations);
-
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
     InvokeRuntimeCallingConvention calling_convention;
     CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen);
     DCHECK_NE(reg, ESP);
@@ -502,8 +500,6 @@
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
-
-    RestoreLiveRegisters(codegen, locations);
     __ jmp(GetExitLabel());
   }
 
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 900c790..b6ba30e 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -493,11 +493,9 @@
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
-    // Save live registers before the runtime call, and in particular
-    // RDI and/or RAX (if they are live), as they are clobbered by
-    // functions art_quick_read_barrier_mark_regX.
-    SaveLiveRegisters(codegen, locations);
-
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
     InvokeRuntimeCallingConvention calling_convention;
     CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
     DCHECK_NE(reg, RSP);
@@ -523,8 +521,6 @@
                                   instruction_,
                                   instruction_->GetDexPc(),
                                   this);
-
-    RestoreLiveRegisters(codegen, locations);
     __ jmp(GetExitLabel());
   }
 
diff --git a/compiler/optimizing/common_arm64.h b/compiler/optimizing/common_arm64.h
index d2afa5b..af0ee4e 100644
--- a/compiler/optimizing/common_arm64.h
+++ b/compiler/optimizing/common_arm64.h
@@ -227,7 +227,7 @@
     return vixl::aarch64::Assembler::IsImmMovn(value, vixl::aarch64::kXRegSize);
   } else {
     DCHECK(instr->IsAdd() ||
-           instr->IsArm64IntermediateAddress() ||
+           instr->IsIntermediateAddress() ||
            instr->IsBoundsCheck() ||
            instr->IsCompare() ||
            instr->IsCondition() ||
diff --git a/compiler/optimizing/instruction_simplifier_arm.cc b/compiler/optimizing/instruction_simplifier_arm.cc
index cd026b8..495f3fd 100644
--- a/compiler/optimizing/instruction_simplifier_arm.cc
+++ b/compiler/optimizing/instruction_simplifier_arm.cc
@@ -14,8 +14,10 @@
  * limitations under the License.
  */
 
+#include "code_generator.h"
 #include "instruction_simplifier_arm.h"
 #include "instruction_simplifier_shared.h"
+#include "mirror/array-inl.h"
 
 namespace art {
 namespace arm {
@@ -38,6 +40,46 @@
   }
 }
 
+void InstructionSimplifierArmVisitor::VisitArrayGet(HArrayGet* instruction) {
+  size_t data_offset = CodeGenerator::GetArrayDataOffset(instruction);
+  Primitive::Type type = instruction->GetType();
+
+  if (type == Primitive::kPrimLong
+      || type == Primitive::kPrimFloat
+      || type == Primitive::kPrimDouble) {
+    // T32 doesn't support ShiftedRegOffset mem address mode for these types
+    // to enable optimization.
+    return;
+  }
+
+  if (TryExtractArrayAccessAddress(instruction,
+                                   instruction->GetArray(),
+                                   instruction->GetIndex(),
+                                   data_offset)) {
+    RecordSimplification();
+  }
+}
+
+void InstructionSimplifierArmVisitor::VisitArraySet(HArraySet* instruction) {
+  size_t access_size = Primitive::ComponentSize(instruction->GetComponentType());
+  size_t data_offset = mirror::Array::DataOffset(access_size).Uint32Value();
+  Primitive::Type type = instruction->GetComponentType();
+
+  if (type == Primitive::kPrimLong
+      || type == Primitive::kPrimFloat
+      || type == Primitive::kPrimDouble) {
+    // T32 doesn't support ShiftedRegOffset mem address mode for these types
+    // to enable optimization.
+    return;
+  }
+
+  if (TryExtractArrayAccessAddress(instruction,
+                                   instruction->GetArray(),
+                                   instruction->GetIndex(),
+                                   data_offset)) {
+    RecordSimplification();
+  }
+}
 
 }  // namespace arm
 }  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_arm.h b/compiler/optimizing/instruction_simplifier_arm.h
index 14c940e..3d297da 100644
--- a/compiler/optimizing/instruction_simplifier_arm.h
+++ b/compiler/optimizing/instruction_simplifier_arm.h
@@ -38,6 +38,8 @@
   void VisitMul(HMul* instruction) OVERRIDE;
   void VisitOr(HOr* instruction) OVERRIDE;
   void VisitAnd(HAnd* instruction) OVERRIDE;
+  void VisitArrayGet(HArrayGet* instruction) OVERRIDE;
+  void VisitArraySet(HArraySet* instruction) OVERRIDE;
 
   OptimizingCompilerStats* stats_;
 };
diff --git a/compiler/optimizing/instruction_simplifier_arm64.cc b/compiler/optimizing/instruction_simplifier_arm64.cc
index 983d31d..6d107d5 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.cc
+++ b/compiler/optimizing/instruction_simplifier_arm64.cc
@@ -28,56 +28,6 @@
 using helpers::HasShifterOperand;
 using helpers::ShifterOperandSupportsExtension;
 
-void InstructionSimplifierArm64Visitor::TryExtractArrayAccessAddress(HInstruction* access,
-                                                                     HInstruction* array,
-                                                                     HInstruction* index,
-                                                                     size_t data_offset) {
-  if (kEmitCompilerReadBarrier) {
-    // The read barrier instrumentation does not support the
-    // HArm64IntermediateAddress instruction yet.
-    //
-    // TODO: Handle this case properly in the ARM64 code generator and
-    // re-enable this optimization; otherwise, remove this TODO.
-    // b/26601270
-    return;
-  }
-  if (index->IsConstant() ||
-      (index->IsBoundsCheck() && index->AsBoundsCheck()->GetIndex()->IsConstant())) {
-    // When the index is a constant all the addressing can be fitted in the
-    // memory access instruction, so do not split the access.
-    return;
-  }
-  if (access->IsArraySet() &&
-      access->AsArraySet()->GetValue()->GetType() == Primitive::kPrimNot) {
-    // The access may require a runtime call or the original array pointer.
-    return;
-  }
-
-  // Proceed to extract the base address computation.
-  ArenaAllocator* arena = GetGraph()->GetArena();
-
-  HIntConstant* offset = GetGraph()->GetIntConstant(data_offset);
-  HArm64IntermediateAddress* address =
-      new (arena) HArm64IntermediateAddress(array, offset, kNoDexPc);
-  address->SetReferenceTypeInfo(array->GetReferenceTypeInfo());
-  access->GetBlock()->InsertInstructionBefore(address, access);
-  access->ReplaceInput(address, 0);
-  // Both instructions must depend on GC to prevent any instruction that can
-  // trigger GC to be inserted between the two.
-  access->AddSideEffects(SideEffects::DependsOnGC());
-  DCHECK(address->GetSideEffects().Includes(SideEffects::DependsOnGC()));
-  DCHECK(access->GetSideEffects().Includes(SideEffects::DependsOnGC()));
-  // TODO: Code generation for HArrayGet and HArraySet will check whether the input address
-  // is an HArm64IntermediateAddress and generate appropriate code.
-  // We would like to replace the `HArrayGet` and `HArraySet` with custom instructions (maybe
-  // `HArm64Load` and `HArm64Store`). We defer these changes because these new instructions would
-  // not bring any advantages yet.
-  // Also see the comments in
-  // `InstructionCodeGeneratorARM64::VisitArrayGet()` and
-  // `InstructionCodeGeneratorARM64::VisitArraySet()`.
-  RecordSimplification();
-}
-
 bool InstructionSimplifierArm64Visitor::TryMergeIntoShifterOperand(HInstruction* use,
                                                                    HInstruction* bitfield_op,
                                                                    bool do_merge) {
@@ -190,19 +140,23 @@
 
 void InstructionSimplifierArm64Visitor::VisitArrayGet(HArrayGet* instruction) {
   size_t data_offset = CodeGenerator::GetArrayDataOffset(instruction);
-  TryExtractArrayAccessAddress(instruction,
-                               instruction->GetArray(),
-                               instruction->GetIndex(),
-                               data_offset);
+  if (TryExtractArrayAccessAddress(instruction,
+                                   instruction->GetArray(),
+                                   instruction->GetIndex(),
+                                   data_offset)) {
+    RecordSimplification();
+  }
 }
 
 void InstructionSimplifierArm64Visitor::VisitArraySet(HArraySet* instruction) {
   size_t access_size = Primitive::ComponentSize(instruction->GetComponentType());
   size_t data_offset = mirror::Array::DataOffset(access_size).Uint32Value();
-  TryExtractArrayAccessAddress(instruction,
-                               instruction->GetArray(),
-                               instruction->GetIndex(),
-                               data_offset);
+  if (TryExtractArrayAccessAddress(instruction,
+                                   instruction->GetArray(),
+                                   instruction->GetIndex(),
+                                   data_offset)) {
+    RecordSimplification();
+  }
 }
 
 void InstructionSimplifierArm64Visitor::VisitMul(HMul* instruction) {
diff --git a/compiler/optimizing/instruction_simplifier_arm64.h b/compiler/optimizing/instruction_simplifier_arm64.h
index 4735f85..28648b3 100644
--- a/compiler/optimizing/instruction_simplifier_arm64.h
+++ b/compiler/optimizing/instruction_simplifier_arm64.h
@@ -35,10 +35,6 @@
     }
   }
 
-  void TryExtractArrayAccessAddress(HInstruction* access,
-                                    HInstruction* array,
-                                    HInstruction* index,
-                                    size_t data_offset);
   bool TryMergeIntoUsersShifterOperand(HInstruction* instruction);
   bool TryMergeIntoShifterOperand(HInstruction* use,
                                   HInstruction* bitfield_op,
diff --git a/compiler/optimizing/instruction_simplifier_shared.cc b/compiler/optimizing/instruction_simplifier_shared.cc
index dab1ebc..8f7778f 100644
--- a/compiler/optimizing/instruction_simplifier_shared.cc
+++ b/compiler/optimizing/instruction_simplifier_shared.cc
@@ -226,4 +226,59 @@
   return false;
 }
 
+
+bool TryExtractArrayAccessAddress(HInstruction* access,
+                                  HInstruction* array,
+                                  HInstruction* index,
+                                  size_t data_offset) {
+  if (kEmitCompilerReadBarrier) {
+    // The read barrier instrumentation does not support the
+    // HIntermediateAddress instruction yet.
+    //
+    // TODO: Handle this case properly in the ARM64 and ARM code generator and
+    // re-enable this optimization; otherwise, remove this TODO.
+    // b/26601270
+    return false;
+  }
+  if (index->IsConstant() ||
+      (index->IsBoundsCheck() && index->AsBoundsCheck()->GetIndex()->IsConstant())) {
+    // When the index is a constant all the addressing can be fitted in the
+    // memory access instruction, so do not split the access.
+    return false;
+  }
+  if (access->IsArraySet() &&
+      access->AsArraySet()->GetValue()->GetType() == Primitive::kPrimNot) {
+    // The access may require a runtime call or the original array pointer.
+    return false;
+  }
+
+  // Proceed to extract the base address computation.
+  HGraph* graph = access->GetBlock()->GetGraph();
+  ArenaAllocator* arena = graph->GetArena();
+
+  HIntConstant* offset = graph->GetIntConstant(data_offset);
+  HIntermediateAddress* address =
+      new (arena) HIntermediateAddress(array, offset, kNoDexPc);
+  address->SetReferenceTypeInfo(array->GetReferenceTypeInfo());
+  access->GetBlock()->InsertInstructionBefore(address, access);
+  access->ReplaceInput(address, 0);
+  // Both instructions must depend on GC to prevent any instruction that can
+  // trigger GC to be inserted between the two.
+  access->AddSideEffects(SideEffects::DependsOnGC());
+  DCHECK(address->GetSideEffects().Includes(SideEffects::DependsOnGC()));
+  DCHECK(access->GetSideEffects().Includes(SideEffects::DependsOnGC()));
+  // TODO: Code generation for HArrayGet and HArraySet will check whether the input address
+  // is an HIntermediateAddress and generate appropriate code.
+  // We would like to replace the `HArrayGet` and `HArraySet` with custom instructions (maybe
+  // `HArm64Load` and `HArm64Store`,`HArmLoad` and `HArmStore`). We defer these changes
+  // because these new instructions would not bring any advantages yet.
+  // Also see the comments in
+  // `InstructionCodeGeneratorARM::VisitArrayGet()`
+  // `InstructionCodeGeneratorARM::VisitArraySet()`
+  // `InstructionCodeGeneratorARM64::VisitArrayGet()`
+  // `InstructionCodeGeneratorARM64::VisitArraySet()`.
+  return true;
+}
+
+
 }  // namespace art
diff --git a/compiler/optimizing/instruction_simplifier_shared.h b/compiler/optimizing/instruction_simplifier_shared.h
index b1fe8f4..56804f5 100644
--- a/compiler/optimizing/instruction_simplifier_shared.h
+++ b/compiler/optimizing/instruction_simplifier_shared.h
@@ -26,6 +26,11 @@
 // a negated bitwise instruction.
 bool TryMergeNegatedInput(HBinaryOperation* op);
 
+bool TryExtractArrayAccessAddress(HInstruction* access,
+                                  HInstruction* array,
+                                  HInstruction* index,
+                                  size_t data_offset);
+
 }  // namespace art
 
 #endif  // ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_SHARED_H_
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 0f0ef26..23ac457 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -1289,7 +1289,8 @@
 #else
 #define FOR_EACH_CONCRETE_INSTRUCTION_SHARED(M)                         \
   M(BitwiseNegatedRight, Instruction)                                   \
-  M(MultiplyAccumulate, Instruction)
+  M(MultiplyAccumulate, Instruction)                                    \
+  M(IntermediateAddress, Instruction)
 #endif
 
 #ifndef ART_ENABLE_CODEGEN_arm
@@ -1303,8 +1304,7 @@
 #define FOR_EACH_CONCRETE_INSTRUCTION_ARM64(M)
 #else
 #define FOR_EACH_CONCRETE_INSTRUCTION_ARM64(M)                          \
-  M(Arm64DataProcWithShifterOp, Instruction)                            \
-  M(Arm64IntermediateAddress, Instruction)
+  M(Arm64DataProcWithShifterOp, Instruction)
 #endif
 
 #ifndef ART_ENABLE_CODEGEN_mips
diff --git a/compiler/optimizing/nodes_arm64.h b/compiler/optimizing/nodes_arm64.h
index 06b073c..3f88717 100644
--- a/compiler/optimizing/nodes_arm64.h
+++ b/compiler/optimizing/nodes_arm64.h
@@ -94,32 +94,6 @@
 
 std::ostream& operator<<(std::ostream& os, const HArm64DataProcWithShifterOp::OpKind op);
 
-// This instruction computes an intermediate address pointing in the 'middle' of an object. The
-// result pointer cannot be handled by GC, so extra care is taken to make sure that this value is
-// never used across anything that can trigger GC.
-class HArm64IntermediateAddress FINAL : public HExpression<2> {
- public:
-  HArm64IntermediateAddress(HInstruction* base_address, HInstruction* offset, uint32_t dex_pc)
-      : HExpression(Primitive::kPrimNot, SideEffects::DependsOnGC(), dex_pc) {
-    SetRawInputAt(0, base_address);
-    SetRawInputAt(1, offset);
-  }
-
-  bool CanBeMoved() const OVERRIDE { return true; }
-  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
-    return true;
-  }
-  bool IsActualObject() const OVERRIDE { return false; }
-
-  HInstruction* GetBaseAddress() const { return InputAt(0); }
-  HInstruction* GetOffset() const { return InputAt(1); }
-
-  DECLARE_INSTRUCTION(Arm64IntermediateAddress);
-
- private:
-  DISALLOW_COPY_AND_ASSIGN(HArm64IntermediateAddress);
-};
-
 }  // namespace art
 
 #endif  // ART_COMPILER_OPTIMIZING_NODES_ARM64_H_
diff --git a/compiler/optimizing/nodes_shared.h b/compiler/optimizing/nodes_shared.h
index f2d5cf3..8bd8667 100644
--- a/compiler/optimizing/nodes_shared.h
+++ b/compiler/optimizing/nodes_shared.h
@@ -113,6 +113,34 @@
   DISALLOW_COPY_AND_ASSIGN(HBitwiseNegatedRight);
 };
 
+
+// This instruction computes an intermediate address pointing in the 'middle' of an object. The
+// result pointer cannot be handled by GC, so extra care is taken to make sure that this value is
+// never used across anything that can trigger GC.
+class HIntermediateAddress FINAL : public HExpression<2> {
+ public:
+  HIntermediateAddress(HInstruction* base_address, HInstruction* offset, uint32_t dex_pc)
+      : HExpression(Primitive::kPrimNot, SideEffects::DependsOnGC(), dex_pc) {
+    SetRawInputAt(0, base_address);
+    SetRawInputAt(1, offset);
+  }
+
+  bool CanBeMoved() const OVERRIDE { return true; }
+  bool InstructionDataEquals(const HInstruction* other ATTRIBUTE_UNUSED) const OVERRIDE {
+    return true;
+  }
+  bool IsActualObject() const OVERRIDE { return false; }
+
+  HInstruction* GetBaseAddress() const { return InputAt(0); }
+  HInstruction* GetOffset() const { return InputAt(1); }
+
+  DECLARE_INSTRUCTION(IntermediateAddress);
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(HIntermediateAddress);
+};
+
+
 }  // namespace art
 
 #endif  // ART_COMPILER_OPTIMIZING_NODES_SHARED_H_
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 77ae10a..0bca186 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -448,8 +448,12 @@
       arm::DexCacheArrayFixups* fixups = new (arena) arm::DexCacheArrayFixups(graph, stats);
       arm::InstructionSimplifierArm* simplifier =
           new (arena) arm::InstructionSimplifierArm(graph, stats);
+      SideEffectsAnalysis* side_effects = new (arena) SideEffectsAnalysis(graph);
+      GVNOptimization* gvn = new (arena) GVNOptimization(graph, *side_effects, "GVN_after_arch");
       HOptimization* arm_optimizations[] = {
         simplifier,
+        side_effects,
+        gvn,
         fixups
       };
       RunOptimizations(arm_optimizations, arraysize(arm_optimizations), pass_observer);
diff --git a/runtime/arch/arm/entrypoints_init_arm.cc b/runtime/arch/arm/entrypoints_init_arm.cc
index e48a164..966587d 100644
--- a/runtime/arch/arm/entrypoints_init_arm.cc
+++ b/runtime/arch/arm/entrypoints_init_arm.cc
@@ -33,7 +33,9 @@
 // Read barrier entrypoints.
 // art_quick_read_barrier_mark_regX uses an non-standard calling
 // convention: it expects its input in register X and returns its
-// result in that same register.
+// result in that same register, and saves and restores all
+// caller-save registers.
+extern "C" mirror::Object* art_quick_read_barrier_mark_reg00(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg01(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg02(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg03(mirror::Object*);
@@ -119,7 +121,7 @@
 
   // Read barrier.
   qpoints->pReadBarrierJni = ReadBarrierJni;
-  qpoints->pReadBarrierMarkReg00 = artReadBarrierMark;
+  qpoints->pReadBarrierMarkReg00 = art_quick_read_barrier_mark_reg00;
   qpoints->pReadBarrierMarkReg01 = art_quick_read_barrier_mark_reg01;
   qpoints->pReadBarrierMarkReg02 = art_quick_read_barrier_mark_reg02;
   qpoints->pReadBarrierMarkReg03 = art_quick_read_barrier_mark_reg03;
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index f9c34f5..34d3158 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -672,6 +672,12 @@
     .endif
 .endm
 
+// Save rReg's value to [sp, #offset].
+.macro PUSH_REG rReg, offset
+    str \rReg, [sp, #\offset]       @ save rReg
+    .cfi_rel_offset \rReg, \offset
+.endm
+
     /*
      * Macro to insert read barrier, only used in art_quick_aput_obj.
      * rObj and rDest are registers, offset is a defined literal such as MIRROR_OBJECT_CLASS_OFFSET.
@@ -1752,30 +1758,83 @@
     /*
      * Create a function `name` calling the ReadBarrier::Mark routine,
      * getting its argument and returning its result through register
-     * `reg`, thus following a non-standard runtime calling convention:
-     * - `reg` is used to pass the (sole) argument of this function
+     * `reg`, saving and restoring all caller-save registers.
+     *
+     * If `reg` is different from `r0`, the generated function follows a
+     * non-standard runtime calling convention:
+     * - register `reg` is used to pass the (sole) argument of this
+     *   function (instead of R0);
+     * - register `reg` is used to return the result of this function
      *   (instead of R0);
-     * - `reg` is used to return the result of this function (instead of R0);
      * - R0 is treated like a normal (non-argument) caller-save register;
      * - everything else is the same as in the standard runtime calling
-     *   convention (e.g. same callee-save registers).
+     *   convention (e.g. standard callee-save registers are preserved).
      */
 .macro READ_BARRIER_MARK_REG name, reg
 ENTRY \name
-    push  {lr}                          @ save return address
-    .cfi_adjust_cfa_offset 4
-    .cfi_rel_offset lr, 0
-    sub   sp, #4                        @ push padding (native calling convention 8-byte alignment)
-    .cfi_adjust_cfa_offset 4
-    mov   r0, \reg                      @ pass arg1 - obj from `reg`
-    bl    artReadBarrierMark            @ artReadBarrierMark(obj)
-    mov   \reg, r0                      @ return result into `reg`
-    add   sp, #4                        @ pop padding
-    .cfi_adjust_cfa_offset -4
-    pop   {pc}                          @ return
+    push  {r0-r4, r9, r12, lr}          @ save return address and core caller-save registers
+    .cfi_adjust_cfa_offset 32
+    .cfi_rel_offset r0, 0
+    .cfi_rel_offset r1, 4
+    .cfi_rel_offset r2, 8
+    .cfi_rel_offset r3, 12
+    .cfi_rel_offset r4, 16
+    .cfi_rel_offset r9, 20
+    .cfi_rel_offset r12, 24
+    .cfi_rel_offset lr, 28
+    vpush {s0-s15}                      @ save floating-point caller-save registers
+    .cfi_adjust_cfa_offset 64
+
+    .ifnc \reg, r0
+      mov   r0, \reg                    @ pass arg1 - obj from `reg`
+    .endif
+    bl    artReadBarrierMark            @ r0 <- artReadBarrierMark(obj)
+
+    vpop {s0-s15}                       @ restore floating-point registers
+    .cfi_adjust_cfa_offset -64
+    @ If `reg` is a caller-save register, save the result to its
+    @ corresponding stack slot; it will be restored by the "pop"
+    @ instruction below. Otherwise, move result into `reg`.
+    @
+    @ (Note that saving `reg` to its stack slot will overwrite the value
+    @ previously stored by the "push" instruction above. That is
+    @ alright, as in that case we know that `reg` is not a live
+    @ register, as it is used to pass the argument and return the result
+    @ of this function.)
+    .ifc \reg, r0
+      PUSH_REG r0, 0                    @ copy result to r0's stack location
+    .else
+      .ifc \reg, r1
+        PUSH_REG r0, 4                  @ copy result to r1's stack location
+      .else
+        .ifc \reg, r2
+          PUSH_REG r0, 8                @ copy result to r2's stack location
+        .else
+          .ifc \reg, r3
+            PUSH_REG r0, 12             @ copy result to r3's stack location
+          .else
+            .ifc \reg, r4
+              PUSH_REG r0, 16           @ copy result to r4's stack location
+            .else
+              .ifc \reg, r9
+                PUSH_REG r0, 20         @ copy result to r9's stack location
+              .else
+                .ifc \reg, r12
+                  PUSH_REG r0, 24       @ copy result to r12's stack location
+                .else
+                  mov   \reg, r0        @ return result into `reg`
+                .endif
+              .endif
+            .endif
+          .endif
+        .endif
+      .endif
+    .endif
+    pop   {r0-r4, r9, r12, pc}          @ restore caller-save registers and return
 END \name
 .endm
 
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg00, r0
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg01, r1
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg02, r2
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg03, r3
diff --git a/runtime/arch/arm64/entrypoints_init_arm64.cc b/runtime/arch/arm64/entrypoints_init_arm64.cc
index 5385a2f..2e5f5ad 100644
--- a/runtime/arch/arm64/entrypoints_init_arm64.cc
+++ b/runtime/arch/arm64/entrypoints_init_arm64.cc
@@ -33,7 +33,9 @@
 // Read barrier entrypoints.
 // art_quick_read_barrier_mark_regX uses an non-standard calling
 // convention: it expects its input in register X and returns its
-// result in that same register.
+// result in that same register, and saves and restores all
+// caller-save registers.
+extern "C" mirror::Object* art_quick_read_barrier_mark_reg00(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg01(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg02(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg03(mirror::Object*);
@@ -122,7 +124,7 @@
 
   // Read barrier.
   qpoints->pReadBarrierJni = ReadBarrierJni;
-  qpoints->pReadBarrierMarkReg00 = artReadBarrierMark;
+  qpoints->pReadBarrierMarkReg00 = art_quick_read_barrier_mark_reg00;
   qpoints->pReadBarrierMarkReg01 = art_quick_read_barrier_mark_reg01;
   qpoints->pReadBarrierMarkReg02 = art_quick_read_barrier_mark_reg02;
   qpoints->pReadBarrierMarkReg03 = art_quick_read_barrier_mark_reg03;
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index c893e77..6173ae7 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1253,6 +1253,22 @@
     .endif
 .endm
 
+// Restore xReg1's value from [sp, #offset] if xReg1 is not the same as xExclude.
+// Restore xReg2's value from [sp, #(offset + 8)] if xReg2 is not the same as xExclude.
+.macro POP_REGS_NE xReg1, xReg2, offset, xExclude
+    .ifc \xReg1, \xExclude
+        ldr \xReg2, [sp, #(\offset + 8)]        // restore xReg2
+    .else
+        .ifc \xReg2, \xExclude
+            ldr \xReg1, [sp, #\offset]          // restore xReg1
+        .else
+            ldp \xReg1, \xReg2, [sp, #\offset]  // restore xReg1 and xReg2
+        .endif
+    .endif
+    .cfi_restore \xReg1
+    .cfi_restore \xReg2
+.endm
+
     /*
      * Macro to insert read barrier, only used in art_quick_aput_obj.
      * xDest, wDest and xObj are registers, offset is a defined literal such as
@@ -2222,56 +2238,148 @@
 
     /*
      * Create a function `name` calling the ReadBarrier::Mark routine,
-     * getting its argument and returning its result through register
-     * `reg`, thus following a non-standard runtime calling convention:
-     * - `reg` is used to pass the (sole) argument of this function
+     * getting its argument and returning its result through W register
+     * `wreg` (corresponding to X register `xreg`), saving and restoring
+     * all caller-save registers.
+     *
+     * If `wreg` is different from `w0`, the generated function follows a
+     * non-standard runtime calling convention:
+     * - register `wreg` is used to pass the (sole) argument of this
+     *   function (instead of W0);
+     * - register `wreg` is used to return the result of this function
      *   (instead of W0);
-     * - `reg` is used to return the result of this function (instead of W0);
      * - W0 is treated like a normal (non-argument) caller-save register;
      * - everything else is the same as in the standard runtime calling
-     *   convention (e.g. same callee-save registers).
+     *   convention (e.g. standard callee-save registers are preserved).
      */
-.macro READ_BARRIER_MARK_REG name, reg
+.macro READ_BARRIER_MARK_REG name, wreg, xreg
 ENTRY \name
-    str   xLR, [sp, #-16]!              // Save return address and add padding (16B align stack).
-    .cfi_adjust_cfa_offset 16
-    .cfi_rel_offset x30, 0
-    mov   w0, \reg                      // Pass arg1 - obj from `reg`
+    /*
+     * Allocate 46 stack slots * 8 = 368 bytes:
+     * - 20 slots for core registers X0-X19
+     * - 24 slots for floating-point registers D0-D7 and D16-D31
+     * -  1 slot for return address register XLR
+     * -  1 padding slot for 16-byte stack alignment
+     */
+    // Save all potentially live caller-save core registers.
+    stp   x0, x1,   [sp, #-368]!
+    .cfi_adjust_cfa_offset 368
+    .cfi_rel_offset x0, 0
+    .cfi_rel_offset x1, 8
+    stp   x2, x3,   [sp, #16]
+    .cfi_rel_offset x2, 16
+    .cfi_rel_offset x3, 24
+    stp   x4, x5,   [sp, #32]
+    .cfi_rel_offset x4, 32
+    .cfi_rel_offset x5, 40
+    stp   x6, x7,   [sp, #48]
+    .cfi_rel_offset x6, 48
+    .cfi_rel_offset x7, 56
+    stp   x8, x9,   [sp, #64]
+    .cfi_rel_offset x8, 64
+    .cfi_rel_offset x9, 72
+    stp   x10, x11, [sp, #80]
+    .cfi_rel_offset x10, 80
+    .cfi_rel_offset x11, 88
+    stp   x12, x13, [sp, #96]
+    .cfi_rel_offset x12, 96
+    .cfi_rel_offset x13, 104
+    stp   x14, x15, [sp, #112]
+    .cfi_rel_offset x14, 112
+    .cfi_rel_offset x15, 120
+    stp   x16, x17, [sp, #128]
+    .cfi_rel_offset x16, 128
+    .cfi_rel_offset x17, 136
+    stp   x18, x19, [sp, #144]
+    .cfi_rel_offset x18, 144
+    .cfi_rel_offset x19, 152
+    // Save all potentially live caller-save floating-point registers.
+    stp   d0, d1,   [sp, #160]
+    stp   d2, d3,   [sp, #176]
+    stp   d4, d5,   [sp, #192]
+    stp   d6, d7,   [sp, #208]
+    stp   d16, d17, [sp, #224]
+    stp   d18, d19, [sp, #240]
+    stp   d20, d21, [sp, #256]
+    stp   d22, d23, [sp, #272]
+    stp   d24, d25, [sp, #288]
+    stp   d26, d27, [sp, #304]
+    stp   d28, d29, [sp, #320]
+    stp   d30, d31, [sp, #336]
+    // Save return address.
+    str   xLR,      [sp, #352]
+    .cfi_rel_offset x30, 352
+    // (sp + #360 is a padding slot)
+
+    .ifnc \wreg, w0
+      mov   w0, \wreg                   // Pass arg1 - obj from `wreg`
+    .endif
     bl    artReadBarrierMark            // artReadBarrierMark(obj)
-    mov   \reg, w0                      // Return result into `reg`
-    ldr   xLR, [sp], #16                // Restore return address and remove padding.
+    .ifnc \wreg, w0
+      mov   \wreg, w0                   // Return result into `wreg`
+    .endif
+
+    // Restore core regs, except `xreg`, as `wreg` is used to return the
+    // result of this function (simply remove it from the stack instead).
+    POP_REGS_NE x0, x1,   0,   \xreg
+    POP_REGS_NE x2, x3,   16,  \xreg
+    POP_REGS_NE x4, x5,   32,  \xreg
+    POP_REGS_NE x6, x7,   48,  \xreg
+    POP_REGS_NE x8, x9,   64,  \xreg
+    POP_REGS_NE x10, x11, 80,  \xreg
+    POP_REGS_NE x12, x13, 96,  \xreg
+    POP_REGS_NE x14, x15, 112, \xreg
+    POP_REGS_NE x16, x17, 128, \xreg
+    POP_REGS_NE x18, x19, 144, \xreg
+    // Restore floating-point registers.
+    ldp   d0, d1,   [sp, #160]
+    ldp   d2, d3,   [sp, #176]
+    ldp   d4, d5,   [sp, #192]
+    ldp   d6, d7,   [sp, #208]
+    ldp   d16, d17, [sp, #224]
+    ldp   d18, d19, [sp, #240]
+    ldp   d20, d21, [sp, #256]
+    ldp   d22, d23, [sp, #272]
+    ldp   d24, d25, [sp, #288]
+    ldp   d26, d27, [sp, #304]
+    ldp   d28, d29, [sp, #320]
+    ldp   d30, d31, [sp, #336]
+    // Restore return address and remove padding.
+    ldr   xLR,      [sp, #352]
     .cfi_restore x30
-    .cfi_adjust_cfa_offset -16
+    add sp, sp, #368
+    .cfi_adjust_cfa_offset -368
     ret
 END \name
 .endm
 
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg01, w1
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg02, w2
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg03, w3
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg04, w4
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg05, w5
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg06, w6
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg07, w7
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg08, w8
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg09, w9
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg10, w10
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg11, w11
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg12, w12
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg13, w13
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg14, w14
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg15, w15
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg16, w16
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg17, w17
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg18, w18
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg19, w19
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg20, w20
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg21, w21
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg22, w22
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg23, w23
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg24, w24
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg25, w25
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg26, w26
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg27, w27
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg28, w28
-READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg29, w29
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg00, w0,  x0
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg01, w1,  x1
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg02, w2,  x2
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg03, w3,  x3
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg04, w4,  x4
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg05, w5,  x5
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg06, w6,  x6
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg07, w7,  x7
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg08, w8,  x8
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg09, w9,  x9
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg10, w10, x10
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg11, w11, x11
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg12, w12, x12
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg13, w13, x13
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg14, w14, x14
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg15, w15, x15
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg16, w16, x16
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg17, w17, x17
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg18, w18, x18
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg19, w19, x19
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg20, w20, x20
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg21, w21, x21
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg22, w22, x22
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg23, w23, x23
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg24, w24, x24
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg25, w25, x25
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg26, w26, x26
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg27, w27, x27
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg28, w28, x28
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg29, w29, x29
diff --git a/runtime/arch/mips64/entrypoints_init_mips64.cc b/runtime/arch/mips64/entrypoints_init_mips64.cc
index b19aa01..b02edb6 100644
--- a/runtime/arch/mips64/entrypoints_init_mips64.cc
+++ b/runtime/arch/mips64/entrypoints_init_mips64.cc
@@ -28,8 +28,8 @@
 namespace art {
 
 // Cast entrypoints.
-extern "C" uint32_t artIsAssignableFromCode(const mirror::Class* klass,
-                                            const mirror::Class* ref_class);
+extern "C" size_t artIsAssignableFromCode(const mirror::Class* klass,
+                                          const mirror::Class* ref_class);
 // Math entrypoints.
 extern int32_t CmpgDouble(double a, double b);
 extern int32_t CmplDouble(double a, double b);
diff --git a/runtime/arch/x86/entrypoints_init_x86.cc b/runtime/arch/x86/entrypoints_init_x86.cc
index 8f13d58..4e9756c 100644
--- a/runtime/arch/x86/entrypoints_init_x86.cc
+++ b/runtime/arch/x86/entrypoints_init_x86.cc
@@ -31,7 +31,8 @@
 // Read barrier entrypoints.
 // art_quick_read_barrier_mark_regX uses an non-standard calling
 // convention: it expects its input in register X and returns its
-// result in that same register.
+// result in that same register, and saves and restores all
+// caller-save registers.
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg00(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg01(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg02(mirror::Object*);
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index e75fecb..77e04e7 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1908,41 +1908,73 @@
     UNREACHABLE
 END_FUNCTION art_nested_signal_return
 
-// Call the ReadBarrierMark entry point, getting input and returning
-// result through EAX (register 0), following the standard runtime
-// calling convention.
-DEFINE_FUNCTION art_quick_read_barrier_mark_reg00
-    subl LITERAL(8), %esp            // alignment padding
-    CFI_ADJUST_CFA_OFFSET(8)
-    PUSH eax                         // pass arg1 - obj
-    call SYMBOL(artReadBarrierMark)  // artReadBarrierMark(obj)
-    addl LITERAL(12), %esp           // pop argument and remove padding
-    CFI_ADJUST_CFA_OFFSET(-12)
-    ret
-END_FUNCTION art_quick_read_barrier_mark_reg00
-
 // Create a function `name` calling the ReadBarrier::Mark routine,
 // getting its argument and returning its result through register
-// `reg`, thus following a non-standard runtime calling convention:
-// - `reg` is used to pass the (sole) argument of this function
+// `reg`, saving and restoring all caller-save registers.
+//
+// If `reg` is different from `eax`, the generated function follows a
+// non-standard runtime calling convention:
+// - register `reg` is used to pass the (sole) argument of this function
 //   (instead of EAX);
-// - `reg` is used to return the result of this function (instead of EAX);
+// - register `reg` is used to return the result of this function
+//   (instead of EAX);
 // - EAX is treated like a normal (non-argument) caller-save register;
 // - everything else is the same as in the standard runtime calling
-//   convention (e.g. same callee-save registers).
+//   convention (e.g. standard callee-save registers are preserved).
 MACRO2(READ_BARRIER_MARK_REG, name, reg)
     DEFINE_FUNCTION VAR(name)
-    subl LITERAL(8), %esp            // alignment padding
-    CFI_ADJUST_CFA_OFFSET(8)
+    // Save all potentially live caller-save core registers.
+    PUSH eax
+    PUSH ecx
+    PUSH edx
+    PUSH ebx
+    // 8-byte align the stack to improve (8-byte) XMM register saving and restoring.
+    // and create space for caller-save floating-point registers.
+    subl MACRO_LITERAL(4 + 8 * 8), %esp
+    CFI_ADJUST_CFA_OFFSET(4 + 8 * 8)
+    // Save all potentially live caller-save floating-point registers.
+    movsd %xmm0, 0(%esp)
+    movsd %xmm1, 8(%esp)
+    movsd %xmm2, 16(%esp)
+    movsd %xmm3, 24(%esp)
+    movsd %xmm4, 32(%esp)
+    movsd %xmm5, 40(%esp)
+    movsd %xmm6, 48(%esp)
+    movsd %xmm7, 56(%esp)
+
+    subl LITERAL(4), %esp            // alignment padding
+    CFI_ADJUST_CFA_OFFSET(4)
     PUSH RAW_VAR(reg)                // pass arg1 - obj from `reg`
     call SYMBOL(artReadBarrierMark)  // artReadBarrierMark(obj)
-    movl %eax, REG_VAR(reg)          // return result into `reg`
-    addl LITERAL(12), %esp           // pop argument and remove padding
-    CFI_ADJUST_CFA_OFFSET(-12)
+    .ifnc RAW_VAR(reg), eax
+      movl %eax, REG_VAR(reg)        // return result into `reg`
+    .endif
+    addl LITERAL(8), %esp            // pop argument and remove padding
+    CFI_ADJUST_CFA_OFFSET(-8)
+
+    // Restore floating-point registers.
+    movsd 0(%esp), %xmm0
+    movsd 8(%esp), %xmm1
+    movsd 16(%esp), %xmm2
+    movsd 24(%esp), %xmm3
+    movsd 32(%esp), %xmm4
+    movsd 40(%esp), %xmm5
+    movsd 48(%esp), %xmm6
+    movsd 56(%esp), %xmm7
+    // Remove floating-point registers and padding.
+    addl MACRO_LITERAL(8 * 8 + 4), %esp
+    CFI_ADJUST_CFA_OFFSET(-(8 * 8 + 4))
+    // Restore core regs, except `reg`, as it is used to return the
+    // result of this function (simply remove it from the stack instead).
+    POP_REG_NE ebx, RAW_VAR(reg)
+    POP_REG_NE edx, RAW_VAR(reg)
+    POP_REG_NE ecx, RAW_VAR(reg)
+    POP_REG_NE eax, RAW_VAR(reg)
     ret
     END_FUNCTION VAR(name)
 END_MACRO
 
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg00, eax
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg01, ecx
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg02, edx
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg03, ebx
diff --git a/runtime/arch/x86_64/asm_support_x86_64.S b/runtime/arch/x86_64/asm_support_x86_64.S
index cf0039c..c4e723c 100644
--- a/runtime/arch/x86_64/asm_support_x86_64.S
+++ b/runtime/arch/x86_64/asm_support_x86_64.S
@@ -52,7 +52,7 @@
 
 #define LITERAL(value) $value
 #if defined(__APPLE__)
-    #define MACRO_LITERAL(value) $$(value)
+    #define MACRO_LITERAL(value) $(value)
 #else
     #define MACRO_LITERAL(value) $value
 #endif
diff --git a/runtime/arch/x86_64/entrypoints_init_x86_64.cc b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
index b566fb1..c2e3023 100644
--- a/runtime/arch/x86_64/entrypoints_init_x86_64.cc
+++ b/runtime/arch/x86_64/entrypoints_init_x86_64.cc
@@ -34,7 +34,8 @@
 // Read barrier entrypoints.
 // art_quick_read_barrier_mark_regX uses an non-standard calling
 // convention: it expects its input in register X and returns its
-// result in that same register.
+// result in that same register, and saves and restores all
+// caller-save registers.
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg00(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg01(mirror::Object*);
 extern "C" mirror::Object* art_quick_read_barrier_mark_reg02(mirror::Object*);
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 496e6a8..784ec39 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1815,73 +1815,93 @@
     UNREACHABLE
 END_FUNCTION art_nested_signal_return
 
-// Call the ReadBarrier::Mark routine, getting argument and returning
-// result through RAX (register 0), thus following a non-standard
-// runtime calling convention:
-// - RAX is used to pass the (sole) argument of this function (instead
-//   of RDI);
-// - RDI is treated like a normal (non-argument) caller-save register;
-// - everything else is the same as in the standard runtime calling
-//   convention; in particular, RAX is still used to return the result
-//   of this function.
-DEFINE_FUNCTION art_quick_read_barrier_mark_reg00
-    SETUP_FP_CALLEE_SAVE_FRAME
-    subq LITERAL(8), %rsp           // Alignment padding.
-    CFI_ADJUST_CFA_OFFSET(8)
-    movq %rax, %rdi                 // Pass arg1 - obj from RAX.
-    call SYMBOL(artReadBarrierMark) // artReadBarrierMark(obj)
-    addq LITERAL(8), %rsp           // Remove padding.
-    CFI_ADJUST_CFA_OFFSET(-8)
-    RESTORE_FP_CALLEE_SAVE_FRAME
-    ret
-END_FUNCTION art_quick_read_barrier_mark_reg00
-
-// Call the ReadBarrier::Mark routine, getting argument and returning
-// result through RDI (register 7), thus following a non-standard
-// runtime calling convention:
-// - RDI is used to return the result of this function (instead of RAX);
-// - RAX is treated like a normal (non-result) caller-save register;
-// - everything else is the same as in the standard runtime calling
-//   convention; in particular, RDI is still used to pass the (sole)
-//   argument of this function.
-DEFINE_FUNCTION art_quick_read_barrier_mark_reg07
-    SETUP_FP_CALLEE_SAVE_FRAME
-    subq LITERAL(8), %rsp           // Alignment padding.
-    CFI_ADJUST_CFA_OFFSET(8)
-    call SYMBOL(artReadBarrierMark) // artReadBarrierMark(obj)
-    movq %rax, %rdi                 // Return result into RDI.
-    addq LITERAL(8), %rsp           // Remove padding.
-    CFI_ADJUST_CFA_OFFSET(-8)
-    RESTORE_FP_CALLEE_SAVE_FRAME
-    ret
-END_FUNCTION art_quick_read_barrier_mark_reg07
-
 // Create a function `name` calling the ReadBarrier::Mark routine,
 // getting its argument and returning its result through register
-// `reg`, thus following a non-standard runtime calling convention:
-// - `reg` is used to pass the (sole) argument of this function (instead
-//   of RDI);
-// - `reg` is used to return the result of this function (instead of RAX);
-// - RDI is treated like a normal (non-argument) caller-save register;
-// - RAX is treated like a normal (non-result) caller-save register;
+// `reg`, saving and restoring all caller-save registers.
+//
+// The generated function follows a non-standard runtime calling
+// convention:
+// - register `reg` (which may be different from RDI) is used to pass
+//   the (sole) argument of this function;
+// - register `reg` (which may be different from RAX) is used to return
+//   the result of this function (instead of RAX);
+// - if `reg` is different from `rdi`, RDI is treated like a normal
+//   (non-argument) caller-save register;
+// - if `reg` is different from `rax`, RAX is treated like a normal
+//   (non-result) caller-save register;
 // - everything else is the same as in the standard runtime calling
-//   convention (e.g. same callee-save registers).
+//   convention (e.g. standard callee-save registers are preserved).
 MACRO2(READ_BARRIER_MARK_REG, name, reg)
     DEFINE_FUNCTION VAR(name)
+    // Save all potentially live caller-save core registers.
+    PUSH rax
+    PUSH rcx
+    PUSH rdx
+    PUSH rsi
+    PUSH rdi
+    PUSH r8
+    PUSH r9
+    PUSH r10
+    PUSH r11
+    // Create space for caller-save floating-point registers.
+    subq MACRO_LITERAL(12 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(12 * 8)
+    // Save all potentially live caller-save floating-point registers.
+    movq %xmm0, 0(%rsp)
+    movq %xmm1, 8(%rsp)
+    movq %xmm2, 16(%rsp)
+    movq %xmm3, 24(%rsp)
+    movq %xmm4, 32(%rsp)
+    movq %xmm5, 40(%rsp)
+    movq %xmm6, 48(%rsp)
+    movq %xmm7, 56(%rsp)
+    movq %xmm8, 64(%rsp)
+    movq %xmm9, 72(%rsp)
+    movq %xmm10, 80(%rsp)
+    movq %xmm11, 88(%rsp)
     SETUP_FP_CALLEE_SAVE_FRAME
-    subq LITERAL(8), %rsp           // Alignment padding.
-    CFI_ADJUST_CFA_OFFSET(8)
-    movq REG_VAR(reg), %rdi         // Pass arg1 - obj from `reg`.
+
+    .ifnc RAW_VAR(reg), rdi
+      movq REG_VAR(reg), %rdi       // Pass arg1 - obj from `reg`.
+    .endif
     call SYMBOL(artReadBarrierMark) // artReadBarrierMark(obj)
-    movq %rax, REG_VAR(reg)         // Return result into `reg`.
-    addq LITERAL(8), %rsp           // Remove padding.
-    CFI_ADJUST_CFA_OFFSET(-8)
+    .ifnc RAW_VAR(reg), rax
+      movq %rax, REG_VAR(reg)       // Return result into `reg`.
+    .endif
+
     RESTORE_FP_CALLEE_SAVE_FRAME
+    // Restore floating-point registers.
+    movq 0(%rsp), %xmm0
+    movq 8(%rsp), %xmm1
+    movq 16(%rsp), %xmm2
+    movq 24(%rsp), %xmm3
+    movq 32(%rsp), %xmm4
+    movq 40(%rsp), %xmm5
+    movq 48(%rsp), %xmm6
+    movq 56(%rsp), %xmm7
+    movq 64(%rsp), %xmm8
+    movq 72(%rsp), %xmm9
+    movq 80(%rsp), %xmm10
+    movq 88(%rsp), %xmm11
+    // Remove floating-point registers.
+    addq MACRO_LITERAL(12 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(-(12 * 8))
+    // Restore core regs, except `reg`, as it is used to return the
+    // result of this function (simply remove it from the stack instead).
+    POP_REG_NE r11, RAW_VAR(reg)
+    POP_REG_NE r10, RAW_VAR(reg)
+    POP_REG_NE r9, RAW_VAR(reg)
+    POP_REG_NE r8, RAW_VAR(reg)
+    POP_REG_NE rdi, RAW_VAR(reg)
+    POP_REG_NE rsi, RAW_VAR(reg)
+    POP_REG_NE rdx, RAW_VAR(reg)
+    POP_REG_NE rcx, RAW_VAR(reg)
+    POP_REG_NE rax, RAW_VAR(reg)
     ret
     END_FUNCTION VAR(name)
 END_MACRO
 
-// Note: art_quick_read_barrier_mark_reg00 is implemented above.
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg00, rax
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg01, rcx
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg02, rdx
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg03, rbx
@@ -1889,7 +1909,7 @@
 // cannot be used to pass arguments.
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg05, rbp
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg06, rsi
-// Note: art_quick_read_barrier_mark_reg07 is implemented above.
+READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg07, rdi
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg08, r8
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg09, r9
 READ_BARRIER_MARK_REG art_quick_read_barrier_mark_reg10, r10
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index e5a2f36..d0dad64 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -7695,7 +7695,7 @@
   }
 
   if (is_static) {
-    resolved = mirror::Class::FindStaticField(self, klass, dex_cache.Get(), field_idx);
+    resolved = mirror::Class::FindStaticField(self, klass.Get(), dex_cache.Get(), field_idx);
   } else {
     resolved = klass->FindInstanceField(dex_cache.Get(), field_idx);
   }
diff --git a/runtime/gc/collector/concurrent_copying-inl.h b/runtime/gc/collector/concurrent_copying-inl.h
index 3011112..4019a5b 100644
--- a/runtime/gc/collector/concurrent_copying-inl.h
+++ b/runtime/gc/collector/concurrent_copying-inl.h
@@ -153,6 +153,14 @@
   }
 }
 
+inline mirror::Object* ConcurrentCopying::MarkFromReadBarrier(mirror::Object* from_ref) {
+  // TODO: Consider removing this check when we are done investigating slow paths. b/30162165
+  if (UNLIKELY(mark_from_read_barrier_measurements_)) {
+    return MarkFromReadBarrierWithMeasurements(from_ref);
+  }
+  return Mark(from_ref);
+}
+
 inline mirror::Object* ConcurrentCopying::GetFwdPtr(mirror::Object* from_ref) {
   DCHECK(region_space_->IsInFromSpace(from_ref));
   LockWord lw = from_ref->GetLockWord(false);
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index b7b5aa0..155e032 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -17,7 +17,9 @@
 #include "concurrent_copying.h"
 
 #include "art_field-inl.h"
+#include "base/histogram-inl.h"
 #include "base/stl_util.h"
+#include "base/systrace.h"
 #include "debugger.h"
 #include "gc/accounting/heap_bitmap-inl.h"
 #include "gc/accounting/space_bitmap-inl.h"
@@ -39,7 +41,9 @@
 
 static constexpr size_t kDefaultGcMarkStackSize = 2 * MB;
 
-ConcurrentCopying::ConcurrentCopying(Heap* heap, const std::string& name_prefix)
+ConcurrentCopying::ConcurrentCopying(Heap* heap,
+                                     const std::string& name_prefix,
+                                     bool measure_read_barrier_slow_path)
     : GarbageCollector(heap,
                        name_prefix + (name_prefix.empty() ? "" : " ") +
                        "concurrent copying + mark sweep"),
@@ -54,6 +58,14 @@
       heap_mark_bitmap_(nullptr), live_stack_freeze_size_(0), mark_stack_mode_(kMarkStackModeOff),
       weak_ref_access_enabled_(true),
       skipped_blocks_lock_("concurrent copying bytes blocks lock", kMarkSweepMarkStackLock),
+      measure_read_barrier_slow_path_(measure_read_barrier_slow_path),
+      rb_slow_path_ns_(0),
+      rb_slow_path_count_(0),
+      rb_slow_path_count_gc_(0),
+      rb_slow_path_histogram_lock_("Read barrier histogram lock"),
+      rb_slow_path_time_histogram_("Mutator time in read barrier slow path", 500, 32),
+      rb_slow_path_count_total_(0),
+      rb_slow_path_count_gc_total_(0),
       rb_table_(heap_->GetReadBarrierTable()),
       force_evacuate_all_(false),
       immune_gray_stack_lock_("concurrent copying immune gray stack lock",
@@ -162,6 +174,14 @@
     MutexLock mu(Thread::Current(), mark_stack_lock_);
     CHECK(false_gray_stack_.empty());
   }
+
+  mark_from_read_barrier_measurements_ = measure_read_barrier_slow_path_;
+  if (measure_read_barrier_slow_path_) {
+    rb_slow_path_ns_.StoreRelaxed(0);
+    rb_slow_path_count_.StoreRelaxed(0);
+    rb_slow_path_count_gc_.StoreRelaxed(0);
+  }
+
   immune_spaces_.Reset();
   bytes_moved_.StoreRelaxed(0);
   objects_moved_.StoreRelaxed(0);
@@ -1996,9 +2016,17 @@
     MutexLock mu(Thread::Current(), skipped_blocks_lock_);
     skipped_blocks_map_.clear();
   }
-  ReaderMutexLock mu(self, *Locks::mutator_lock_);
-  WriterMutexLock mu2(self, *Locks::heap_bitmap_lock_);
-  heap_->ClearMarkedObjects();
+  {
+    ReaderMutexLock mu(self, *Locks::mutator_lock_);
+    WriterMutexLock mu2(self, *Locks::heap_bitmap_lock_);
+    heap_->ClearMarkedObjects();
+  }
+  if (measure_read_barrier_slow_path_) {
+    MutexLock mu(self, rb_slow_path_histogram_lock_);
+    rb_slow_path_time_histogram_.AdjustAndAddValue(rb_slow_path_ns_.LoadRelaxed());
+    rb_slow_path_count_total_ += rb_slow_path_count_.LoadRelaxed();
+    rb_slow_path_count_gc_total_ += rb_slow_path_count_gc_.LoadRelaxed();
+  }
 }
 
 bool ConcurrentCopying::IsMarkedHeapReference(mirror::HeapReference<mirror::Object>* field) {
@@ -2036,6 +2064,37 @@
   region_space_->RevokeAllThreadLocalBuffers();
 }
 
+mirror::Object* ConcurrentCopying::MarkFromReadBarrierWithMeasurements(mirror::Object* from_ref) {
+  if (Thread::Current() != thread_running_gc_) {
+    rb_slow_path_count_.FetchAndAddRelaxed(1u);
+  } else {
+    rb_slow_path_count_gc_.FetchAndAddRelaxed(1u);
+  }
+  ScopedTrace tr(__FUNCTION__);
+  const uint64_t start_time = measure_read_barrier_slow_path_ ? NanoTime() : 0u;
+  mirror::Object* ret = Mark(from_ref);
+  if (measure_read_barrier_slow_path_) {
+    rb_slow_path_ns_.FetchAndAddRelaxed(NanoTime() - start_time);
+  }
+  return ret;
+}
+
+void ConcurrentCopying::DumpPerformanceInfo(std::ostream& os) {
+  GarbageCollector::DumpPerformanceInfo(os);
+  MutexLock mu(Thread::Current(), rb_slow_path_histogram_lock_);
+  if (rb_slow_path_time_histogram_.SampleSize() > 0) {
+    Histogram<uint64_t>::CumulativeData cumulative_data;
+    rb_slow_path_time_histogram_.CreateHistogram(&cumulative_data);
+    rb_slow_path_time_histogram_.PrintConfidenceIntervals(os, 0.99, cumulative_data);
+  }
+  if (rb_slow_path_count_total_ > 0) {
+    os << "Slow path count " << rb_slow_path_count_total_ << "\n";
+  }
+  if (rb_slow_path_count_gc_total_ > 0) {
+    os << "GC slow path count " << rb_slow_path_count_gc_total_ << "\n";
+  }
+}
+
 }  // namespace collector
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index 166a1f0..6a8d052 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -58,17 +58,24 @@
   // Enable verbose mode.
   static constexpr bool kVerboseMode = false;
 
-  ConcurrentCopying(Heap* heap, const std::string& name_prefix = "");
+  ConcurrentCopying(Heap* heap,
+                    const std::string& name_prefix = "",
+                    bool measure_read_barrier_slow_path = false);
   ~ConcurrentCopying();
 
   virtual void RunPhases() OVERRIDE
-      REQUIRES(!mark_stack_lock_, !skipped_blocks_lock_, !immune_gray_stack_lock_);
+      REQUIRES(!immune_gray_stack_lock_,
+               !mark_stack_lock_,
+               !rb_slow_path_histogram_lock_,
+               !skipped_blocks_lock_);
   void InitializePhase() SHARED_REQUIRES(Locks::mutator_lock_)
       REQUIRES(!mark_stack_lock_, !immune_gray_stack_lock_);
   void MarkingPhase() SHARED_REQUIRES(Locks::mutator_lock_)
       REQUIRES(!mark_stack_lock_, !skipped_blocks_lock_, !immune_gray_stack_lock_);
   void ReclaimPhase() SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!mark_stack_lock_);
-  void FinishPhase() REQUIRES(!mark_stack_lock_, !skipped_blocks_lock_);
+  void FinishPhase() REQUIRES(!mark_stack_lock_,
+                              !rb_slow_path_histogram_lock_,
+                              !skipped_blocks_lock_);
 
   void BindBitmaps() SHARED_REQUIRES(Locks::mutator_lock_)
       REQUIRES(!Locks::heap_bitmap_lock_);
@@ -95,7 +102,11 @@
     return IsMarked(ref) == ref;
   }
   template<bool kGrayImmuneObject = true>
-  ALWAYS_INLINE mirror::Object* Mark(mirror::Object* from_ref) SHARED_REQUIRES(Locks::mutator_lock_)
+  ALWAYS_INLINE mirror::Object* Mark(mirror::Object* from_ref)
+      SHARED_REQUIRES(Locks::mutator_lock_)
+      REQUIRES(!mark_stack_lock_, !skipped_blocks_lock_, !immune_gray_stack_lock_);
+  ALWAYS_INLINE mirror::Object* MarkFromReadBarrier(mirror::Object* from_ref)
+      SHARED_REQUIRES(Locks::mutator_lock_)
       REQUIRES(!mark_stack_lock_, !skipped_blocks_lock_, !immune_gray_stack_lock_);
   bool IsMarking() const {
     return is_marking_;
@@ -203,6 +214,10 @@
       REQUIRES(!mark_stack_lock_);
   void ScanImmuneObject(mirror::Object* obj)
       SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!mark_stack_lock_);
+  mirror::Object* MarkFromReadBarrierWithMeasurements(mirror::Object* from_ref)
+      SHARED_REQUIRES(Locks::mutator_lock_)
+      REQUIRES(!mark_stack_lock_, !skipped_blocks_lock_, !immune_gray_stack_lock_);
+  void DumpPerformanceInfo(std::ostream& os) OVERRIDE REQUIRES(!rb_slow_path_histogram_lock_);
 
   space::RegionSpace* region_space_;      // The underlying region space.
   std::unique_ptr<Barrier> gc_barrier_;
@@ -251,6 +266,20 @@
   Atomic<size_t> to_space_bytes_skipped_;
   Atomic<size_t> to_space_objects_skipped_;
 
+  // If measure_read_barrier_slow_path_ is true, we count how long is spent in MarkFromReadBarrier
+  // and also log.
+  bool measure_read_barrier_slow_path_;
+  // mark_from_read_barrier_measurements_ is true if systrace is enabled or
+  // measure_read_barrier_time_ is true.
+  bool mark_from_read_barrier_measurements_;
+  Atomic<uint64_t> rb_slow_path_ns_;
+  Atomic<uint64_t> rb_slow_path_count_;
+  Atomic<uint64_t> rb_slow_path_count_gc_;
+  mutable Mutex rb_slow_path_histogram_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+  Histogram<uint64_t> rb_slow_path_time_histogram_ GUARDED_BY(rb_slow_path_histogram_lock_);
+  uint64_t rb_slow_path_count_total_ GUARDED_BY(rb_slow_path_histogram_lock_);
+  uint64_t rb_slow_path_count_gc_total_ GUARDED_BY(rb_slow_path_histogram_lock_);
+
   accounting::ReadBarrierTable* rb_table_;
   bool force_evacuate_all_;  // True if all regions are evacuated.
   Atomic<bool> updated_all_immune_objects_;
diff --git a/runtime/gc/collector/garbage_collector.h b/runtime/gc/collector/garbage_collector.h
index 580486a..e0b71a7 100644
--- a/runtime/gc/collector/garbage_collector.h
+++ b/runtime/gc/collector/garbage_collector.h
@@ -181,7 +181,7 @@
   void RecordFree(const ObjectBytePair& freed);
   // Record a free of large objects.
   void RecordFreeLOS(const ObjectBytePair& freed);
-  void DumpPerformanceInfo(std::ostream& os) REQUIRES(!pause_histogram_lock_);
+  virtual void DumpPerformanceInfo(std::ostream& os) REQUIRES(!pause_histogram_lock_);
 
   // Helper functions for querying if objects are marked. These are used for processing references,
   // and will be used for reading system weaks while the GC is running.
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index a6d62a9..6f4767e 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -157,6 +157,7 @@
            bool verify_pre_sweeping_rosalloc,
            bool verify_post_gc_rosalloc,
            bool gc_stress_mode,
+           bool measure_gc_performance,
            bool use_homogeneous_space_compaction_for_oom,
            uint64_t min_interval_homogeneous_space_compaction_by_oom)
     : non_moving_space_(nullptr),
@@ -599,7 +600,9 @@
       garbage_collectors_.push_back(semi_space_collector_);
     }
     if (MayUseCollector(kCollectorTypeCC)) {
-      concurrent_copying_collector_ = new collector::ConcurrentCopying(this);
+      concurrent_copying_collector_ = new collector::ConcurrentCopying(this,
+                                                                       "",
+                                                                       measure_gc_performance);
       garbage_collectors_.push_back(concurrent_copying_collector_);
     }
     if (MayUseCollector(kCollectorTypeMC)) {
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 6fb048a..bb0d11a 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -182,6 +182,7 @@
        bool verify_pre_sweeping_rosalloc,
        bool verify_post_gc_rosalloc,
        bool gc_stress_mode,
+       bool measure_gc_performance,
        bool use_homogeneous_space_compaction,
        uint64_t min_interval_homogeneous_space_compaction_by_oom);
 
diff --git a/runtime/interpreter/mterp/arm64/fbinop2addr.S b/runtime/interpreter/mterp/arm64/fbinop2addr.S
index 0d57cbf..04236ad 100644
--- a/runtime/interpreter/mterp/arm64/fbinop2addr.S
+++ b/runtime/interpreter/mterp/arm64/fbinop2addr.S
@@ -7,8 +7,7 @@
      */
     /* binop/2addr vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w9, wINST, #8               // w9<- A+
-    and     w9, w9, #15                 // w9<- A
+    ubfx    w9, wINST, #8, #4           // w9<- A
     GET_VREG s1, w3
     GET_VREG s0, w9
     $instr                              // s2<- op
diff --git a/runtime/interpreter/mterp/arm64/footer.S b/runtime/interpreter/mterp/arm64/footer.S
index 2d3a11e..7628ed3 100644
--- a/runtime/interpreter/mterp/arm64/footer.S
+++ b/runtime/interpreter/mterp/arm64/footer.S
@@ -234,7 +234,7 @@
 #if MTERP_LOGGING
     mov  x0, xSELF
     add  x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm x2, xINST, 0, 31
+    sxtw x2, wINST
     bl MterpLogOSR
 #endif
     mov  x0, #1                         // Signal normal return
diff --git a/runtime/interpreter/mterp/arm64/funopNarrow.S b/runtime/interpreter/mterp/arm64/funopNarrow.S
index 9f5ad1e..aed830b 100644
--- a/runtime/interpreter/mterp/arm64/funopNarrow.S
+++ b/runtime/interpreter/mterp/arm64/funopNarrow.S
@@ -8,10 +8,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG $srcreg, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     $instr                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG $tgtreg, w4                // vA<- d0
diff --git a/runtime/interpreter/mterp/arm64/funopNarrower.S b/runtime/interpreter/mterp/arm64/funopNarrower.S
index 411396b..6fddfea 100644
--- a/runtime/interpreter/mterp/arm64/funopNarrower.S
+++ b/runtime/interpreter/mterp/arm64/funopNarrower.S
@@ -7,10 +7,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG_WIDE $srcreg, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     $instr                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG $tgtreg, w4                // vA<- d0
diff --git a/runtime/interpreter/mterp/arm64/funopWide.S b/runtime/interpreter/mterp/arm64/funopWide.S
index d83b39c..409e26b 100644
--- a/runtime/interpreter/mterp/arm64/funopWide.S
+++ b/runtime/interpreter/mterp/arm64/funopWide.S
@@ -7,10 +7,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG_WIDE $srcreg, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     $instr                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG_WIDE $tgtreg, w4           // vA<- d0
diff --git a/runtime/interpreter/mterp/arm64/funopWider.S b/runtime/interpreter/mterp/arm64/funopWider.S
index 50a73f1..4c91ebc 100644
--- a/runtime/interpreter/mterp/arm64/funopWider.S
+++ b/runtime/interpreter/mterp/arm64/funopWider.S
@@ -7,10 +7,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG $srcreg, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     $instr                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG_WIDE $tgtreg, w4           // vA<- d0
diff --git a/runtime/interpreter/mterp/arm64/op_const_wide_16.S b/runtime/interpreter/mterp/arm64/op_const_wide_16.S
index e43628b..553d481 100644
--- a/runtime/interpreter/mterp/arm64/op_const_wide_16.S
+++ b/runtime/interpreter/mterp/arm64/op_const_wide_16.S
@@ -1,8 +1,7 @@
     /* const-wide/16 vAA, #+BBBB */
-    FETCH_S w0, 1                       // w0<- ssssBBBB (sign-extended
+    FETCH_S x0, 1                       // x0<- ssssssssssssBBBB (sign-extended)
     lsr     w3, wINST, #8               // w3<- AA
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    sbfm    x0, x0, 0, 31
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG_WIDE x0, w3
     GOTO_OPCODE ip                      // jump to next instruction
diff --git a/runtime/interpreter/mterp/arm64/op_const_wide_32.S b/runtime/interpreter/mterp/arm64/op_const_wide_32.S
index 527f7d8..9dc4fc3 100644
--- a/runtime/interpreter/mterp/arm64/op_const_wide_32.S
+++ b/runtime/interpreter/mterp/arm64/op_const_wide_32.S
@@ -1,10 +1,9 @@
     /* const-wide/32 vAA, #+BBBBbbbb */
-    FETCH w0, 1                         // w0<- 0000bbbb (low)
+    FETCH   w0, 1                       // x0<- 000000000000bbbb (low)
     lsr     w3, wINST, #8               // w3<- AA
-    FETCH_S w2, 2                       // w2<- ssssBBBB (high)
+    FETCH_S x2, 2                       // x2<- ssssssssssssBBBB (high)
     FETCH_ADVANCE_INST 3                // advance rPC, load wINST
     GET_INST_OPCODE ip                  // extract opcode from wINST
-    orr     w0, w0, w2, lsl #16         // w0<- BBBBbbbb
-    sbfm    x0, x0, 0, 31
+    orr     x0, x0, x2, lsl #16         // x0<- ssssssssBBBBbbbb
     SET_VREG_WIDE x0, w3
     GOTO_OPCODE ip                      // jump to next instruction
diff --git a/runtime/interpreter/mterp/arm64/op_iget_quick.S b/runtime/interpreter/mterp/arm64/op_iget_quick.S
index 45c68a3..699b2c4 100644
--- a/runtime/interpreter/mterp/arm64/op_iget_quick.S
+++ b/runtime/interpreter/mterp/arm64/op_iget_quick.S
@@ -5,8 +5,7 @@
     FETCH w1, 1                         // w1<- field byte offset
     GET_VREG w3, w2                     // w3<- object we're operating on
     ubfx    w2, wINST, #8, #4           // w2<- A
-    cmp     x3, #0                      // check object for null
-    beq     common_errNullObject        // object was null
+    cbz     w3, common_errNullObject    // object was null
     $load   w0, [x3, x1]                // w0<- obj.field
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
     $extend
diff --git a/runtime/interpreter/mterp/arm64/op_iget_wide_quick.S b/runtime/interpreter/mterp/arm64/op_iget_wide_quick.S
index 2480d2d..30b30c2 100644
--- a/runtime/interpreter/mterp/arm64/op_iget_wide_quick.S
+++ b/runtime/interpreter/mterp/arm64/op_iget_wide_quick.S
@@ -3,7 +3,7 @@
     FETCH w4, 1                         // w4<- field byte offset
     GET_VREG w3, w2                     // w3<- object we're operating on
     ubfx    w2, wINST, #8, #4           // w2<- A
-    cbz     w3, common_errNullObject        // object was null
+    cbz     w3, common_errNullObject    // object was null
     add     x4, x3, x4                  // create direct pointer
     ldr     x0, [x4]
     FETCH_ADVANCE_INST 2                // advance rPC, load wINST
diff --git a/runtime/interpreter/mterp/arm64/op_instance_of.S b/runtime/interpreter/mterp/arm64/op_instance_of.S
index 647bc75..a56705a 100644
--- a/runtime/interpreter/mterp/arm64/op_instance_of.S
+++ b/runtime/interpreter/mterp/arm64/op_instance_of.S
@@ -13,8 +13,7 @@
     mov       x3, xSELF                 // w3<- self
     bl        MterpInstanceOf           // (index, &obj, method, self)
     ldr       x1, [xSELF, #THREAD_EXCEPTION_OFFSET]
-    lsr       w2, wINST, #8             // w2<- A+
-    and       w2, w2, #15               // w2<- A
+    ubfx      w2, wINST, #8, #4         // w2<- A
     PREFETCH_INST 2
     cbnz      x1, MterpException
     ADVANCE 2                           // advance rPC
diff --git a/runtime/interpreter/mterp/arm64/op_int_to_long.S b/runtime/interpreter/mterp/arm64/op_int_to_long.S
index 13d2120..35830f3 100644
--- a/runtime/interpreter/mterp/arm64/op_int_to_long.S
+++ b/runtime/interpreter/mterp/arm64/op_int_to_long.S
@@ -1 +1 @@
-%include "arm64/funopWider.S" {"instr":"sbfm x0, x0, 0, 31", "srcreg":"w0", "tgtreg":"x0"}
+%include "arm64/funopWider.S" {"instr":"sxtw x0, w0", "srcreg":"w0", "tgtreg":"x0"}
diff --git a/runtime/interpreter/mterp/arm64/op_iput_wide_quick.S b/runtime/interpreter/mterp/arm64/op_iput_wide_quick.S
index 27b5dc5..566e2bf 100644
--- a/runtime/interpreter/mterp/arm64/op_iput_wide_quick.S
+++ b/runtime/interpreter/mterp/arm64/op_iput_wide_quick.S
@@ -3,8 +3,7 @@
     FETCH w3, 1                         // w3<- field byte offset
     GET_VREG w2, w2                     // w2<- fp[B], the object pointer
     ubfx    w0, wINST, #8, #4           // w0<- A
-    cmp     w2, #0                      // check object for null
-    beq     common_errNullObject        // object was null
+    cbz     w2, common_errNullObject    // object was null
     GET_VREG_WIDE x0, w0                // x0-< fp[A]
     FETCH_ADVANCE_INST 2                // advance rPC, load wINST
     add     x1, x2, x3                  // create a direct pointer
diff --git a/runtime/interpreter/mterp/arm64/op_packed_switch.S b/runtime/interpreter/mterp/arm64/op_packed_switch.S
index 1456f1a..4faa6d2 100644
--- a/runtime/interpreter/mterp/arm64/op_packed_switch.S
+++ b/runtime/interpreter/mterp/arm64/op_packed_switch.S
@@ -9,12 +9,12 @@
      * for: packed-switch, sparse-switch
      */
     /* op vAA, +BBBB */
-    FETCH w0, 1                         // w0<- bbbb (lo)
-    FETCH w1, 2                         // w1<- BBBB (hi)
+    FETCH   w0, 1                       // x0<- 000000000000bbbb (lo)
+    FETCH_S x1, 2                       // x1<- ssssssssssssBBBB (hi)
     lsr     w3, wINST, #8               // w3<- AA
-    orr     w0, w0, w1, lsl #16         // w0<- BBBBbbbb
+    orr     x0, x0, x1, lsl #16         // x0<- ssssssssBBBBbbbb
     GET_VREG w1, w3                     // w1<- vAA
-    add     x0, xPC, w0, lsl #1         // w0<- PC + BBBBbbbb*2
+    add     x0, xPC, x0, lsl #1         // x0<- PC + BBBBbbbb*2
     bl      $func                       // w0<- code-unit branch offset
-    sbfm    xINST, x0, 0, 31
+    sxtw    xINST, w0
     b       MterpCommonTakenBranchNoFlags
diff --git a/runtime/interpreter/mterp/arm64/op_rem_float_2addr.S b/runtime/interpreter/mterp/arm64/op_rem_float_2addr.S
index 0b91891..95f81c5 100644
--- a/runtime/interpreter/mterp/arm64/op_rem_float_2addr.S
+++ b/runtime/interpreter/mterp/arm64/op_rem_float_2addr.S
@@ -1,12 +1,10 @@
     /* rem vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w9, wINST, #8               // w9<- A+
-    and     w9, w9, #15                 // w9<- A
+    ubfx    w9, wINST, #8, #4           // w9<- A
     GET_VREG s1, w3
     GET_VREG s0, w9
     bl  fmodf
-    lsr     w9, wINST, #8               // w9<- A+
-    and     w9, w9, #15                 // w9<- A
+    ubfx    w9, wINST, #8, #4           // w9<- A
     FETCH_ADVANCE_INST 1                // advance rPC, load rINST
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG s0, w9
diff --git a/runtime/interpreter/mterp/arm64/op_shl_int.S b/runtime/interpreter/mterp/arm64/op_shl_int.S
index bd0f237..3062a3f 100644
--- a/runtime/interpreter/mterp/arm64/op_shl_int.S
+++ b/runtime/interpreter/mterp/arm64/op_shl_int.S
@@ -1 +1 @@
-%include "arm64/binop.S" {"preinstr":"and     w1, w1, #31", "instr":"lsl     w0, w0, w1"}
+%include "arm64/binop.S" {"instr":"lsl     w0, w0, w1"}
diff --git a/runtime/interpreter/mterp/arm64/op_shl_int_2addr.S b/runtime/interpreter/mterp/arm64/op_shl_int_2addr.S
index b4671d2..9a7e09f 100644
--- a/runtime/interpreter/mterp/arm64/op_shl_int_2addr.S
+++ b/runtime/interpreter/mterp/arm64/op_shl_int_2addr.S
@@ -1 +1 @@
-%include "arm64/binop2addr.S" {"preinstr":"and     w1, w1, #31", "instr":"lsl     w0, w0, w1"}
+%include "arm64/binop2addr.S" {"instr":"lsl     w0, w0, w1"}
diff --git a/runtime/interpreter/mterp/arm64/op_shl_int_lit8.S b/runtime/interpreter/mterp/arm64/op_shl_int_lit8.S
index 4dd32e0..17f57f9 100644
--- a/runtime/interpreter/mterp/arm64/op_shl_int_lit8.S
+++ b/runtime/interpreter/mterp/arm64/op_shl_int_lit8.S
@@ -1 +1 @@
-%include "arm64/binopLit8.S" {"preinstr":"and     w1, w1, #31", "instr":"lsl     w0, w0, w1"}
+%include "arm64/binopLit8.S" {"instr":"lsl     w0, w0, w1"}
diff --git a/runtime/interpreter/mterp/arm64/op_shr_int.S b/runtime/interpreter/mterp/arm64/op_shr_int.S
index c214a18..493b740 100644
--- a/runtime/interpreter/mterp/arm64/op_shr_int.S
+++ b/runtime/interpreter/mterp/arm64/op_shr_int.S
@@ -1 +1 @@
-%include "arm64/binop.S" {"preinstr":"and     w1, w1, #31", "instr":"asr     w0, w0, w1"}
+%include "arm64/binop.S" {"instr":"asr     w0, w0, w1"}
diff --git a/runtime/interpreter/mterp/arm64/op_shr_int_2addr.S b/runtime/interpreter/mterp/arm64/op_shr_int_2addr.S
index 3c1484b..6efe8ee 100644
--- a/runtime/interpreter/mterp/arm64/op_shr_int_2addr.S
+++ b/runtime/interpreter/mterp/arm64/op_shr_int_2addr.S
@@ -1 +1 @@
-%include "arm64/binop2addr.S" {"preinstr":"and     w1, w1, #31", "instr":"asr     w0, w0, w1"}
+%include "arm64/binop2addr.S" {"instr":"asr     w0, w0, w1"}
diff --git a/runtime/interpreter/mterp/arm64/op_shr_int_lit8.S b/runtime/interpreter/mterp/arm64/op_shr_int_lit8.S
index 26d5024..274080c 100644
--- a/runtime/interpreter/mterp/arm64/op_shr_int_lit8.S
+++ b/runtime/interpreter/mterp/arm64/op_shr_int_lit8.S
@@ -1 +1 @@
-%include "arm64/binopLit8.S" {"preinstr":"and     w1, w1, #31", "instr":"asr     w0, w0, w1"}
+%include "arm64/binopLit8.S" {"instr":"asr     w0, w0, w1"}
diff --git a/runtime/interpreter/mterp/arm64/op_ushr_int.S b/runtime/interpreter/mterp/arm64/op_ushr_int.S
index bb8382b..005452b 100644
--- a/runtime/interpreter/mterp/arm64/op_ushr_int.S
+++ b/runtime/interpreter/mterp/arm64/op_ushr_int.S
@@ -1 +1 @@
-%include "arm64/binop.S" {"preinstr":"and     w1, w1, #31", "instr":"lsr     w0, w0, w1"}
+%include "arm64/binop.S" {"instr":"lsr     w0, w0, w1"}
diff --git a/runtime/interpreter/mterp/arm64/op_ushr_int_2addr.S b/runtime/interpreter/mterp/arm64/op_ushr_int_2addr.S
index dbccb99..1cb8cb7 100644
--- a/runtime/interpreter/mterp/arm64/op_ushr_int_2addr.S
+++ b/runtime/interpreter/mterp/arm64/op_ushr_int_2addr.S
@@ -1 +1 @@
-%include "arm64/binop2addr.S" {"preinstr":"and     w1, w1, #31", "instr":"lsr     w0, w0, w1"}
+%include "arm64/binop2addr.S" {"instr":"lsr     w0, w0, w1"}
diff --git a/runtime/interpreter/mterp/arm64/op_ushr_int_lit8.S b/runtime/interpreter/mterp/arm64/op_ushr_int_lit8.S
index 35090c4..ff30e1f 100644
--- a/runtime/interpreter/mterp/arm64/op_ushr_int_lit8.S
+++ b/runtime/interpreter/mterp/arm64/op_ushr_int_lit8.S
@@ -1 +1 @@
-%include "arm64/binopLit8.S" {"preinstr":"and     w1, w1, #31", "instr":"lsr     w0, w0, w1"}
+%include "arm64/binopLit8.S" {"instr":"lsr     w0, w0, w1"}
diff --git a/runtime/interpreter/mterp/arm64/shiftWide.S b/runtime/interpreter/mterp/arm64/shiftWide.S
index 6306fca..dcb2fb7 100644
--- a/runtime/interpreter/mterp/arm64/shiftWide.S
+++ b/runtime/interpreter/mterp/arm64/shiftWide.S
@@ -12,8 +12,7 @@
     and      w1, w0, #255                // w1<- BB
     GET_VREG_WIDE x1, w1                // x1<- vBB
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    and      x2, x2, #63                 // Mask low 6
-    $opcode  x0, x1, x2                 // Do the shift.
+    $opcode  x0, x1, x2                 // Do the shift. Only low 6 bits of x2 are used.
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG_WIDE x0, w3                // vAA<- x0
     GOTO_OPCODE ip                      // jump to next instruction
diff --git a/runtime/interpreter/mterp/arm64/shiftWide2addr.S b/runtime/interpreter/mterp/arm64/shiftWide2addr.S
index 77d104a..b860dfd 100644
--- a/runtime/interpreter/mterp/arm64/shiftWide2addr.S
+++ b/runtime/interpreter/mterp/arm64/shiftWide2addr.S
@@ -8,8 +8,7 @@
     GET_VREG w1, w1                     // x1<- vB
     GET_VREG_WIDE x0, w2                // x0<- vA
     FETCH_ADVANCE_INST 1                // advance rPC, load rINST
-    and     x1, x1, #63                 // Mask low 6 bits.
-    $opcode x0, x0, x1
+    $opcode x0, x0, x1                  // Do the shift. Only low 6 bits of x1 are used.
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG_WIDE x0, w2               // vAA<- result
     GOTO_OPCODE ip                      // jump to next instruction
diff --git a/runtime/interpreter/mterp/out/mterp_arm64.S b/runtime/interpreter/mterp/out/mterp_arm64.S
index df0b686..d470551 100644
--- a/runtime/interpreter/mterp/out/mterp_arm64.S
+++ b/runtime/interpreter/mterp/out/mterp_arm64.S
@@ -747,10 +747,9 @@
 .L_op_const_wide_16: /* 0x16 */
 /* File: arm64/op_const_wide_16.S */
     /* const-wide/16 vAA, #+BBBB */
-    FETCH_S w0, 1                       // w0<- ssssBBBB (sign-extended
+    FETCH_S x0, 1                       // x0<- ssssssssssssBBBB (sign-extended)
     lsr     w3, wINST, #8               // w3<- AA
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    sbfm    x0, x0, 0, 31
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG_WIDE x0, w3
     GOTO_OPCODE ip                      // jump to next instruction
@@ -760,13 +759,12 @@
 .L_op_const_wide_32: /* 0x17 */
 /* File: arm64/op_const_wide_32.S */
     /* const-wide/32 vAA, #+BBBBbbbb */
-    FETCH w0, 1                         // w0<- 0000bbbb (low)
+    FETCH   w0, 1                       // x0<- 000000000000bbbb (low)
     lsr     w3, wINST, #8               // w3<- AA
-    FETCH_S w2, 2                       // w2<- ssssBBBB (high)
+    FETCH_S x2, 2                       // x2<- ssssssssssssBBBB (high)
     FETCH_ADVANCE_INST 3                // advance rPC, load wINST
     GET_INST_OPCODE ip                  // extract opcode from wINST
-    orr     w0, w0, w2, lsl #16         // w0<- BBBBbbbb
-    sbfm    x0, x0, 0, 31
+    orr     x0, x0, x2, lsl #16         // x0<- ssssssssBBBBbbbb
     SET_VREG_WIDE x0, w3
     GOTO_OPCODE ip                      // jump to next instruction
 
@@ -934,8 +932,7 @@
     mov       x3, xSELF                 // w3<- self
     bl        MterpInstanceOf           // (index, &obj, method, self)
     ldr       x1, [xSELF, #THREAD_EXCEPTION_OFFSET]
-    lsr       w2, wINST, #8             // w2<- A+
-    and       w2, w2, #15               // w2<- A
+    ubfx      w2, wINST, #8, #4         // w2<- A
     PREFETCH_INST 2
     cbnz      x1, MterpException
     ADVANCE 2                           // advance rPC
@@ -1143,14 +1140,14 @@
      * for: packed-switch, sparse-switch
      */
     /* op vAA, +BBBB */
-    FETCH w0, 1                         // w0<- bbbb (lo)
-    FETCH w1, 2                         // w1<- BBBB (hi)
+    FETCH   w0, 1                       // x0<- 000000000000bbbb (lo)
+    FETCH_S x1, 2                       // x1<- ssssssssssssBBBB (hi)
     lsr     w3, wINST, #8               // w3<- AA
-    orr     w0, w0, w1, lsl #16         // w0<- BBBBbbbb
+    orr     x0, x0, x1, lsl #16         // x0<- ssssssssBBBBbbbb
     GET_VREG w1, w3                     // w1<- vAA
-    add     x0, xPC, w0, lsl #1         // w0<- PC + BBBBbbbb*2
+    add     x0, xPC, x0, lsl #1         // x0<- PC + BBBBbbbb*2
     bl      MterpDoPackedSwitch                       // w0<- code-unit branch offset
-    sbfm    xINST, x0, 0, 31
+    sxtw    xINST, w0
     b       MterpCommonTakenBranchNoFlags
 
 /* ------------------------------ */
@@ -1168,14 +1165,14 @@
      * for: packed-switch, sparse-switch
      */
     /* op vAA, +BBBB */
-    FETCH w0, 1                         // w0<- bbbb (lo)
-    FETCH w1, 2                         // w1<- BBBB (hi)
+    FETCH   w0, 1                       // x0<- 000000000000bbbb (lo)
+    FETCH_S x1, 2                       // x1<- ssssssssssssBBBB (hi)
     lsr     w3, wINST, #8               // w3<- AA
-    orr     w0, w0, w1, lsl #16         // w0<- BBBBbbbb
+    orr     x0, x0, x1, lsl #16         // x0<- ssssssssBBBBbbbb
     GET_VREG w1, w3                     // w1<- vAA
-    add     x0, xPC, w0, lsl #1         // w0<- PC + BBBBbbbb*2
+    add     x0, xPC, x0, lsl #1         // x0<- PC + BBBBbbbb*2
     bl      MterpDoSparseSwitch                       // w0<- code-unit branch offset
-    sbfm    xINST, x0, 0, 31
+    sxtw    xINST, w0
     b       MterpCommonTakenBranchNoFlags
 
 
@@ -3345,11 +3342,10 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG w0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
-    sbfm x0, x0, 0, 31                              // d0<- op
+    sxtw x0, w0                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG_WIDE x0, w4           // vA<- d0
     GOTO_OPCODE ip                      // jump to next instruction
@@ -3369,10 +3365,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG w0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     scvtf s0, w0                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG s0, w4                // vA<- d0
@@ -3392,10 +3387,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG w0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     scvtf d0, w0                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG_WIDE d0, w4           // vA<- d0
@@ -3415,10 +3409,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG_WIDE x0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
                                   // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG w0, w4                // vA<- d0
@@ -3438,10 +3431,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG_WIDE x0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     scvtf s0, x0                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG s0, w4                // vA<- d0
@@ -3461,10 +3453,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG_WIDE x0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     scvtf d0, x0                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG_WIDE d0, w4           // vA<- d0
@@ -3485,10 +3476,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG s0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     fcvtzs w0, s0                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG w0, w4                // vA<- d0
@@ -3508,10 +3498,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG s0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     fcvtzs x0, s0                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG_WIDE x0, w4           // vA<- d0
@@ -3531,10 +3520,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG s0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     fcvt  d0, s0                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG_WIDE d0, w4           // vA<- d0
@@ -3554,10 +3542,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG_WIDE d0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     fcvtzs w0, d0                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG w0, w4                // vA<- d0
@@ -3577,10 +3564,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG_WIDE d0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     fcvtzs x0, d0                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG_WIDE x0, w4           // vA<- d0
@@ -3600,10 +3586,9 @@
      */
     /* unop vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w4, wINST, #8               // w4<- A+
+    ubfx    w4, wINST, #8, #4           // w4<- A
     GET_VREG_WIDE d0, w3
     FETCH_ADVANCE_INST 1                // advance rPC, load wINST
-    and     w4, w4, #15                 // w4<- A
     fcvt s0, d0                              // d0<- op
     GET_INST_OPCODE ip                  // extract opcode from wINST
     SET_VREG s0, w4                // vA<- d0
@@ -4032,7 +4017,7 @@
     cbz     w1, common_errDivideByZero  // is second operand zero?
     .endif
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    and     w1, w1, #31                           // optional op; may set condition codes
+                               // optional op; may set condition codes
     lsl     w0, w0, w1                              // w0<- op, w0-w3 changed
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG w0, w9                // vAA<- w0
@@ -4071,7 +4056,7 @@
     cbz     w1, common_errDivideByZero  // is second operand zero?
     .endif
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    and     w1, w1, #31                           // optional op; may set condition codes
+                               // optional op; may set condition codes
     asr     w0, w0, w1                              // w0<- op, w0-w3 changed
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG w0, w9                // vAA<- w0
@@ -4110,7 +4095,7 @@
     cbz     w1, common_errDivideByZero  // is second operand zero?
     .endif
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    and     w1, w1, #31                           // optional op; may set condition codes
+                               // optional op; may set condition codes
     lsr     w0, w0, w1                              // w0<- op, w0-w3 changed
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG w0, w9                // vAA<- w0
@@ -4424,8 +4409,7 @@
     and      w1, w0, #255                // w1<- BB
     GET_VREG_WIDE x1, w1                // x1<- vBB
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    and      x2, x2, #63                 // Mask low 6
-    lsl  x0, x1, x2                 // Do the shift.
+    lsl  x0, x1, x2                 // Do the shift. Only low 6 bits of x2 are used.
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG_WIDE x0, w3                // vAA<- x0
     GOTO_OPCODE ip                      // jump to next instruction
@@ -4450,8 +4434,7 @@
     and      w1, w0, #255                // w1<- BB
     GET_VREG_WIDE x1, w1                // x1<- vBB
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    and      x2, x2, #63                 // Mask low 6
-    asr  x0, x1, x2                 // Do the shift.
+    asr  x0, x1, x2                 // Do the shift. Only low 6 bits of x2 are used.
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG_WIDE x0, w3                // vAA<- x0
     GOTO_OPCODE ip                      // jump to next instruction
@@ -4476,8 +4459,7 @@
     and      w1, w0, #255                // w1<- BB
     GET_VREG_WIDE x1, w1                // x1<- vBB
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    and      x2, x2, #63                 // Mask low 6
-    lsr  x0, x1, x2                 // Do the shift.
+    lsr  x0, x1, x2                 // Do the shift. Only low 6 bits of x2 are used.
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG_WIDE x0, w3                // vAA<- x0
     GOTO_OPCODE ip                      // jump to next instruction
@@ -5089,7 +5071,7 @@
     cbz     w1, common_errDivideByZero
     .endif
     FETCH_ADVANCE_INST 1                // advance rPC, load rINST
-    and     w1, w1, #31                           // optional op; may set condition codes
+                               // optional op; may set condition codes
     lsl     w0, w0, w1                              // w0<- op, w0-w3 changed
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG w0, w9                // vAA<- w0
@@ -5125,7 +5107,7 @@
     cbz     w1, common_errDivideByZero
     .endif
     FETCH_ADVANCE_INST 1                // advance rPC, load rINST
-    and     w1, w1, #31                           // optional op; may set condition codes
+                               // optional op; may set condition codes
     asr     w0, w0, w1                              // w0<- op, w0-w3 changed
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG w0, w9                // vAA<- w0
@@ -5161,7 +5143,7 @@
     cbz     w1, common_errDivideByZero
     .endif
     FETCH_ADVANCE_INST 1                // advance rPC, load rINST
-    and     w1, w1, #31                           // optional op; may set condition codes
+                               // optional op; may set condition codes
     lsr     w0, w0, w1                              // w0<- op, w0-w3 changed
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG w0, w9                // vAA<- w0
@@ -5463,8 +5445,7 @@
     GET_VREG w1, w1                     // x1<- vB
     GET_VREG_WIDE x0, w2                // x0<- vA
     FETCH_ADVANCE_INST 1                // advance rPC, load rINST
-    and     x1, x1, #63                 // Mask low 6 bits.
-    lsl x0, x0, x1
+    lsl x0, x0, x1                  // Do the shift. Only low 6 bits of x1 are used.
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG_WIDE x0, w2               // vAA<- result
     GOTO_OPCODE ip                      // jump to next instruction
@@ -5485,8 +5466,7 @@
     GET_VREG w1, w1                     // x1<- vB
     GET_VREG_WIDE x0, w2                // x0<- vA
     FETCH_ADVANCE_INST 1                // advance rPC, load rINST
-    and     x1, x1, #63                 // Mask low 6 bits.
-    asr x0, x0, x1
+    asr x0, x0, x1                  // Do the shift. Only low 6 bits of x1 are used.
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG_WIDE x0, w2               // vAA<- result
     GOTO_OPCODE ip                      // jump to next instruction
@@ -5507,8 +5487,7 @@
     GET_VREG w1, w1                     // x1<- vB
     GET_VREG_WIDE x0, w2                // x0<- vA
     FETCH_ADVANCE_INST 1                // advance rPC, load rINST
-    and     x1, x1, #63                 // Mask low 6 bits.
-    lsr x0, x0, x1
+    lsr x0, x0, x1                  // Do the shift. Only low 6 bits of x1 are used.
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG_WIDE x0, w2               // vAA<- result
     GOTO_OPCODE ip                      // jump to next instruction
@@ -5529,8 +5508,7 @@
      */
     /* binop/2addr vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w9, wINST, #8               // w9<- A+
-    and     w9, w9, #15                 // w9<- A
+    ubfx    w9, wINST, #8, #4           // w9<- A
     GET_VREG s1, w3
     GET_VREG s0, w9
     fadd   s2, s0, s1                              // s2<- op
@@ -5554,8 +5532,7 @@
      */
     /* binop/2addr vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w9, wINST, #8               // w9<- A+
-    and     w9, w9, #15                 // w9<- A
+    ubfx    w9, wINST, #8, #4           // w9<- A
     GET_VREG s1, w3
     GET_VREG s0, w9
     fsub   s2, s0, s1                              // s2<- op
@@ -5579,8 +5556,7 @@
      */
     /* binop/2addr vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w9, wINST, #8               // w9<- A+
-    and     w9, w9, #15                 // w9<- A
+    ubfx    w9, wINST, #8, #4           // w9<- A
     GET_VREG s1, w3
     GET_VREG s0, w9
     fmul   s2, s0, s1                              // s2<- op
@@ -5604,8 +5580,7 @@
      */
     /* binop/2addr vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w9, wINST, #8               // w9<- A+
-    and     w9, w9, #15                 // w9<- A
+    ubfx    w9, wINST, #8, #4           // w9<- A
     GET_VREG s1, w3
     GET_VREG s0, w9
     fdiv   s2, s0, s1                              // s2<- op
@@ -5621,13 +5596,11 @@
 /* File: arm64/op_rem_float_2addr.S */
     /* rem vA, vB */
     lsr     w3, wINST, #12              // w3<- B
-    lsr     w9, wINST, #8               // w9<- A+
-    and     w9, w9, #15                 // w9<- A
+    ubfx    w9, wINST, #8, #4           // w9<- A
     GET_VREG s1, w3
     GET_VREG s0, w9
     bl  fmodf
-    lsr     w9, wINST, #8               // w9<- A+
-    and     w9, w9, #15                 // w9<- A
+    ubfx    w9, wINST, #8, #4           // w9<- A
     FETCH_ADVANCE_INST 1                // advance rPC, load rINST
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG s0, w9
@@ -6381,7 +6354,7 @@
     cbz     w1, common_errDivideByZero
     .endif
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    and     w1, w1, #31                           // optional op; may set condition codes
+                               // optional op; may set condition codes
     lsl     w0, w0, w1                              // w0<- op, w0-w3 changed
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG w0, w9                // vAA<- w0
@@ -6417,7 +6390,7 @@
     cbz     w1, common_errDivideByZero
     .endif
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    and     w1, w1, #31                           // optional op; may set condition codes
+                               // optional op; may set condition codes
     asr     w0, w0, w1                              // w0<- op, w0-w3 changed
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG w0, w9                // vAA<- w0
@@ -6453,7 +6426,7 @@
     cbz     w1, common_errDivideByZero
     .endif
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
-    and     w1, w1, #31                           // optional op; may set condition codes
+                               // optional op; may set condition codes
     lsr     w0, w0, w1                              // w0<- op, w0-w3 changed
     GET_INST_OPCODE ip                  // extract opcode from rINST
     SET_VREG w0, w9                // vAA<- w0
@@ -6471,8 +6444,7 @@
     FETCH w1, 1                         // w1<- field byte offset
     GET_VREG w3, w2                     // w3<- object we're operating on
     ubfx    w2, wINST, #8, #4           // w2<- A
-    cmp     x3, #0                      // check object for null
-    beq     common_errNullObject        // object was null
+    cbz     w3, common_errNullObject    // object was null
     ldr   w0, [x3, x1]                // w0<- obj.field
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
     
@@ -6489,7 +6461,7 @@
     FETCH w4, 1                         // w4<- field byte offset
     GET_VREG w3, w2                     // w3<- object we're operating on
     ubfx    w2, wINST, #8, #4           // w2<- A
-    cbz     w3, common_errNullObject        // object was null
+    cbz     w3, common_errNullObject    // object was null
     add     x4, x3, x4                  // create direct pointer
     ldr     x0, [x4]
     FETCH_ADVANCE_INST 2                // advance rPC, load wINST
@@ -6544,8 +6516,7 @@
     FETCH w3, 1                         // w3<- field byte offset
     GET_VREG w2, w2                     // w2<- fp[B], the object pointer
     ubfx    w0, wINST, #8, #4           // w0<- A
-    cmp     w2, #0                      // check object for null
-    beq     common_errNullObject        // object was null
+    cbz     w2, common_errNullObject    // object was null
     GET_VREG_WIDE x0, w0                // x0-< fp[A]
     FETCH_ADVANCE_INST 2                // advance rPC, load wINST
     add     x1, x2, x3                  // create a direct pointer
@@ -6710,8 +6681,7 @@
     FETCH w1, 1                         // w1<- field byte offset
     GET_VREG w3, w2                     // w3<- object we're operating on
     ubfx    w2, wINST, #8, #4           // w2<- A
-    cmp     x3, #0                      // check object for null
-    beq     common_errNullObject        // object was null
+    cbz     w3, common_errNullObject    // object was null
     ldrb   w0, [x3, x1]                // w0<- obj.field
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
     
@@ -6731,8 +6701,7 @@
     FETCH w1, 1                         // w1<- field byte offset
     GET_VREG w3, w2                     // w3<- object we're operating on
     ubfx    w2, wINST, #8, #4           // w2<- A
-    cmp     x3, #0                      // check object for null
-    beq     common_errNullObject        // object was null
+    cbz     w3, common_errNullObject    // object was null
     ldrsb   w0, [x3, x1]                // w0<- obj.field
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
     
@@ -6752,8 +6721,7 @@
     FETCH w1, 1                         // w1<- field byte offset
     GET_VREG w3, w2                     // w3<- object we're operating on
     ubfx    w2, wINST, #8, #4           // w2<- A
-    cmp     x3, #0                      // check object for null
-    beq     common_errNullObject        // object was null
+    cbz     w3, common_errNullObject    // object was null
     ldrh   w0, [x3, x1]                // w0<- obj.field
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
     
@@ -6773,8 +6741,7 @@
     FETCH w1, 1                         // w1<- field byte offset
     GET_VREG w3, w2                     // w3<- object we're operating on
     ubfx    w2, wINST, #8, #4           // w2<- A
-    cmp     x3, #0                      // check object for null
-    beq     common_errNullObject        // object was null
+    cbz     w3, common_errNullObject    // object was null
     ldrsh   w0, [x3, x1]                // w0<- obj.field
     FETCH_ADVANCE_INST 2                // advance rPC, load rINST
     
@@ -11521,7 +11488,7 @@
 #if MTERP_LOGGING
     mov  x0, xSELF
     add  x1, xFP, #OFF_FP_SHADOWFRAME
-    sbfm x2, xINST, 0, 31
+    sxtw x2, wINST
     bl MterpLogOSR
 #endif
     mov  x0, #1                         // Signal normal return
diff --git a/runtime/mirror/class.cc b/runtime/mirror/class.cc
index 9c77d38..1c31c57 100644
--- a/runtime/mirror/class.cc
+++ b/runtime/mirror/class.cc
@@ -748,21 +748,24 @@
   return nullptr;
 }
 
-ArtField* Class::FindStaticField(Thread* self, Handle<Class> klass, const DexCache* dex_cache,
+ArtField* Class::FindStaticField(Thread* self,
+                                 Class* klass,
+                                 const DexCache* dex_cache,
                                  uint32_t dex_field_idx) {
-  for (Class* k = klass.Get(); k != nullptr; k = k->GetSuperClass()) {
+  for (Class* k = klass; k != nullptr; k = k->GetSuperClass()) {
     // Is the field in this class?
     ArtField* f = k->FindDeclaredStaticField(dex_cache, dex_field_idx);
     if (f != nullptr) {
       return f;
     }
-    // Wrap k incase it moves during GetDirectInterface.
+    // Though GetDirectInterface() should not cause thread suspension when called
+    // from here, it takes a Handle as an argument, so we need to wrap `k`.
+    ScopedAssertNoThreadSuspension ants(self, __FUNCTION__);
     StackHandleScope<1> hs(self);
-    HandleWrapper<mirror::Class> h_k(hs.NewHandleWrapper(&k));
+    Handle<mirror::Class> h_k(hs.NewHandle(k));
     // Is this field in any of this class' interfaces?
     for (uint32_t i = 0; i < h_k->NumDirectInterfaces(); ++i) {
-      StackHandleScope<1> hs2(self);
-      Handle<mirror::Class> interface(hs2.NewHandle(GetDirectInterface(self, h_k, i)));
+      mirror::Class* interface = GetDirectInterface(self, h_k, i);
       f = FindStaticField(self, interface, dex_cache, dex_field_idx);
       if (f != nullptr) {
         return f;
diff --git a/runtime/mirror/class.h b/runtime/mirror/class.h
index f044b59..9be9f01 100644
--- a/runtime/mirror/class.h
+++ b/runtime/mirror/class.h
@@ -1091,7 +1091,9 @@
 
   // Finds the given static field in this class or superclass, only searches classes that
   // have the same dex cache.
-  static ArtField* FindStaticField(Thread* self, Handle<Class> klass, const DexCache* dex_cache,
+  static ArtField* FindStaticField(Thread* self,
+                                   Class* klass,
+                                   const DexCache* dex_cache,
                                    uint32_t dex_field_idx)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
diff --git a/runtime/native/dalvik_system_VMRuntime.cc b/runtime/native/dalvik_system_VMRuntime.cc
index 79b18aa..d987f65 100644
--- a/runtime/native/dalvik_system_VMRuntime.cc
+++ b/runtime/native/dalvik_system_VMRuntime.cc
@@ -342,7 +342,7 @@
     return;
   }
   if (is_static) {
-    field = mirror::Class::FindStaticField(self, klass, dex_cache.Get(), field_idx);
+    field = mirror::Class::FindStaticField(self, klass.Get(), dex_cache.Get(), field_idx);
   } else {
     field = klass->FindInstanceField(dex_cache.Get(), field_idx);
   }
diff --git a/runtime/oat.h b/runtime/oat.h
index e506e3c..9b8f545 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,7 +32,7 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '0', '8', '3', '\0' };
+  static constexpr uint8_t kOatVersion[] = { '0', '8', '4', '\0' };
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
diff --git a/runtime/read_barrier-inl.h b/runtime/read_barrier-inl.h
index 0c3eb3b..92efa21 100644
--- a/runtime/read_barrier-inl.h
+++ b/runtime/read_barrier-inl.h
@@ -220,7 +220,7 @@
 }
 
 inline mirror::Object* ReadBarrier::Mark(mirror::Object* obj) {
-  return Runtime::Current()->GetHeap()->ConcurrentCopyingCollector()->Mark(obj);
+  return Runtime::Current()->GetHeap()->ConcurrentCopyingCollector()->MarkFromReadBarrier(obj);
 }
 
 inline bool ReadBarrier::HasGrayReadBarrierPointer(mirror::Object* obj,
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 21cd2aa..079c079 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -989,6 +989,7 @@
                        xgc_option.verify_pre_sweeping_rosalloc_,
                        xgc_option.verify_post_gc_rosalloc_,
                        xgc_option.gcstress_,
+                       xgc_option.measure_,
                        runtime_options.GetOrDefault(Opt::EnableHSpaceCompactForOOM),
                        runtime_options.GetOrDefault(Opt::HSpaceCompactForOOMMinIntervalsMs));
 
diff --git a/test/501-regression-packed-switch/info.txt b/test/501-regression-packed-switch/info.txt
index fbd93fa..988b220 100644
--- a/test/501-regression-packed-switch/info.txt
+++ b/test/501-regression-packed-switch/info.txt
@@ -1,2 +1,4 @@
 Regression test for the interpreter and optimizing's builder which used
 to trip when compiled code contained a packed switch with no targets.
+Regression test for the arm64 mterp miscalculating the switch table
+address, zero-extending a register instead of sign-extending.
diff --git a/test/501-regression-packed-switch/smali/Test.smali b/test/501-regression-packed-switch/smali/Test.smali
index 8756ed5..5a760c7 100644
--- a/test/501-regression-packed-switch/smali/Test.smali
+++ b/test/501-regression-packed-switch/smali/Test.smali
@@ -27,3 +27,28 @@
   .packed-switch 0x0
   .end packed-switch
 .end method
+
+.method public static PackedSwitchAfterData(I)I
+  .registers 1
+  goto :pswitch_instr
+
+  :case0
+  const/4 v0, 0x1
+  return v0
+
+  :pswitch_data
+  .packed-switch 0x0
+    :case0
+    :case1
+  .end packed-switch
+
+  :pswitch_instr
+  packed-switch v0, :pswitch_data
+  const/4 v0, 0x7
+  return v0
+
+  :case1
+  const/4 v0, 0x4
+  return v0
+
+.end method
diff --git a/test/501-regression-packed-switch/src/Main.java b/test/501-regression-packed-switch/src/Main.java
index b80bc62..12bc1a8 100644
--- a/test/501-regression-packed-switch/src/Main.java
+++ b/test/501-regression-packed-switch/src/Main.java
@@ -29,5 +29,10 @@
     if (result != 5) {
       throw new Error("Expected 5, got " + result);
     }
+    m = c.getMethod("PackedSwitchAfterData", new Class[] { int.class });
+    result = (Integer) m.invoke(null, new Integer(0));
+    if (result != 1) {
+      throw new Error("Expected 1, got " + result);
+    }
   }
 }
diff --git a/test/527-checker-array-access-split/src/Main.java b/test/527-checker-array-access-split/src/Main.java
index ead9446..3366f20 100644
--- a/test/527-checker-array-access-split/src/Main.java
+++ b/test/527-checker-array-access-split/src/Main.java
@@ -34,9 +34,21 @@
   /// CHECK-START-ARM64: int Main.constantIndexGet(int[]) instruction_simplifier_arm64 (after)
   /// CHECK:             <<Array:l\d+>>         NullCheck
   /// CHECK:             <<Index:i\d+>>         BoundsCheck
-  /// CHECK-NOT:                                Arm64IntermediateAddress
+  /// CHECK-NOT:                                IntermediateAddress
   /// CHECK:                                    ArrayGet [<<Array>>,<<Index>>]
 
+
+  /// CHECK-START-ARM: int Main.constantIndexGet(int[]) instruction_simplifier_arm (before)
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:                                    ArrayGet [<<Array>>,<<Index>>]
+
+  /// CHECK-START-ARM: int Main.constantIndexGet(int[]) instruction_simplifier_arm (after)
+  /// CHECK:           <<Array:l\d+>>         NullCheck
+  /// CHECK:           <<Index:i\d+>>         BoundsCheck
+  /// CHECK-NOT:                              IntermediateAddress
+  /// CHECK:                                  ArrayGet [<<Array>>,<<Index>>]
+
   public static int constantIndexGet(int array[]) {
     return array[1];
   }
@@ -55,10 +67,23 @@
   /// CHECK:             <<Const2:i\d+>>        IntConstant 2
   /// CHECK:             <<Array:l\d+>>         NullCheck
   /// CHECK:             <<Index:i\d+>>         BoundsCheck
-  /// CHECK-NOT:                                Arm64IntermediateAddress
+  /// CHECK-NOT:                                IntermediateAddress
   /// CHECK:                                    ArraySet [<<Array>>,<<Index>>,<<Const2>>]
 
 
+  /// CHECK-START-ARM:   void Main.constantIndexSet(int[]) instruction_simplifier_arm (before)
+  /// CHECK:             <<Const2:i\d+>>        IntConstant 2
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:                                    ArraySet [<<Array>>,<<Index>>,<<Const2>>]
+
+  /// CHECK-START-ARM:   void Main.constantIndexSet(int[]) instruction_simplifier_arm (after)
+  /// CHECK:             <<Const2:i\d+>>        IntConstant 2
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK-NOT:                                IntermediateAddress
+  /// CHECK:                                    ArraySet [<<Array>>,<<Index>>,<<Const2>>]
+
   public static void constantIndexSet(int array[]) {
     array[1] = 2;
   }
@@ -76,7 +101,20 @@
   /// CHECK:             <<DataOffset:i\d+>>    IntConstant
   /// CHECK:             <<Array:l\d+>>         NullCheck
   /// CHECK:             <<Index:i\d+>>         BoundsCheck
-  /// CHECK:             <<Address:l\d+>>       Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address:l\d+>>       IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-NEXT:                               ArrayGet [<<Address>>,<<Index>>]
+
+
+  /// CHECK-START-ARM:   int Main.get(int[], int) instruction_simplifier_arm (before)
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:                                    ArrayGet [<<Array>>,<<Index>>]
+
+  /// CHECK-START-ARM:   int Main.get(int[], int) instruction_simplifier_arm (after)
+  /// CHECK:             <<DataOffset:i\d+>>    IntConstant
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:             <<Address:l\d+>>       IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK-NEXT:                               ArrayGet [<<Address>>,<<Index>>]
 
   public static int get(int array[], int index) {
@@ -102,7 +140,26 @@
   /// CHECK:             <<DataOffset:i\d+>>    IntConstant
   /// CHECK:             <<Array:l\d+>>         NullCheck
   /// CHECK:             <<Index:i\d+>>         BoundsCheck
-  /// CHECK:             <<Address:l\d+>>       Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address:l\d+>>       IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-NEXT:                               ArraySet [<<Address>>,<<Index>>,<<Arg>>]
+
+
+  /// CHECK-START-ARM:   void Main.set(int[], int, int) instruction_simplifier_arm (before)
+  /// CHECK:                                    ParameterValue
+  /// CHECK:                                    ParameterValue
+  /// CHECK:             <<Arg:i\d+>>           ParameterValue
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:                                    ArraySet [<<Array>>,<<Index>>,<<Arg>>]
+
+  /// CHECK-START-ARM:   void Main.set(int[], int, int) instruction_simplifier_arm (after)
+  /// CHECK:                                    ParameterValue
+  /// CHECK:                                    ParameterValue
+  /// CHECK:             <<Arg:i\d+>>           ParameterValue
+  /// CHECK:             <<DataOffset:i\d+>>    IntConstant
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:             <<Address:l\d+>>       IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK-NEXT:                               ArraySet [<<Address>>,<<Index>>,<<Arg>>]
 
   public static void set(int array[], int index, int value) {
@@ -126,10 +183,10 @@
   /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant
   /// CHECK:             <<Array:l\d+>>         NullCheck
   /// CHECK:             <<Index:i\d+>>         BoundsCheck
-  /// CHECK:             <<Address1:l\d+>>      Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address1:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK-NEXT:        <<ArrayGet:i\d+>>      ArrayGet [<<Address1>>,<<Index>>]
   /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
-  /// CHECK:             <<Address2:l\d+>>      Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address2:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK-NEXT:                               ArraySet [<<Address2>>,<<Index>>,<<Add>>]
 
   /// CHECK-START-ARM64: void Main.getSet(int[], int) GVN_after_arch (after)
@@ -137,12 +194,42 @@
   /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant
   /// CHECK:             <<Array:l\d+>>         NullCheck
   /// CHECK:             <<Index:i\d+>>         BoundsCheck
-  /// CHECK:             <<Address:l\d+>>       Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address:l\d+>>       IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK:             <<ArrayGet:i\d+>>      ArrayGet [<<Address>>,<<Index>>]
   /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
-  /// CHECK-NOT:                                Arm64IntermediateAddress
+  /// CHECK-NOT:                                IntermediateAddress
   /// CHECK:                                    ArraySet [<<Address>>,<<Index>>,<<Add>>]
 
+
+  /// CHECK-START-ARM:   void Main.getSet(int[], int) instruction_simplifier_arm (before)
+  /// CHECK:             <<Const1:i\d+>>        IntConstant 1
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:             <<ArrayGet:i\d+>>      ArrayGet [<<Array>>,<<Index>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
+  /// CHECK:                                    ArraySet [<<Array>>,<<Index>>,<<Add>>]
+
+  /// CHECK-START-ARM:   void Main.getSet(int[], int) instruction_simplifier_arm (after)
+  /// CHECK-DAG:         <<Const1:i\d+>>        IntConstant 1
+  /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:             <<Address1:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-NEXT:        <<ArrayGet:i\d+>>      ArrayGet [<<Address1>>,<<Index>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
+  /// CHECK:             <<Address2:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-NEXT:                               ArraySet [<<Address2>>,<<Index>>,<<Add>>]
+
+  /// CHECK-START-ARM:   void Main.getSet(int[], int) GVN_after_arch (after)
+  /// CHECK-DAG:         <<Const1:i\d+>>        IntConstant 1
+  /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:             <<Address:l\d+>>       IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<ArrayGet:i\d+>>      ArrayGet [<<Address>>,<<Index>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
+  /// CHECK-NOT:                                IntermediateAddress
+  /// CHECK:                                    ArraySet [<<Address>>,<<Index>>,<<Add>>]
   public static void getSet(int array[], int index) {
     array[index] = array[index] + 1;
   }
@@ -166,11 +253,11 @@
   /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant
   /// CHECK:             <<Array:l\d+>>         NullCheck
   /// CHECK:             <<Index:i\d+>>         BoundsCheck
-  /// CHECK:             <<Address1:l\d+>>      Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address1:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK-NEXT:        <<ArrayGet:i\d+>>      ArrayGet [<<Address1>>,<<Index>>]
   /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
   /// CHECK:                                    NewArray
-  /// CHECK:             <<Address2:l\d+>>      Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address2:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK-NEXT:                               ArraySet [<<Address2>>,<<Index>>,<<Add>>]
 
   /// CHECK-START-ARM64: int[] Main.accrossGC(int[], int) GVN_after_arch (after)
@@ -178,11 +265,45 @@
   /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant
   /// CHECK:             <<Array:l\d+>>         NullCheck
   /// CHECK:             <<Index:i\d+>>         BoundsCheck
-  /// CHECK:             <<Address1:l\d+>>      Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address1:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK:             <<ArrayGet:i\d+>>      ArrayGet [<<Address1>>,<<Index>>]
   /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
   /// CHECK:                                    NewArray
-  /// CHECK:             <<Address2:l\d+>>      Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address2:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:                                    ArraySet [<<Address2>>,<<Index>>,<<Add>>]
+
+
+  /// CHECK-START-ARM:   int[] Main.accrossGC(int[], int) instruction_simplifier_arm (before)
+  /// CHECK:             <<Const1:i\d+>>        IntConstant 1
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:             <<ArrayGet:i\d+>>      ArrayGet [<<Array>>,<<Index>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
+  /// CHECK:                                    NewArray
+  /// CHECK:                                    ArraySet [<<Array>>,<<Index>>,<<Add>>]
+
+  /// CHECK-START-ARM:   int[] Main.accrossGC(int[], int) instruction_simplifier_arm (after)
+  /// CHECK-DAG:         <<Const1:i\d+>>        IntConstant 1
+  /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:             <<Address1:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-NEXT:        <<ArrayGet:i\d+>>      ArrayGet [<<Address1>>,<<Index>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
+  /// CHECK:                                    NewArray
+  /// CHECK:             <<Address2:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-NEXT:                               ArraySet [<<Address2>>,<<Index>>,<<Add>>]
+
+  /// CHECK-START-ARM:   int[] Main.accrossGC(int[], int) GVN_after_arch (after)
+  /// CHECK-DAG:         <<Const1:i\d+>>        IntConstant 1
+  /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant
+  /// CHECK:             <<Array:l\d+>>         NullCheck
+  /// CHECK:             <<Index:i\d+>>         BoundsCheck
+  /// CHECK:             <<Address1:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<ArrayGet:i\d+>>      ArrayGet [<<Address1>>,<<Index>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
+  /// CHECK:                                    NewArray
+  /// CHECK:             <<Address2:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK:                                    ArraySet [<<Address2>>,<<Index>>,<<Add>>]
 
   public static int[] accrossGC(int array[], int index) {
@@ -196,6 +317,14 @@
    * Test that the intermediate address is shared between array accesses after
    * the bounds check have been removed by BCE.
    */
+  // For checker tests `instruction_simplifier_<arch> (after)` below, by the time we reach
+  // the architecture-specific instruction simplifier, BCE has removed the bounds checks in
+  // the loop.
+
+  // Note that we do not care that the `DataOffset` is `12`. But if we do not
+  // specify it and any other `IntConstant` appears before that instruction,
+  // checker will match the previous `IntConstant`, and we will thus fail the
+  // check.
 
   /// CHECK-START-ARM64: int Main.canMergeAfterBCE1() instruction_simplifier_arm64 (before)
   /// CHECK:             <<Const1:i\d+>>        IntConstant 1
@@ -207,14 +336,6 @@
   /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
   /// CHECK:                                    ArraySet [<<Array>>,<<Index>>,<<Add>>]
 
-  // By the time we reach the architecture-specific instruction simplifier, BCE
-  // has removed the bounds checks in the loop.
-
-  // Note that we do not care that the `DataOffset` is `12`. But if we do not
-  // specify it and any other `IntConstant` appears before that instruction,
-  // checker will match the previous `IntConstant`, and we will thus fail the
-  // check.
-
   /// CHECK-START-ARM64: int Main.canMergeAfterBCE1() instruction_simplifier_arm64 (after)
   /// CHECK-DAG:         <<Const1:i\d+>>        IntConstant 1
   /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant 12
@@ -222,10 +343,10 @@
   /// CHECK:             <<Index:i\d+>>         Phi
   /// CHECK:                                    If
   //  -------------- Loop
-  /// CHECK:             <<Address1:l\d+>>      Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address1:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK-NEXT:        <<ArrayGet:i\d+>>      ArrayGet [<<Address1>>,<<Index>>]
   /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
-  /// CHECK:             <<Address2:l\d+>>      Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address2:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK-NEXT:                               ArraySet [<<Address2>>,<<Index>>,<<Add>>]
 
   /// CHECK-START-ARM64: int Main.canMergeAfterBCE1() GVN_after_arch (after)
@@ -235,10 +356,47 @@
   /// CHECK:             <<Index:i\d+>>         Phi
   /// CHECK:                                    If
   //  -------------- Loop
-  /// CHECK:             <<Address:l\d+>>       Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address:l\d+>>       IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK:             <<ArrayGet:i\d+>>      ArrayGet [<<Address>>,<<Index>>]
   /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
-  /// CHECK-NOT:                                Arm64IntermediateAddress
+  /// CHECK-NOT:                                IntermediateAddress
+  /// CHECK:                                    ArraySet [<<Address>>,<<Index>>,<<Add>>]
+
+
+  /// CHECK-START-ARM:   int Main.canMergeAfterBCE1() instruction_simplifier_arm (before)
+  /// CHECK:             <<Const1:i\d+>>        IntConstant 1
+  /// CHECK:             <<Array:l\d+>>         NewArray
+  /// CHECK:             <<Index:i\d+>>         Phi
+  /// CHECK:                                    If
+  //  -------------- Loop
+  /// CHECK:             <<ArrayGet:i\d+>>      ArrayGet [<<Array>>,<<Index>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
+  /// CHECK:                                    ArraySet [<<Array>>,<<Index>>,<<Add>>]
+
+  /// CHECK-START-ARM:   int Main.canMergeAfterBCE1() instruction_simplifier_arm (after)
+  /// CHECK-DAG:         <<Const1:i\d+>>        IntConstant 1
+  /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK:             <<Array:l\d+>>         NewArray
+  /// CHECK:             <<Index:i\d+>>         Phi
+  /// CHECK:                                    If
+  //  -------------- Loop
+  /// CHECK:             <<Address1:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-NEXT:        <<ArrayGet:i\d+>>      ArrayGet [<<Address1>>,<<Index>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
+  /// CHECK:             <<Address2:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-NEXT:                               ArraySet [<<Address2>>,<<Index>>,<<Add>>]
+
+  /// CHECK-START-ARM:   int Main.canMergeAfterBCE1() GVN_after_arch (after)
+  /// CHECK-DAG:         <<Const1:i\d+>>        IntConstant 1
+  /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK:             <<Array:l\d+>>         NewArray
+  /// CHECK:             <<Index:i\d+>>         Phi
+  /// CHECK:                                    If
+  //  -------------- Loop
+  /// CHECK:             <<Address:l\d+>>       IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<ArrayGet:i\d+>>      ArrayGet [<<Address>>,<<Index>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGet>>,<<Const1>>]
+  /// CHECK-NOT:                                IntermediateAddress
   /// CHECK:                                    ArraySet [<<Address>>,<<Index>>,<<Add>>]
 
   public static int canMergeAfterBCE1() {
@@ -279,12 +437,12 @@
   /// CHECK:                                    If
   //  -------------- Loop
   /// CHECK-DAG:         <<Index1:i\d+>>        Add [<<Index>>,<<Const1>>]
-  /// CHECK-DAG:         <<Address1:l\d+>>      Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-DAG:         <<Address1:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK-DAG:         <<ArrayGetI:i\d+>>     ArrayGet [<<Address1>>,<<Index>>]
-  /// CHECK-DAG:         <<Address2:l\d+>>      Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-DAG:         <<Address2:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK-DAG:         <<ArrayGetI1:i\d+>>    ArrayGet [<<Address2>>,<<Index1>>]
   /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGetI>>,<<ArrayGetI1>>]
-  /// CHECK:             <<Address3:l\d+>>      Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:             <<Address3:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK:                                    ArraySet [<<Address3>>,<<Index1>>,<<Add>>]
 
   /// CHECK-START-ARM64: int Main.canMergeAfterBCE2() GVN_after_arch (after)
@@ -295,7 +453,7 @@
   /// CHECK:                                    If
   //  -------------- Loop
   /// CHECK-DAG:         <<Index1:i\d+>>        Add [<<Index>>,<<Const1>>]
-  /// CHECK-DAG:         <<Address:l\d+>>       Arm64IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-DAG:         <<Address:l\d+>>       IntermediateAddress [<<Array>>,<<DataOffset>>]
   /// CHECK-DAG:         <<ArrayGetI:i\d+>>     ArrayGet [<<Address>>,<<Index>>]
   /// CHECK-DAG:         <<ArrayGetI1:i\d+>>    ArrayGet [<<Address>>,<<Index1>>]
   /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGetI>>,<<ArrayGetI1>>]
@@ -304,8 +462,55 @@
   // There should be only one intermediate address computation in the loop.
 
   /// CHECK-START-ARM64: int Main.canMergeAfterBCE2() GVN_after_arch (after)
-  /// CHECK:                                    Arm64IntermediateAddress
-  /// CHECK-NOT:                                Arm64IntermediateAddress
+  /// CHECK:                                    IntermediateAddress
+  /// CHECK-NOT:                                IntermediateAddress
+
+
+  /// CHECK-START-ARM:   int Main.canMergeAfterBCE2() instruction_simplifier_arm (before)
+  /// CHECK:             <<Const1:i\d+>>        IntConstant 1
+  /// CHECK:             <<Array:l\d+>>         NewArray
+  /// CHECK:             <<Index:i\d+>>         Phi
+  /// CHECK:                                    If
+  //  -------------- Loop
+  /// CHECK-DAG:         <<Index1:i\d+>>        Add [<<Index>>,<<Const1>>]
+  /// CHECK-DAG:         <<ArrayGetI:i\d+>>     ArrayGet [<<Array>>,<<Index>>]
+  /// CHECK-DAG:         <<ArrayGetI1:i\d+>>    ArrayGet [<<Array>>,<<Index1>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGetI>>,<<ArrayGetI1>>]
+  /// CHECK:                                    ArraySet [<<Array>>,<<Index1>>,<<Add>>]
+
+  /// CHECK-START-ARM:   int Main.canMergeAfterBCE2() instruction_simplifier_arm (after)
+  /// CHECK-DAG:         <<Const1:i\d+>>        IntConstant 1
+  /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK:             <<Array:l\d+>>         NewArray
+  /// CHECK:             <<Index:i\d+>>         Phi
+  /// CHECK:                                    If
+  //  -------------- Loop
+  /// CHECK-DAG:         <<Index1:i\d+>>        Add [<<Index>>,<<Const1>>]
+  /// CHECK-DAG:         <<Address1:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-DAG:         <<ArrayGetI:i\d+>>     ArrayGet [<<Address1>>,<<Index>>]
+  /// CHECK-DAG:         <<Address2:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-DAG:         <<ArrayGetI1:i\d+>>    ArrayGet [<<Address2>>,<<Index1>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGetI>>,<<ArrayGetI1>>]
+  /// CHECK:             <<Address3:l\d+>>      IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK:                                    ArraySet [<<Address3>>,<<Index1>>,<<Add>>]
+
+  /// CHECK-START-ARM:   int Main.canMergeAfterBCE2() GVN_after_arch (after)
+  /// CHECK-DAG:         <<Const1:i\d+>>        IntConstant 1
+  /// CHECK-DAG:         <<DataOffset:i\d+>>    IntConstant 12
+  /// CHECK:             <<Array:l\d+>>         NewArray
+  /// CHECK:             <<Index:i\d+>>         Phi
+  /// CHECK:                                    If
+  //  -------------- Loop
+  /// CHECK-DAG:         <<Index1:i\d+>>        Add [<<Index>>,<<Const1>>]
+  /// CHECK-DAG:         <<Address:l\d+>>       IntermediateAddress [<<Array>>,<<DataOffset>>]
+  /// CHECK-DAG:         <<ArrayGetI:i\d+>>     ArrayGet [<<Address>>,<<Index>>]
+  /// CHECK-DAG:         <<ArrayGetI1:i\d+>>    ArrayGet [<<Address>>,<<Index1>>]
+  /// CHECK:             <<Add:i\d+>>           Add [<<ArrayGetI>>,<<ArrayGetI1>>]
+  /// CHECK:                                    ArraySet [<<Address>>,<<Index1>>,<<Add>>]
+
+  /// CHECK-START-ARM:   int Main.canMergeAfterBCE2() GVN_after_arch (after)
+  /// CHECK:                                    IntermediateAddress
+  /// CHECK-NOT:                                IntermediateAddress
 
   public static int canMergeAfterBCE2() {
     int[] array = {0, 1, 2, 3};
@@ -315,6 +520,37 @@
     return array[array.length - 1];
   }
 
+  /// CHECK-START-ARM: int Main.checkLongFloatDouble() instruction_simplifier_arm (before)
+  /// CHECK-DAG:         <<Array1:l\d+>>        NewArray
+  /// CHECK-DAG:         <<Array2:l\d+>>        NewArray
+  /// CHECK-DAG:         <<Array3:l\d+>>        NewArray
+  /// CHECK-DAG:         <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                ArrayGet [<<Array1>>,<<Index>>]
+  /// CHECK-DAG:                                ArrayGet [<<Array2>>,<<Index>>]
+  /// CHECK-DAG:                                ArrayGet [<<Array3>>,<<Index>>]
+
+  /// CHECK-START-ARM: int Main.checkLongFloatDouble() instruction_simplifier_arm (after)
+  /// CHECK-DAG:         <<Array1:l\d+>>        NewArray
+  /// CHECK-DAG:         <<Array2:l\d+>>        NewArray
+  /// CHECK-DAG:         <<Array3:l\d+>>        NewArray
+  /// CHECK-DAG:         <<Index:i\d+>>         Phi
+  /// CHECK-DAG:                                ArrayGet [<<Array1>>,<<Index>>]
+  /// CHECK-DAG:                                ArrayGet [<<Array2>>,<<Index>>]
+  /// CHECK-DAG:                                ArrayGet [<<Array3>>,<<Index>>]
+
+  /// CHECK-START-ARM: int Main.checkLongFloatDouble() instruction_simplifier_arm (after)
+  /// CHECK-NOT:                                IntermediateAddress
+  public static int checkLongFloatDouble() {
+    long[] array_long = {0, 1, 2, 3};
+    float[] array_float = {(float)0.0, (float)1.0, (float)2.0, (float)3.0};
+    double[] array_double = {0.0, 1.0, 2.0, 3.0};
+    double s = 0.0;
+
+    for (int i = 0; i < 4; i++) {
+      s += (double)array_long[i] + (double)array_float[i] + array_double[i];
+    }
+    return (int)s;
+  }
 
   public static void main(String[] args) {
     int[] array = {123, 456, 789};
@@ -337,5 +573,7 @@
 
     assertIntEquals(4, canMergeAfterBCE1());
     assertIntEquals(6, canMergeAfterBCE2());
+
+    assertIntEquals(18, checkLongFloatDouble());
   }
 }
diff --git a/test/Android.run-test.mk b/test/Android.run-test.mk
index dd6b6f3..8f8b667 100644
--- a/test/Android.run-test.mk
+++ b/test/Android.run-test.mk
@@ -527,7 +527,7 @@
 # Tests that should fail in the read barrier configuration with the Optimizing compiler (AOT).
 # 484: Baker's fast path based read barrier compiler instrumentation generates code containing
 #      more parallel moves on x86, thus some Checker assertions may fail.
-# 527: On ARM64, the read barrier instrumentation does not support the HArm64IntermediateAddress
+# 527: On ARM64 and ARM, the read barrier instrumentation does not support the HIntermediateAddress
 #      instruction yet (b/26601270).
 # 537: Expects an array copy to be intrinsified on x86-64, but calling-on-slowpath intrinsics are
 #      not yet handled in the read barrier configuration.