Merge "Fix CC DCHECK failure in 152-gc-and-run-finalization."
diff --git a/Android.mk b/Android.mk
index b2716cd..cf3a9e7 100644
--- a/Android.mk
+++ b/Android.mk
@@ -435,8 +435,9 @@
 # Phony target for only building what go/lem requires on target.
 .PHONY: build-art-target-golem
 # Also include libartbenchmark, we always include it when running golem.
+# libstdc++ is needed when building for ART_TARGET_LINUX.
 ART_TARGET_SHARED_LIBRARY_BENCHMARK := $(TARGET_OUT_SHARED_LIBRARIES)/libartbenchmark.so
-build-art-target-golem: dex2oat dalvikvm patchoat linker \
+build-art-target-golem: dex2oat dalvikvm patchoat linker libstdc++ \
                         $(TARGET_OUT)/etc/public.libraries.txt \
                         $(ART_TARGET_DEX_DEPENDENCIES) \
                         $(ART_TARGET_SHARED_LIBRARY_DEPENDENCIES) \
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index a2bab80..e62bdb5 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -1188,13 +1188,12 @@
   CHECK_NE(image_classes_->size(), 0U);
 }
 
-static void MaybeAddToImageClasses(Handle<mirror::Class> c,
+static void MaybeAddToImageClasses(Thread* self,
+                                   ObjPtr<mirror::Class> klass,
                                    std::unordered_set<std::string>* image_classes)
     REQUIRES_SHARED(Locks::mutator_lock_) {
-  Thread* self = Thread::Current();
+  DCHECK_EQ(self, Thread::Current());
   StackHandleScope<1> hs(self);
-  // Make a copy of the handle so that we don't clobber it doing Assign.
-  MutableHandle<mirror::Class> klass(hs.NewHandle(c.Get()));
   std::string temp;
   const PointerSize pointer_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize();
   while (!klass->IsObjectClass()) {
@@ -1205,19 +1204,16 @@
       break;
     }
     VLOG(compiler) << "Adding " << descriptor << " to image classes";
-    for (size_t i = 0; i < klass->NumDirectInterfaces(); ++i) {
-      StackHandleScope<1> hs2(self);
-      // May cause thread suspension.
-      MaybeAddToImageClasses(hs2.NewHandle(mirror::Class::GetDirectInterface(self, klass, i)),
-                             image_classes);
+    for (size_t i = 0, num_interfaces = klass->NumDirectInterfaces(); i != num_interfaces; ++i) {
+      ObjPtr<mirror::Class> interface = mirror::Class::GetDirectInterface(self, klass, i);
+      DCHECK(interface != nullptr);
+      MaybeAddToImageClasses(self, interface, image_classes);
     }
-    for (auto& m : c->GetVirtualMethods(pointer_size)) {
-      StackHandleScope<1> hs2(self);
-      MaybeAddToImageClasses(hs2.NewHandle(m.GetDeclaringClass()), image_classes);
+    for (auto& m : klass->GetVirtualMethods(pointer_size)) {
+      MaybeAddToImageClasses(self, m.GetDeclaringClass(), image_classes);
     }
     if (klass->IsArrayClass()) {
-      StackHandleScope<1> hs2(self);
-      MaybeAddToImageClasses(hs2.NewHandle(klass->GetComponentType()), image_classes);
+      MaybeAddToImageClasses(self, klass->GetComponentType(), image_classes);
     }
     klass.Assign(klass->GetSuperClass());
   }
@@ -1268,8 +1264,10 @@
     for (Handle<mirror::Class> klass_root : image_classes_) {
       VisitClinitClassesObject(klass_root.Get());
     }
+    Thread* self = Thread::Current();
+    ScopedAssertNoThreadSuspension ants(__FUNCTION__);
     for (Handle<mirror::Class> h_klass : to_insert_) {
-      MaybeAddToImageClasses(h_klass, image_class_descriptors_);
+      MaybeAddToImageClasses(self, h_klass.Get(), image_class_descriptors_);
     }
   }
 
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index 0fb2a8b..a47e711 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -838,21 +838,73 @@
   return true;
 }
 
-class ImageWriter::NonImageClassesVisitor : public ClassVisitor {
+class ImageWriter::PruneClassesVisitor : public ClassVisitor {
  public:
-  explicit NonImageClassesVisitor(ImageWriter* image_writer) : image_writer_(image_writer) {}
+  PruneClassesVisitor(ImageWriter* image_writer, ObjPtr<mirror::ClassLoader> class_loader)
+      : image_writer_(image_writer),
+        class_loader_(class_loader),
+        classes_to_prune_(),
+        defined_class_count_(0u) { }
 
-  bool operator()(ObjPtr<Class> klass) OVERRIDE REQUIRES_SHARED(Locks::mutator_lock_) {
+  bool operator()(ObjPtr<mirror::Class> klass) OVERRIDE REQUIRES_SHARED(Locks::mutator_lock_) {
     if (!image_writer_->KeepClass(klass.Ptr())) {
       classes_to_prune_.insert(klass.Ptr());
+      if (klass->GetClassLoader() == class_loader_) {
+        ++defined_class_count_;
+      }
     }
     return true;
   }
 
-  std::unordered_set<mirror::Class*> classes_to_prune_;
+  size_t Prune() REQUIRES_SHARED(Locks::mutator_lock_) {
+    ClassTable* class_table =
+        Runtime::Current()->GetClassLinker()->ClassTableForClassLoader(class_loader_);
+    for (mirror::Class* klass : classes_to_prune_) {
+      std::string storage;
+      const char* descriptor = klass->GetDescriptor(&storage);
+      bool result = class_table->Remove(descriptor);
+      DCHECK(result);
+      DCHECK(!class_table->Remove(descriptor)) << descriptor;
+    }
+    return defined_class_count_;
+  }
+
+ private:
   ImageWriter* const image_writer_;
+  const ObjPtr<mirror::ClassLoader> class_loader_;
+  std::unordered_set<mirror::Class*> classes_to_prune_;
+  size_t defined_class_count_;
 };
 
+class ImageWriter::PruneClassLoaderClassesVisitor : public ClassLoaderVisitor {
+ public:
+  explicit PruneClassLoaderClassesVisitor(ImageWriter* image_writer)
+      : image_writer_(image_writer), removed_class_count_(0) {}
+
+  virtual void Visit(ObjPtr<mirror::ClassLoader> class_loader) OVERRIDE
+      REQUIRES_SHARED(Locks::mutator_lock_) {
+    PruneClassesVisitor classes_visitor(image_writer_, class_loader);
+    ClassTable* class_table =
+        Runtime::Current()->GetClassLinker()->ClassTableForClassLoader(class_loader);
+    class_table->Visit(classes_visitor);
+    removed_class_count_ += classes_visitor.Prune();
+  }
+
+  size_t GetRemovedClassCount() const {
+    return removed_class_count_;
+  }
+
+ private:
+  ImageWriter* const image_writer_;
+  size_t removed_class_count_;
+};
+
+void ImageWriter::VisitClassLoaders(ClassLoaderVisitor* visitor) {
+  WriterMutexLock mu(Thread::Current(), *Locks::classlinker_classes_lock_);
+  visitor->Visit(nullptr);  // Visit boot class loader.
+  Runtime::Current()->GetClassLinker()->VisitClassLoaders(visitor);
+}
+
 void ImageWriter::PruneNonImageClasses() {
   Runtime* runtime = Runtime::Current();
   ClassLinker* class_linker = runtime->GetClassLinker();
@@ -862,21 +914,11 @@
   // path dex caches.
   class_linker->ClearClassTableStrongRoots();
 
-  // Make a list of classes we would like to prune.
-  NonImageClassesVisitor visitor(this);
-  class_linker->VisitClasses(&visitor);
-
   // Remove the undesired classes from the class roots.
-  VLOG(compiler) << "Pruning " << visitor.classes_to_prune_.size() << " classes";
-  for (mirror::Class* klass : visitor.classes_to_prune_) {
-    std::string temp;
-    const char* name = klass->GetDescriptor(&temp);
-    VLOG(compiler) << "Pruning class " << name;
-    if (!compile_app_image_) {
-      DCHECK(IsBootClassLoaderClass(klass));
-    }
-    bool result = class_linker->RemoveClass(name, klass->GetClassLoader());
-    DCHECK(result);
+  {
+    PruneClassLoaderClassesVisitor class_loader_visitor(this);
+    VisitClassLoaders(&class_loader_visitor);
+    VLOG(compiler) << "Pruned " << class_loader_visitor.GetRemovedClassCount() << " classes";
   }
 
   // Clear references to removed classes from the DexCaches.
@@ -1508,8 +1550,10 @@
     }
     // Calculate the size of the class table.
     ReaderMutexLock mu(self, *Locks::classlinker_classes_lock_);
-    DCHECK_EQ(image_info.class_table_->NumZygoteClasses(), 0u);
-    if (image_info.class_table_->NumNonZygoteClasses() != 0u) {
+    CHECK_EQ(class_loaders_.size(), compile_app_image_ ? 1u : 0u);
+    mirror::ClassLoader* class_loader = compile_app_image_ ? *class_loaders_.begin() : nullptr;
+    DCHECK_EQ(image_info.class_table_->NumZygoteClasses(class_loader), 0u);
+    if (image_info.class_table_->NumNonZygoteClasses(class_loader) != 0u) {
       image_info.class_table_bytes_ += image_info.class_table_->WriteToMemory(nullptr);
     }
   }
@@ -1854,8 +1898,10 @@
     // above comment for intern tables.
     ClassTable temp_class_table;
     temp_class_table.ReadFromMemory(class_table_memory_ptr);
-    CHECK_EQ(temp_class_table.NumZygoteClasses(), table->NumNonZygoteClasses() +
-             table->NumZygoteClasses());
+    CHECK_EQ(class_loaders_.size(), compile_app_image_ ? 1u : 0u);
+    mirror::ClassLoader* class_loader = compile_app_image_ ? *class_loaders_.begin() : nullptr;
+    CHECK_EQ(temp_class_table.NumZygoteClasses(class_loader),
+             table->NumNonZygoteClasses(class_loader) + table->NumZygoteClasses(class_loader));
     UnbufferedRootVisitor visitor(&root_visitor, RootInfo(kRootUnknown));
     temp_class_table.VisitRoots(visitor);
   }
diff --git a/compiler/image_writer.h b/compiler/image_writer.h
index 24fad46..c537483 100644
--- a/compiler/image_writer.h
+++ b/compiler/image_writer.h
@@ -50,6 +50,7 @@
 }  // namespace space
 }  // namespace gc
 
+class ClassLoaderVisitor;
 class ClassTable;
 
 static constexpr int kInvalidFd = -1;
@@ -373,6 +374,9 @@
   void ComputeLazyFieldsForImageClasses()
       REQUIRES_SHARED(Locks::mutator_lock_);
 
+  // Visit all class loaders.
+  void VisitClassLoaders(ClassLoaderVisitor* visitor) REQUIRES_SHARED(Locks::mutator_lock_);
+
   // Remove unwanted classes from various roots.
   void PruneNonImageClasses() REQUIRES_SHARED(Locks::mutator_lock_);
 
@@ -588,7 +592,8 @@
   class FixupVisitor;
   class GetRootsVisitor;
   class NativeLocationVisitor;
-  class NonImageClassesVisitor;
+  class PruneClassesVisitor;
+  class PruneClassLoaderClassesVisitor;
   class VisitReferencesVisitor;
 
   DISALLOW_COPY_AND_ASSIGN(ImageWriter);
diff --git a/compiler/jni/jni_cfi_test_expected.inc b/compiler/jni/jni_cfi_test_expected.inc
index a205800..2710ae9 100644
--- a/compiler/jni/jni_cfi_test_expected.inc
+++ b/compiler/jni/jni_cfi_test_expected.inc
@@ -327,7 +327,7 @@
     0xC0, 0xFF, 0xBD, 0x27, 0x3C, 0x00, 0xBF, 0xAF, 0x38, 0x00, 0xBE, 0xAF,
     0x34, 0x00, 0xB7, 0xAF, 0x30, 0x00, 0xB6, 0xAF, 0x2C, 0x00, 0xB5, 0xAF,
     0x28, 0x00, 0xB4, 0xAF, 0x24, 0x00, 0xB3, 0xAF, 0x20, 0x00, 0xB2, 0xAF,
-    0x00, 0x00, 0xA4, 0xAF, 0x44, 0x00, 0xA5, 0xAF, 0x48, 0x00, 0xAC, 0xE7,
+    0x00, 0x00, 0xA4, 0xAF, 0x44, 0x00, 0xA5, 0xAF, 0x48, 0x00, 0xA8, 0xE7,
     0x4C, 0x00, 0xA6, 0xAF, 0x50, 0x00, 0xA7, 0xAF, 0xE0, 0xFF, 0xBD, 0x27,
     0x20, 0x00, 0xBD, 0x27, 0x20, 0x00, 0xB2, 0x8F, 0x24, 0x00, 0xB3, 0x8F,
     0x28, 0x00, 0xB4, 0x8F, 0x2C, 0x00, 0xB5, 0x8F, 0x30, 0x00, 0xB6, 0x8F,
@@ -361,7 +361,7 @@
 // 0x00000024: .cfi_offset: r18 at cfa-32
 // 0x00000024: sw r4, +0(r29)
 // 0x00000028: sw r5, +68(r29)
-// 0x0000002c: swc1 f12, +72(r29)
+// 0x0000002c: swc1 f8, +72(r29)
 // 0x00000030: sw r6, +76(r29)
 // 0x00000034: sw r7, +80(r29)
 // 0x00000038: addiu r29, r29, -32
diff --git a/compiler/jni/quick/mips/calling_convention_mips.cc b/compiler/jni/quick/mips/calling_convention_mips.cc
index e6948ec..0e0716e 100644
--- a/compiler/jni/quick/mips/calling_convention_mips.cc
+++ b/compiler/jni/quick/mips/calling_convention_mips.cc
@@ -23,6 +23,10 @@
 namespace art {
 namespace mips {
 
+//
+// JNI calling convention constants.
+//
+
 // Up to how many float-like (float, double) args can be enregistered in floating-point registers.
 // The rest of the args must go in integer registers or on the stack.
 constexpr size_t kMaxFloatOrDoubleRegisterArguments = 2u;
@@ -30,9 +34,17 @@
 // enregistered. The rest of the args must go on the stack.
 constexpr size_t kMaxIntLikeRegisterArguments = 4u;
 
-static const Register kCoreArgumentRegisters[] = { A0, A1, A2, A3 };
-static const FRegister kFArgumentRegisters[] = { F12, F14 };
-static const DRegister kDArgumentRegisters[] = { D6, D7 };
+static const Register kJniCoreArgumentRegisters[] = { A0, A1, A2, A3 };
+static const FRegister kJniFArgumentRegisters[] = { F12, F14 };
+static const DRegister kJniDArgumentRegisters[] = { D6, D7 };
+
+//
+// Managed calling convention constants.
+//
+
+static const Register kManagedCoreArgumentRegisters[] = { A0, A1, A2, A3, T0, T1 };
+static const FRegister kManagedFArgumentRegisters[] = { F8, F10, F12, F14, F16, F18 };
+static const DRegister kManagedDArgumentRegisters[] = { D4, D5, D6, D7, D8, D9 };
 
 static constexpr ManagedRegister kCalleeSaveRegisters[] = {
     // Core registers.
@@ -133,30 +145,30 @@
     for (ResetIterator(FrameOffset(0)); HasNext(); Next()) {
       if (IsCurrentParamAFloatOrDouble()) {
         if (IsCurrentParamADouble()) {
-          if (fpr_index < arraysize(kDArgumentRegisters)) {
+          if (fpr_index < arraysize(kManagedDArgumentRegisters)) {
             entry_spills_.push_back(
-                MipsManagedRegister::FromDRegister(kDArgumentRegisters[fpr_index++]));
+                MipsManagedRegister::FromDRegister(kManagedDArgumentRegisters[fpr_index++]));
           } else {
             entry_spills_.push_back(ManagedRegister::NoRegister(), 8);
           }
         } else {
-          if (fpr_index < arraysize(kFArgumentRegisters)) {
+          if (fpr_index < arraysize(kManagedFArgumentRegisters)) {
             entry_spills_.push_back(
-                MipsManagedRegister::FromFRegister(kFArgumentRegisters[fpr_index++]));
+                MipsManagedRegister::FromFRegister(kManagedFArgumentRegisters[fpr_index++]));
           } else {
             entry_spills_.push_back(ManagedRegister::NoRegister(), 4);
           }
         }
       } else {
         if (IsCurrentParamALong() && !IsCurrentParamAReference()) {
-          if (gpr_index == 1) {
-            // Don't use a1-a2 as a register pair, move to a2-a3 instead.
+          if (gpr_index == 1 || gpr_index == 3) {
+            // Don't use A1-A2(A3-T0) as a register pair, move to A2-A3(T0-T1) instead.
             gpr_index++;
           }
-          if (gpr_index < arraysize(kCoreArgumentRegisters) - 1) {
+          if (gpr_index < arraysize(kManagedCoreArgumentRegisters) - 1) {
             entry_spills_.push_back(
-                MipsManagedRegister::FromCoreRegister(kCoreArgumentRegisters[gpr_index++]));
-          } else if (gpr_index == arraysize(kCoreArgumentRegisters) - 1) {
+                MipsManagedRegister::FromCoreRegister(kManagedCoreArgumentRegisters[gpr_index++]));
+          } else if (gpr_index == arraysize(kManagedCoreArgumentRegisters) - 1) {
             gpr_index++;
             entry_spills_.push_back(ManagedRegister::NoRegister(), 4);
           } else {
@@ -164,9 +176,9 @@
           }
         }
 
-        if (gpr_index < arraysize(kCoreArgumentRegisters)) {
+        if (gpr_index < arraysize(kManagedCoreArgumentRegisters)) {
           entry_spills_.push_back(
-            MipsManagedRegister::FromCoreRegister(kCoreArgumentRegisters[gpr_index++]));
+              MipsManagedRegister::FromCoreRegister(kManagedCoreArgumentRegisters[gpr_index++]));
         } else {
           entry_spills_.push_back(ManagedRegister::NoRegister(), 4);
         }
@@ -175,6 +187,7 @@
   }
   return entry_spills_;
 }
+
 // JNI calling convention
 
 MipsJniCallingConvention::MipsJniCallingConvention(bool is_static,
@@ -285,7 +298,7 @@
   //  | FLOAT | INT | DOUBLE  |
   //  |  F12  | A1  | A2 | A3 |
   // (c) first two arguments are floating-point (float, double)
-  //  | FLAOT | (PAD) | DOUBLE |  INT  |
+  //  | FLOAT | (PAD) | DOUBLE |  INT  |
   //  |  F12  |       |  F14   | SP+16 |
   // (d) first two arguments are floating-point (double, float)
   //  | DOUBLE | FLOAT | INT |
@@ -404,9 +417,9 @@
   if (use_fp_arg_registers_ && (itr_args_ < kMaxFloatOrDoubleRegisterArguments)) {
     if (IsCurrentParamAFloatOrDouble()) {
       if (IsCurrentParamADouble()) {
-        return MipsManagedRegister::FromDRegister(kDArgumentRegisters[itr_args_]);
+        return MipsManagedRegister::FromDRegister(kJniDArgumentRegisters[itr_args_]);
       } else {
-        return MipsManagedRegister::FromFRegister(kFArgumentRegisters[itr_args_]);
+        return MipsManagedRegister::FromFRegister(kJniFArgumentRegisters[itr_args_]);
       }
     }
   }
@@ -420,7 +433,7 @@
       return MipsManagedRegister::FromRegisterPair(A2_A3);
     }
   } else {
-    return MipsManagedRegister::FromCoreRegister(kCoreArgumentRegisters[itr_slots_]);
+    return MipsManagedRegister::FromCoreRegister(kJniCoreArgumentRegisters[itr_slots_]);
   }
 }
 
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index fa6a522..402eeee 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -1402,6 +1402,14 @@
     entry.second = index;
     ++index;
   }
+  for (auto& entry : jit_class_roots_) {
+    // Update the `roots` with the class, and replace the address temporarily
+    // stored to the index in the table.
+    uint64_t address = entry.second;
+    roots->Set(index, reinterpret_cast<StackReference<mirror::Class>*>(address)->AsMirrorPtr());
+    entry.second = index;
+    ++index;
+  }
   EmitJitRootPatches(code, roots_data);
 }
 
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index 4b11e7c..2e2c3c0 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -34,6 +34,7 @@
 #include "stack_map_stream.h"
 #include "string_reference.h"
 #include "utils/label.h"
+#include "utils/type_reference.h"
 
 namespace art {
 
@@ -343,7 +344,7 @@
   void BuildStackMaps(MemoryRegion region, const DexFile::CodeItem& code_item);
   size_t ComputeStackMapsSize();
   size_t GetNumberOfJitRoots() const {
-    return jit_string_roots_.size();
+    return jit_string_roots_.size() + jit_class_roots_.size();
   }
 
   // Fills the `literals` array with literals collected during code generation.
@@ -611,6 +612,8 @@
         block_order_(nullptr),
         jit_string_roots_(StringReferenceValueComparator(),
                           graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+        jit_class_roots_(TypeReferenceValueComparator(),
+                         graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
         disasm_info_(nullptr),
         stats_(stats),
         graph_(graph),
@@ -681,6 +684,7 @@
   virtual void EmitJitRootPatches(uint8_t* code ATTRIBUTE_UNUSED,
                                   const uint8_t* roots_data ATTRIBUTE_UNUSED) {
     DCHECK_EQ(jit_string_roots_.size(), 0u);
+    DCHECK_EQ(jit_class_roots_.size(), 0u);
   }
 
   // Frame size required for this method.
@@ -711,7 +715,12 @@
   // Maps a StringReference (dex_file, string_index) to the index in the literal table.
   // Entries are intially added with a 0 index, and `EmitJitRoots` will compute all the
   // indices.
-  ArenaSafeMap<StringReference, size_t, StringReferenceValueComparator> jit_string_roots_;
+  ArenaSafeMap<StringReference, uint32_t, StringReferenceValueComparator> jit_string_roots_;
+
+  // Maps a ClassReference (dex_file, type_index) to the index in the literal table.
+  // Entries are intially added with a pointer in the handle zone, and `EmitJitRoots`
+  // will compute all the indices.
+  ArenaSafeMap<TypeReference, uint64_t, TypeReferenceValueComparator> jit_class_roots_;
 
   DisassemblyInformation* disasm_info_;
 
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index ed6eef1..8104613 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -1216,7 +1216,9 @@
       boot_image_address_patches_(std::less<uint32_t>(),
                                   graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_string_patches_(StringReferenceValueComparator(),
-                          graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
+                          graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      jit_class_patches_(TypeReferenceValueComparator(),
+                         graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
   // Always save the LR register to mimic Quick.
   AddAllocatedRegister(Location::RegisterLocation(LR));
 }
@@ -5712,8 +5714,7 @@
       break;
     case HLoadClass::LoadKind::kBootImageAddress:
       break;
-    case HLoadClass::LoadKind::kDexCacheAddress:
-      DCHECK(Runtime::Current()->UseJitCompilation());
+    case HLoadClass::LoadKind::kJitTableAddress:
       break;
     case HLoadClass::LoadKind::kDexCachePcRelative:
       DCHECK(!Runtime::Current()->UseJitCompilation());
@@ -5814,22 +5815,12 @@
       __ LoadLiteral(out, codegen_->DeduplicateBootImageAddressLiteral(address));
       break;
     }
-    case HLoadClass::LoadKind::kDexCacheAddress: {
-      DCHECK_NE(cls->GetAddress(), 0u);
-      uint32_t address = dchecked_integral_cast<uint32_t>(cls->GetAddress());
-      // 16-bit LDR immediate has a 5-bit offset multiplied by the size and that gives
-      // a 128B range. To try and reduce the number of literals if we load multiple types,
-      // simply split the dex cache address to a 128B aligned base loaded from a literal
-      // and the remaining offset embedded in the load.
-      static_assert(sizeof(GcRoot<mirror::Class>) == 4u, "Expected GC root to be 4 bytes.");
-      DCHECK_ALIGNED(cls->GetAddress(), 4u);
-      constexpr size_t offset_bits = /* encoded bits */ 5 + /* scale */ 2;
-      uint32_t base_address = address & ~MaxInt<uint32_t>(offset_bits);
-      uint32_t offset = address & MaxInt<uint32_t>(offset_bits);
-      __ LoadLiteral(out, codegen_->DeduplicateDexCacheAddressLiteral(base_address));
-      // /* GcRoot<mirror::Class> */ out = *(base_address + offset)
-      GenerateGcRootFieldLoad(cls, out_loc, out, offset, read_barrier_option);
-      generate_null_check = !cls->IsInDexCache();
+    case HLoadClass::LoadKind::kJitTableAddress: {
+      __ LoadLiteral(out, codegen_->DeduplicateJitClassLiteral(cls->GetDexFile(),
+                                                               cls->GetTypeIndex(),
+                                                               cls->GetAddress()));
+      // /* GcRoot<mirror::Class> */ out = *out
+      GenerateGcRootFieldLoad(cls, out_loc, out, /* offset */ 0, kCompilerReadBarrierOption);
       break;
     }
     case HLoadClass::LoadKind::kDexCachePcRelative: {
@@ -7379,10 +7370,6 @@
   return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), map);
 }
 
-Literal* CodeGeneratorARM::DeduplicateDexCacheAddressLiteral(uint32_t address) {
-  return DeduplicateUint32Literal(address, &uint32_literals_);
-}
-
 Literal* CodeGeneratorARM::DeduplicateJitStringLiteral(const DexFile& dex_file,
                                                        dex::StringIndex string_index) {
   jit_string_roots_.Overwrite(StringReference(&dex_file, string_index), /* placeholder */ 0u);
@@ -7391,6 +7378,15 @@
       [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); });
 }
 
+Literal* CodeGeneratorARM::DeduplicateJitClassLiteral(const DexFile& dex_file,
+                                                      dex::TypeIndex type_index,
+                                                      uint64_t address) {
+  jit_class_roots_.Overwrite(TypeReference(&dex_file, type_index), address);
+  return jit_class_patches_.GetOrCreate(
+      TypeReference(&dex_file, type_index),
+      [this]() { return __ NewLiteral<uint32_t>(/* placeholder */ 0u); });
+}
+
 template <LinkerPatch (*Factory)(size_t, const DexFile*, uint32_t, uint32_t)>
 inline void CodeGeneratorARM::EmitPcRelativeLinkerPatches(
     const ArenaDeque<PcRelativePatchInfo>& infos,
@@ -7707,18 +7703,28 @@
   }
 }
 
+static void PatchJitRootUse(uint8_t* code,
+                            const uint8_t* roots_data,
+                            Literal* literal,
+                            uint64_t index_in_table) {
+  DCHECK(literal->GetLabel()->IsBound());
+  uint32_t literal_offset = literal->GetLabel()->Position();
+  uintptr_t address =
+      reinterpret_cast<uintptr_t>(roots_data) + index_in_table * sizeof(GcRoot<mirror::Object>);
+  uint8_t* data = code + literal_offset;
+  reinterpret_cast<uint32_t*>(data)[0] = dchecked_integral_cast<uint32_t>(address);
+}
+
 void CodeGeneratorARM::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) {
   for (const auto& entry : jit_string_patches_) {
     const auto& it = jit_string_roots_.find(entry.first);
     DCHECK(it != jit_string_roots_.end());
-    size_t index_in_table = it->second;
-    Literal* literal = entry.second;
-    DCHECK(literal->GetLabel()->IsBound());
-    uint32_t literal_offset = literal->GetLabel()->Position();
-    uintptr_t address =
-        reinterpret_cast<uintptr_t>(roots_data) + index_in_table * sizeof(GcRoot<mirror::Object>);
-    uint8_t* data = code + literal_offset;
-    reinterpret_cast<uint32_t*>(data)[0] = dchecked_integral_cast<uint32_t>(address);
+    PatchJitRootUse(code, roots_data, entry.second, it->second);
+  }
+  for (const auto& entry : jit_class_patches_) {
+    const auto& it = jit_class_roots_.find(entry.first);
+    DCHECK(it != jit_class_roots_.end());
+    PatchJitRootUse(code, roots_data, entry.second, it->second);
   }
 }
 
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 8230512..605169d 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -489,8 +489,10 @@
                                              dex::StringIndex string_index);
   Literal* DeduplicateBootImageTypeLiteral(const DexFile& dex_file, dex::TypeIndex type_index);
   Literal* DeduplicateBootImageAddressLiteral(uint32_t address);
-  Literal* DeduplicateDexCacheAddressLiteral(uint32_t address);
   Literal* DeduplicateJitStringLiteral(const DexFile& dex_file, dex::StringIndex string_index);
+  Literal* DeduplicateJitClassLiteral(const DexFile& dex_file,
+                                      dex::TypeIndex type_index,
+                                      uint64_t address);
 
   void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE;
 
@@ -599,9 +601,9 @@
   using StringToLiteralMap = ArenaSafeMap<StringReference,
                                           Literal*,
                                           StringReferenceValueComparator>;
-  using BootTypeToLiteralMap = ArenaSafeMap<TypeReference,
-                                            Literal*,
-                                            TypeReferenceValueComparator>;
+  using TypeToLiteralMap = ArenaSafeMap<TypeReference,
+                                        Literal*,
+                                        TypeReferenceValueComparator>;
 
   Literal* DeduplicateUint32Literal(uint32_t value, Uint32ToLiteralMap* map);
   Literal* DeduplicateMethodLiteral(MethodReference target_method, MethodToLiteralMap* map);
@@ -638,7 +640,7 @@
   // PC-relative String patch info; type depends on configuration (app .bss or boot image PIC).
   ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_;
   // Deduplication map for boot type literals for kBootImageLinkTimeAddress.
-  BootTypeToLiteralMap boot_image_type_patches_;
+  TypeToLiteralMap boot_image_type_patches_;
   // PC-relative type patch info.
   ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_;
   // Deduplication map for patchable boot image addresses.
@@ -646,6 +648,8 @@
 
   // Patches for string literals in JIT compiled code.
   StringToLiteralMap jit_string_patches_;
+  // Patches for class literals in JIT compiled code.
+  TypeToLiteralMap jit_class_patches_;
 
   DISALLOW_COPY_AND_ASSIGN(CodeGeneratorARM);
 };
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 6eebd69..5cff303 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -1162,7 +1162,9 @@
       boot_image_address_patches_(std::less<uint32_t>(),
                                   graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_string_patches_(StringReferenceValueComparator(),
-                          graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
+                          graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      jit_class_patches_(TypeReferenceValueComparator(),
+                         graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
   // Save the link register (containing the return address) to mimic Quick.
   AddAllocatedRegister(LocationFrom(lr));
 }
@@ -4169,11 +4171,6 @@
   return DeduplicateUint32Literal(dchecked_integral_cast<uint32_t>(address), map);
 }
 
-vixl::aarch64::Literal<uint64_t>* CodeGeneratorARM64::DeduplicateDexCacheAddressLiteral(
-    uint64_t address) {
-  return DeduplicateUint64Literal(address);
-}
-
 vixl::aarch64::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateJitStringLiteral(
     const DexFile& dex_file, dex::StringIndex string_index) {
   jit_string_roots_.Overwrite(StringReference(&dex_file, string_index), /* placeholder */ 0u);
@@ -4182,6 +4179,14 @@
       [this]() { return __ CreateLiteralDestroyedWithPool<uint32_t>(/* placeholder */ 0u); });
 }
 
+vixl::aarch64::Literal<uint32_t>* CodeGeneratorARM64::DeduplicateJitClassLiteral(
+    const DexFile& dex_file, dex::TypeIndex type_index, uint64_t address) {
+  jit_class_roots_.Overwrite(TypeReference(&dex_file, type_index), address);
+  return jit_class_patches_.GetOrCreate(
+      TypeReference(&dex_file, type_index),
+      [this]() { return __ CreateLiteralDestroyedWithPool<uint32_t>(/* placeholder */ 0u); });
+}
+
 void CodeGeneratorARM64::EmitAdrpPlaceholder(vixl::aarch64::Label* fixup_label,
                                              vixl::aarch64::Register reg) {
   DCHECK(reg.IsX());
@@ -4359,7 +4364,7 @@
       break;
     case HLoadClass::LoadKind::kBootImageAddress:
       break;
-    case HLoadClass::LoadKind::kDexCacheAddress:
+    case HLoadClass::LoadKind::kJitTableAddress:
       DCHECK(Runtime::Current()->UseJitCompilation());
       break;
     case HLoadClass::LoadKind::kDexCachePcRelative:
@@ -4452,26 +4457,16 @@
       __ Ldr(out.W(), codegen_->DeduplicateBootImageAddressLiteral(cls->GetAddress()));
       break;
     }
-    case HLoadClass::LoadKind::kDexCacheAddress: {
-      DCHECK_NE(cls->GetAddress(), 0u);
-      // LDR immediate has a 12-bit offset multiplied by the size and for 32-bit loads
-      // that gives a 16KiB range. To try and reduce the number of literals if we load
-      // multiple types, simply split the dex cache address to a 16KiB aligned base
-      // loaded from a literal and the remaining offset embedded in the load.
-      static_assert(sizeof(GcRoot<mirror::Class>) == 4u, "Expected GC root to be 4 bytes.");
-      DCHECK_ALIGNED(cls->GetAddress(), 4u);
-      constexpr size_t offset_bits = /* encoded bits */ 12 + /* scale */ 2;
-      uint64_t base_address = cls->GetAddress() & ~MaxInt<uint64_t>(offset_bits);
-      uint32_t offset = cls->GetAddress() & MaxInt<uint64_t>(offset_bits);
-      __ Ldr(out.X(), codegen_->DeduplicateDexCacheAddressLiteral(base_address));
-      // /* GcRoot<mirror::Class> */ out = *(base_address + offset)
+    case HLoadClass::LoadKind::kJitTableAddress: {
+      __ Ldr(out, codegen_->DeduplicateJitClassLiteral(cls->GetDexFile(),
+                                                       cls->GetTypeIndex(),
+                                                       cls->GetAddress()));
       GenerateGcRootFieldLoad(cls,
                               out_loc,
                               out.X(),
-                              offset,
+                              /* offset */ 0,
                               /* fixup_label */ nullptr,
-                              read_barrier_option);
-      generate_null_check = !cls->IsInDexCache();
+                              kCompilerReadBarrierOption);
       break;
     }
     case HLoadClass::LoadKind::kDexCachePcRelative: {
@@ -5782,17 +5777,27 @@
   }
 }
 
+static void PatchJitRootUse(uint8_t* code,
+                            const uint8_t* roots_data,
+                            vixl::aarch64::Literal<uint32_t>* literal,
+                            uint64_t index_in_table) {
+  uint32_t literal_offset = literal->GetOffset();
+  uintptr_t address =
+      reinterpret_cast<uintptr_t>(roots_data) + index_in_table * sizeof(GcRoot<mirror::Object>);
+  uint8_t* data = code + literal_offset;
+  reinterpret_cast<uint32_t*>(data)[0] = dchecked_integral_cast<uint32_t>(address);
+}
+
 void CodeGeneratorARM64::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) {
   for (const auto& entry : jit_string_patches_) {
     const auto& it = jit_string_roots_.find(entry.first);
     DCHECK(it != jit_string_roots_.end());
-    size_t index_in_table = it->second;
-    vixl::aarch64::Literal<uint32_t>* literal = entry.second;
-    uint32_t literal_offset = literal->GetOffset();
-    uintptr_t address =
-        reinterpret_cast<uintptr_t>(roots_data) + index_in_table * sizeof(GcRoot<mirror::Object>);
-    uint8_t* data = code + literal_offset;
-    reinterpret_cast<uint32_t*>(data)[0] = dchecked_integral_cast<uint32_t>(address);
+    PatchJitRootUse(code, roots_data, entry.second, it->second);
+  }
+  for (const auto& entry : jit_class_patches_) {
+    const auto& it = jit_class_roots_.find(entry.first);
+    DCHECK(it != jit_class_roots_.end());
+    PatchJitRootUse(code, roots_data, entry.second, it->second);
   }
 }
 
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 868c8b0..85b6f9f 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -566,9 +566,11 @@
   vixl::aarch64::Literal<uint32_t>* DeduplicateBootImageTypeLiteral(const DexFile& dex_file,
                                                                     dex::TypeIndex type_index);
   vixl::aarch64::Literal<uint32_t>* DeduplicateBootImageAddressLiteral(uint64_t address);
-  vixl::aarch64::Literal<uint64_t>* DeduplicateDexCacheAddressLiteral(uint64_t address);
   vixl::aarch64::Literal<uint32_t>* DeduplicateJitStringLiteral(const DexFile& dex_file,
                                                                 dex::StringIndex string_index);
+  vixl::aarch64::Literal<uint32_t>* DeduplicateJitClassLiteral(const DexFile& dex_file,
+                                                               dex::TypeIndex string_index,
+                                                               uint64_t address);
 
   void EmitAdrpPlaceholder(vixl::aarch64::Label* fixup_label, vixl::aarch64::Register reg);
   void EmitAddPlaceholder(vixl::aarch64::Label* fixup_label,
@@ -682,9 +684,9 @@
   using StringToLiteralMap = ArenaSafeMap<StringReference,
                                           vixl::aarch64::Literal<uint32_t>*,
                                           StringReferenceValueComparator>;
-  using BootTypeToLiteralMap = ArenaSafeMap<TypeReference,
-                                            vixl::aarch64::Literal<uint32_t>*,
-                                            TypeReferenceValueComparator>;
+  using TypeToLiteralMap = ArenaSafeMap<TypeReference,
+                                        vixl::aarch64::Literal<uint32_t>*,
+                                        TypeReferenceValueComparator>;
 
   vixl::aarch64::Literal<uint32_t>* DeduplicateUint32Literal(uint32_t value,
                                                              Uint32ToLiteralMap* map);
@@ -733,8 +735,7 @@
 
   // Deduplication map for 32-bit literals, used for non-patchable boot image addresses.
   Uint32ToLiteralMap uint32_literals_;
-  // Deduplication map for 64-bit literals, used for non-patchable method address, method code
-  // or string dex cache address.
+  // Deduplication map for 64-bit literals, used for non-patchable method address or method code.
   Uint64ToLiteralMap uint64_literals_;
   // Method patch info, map MethodReference to a literal for method address and method code.
   MethodToLiteralMap method_patches_;
@@ -749,7 +750,7 @@
   // PC-relative String patch info; type depends on configuration (app .bss or boot image PIC).
   ArenaDeque<PcRelativePatchInfo> pc_relative_string_patches_;
   // Deduplication map for boot type literals for kBootImageLinkTimeAddress.
-  BootTypeToLiteralMap boot_image_type_patches_;
+  TypeToLiteralMap boot_image_type_patches_;
   // PC-relative type patch info.
   ArenaDeque<PcRelativePatchInfo> pc_relative_type_patches_;
   // Deduplication map for patchable boot image addresses.
@@ -757,6 +758,8 @@
 
   // Patches for string literals in JIT compiled code.
   StringToLiteralMap jit_string_patches_;
+  // Patches for class literals in JIT compiled code.
+  TypeToLiteralMap jit_class_patches_;
 
   DISALLOW_COPY_AND_ASSIGN(CodeGeneratorARM64);
 };
diff --git a/compiler/optimizing/code_generator_arm_vixl.cc b/compiler/optimizing/code_generator_arm_vixl.cc
index 4b24ac3..2c6df38 100644
--- a/compiler/optimizing/code_generator_arm_vixl.cc
+++ b/compiler/optimizing/code_generator_arm_vixl.cc
@@ -613,6 +613,509 @@
   DISALLOW_COPY_AND_ASSIGN(ArraySetSlowPathARMVIXL);
 };
 
+// Slow path marking an object reference `ref` during a read
+// barrier. The field `obj.field` in the object `obj` holding this
+// reference does not get updated by this slow path after marking (see
+// ReadBarrierMarkAndUpdateFieldSlowPathARM below for that).
+//
+// This means that after the execution of this slow path, `ref` will
+// always be up-to-date, but `obj.field` may not; i.e., after the
+// flip, `ref` will be a to-space reference, but `obj.field` will
+// probably still be a from-space reference (unless it gets updated by
+// another thread, or if another thread installed another object
+// reference (different from `ref`) in `obj.field`).
+class ReadBarrierMarkSlowPathARMVIXL : public SlowPathCodeARMVIXL {
+ public:
+  ReadBarrierMarkSlowPathARMVIXL(HInstruction* instruction,
+                                 Location ref,
+                                 Location entrypoint = Location::NoLocation())
+      : SlowPathCodeARMVIXL(instruction), ref_(ref), entrypoint_(entrypoint) {
+    DCHECK(kEmitCompilerReadBarrier);
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierMarkSlowPathARMVIXL"; }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    vixl32::Register ref_reg = RegisterFrom(ref_);
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg.GetCode())) << ref_reg;
+    DCHECK(instruction_->IsInstanceFieldGet() ||
+           instruction_->IsStaticFieldGet() ||
+           instruction_->IsArrayGet() ||
+           instruction_->IsArraySet() ||
+           instruction_->IsLoadClass() ||
+           instruction_->IsLoadString() ||
+           instruction_->IsInstanceOf() ||
+           instruction_->IsCheckCast() ||
+           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) ||
+           (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified()))
+        << "Unexpected instruction in read barrier marking slow path: "
+        << instruction_->DebugName();
+    // The read barrier instrumentation of object ArrayGet
+    // instructions does not support the HIntermediateAddress
+    // instruction.
+    DCHECK(!(instruction_->IsArrayGet() &&
+             instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress()));
+
+    __ Bind(GetEntryLabel());
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
+    CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen);
+    DCHECK(!ref_reg.Is(sp));
+    DCHECK(!ref_reg.Is(lr));
+    DCHECK(!ref_reg.Is(pc));
+    // IP is used internally by the ReadBarrierMarkRegX entry point
+    // as a temporary, it cannot be the entry point's input/output.
+    DCHECK(!ref_reg.Is(ip));
+    DCHECK(ref_reg.IsRegister()) << ref_reg;
+    // "Compact" slow path, saving two moves.
+    //
+    // Instead of using the standard runtime calling convention (input
+    // and output in R0):
+    //
+    //   R0 <- ref
+    //   R0 <- ReadBarrierMark(R0)
+    //   ref <- R0
+    //
+    // we just use rX (the register containing `ref`) as input and output
+    // of a dedicated entrypoint:
+    //
+    //   rX <- ReadBarrierMarkRegX(rX)
+    //
+    if (entrypoint_.IsValid()) {
+      arm_codegen->ValidateInvokeRuntimeWithoutRecordingPcInfo(instruction_, this);
+      __ Blx(RegisterFrom(entrypoint_));
+    } else {
+      int32_t entry_point_offset =
+          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg.GetCode());
+      // This runtime call does not require a stack map.
+      arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
+    }
+    __ B(GetExitLabel());
+  }
+
+ private:
+  // The location (register) of the marked object reference.
+  const Location ref_;
+
+  // The location of the entrypoint if already loaded.
+  const Location entrypoint_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathARMVIXL);
+};
+
+// Slow path marking an object reference `ref` during a read barrier,
+// and if needed, atomically updating the field `obj.field` in the
+// object `obj` holding this reference after marking (contrary to
+// ReadBarrierMarkSlowPathARM above, which never tries to update
+// `obj.field`).
+//
+// This means that after the execution of this slow path, both `ref`
+// and `obj.field` will be up-to-date; i.e., after the flip, both will
+// hold the same to-space reference (unless another thread installed
+// another object reference (different from `ref`) in `obj.field`).
+class ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL : public SlowPathCodeARMVIXL {
+ public:
+  ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL(HInstruction* instruction,
+                                               Location ref,
+                                               vixl32::Register obj,
+                                               Location field_offset,
+                                               vixl32::Register temp1,
+                                               vixl32::Register temp2)
+      : SlowPathCodeARMVIXL(instruction),
+        ref_(ref),
+        obj_(obj),
+        field_offset_(field_offset),
+        temp1_(temp1),
+        temp2_(temp2) {
+    DCHECK(kEmitCompilerReadBarrier);
+  }
+
+  const char* GetDescription() const OVERRIDE {
+    return "ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL";
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    vixl32::Register ref_reg = RegisterFrom(ref_);
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(ref_reg.GetCode())) << ref_reg;
+    // This slow path is only used by the UnsafeCASObject intrinsic.
+    DCHECK((instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()))
+        << "Unexpected instruction in read barrier marking and field updating slow path: "
+        << instruction_->DebugName();
+    DCHECK(instruction_->GetLocations()->Intrinsified());
+    DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kUnsafeCASObject);
+    DCHECK(field_offset_.IsRegisterPair()) << field_offset_;
+
+    __ Bind(GetEntryLabel());
+
+    // Save the old reference.
+    // Note that we cannot use IP to save the old reference, as IP is
+    // used internally by the ReadBarrierMarkRegX entry point, and we
+    // need the old reference after the call to that entry point.
+    DCHECK(!temp1_.Is(ip));
+    __ Mov(temp1_, ref_reg);
+
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
+    CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen);
+    DCHECK(!ref_reg.Is(sp));
+    DCHECK(!ref_reg.Is(lr));
+    DCHECK(!ref_reg.Is(pc));
+    // IP is used internally by the ReadBarrierMarkRegX entry point
+    // as a temporary, it cannot be the entry point's input/output.
+    DCHECK(!ref_reg.Is(ip));
+    DCHECK(ref_reg.IsRegister()) << ref_reg;
+    // "Compact" slow path, saving two moves.
+    //
+    // Instead of using the standard runtime calling convention (input
+    // and output in R0):
+    //
+    //   R0 <- ref
+    //   R0 <- ReadBarrierMark(R0)
+    //   ref <- R0
+    //
+    // we just use rX (the register containing `ref`) as input and output
+    // of a dedicated entrypoint:
+    //
+    //   rX <- ReadBarrierMarkRegX(rX)
+    //
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(ref_reg.GetCode());
+    // This runtime call does not require a stack map.
+    arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
+
+    // If the new reference is different from the old reference,
+    // update the field in the holder (`*(obj_ + field_offset_)`).
+    //
+    // Note that this field could also hold a different object, if
+    // another thread had concurrently changed it. In that case, the
+    // LDREX/SUBS/ITNE sequence of instructions in the compare-and-set
+    // (CAS) operation below would abort the CAS, leaving the field
+    // as-is.
+    vixl32::Label done;
+    __ Cmp(temp1_, ref_reg);
+    __ B(eq, &done);
+
+    // Update the the holder's field atomically.  This may fail if
+    // mutator updates before us, but it's OK.  This is achieved
+    // using a strong compare-and-set (CAS) operation with relaxed
+    // memory synchronization ordering, where the expected value is
+    // the old reference and the desired value is the new reference.
+
+    UseScratchRegisterScope temps(arm_codegen->GetVIXLAssembler());
+    // Convenience aliases.
+    vixl32::Register base = obj_;
+    // The UnsafeCASObject intrinsic uses a register pair as field
+    // offset ("long offset"), of which only the low part contains
+    // data.
+    vixl32::Register offset = LowRegisterFrom(field_offset_);
+    vixl32::Register expected = temp1_;
+    vixl32::Register value = ref_reg;
+    vixl32::Register tmp_ptr = temps.Acquire();       // Pointer to actual memory.
+    vixl32::Register tmp = temp2_;                    // Value in memory.
+
+    __ Add(tmp_ptr, base, offset);
+
+    if (kPoisonHeapReferences) {
+      arm_codegen->GetAssembler()->PoisonHeapReference(expected);
+      if (value.Is(expected)) {
+        // Do not poison `value`, as it is the same register as
+        // `expected`, which has just been poisoned.
+      } else {
+        arm_codegen->GetAssembler()->PoisonHeapReference(value);
+      }
+    }
+
+    // do {
+    //   tmp = [r_ptr] - expected;
+    // } while (tmp == 0 && failure([r_ptr] <- r_new_value));
+
+    vixl32::Label loop_head, exit_loop;
+    __ Bind(&loop_head);
+
+    __ Ldrex(tmp, MemOperand(tmp_ptr));
+
+    __ Subs(tmp, tmp, expected);
+
+    {
+      AssemblerAccurateScope aas(arm_codegen->GetVIXLAssembler(),
+                                 2 * kMaxInstructionSizeInBytes,
+                                 CodeBufferCheckScope::kMaximumSize);
+
+      __ it(ne);
+      __ clrex(ne);
+    }
+
+    __ B(ne, &exit_loop);
+
+    __ Strex(tmp, value, MemOperand(tmp_ptr));
+    __ Cmp(tmp, 1);
+    __ B(eq, &loop_head);
+
+    __ Bind(&exit_loop);
+
+    if (kPoisonHeapReferences) {
+      arm_codegen->GetAssembler()->UnpoisonHeapReference(expected);
+      if (value.Is(expected)) {
+        // Do not unpoison `value`, as it is the same register as
+        // `expected`, which has just been unpoisoned.
+      } else {
+        arm_codegen->GetAssembler()->UnpoisonHeapReference(value);
+      }
+    }
+
+    __ Bind(&done);
+    __ B(GetExitLabel());
+  }
+
+ private:
+  // The location (register) of the marked object reference.
+  const Location ref_;
+  // The register containing the object holding the marked object reference field.
+  const vixl32::Register obj_;
+  // The location of the offset of the marked reference field within `obj_`.
+  Location field_offset_;
+
+  const vixl32::Register temp1_;
+  const vixl32::Register temp2_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL);
+};
+
+// Slow path generating a read barrier for a heap reference.
+class ReadBarrierForHeapReferenceSlowPathARMVIXL : public SlowPathCodeARMVIXL {
+ public:
+  ReadBarrierForHeapReferenceSlowPathARMVIXL(HInstruction* instruction,
+                                             Location out,
+                                             Location ref,
+                                             Location obj,
+                                             uint32_t offset,
+                                             Location index)
+      : SlowPathCodeARMVIXL(instruction),
+        out_(out),
+        ref_(ref),
+        obj_(obj),
+        offset_(offset),
+        index_(index) {
+    DCHECK(kEmitCompilerReadBarrier);
+    // If `obj` is equal to `out` or `ref`, it means the initial object
+    // has been overwritten by (or after) the heap object reference load
+    // to be instrumented, e.g.:
+    //
+    //   __ LoadFromOffset(kLoadWord, out, out, offset);
+    //   codegen_->GenerateReadBarrierSlow(instruction, out_loc, out_loc, out_loc, offset);
+    //
+    // In that case, we have lost the information about the original
+    // object, and the emitted read barrier cannot work properly.
+    DCHECK(!obj.Equals(out)) << "obj=" << obj << " out=" << out;
+    DCHECK(!obj.Equals(ref)) << "obj=" << obj << " ref=" << ref;
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen);
+    LocationSummary* locations = instruction_->GetLocations();
+    vixl32::Register reg_out = RegisterFrom(out_);
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out.GetCode()));
+    DCHECK(instruction_->IsInstanceFieldGet() ||
+           instruction_->IsStaticFieldGet() ||
+           instruction_->IsArrayGet() ||
+           instruction_->IsInstanceOf() ||
+           instruction_->IsCheckCast() ||
+           (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified())
+        << "Unexpected instruction in read barrier for heap reference slow path: "
+        << instruction_->DebugName();
+    // The read barrier instrumentation of object ArrayGet
+    // instructions does not support the HIntermediateAddress
+    // instruction.
+    DCHECK(!(instruction_->IsArrayGet() &&
+             instruction_->AsArrayGet()->GetArray()->IsIntermediateAddress()));
+
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+
+    // We may have to change the index's value, but as `index_` is a
+    // constant member (like other "inputs" of this slow path),
+    // introduce a copy of it, `index`.
+    Location index = index_;
+    if (index_.IsValid()) {
+      // Handle `index_` for HArrayGet and UnsafeGetObject/UnsafeGetObjectVolatile intrinsics.
+      if (instruction_->IsArrayGet()) {
+        // Compute the actual memory offset and store it in `index`.
+        vixl32::Register index_reg = RegisterFrom(index_);
+        DCHECK(locations->GetLiveRegisters()->ContainsCoreRegister(index_reg.GetCode()));
+        if (codegen->IsCoreCalleeSaveRegister(index_reg.GetCode())) {
+          // We are about to change the value of `index_reg` (see the
+          // calls to art::arm::Thumb2Assembler::Lsl and
+          // art::arm::Thumb2Assembler::AddConstant below), but it has
+          // not been saved by the previous call to
+          // art::SlowPathCode::SaveLiveRegisters, as it is a
+          // callee-save register --
+          // art::SlowPathCode::SaveLiveRegisters does not consider
+          // callee-save registers, as it has been designed with the
+          // assumption that callee-save registers are supposed to be
+          // handled by the called function.  So, as a callee-save
+          // register, `index_reg` _would_ eventually be saved onto
+          // the stack, but it would be too late: we would have
+          // changed its value earlier.  Therefore, we manually save
+          // it here into another freely available register,
+          // `free_reg`, chosen of course among the caller-save
+          // registers (as a callee-save `free_reg` register would
+          // exhibit the same problem).
+          //
+          // Note we could have requested a temporary register from
+          // the register allocator instead; but we prefer not to, as
+          // this is a slow path, and we know we can find a
+          // caller-save register that is available.
+          vixl32::Register free_reg = FindAvailableCallerSaveRegister(codegen);
+          __ Mov(free_reg, index_reg);
+          index_reg = free_reg;
+          index = LocationFrom(index_reg);
+        } else {
+          // The initial register stored in `index_` has already been
+          // saved in the call to art::SlowPathCode::SaveLiveRegisters
+          // (as it is not a callee-save register), so we can freely
+          // use it.
+        }
+        // Shifting the index value contained in `index_reg` by the scale
+        // factor (2) cannot overflow in practice, as the runtime is
+        // unable to allocate object arrays with a size larger than
+        // 2^26 - 1 (that is, 2^28 - 4 bytes).
+        __ Lsl(index_reg, index_reg, TIMES_4);
+        static_assert(
+            sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+            "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+        __ Add(index_reg, index_reg, offset_);
+      } else {
+        // In the case of the UnsafeGetObject/UnsafeGetObjectVolatile
+        // intrinsics, `index_` is not shifted by a scale factor of 2
+        // (as in the case of ArrayGet), as it is actually an offset
+        // to an object field within an object.
+        DCHECK(instruction_->IsInvoke()) << instruction_->DebugName();
+        DCHECK(instruction_->GetLocations()->Intrinsified());
+        DCHECK((instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObject) ||
+               (instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile))
+            << instruction_->AsInvoke()->GetIntrinsic();
+        DCHECK_EQ(offset_, 0U);
+        DCHECK(index_.IsRegisterPair());
+        // UnsafeGet's offset location is a register pair, the low
+        // part contains the correct offset.
+        index = index_.ToLow();
+      }
+    }
+
+    // We're moving two or three locations to locations that could
+    // overlap, so we need a parallel move resolver.
+    InvokeRuntimeCallingConventionARMVIXL calling_convention;
+    HParallelMove parallel_move(codegen->GetGraph()->GetArena());
+    parallel_move.AddMove(ref_,
+                          LocationFrom(calling_convention.GetRegisterAt(0)),
+                          Primitive::kPrimNot,
+                          nullptr);
+    parallel_move.AddMove(obj_,
+                          LocationFrom(calling_convention.GetRegisterAt(1)),
+                          Primitive::kPrimNot,
+                          nullptr);
+    if (index.IsValid()) {
+      parallel_move.AddMove(index,
+                            LocationFrom(calling_convention.GetRegisterAt(2)),
+                            Primitive::kPrimInt,
+                            nullptr);
+      codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+    } else {
+      codegen->GetMoveResolver()->EmitNativeCode(&parallel_move);
+      __ Mov(calling_convention.GetRegisterAt(2), offset_);
+    }
+    arm_codegen->InvokeRuntime(kQuickReadBarrierSlow, instruction_, instruction_->GetDexPc(), this);
+    CheckEntrypointTypes<
+        kQuickReadBarrierSlow, mirror::Object*, mirror::Object*, mirror::Object*, uint32_t>();
+    arm_codegen->Move32(out_, LocationFrom(r0));
+
+    RestoreLiveRegisters(codegen, locations);
+    __ B(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE {
+    return "ReadBarrierForHeapReferenceSlowPathARMVIXL";
+  }
+
+ private:
+  vixl32::Register FindAvailableCallerSaveRegister(CodeGenerator* codegen) {
+    uint32_t ref = RegisterFrom(ref_).GetCode();
+    uint32_t obj = RegisterFrom(obj_).GetCode();
+    for (uint32_t i = 0, e = codegen->GetNumberOfCoreRegisters(); i < e; ++i) {
+      if (i != ref && i != obj && !codegen->IsCoreCalleeSaveRegister(i)) {
+        return vixl32::Register(i);
+      }
+    }
+    // We shall never fail to find a free caller-save register, as
+    // there are more than two core caller-save registers on ARM
+    // (meaning it is possible to find one which is different from
+    // `ref` and `obj`).
+    DCHECK_GT(codegen->GetNumberOfCoreCallerSaveRegisters(), 2u);
+    LOG(FATAL) << "Could not find a free caller-save register";
+    UNREACHABLE();
+  }
+
+  const Location out_;
+  const Location ref_;
+  const Location obj_;
+  const uint32_t offset_;
+  // An additional location containing an index to an array.
+  // Only used for HArrayGet and the UnsafeGetObject &
+  // UnsafeGetObjectVolatile intrinsics.
+  const Location index_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierForHeapReferenceSlowPathARMVIXL);
+};
+
+// Slow path generating a read barrier for a GC root.
+class ReadBarrierForRootSlowPathARMVIXL : public SlowPathCodeARMVIXL {
+ public:
+  ReadBarrierForRootSlowPathARMVIXL(HInstruction* instruction, Location out, Location root)
+      : SlowPathCodeARMVIXL(instruction), out_(out), root_(root) {
+    DCHECK(kEmitCompilerReadBarrier);
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    LocationSummary* locations = instruction_->GetLocations();
+    vixl32::Register reg_out = RegisterFrom(out_);
+    DCHECK(locations->CanCall());
+    DCHECK(!locations->GetLiveRegisters()->ContainsCoreRegister(reg_out.GetCode()));
+    DCHECK(instruction_->IsLoadClass() || instruction_->IsLoadString())
+        << "Unexpected instruction in read barrier for GC root slow path: "
+        << instruction_->DebugName();
+
+    __ Bind(GetEntryLabel());
+    SaveLiveRegisters(codegen, locations);
+
+    InvokeRuntimeCallingConventionARMVIXL calling_convention;
+    CodeGeneratorARMVIXL* arm_codegen = down_cast<CodeGeneratorARMVIXL*>(codegen);
+    arm_codegen->Move32(LocationFrom(calling_convention.GetRegisterAt(0)), root_);
+    arm_codegen->InvokeRuntime(kQuickReadBarrierForRootSlow,
+                               instruction_,
+                               instruction_->GetDexPc(),
+                               this);
+    CheckEntrypointTypes<kQuickReadBarrierForRootSlow, mirror::Object*, GcRoot<mirror::Object>*>();
+    arm_codegen->Move32(out_, LocationFrom(r0));
+
+    RestoreLiveRegisters(codegen, locations);
+    __ B(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierForRootSlowPathARMVIXL"; }
+
+ private:
+  const Location out_;
+  const Location root_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierForRootSlowPathARMVIXL);
+};
 
 inline vixl32::Condition ARMCondition(IfCondition cond) {
   switch (cond) {
@@ -897,10 +1400,25 @@
     GetAssembler()->cfi().AdjustCFAOffset(kArmWordSize * POPCOUNT(fpu_spill_mask_));
     GetAssembler()->cfi().RelOffsetForMany(DWARFReg(s0), 0, fpu_spill_mask_, kArmWordSize);
   }
+
+  if (GetGraph()->HasShouldDeoptimizeFlag()) {
+    UseScratchRegisterScope temps(GetVIXLAssembler());
+    vixl32::Register temp = temps.Acquire();
+    // Initialize should_deoptimize flag to 0.
+    __ Mov(temp, 0);
+    GetAssembler()->StoreToOffset(kStoreWord, temp, sp, -kShouldDeoptimizeFlagSize);
+  }
+
   int adjust = GetFrameSize() - FrameEntrySpillSize();
   __ Sub(sp, sp, adjust);
   GetAssembler()->cfi().AdjustCFAOffset(adjust);
-  GetAssembler()->StoreToOffset(kStoreWord, kMethodRegister, sp, 0);
+
+  // Save the current method if we need it. Note that we do not
+  // do this in HCurrentMethod, as the instruction might have been removed
+  // in the SSA graph.
+  if (RequiresCurrentMethod()) {
+    GetAssembler()->StoreToOffset(kStoreWord, kMethodRegister, sp, 0);
+  }
 }
 
 void CodeGeneratorARMVIXL::GenerateFrameExit() {
@@ -3103,8 +3621,7 @@
 
     __ And(shift_right, RegisterFrom(rhs), 0x1F);
     __ Lsrs(shift_left, RegisterFrom(rhs), 6);
-    // TODO(VIXL): Check that flags are kept after "vixl32::LeaveFlags" enabled.
-    __ Rsb(shift_left, shift_right, Operand::From(kArmBitsPerWord));
+    __ Rsb(LeaveFlags, shift_left, shift_right, Operand::From(kArmBitsPerWord));
     __ B(cc, &shift_by_32_plus_shift_right);
 
     // out_reg_hi = (reg_hi << shift_left) | (reg_lo >> shift_right).
@@ -4007,7 +4524,14 @@
     case Primitive::kPrimNot: {
       // /* HeapReference<Object> */ out = *(base + offset)
       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-        TODO_VIXL32(FATAL);
+        Location temp_loc = locations->GetTemp(0);
+        // Note that a potential implicit null check is handled in this
+        // CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier call.
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            instruction, out, base, offset, temp_loc, /* needs_null_check */ true);
+        if (is_volatile) {
+          codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
+        }
       } else {
         GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(out), base, offset);
         codegen_->MaybeRecordImplicitNullCheck(instruction);
@@ -4334,7 +4858,7 @@
                                                        LocationSummary::kCallOnSlowPath :
                                                        LocationSummary::kNoCall);
   if (object_array_get_with_read_barrier && kUseBakerReadBarrier) {
-    TODO_VIXL32(FATAL);
+    locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
   }
   locations->SetInAt(0, Location::RequiresRegister());
   locations->SetInAt(1, Location::RegisterOrConstant(instruction->InputAt(1)));
@@ -4358,7 +4882,6 @@
 }
 
 void InstructionCodeGeneratorARMVIXL::VisitArrayGet(HArrayGet* instruction) {
-  UseScratchRegisterScope temps(GetAssembler()->GetVIXLAssembler());
   LocationSummary* locations = instruction->GetLocations();
   Location obj_loc = locations->InAt(0);
   vixl32::Register obj = InputRegisterAt(instruction, 0);
@@ -4370,8 +4893,6 @@
                                         instruction->IsStringCharAt();
   HInstruction* array_instr = instruction->GetArray();
   bool has_intermediate_address = array_instr->IsIntermediateAddress();
-  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
-  DCHECK(!(has_intermediate_address && kEmitCompilerReadBarrier));
 
   switch (type) {
     case Primitive::kPrimBoolean:
@@ -4412,6 +4933,7 @@
           GetAssembler()->LoadFromOffset(load_type, RegisterFrom(out_loc), obj, full_offset);
         }
       } else {
+        UseScratchRegisterScope temps(GetVIXLAssembler());
         vixl32::Register temp = temps.Acquire();
 
         if (has_intermediate_address) {
@@ -4440,19 +4962,27 @@
         } else {
           codegen_->LoadFromShiftedRegOffset(type, out_loc, temp, RegisterFrom(index));
         }
-        temps.Release(temp);
       }
       break;
     }
 
     case Primitive::kPrimNot: {
+      // The read barrier instrumentation of object ArrayGet
+      // instructions does not support the HIntermediateAddress
+      // instruction.
+      DCHECK(!(has_intermediate_address && kEmitCompilerReadBarrier));
+
       static_assert(
           sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
           "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
       // /* HeapReference<Object> */ out =
       //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-        TODO_VIXL32(FATAL);
+        Location temp = locations->GetTemp(0);
+        // Note that a potential implicit null check is handled in this
+        // CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier call.
+        codegen_->GenerateArrayLoadWithBakerReadBarrier(
+            instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ true);
       } else {
         vixl32::Register out = OutputRegister(instruction);
         if (index.IsConstant()) {
@@ -4470,6 +5000,7 @@
           // reference, if heap poisoning is enabled).
           codegen_->MaybeGenerateReadBarrierSlow(instruction, out_loc, out_loc, obj_loc, offset);
         } else {
+          UseScratchRegisterScope temps(GetVIXLAssembler());
           vixl32::Register temp = temps.Acquire();
 
           if (has_intermediate_address) {
@@ -4485,7 +5016,7 @@
             __ Add(temp, obj, data_offset);
           }
           codegen_->LoadFromShiftedRegOffset(type, out_loc, temp, RegisterFrom(index));
-          temps.Release(temp);
+          temps.Close();
           // TODO(VIXL): Use a scope to ensure that we record the pc position immediately after the
           // load instruction. Practically, everything is fine because the helper and VIXL, at the
           // time of writing, do generate the store instruction last.
@@ -4506,10 +5037,10 @@
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
         GetAssembler()->LoadFromOffset(kLoadWordPair, LowRegisterFrom(out_loc), obj, offset);
       } else {
+        UseScratchRegisterScope temps(GetVIXLAssembler());
         vixl32::Register temp = temps.Acquire();
         __ Add(temp, obj, Operand(RegisterFrom(index), vixl32::LSL, TIMES_8));
         GetAssembler()->LoadFromOffset(kLoadWordPair, LowRegisterFrom(out_loc), temp, data_offset);
-        temps.Release(temp);
       }
       break;
     }
@@ -4520,10 +5051,10 @@
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
         GetAssembler()->LoadSFromOffset(out, obj, offset);
       } else {
+        UseScratchRegisterScope temps(GetVIXLAssembler());
         vixl32::Register temp = temps.Acquire();
         __ Add(temp, obj, Operand(RegisterFrom(index), vixl32::LSL, TIMES_4));
         GetAssembler()->LoadSFromOffset(out, temp, data_offset);
-        temps.Release(temp);
       }
       break;
     }
@@ -4533,10 +5064,10 @@
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
         GetAssembler()->LoadDFromOffset(DRegisterFrom(out_loc), obj, offset);
       } else {
+        UseScratchRegisterScope temps(GetVIXLAssembler());
         vixl32::Register temp = temps.Acquire();
         __ Add(temp, obj, Operand(RegisterFrom(index), vixl32::LSL, TIMES_8));
         GetAssembler()->LoadDFromOffset(DRegisterFrom(out_loc), temp, data_offset);
-        temps.Release(temp);
       }
       break;
     }
@@ -4584,7 +5115,6 @@
 }
 
 void InstructionCodeGeneratorARMVIXL::VisitArraySet(HArraySet* instruction) {
-  UseScratchRegisterScope temps(GetAssembler()->GetVIXLAssembler());
   LocationSummary* locations = instruction->GetLocations();
   vixl32::Register array = InputRegisterAt(instruction, 0);
   Location index = locations->InAt(1);
@@ -4597,8 +5127,6 @@
   Location value_loc = locations->InAt(2);
   HInstruction* array_instr = instruction->GetArray();
   bool has_intermediate_address = array_instr->IsIntermediateAddress();
-  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
-  DCHECK(!(has_intermediate_address && kEmitCompilerReadBarrier));
 
   switch (value_type) {
     case Primitive::kPrimBoolean:
@@ -4613,6 +5141,7 @@
         StoreOperandType store_type = GetStoreOperandType(value_type);
         GetAssembler()->StoreToOffset(store_type, RegisterFrom(value_loc), array, full_offset);
       } else {
+        UseScratchRegisterScope temps(GetVIXLAssembler());
         vixl32::Register temp = temps.Acquire();
 
         if (has_intermediate_address) {
@@ -4628,7 +5157,6 @@
           __ Add(temp, array, data_offset);
         }
         codegen_->StoreToShiftedRegOffset(value_type, value_loc, temp, RegisterFrom(index));
-        temps.Release(temp);
       }
       break;
     }
@@ -4647,10 +5175,10 @@
           GetAssembler()->StoreToOffset(kStoreWord, value, array, offset);
         } else {
           DCHECK(index.IsRegister()) << index;
+          UseScratchRegisterScope temps(GetVIXLAssembler());
           vixl32::Register temp = temps.Acquire();
           __ Add(temp, array, data_offset);
           codegen_->StoreToShiftedRegOffset(value_type, value_loc, temp, RegisterFrom(index));
-          temps.Release(temp);
         }
         // TODO(VIXL): Use a scope to ensure we record the pc info immediately after the preceding
         // store instruction.
@@ -4683,10 +5211,10 @@
             GetAssembler()->StoreToOffset(kStoreWord, value, array, offset);
           } else {
             DCHECK(index.IsRegister()) << index;
+            UseScratchRegisterScope temps(GetVIXLAssembler());
             vixl32::Register temp = temps.Acquire();
             __ Add(temp, array, data_offset);
             codegen_->StoreToShiftedRegOffset(value_type, value_loc, temp, RegisterFrom(index));
-            temps.Release(temp);
           }
           // TODO(VIXL): Use a scope to ensure we record the pc info immediately after the preceding
           // store instruction.
@@ -4758,13 +5286,13 @@
       } else {
         DCHECK(index.IsRegister()) << index;
 
+        UseScratchRegisterScope temps(GetVIXLAssembler());
         vixl32::Register temp = temps.Acquire();
         __ Add(temp, array, data_offset);
         codegen_->StoreToShiftedRegOffset(value_type,
                                           LocationFrom(source),
                                           temp,
                                           RegisterFrom(index));
-        temps.Release(temp);
       }
 
       if (!may_need_runtime_call_for_type_check) {
@@ -4793,10 +5321,10 @@
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
         GetAssembler()->StoreToOffset(kStoreWordPair, LowRegisterFrom(value), array, offset);
       } else {
+        UseScratchRegisterScope temps(GetVIXLAssembler());
         vixl32::Register temp = temps.Acquire();
         __ Add(temp, array, Operand(RegisterFrom(index), vixl32::LSL, TIMES_8));
         GetAssembler()->StoreToOffset(kStoreWordPair, LowRegisterFrom(value), temp, data_offset);
-        temps.Release(temp);
       }
       break;
     }
@@ -4808,10 +5336,10 @@
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
         GetAssembler()->StoreSToOffset(SRegisterFrom(value), array, offset);
       } else {
+        UseScratchRegisterScope temps(GetVIXLAssembler());
         vixl32::Register temp = temps.Acquire();
         __ Add(temp, array, Operand(RegisterFrom(index), vixl32::LSL, TIMES_4));
         GetAssembler()->StoreSToOffset(SRegisterFrom(value), temp, data_offset);
-        temps.Release(temp);
       }
       break;
     }
@@ -4823,10 +5351,10 @@
         size_t offset = (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
         GetAssembler()->StoreDToOffset(DRegisterFrom(value), array, offset);
       } else {
+        UseScratchRegisterScope temps(GetVIXLAssembler());
         vixl32::Register temp = temps.Acquire();
         __ Add(temp, array, Operand(RegisterFrom(index), vixl32::LSL, TIMES_8));
         GetAssembler()->StoreDToOffset(DRegisterFrom(value), temp, data_offset);
-        temps.Release(temp);
       }
       break;
     }
@@ -4869,8 +5397,6 @@
 }
 
 void LocationsBuilderARMVIXL::VisitIntermediateAddress(HIntermediateAddress* instruction) {
-  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
-  DCHECK(!kEmitCompilerReadBarrier);
   LocationSummary* locations =
       new (GetGraph()->GetArena()) LocationSummary(instruction, LocationSummary::kNoCall);
 
@@ -4884,9 +5410,6 @@
   vixl32::Register first = InputRegisterAt(instruction, 0);
   Location second = instruction->GetLocations()->InAt(1);
 
-  // The read barrier instrumentation does not support the HIntermediateAddress instruction yet.
-  DCHECK(!kEmitCompilerReadBarrier);
-
   if (second.IsRegister()) {
     __ Add(out, first, RegisterFrom(second));
   } else {
@@ -4978,7 +5501,7 @@
     DCHECK_EQ(slow_path->GetSuccessor(), successor);
   }
 
-  UseScratchRegisterScope temps(GetAssembler()->GetVIXLAssembler());
+  UseScratchRegisterScope temps(GetVIXLAssembler());
   vixl32::Register temp = temps.Acquire();
   GetAssembler()->LoadFromOffset(
       kLoadUnsignedHalfword, temp, tr, Thread::ThreadFlagsOffset<kArmPointerSize>().Int32Value());
@@ -5253,7 +5776,7 @@
     case HLoadClass::LoadKind::kBootImageAddress:
       // TODO(VIXL): Enable it back when literal pools are fixed in VIXL.
       return HLoadClass::LoadKind::kDexCacheViaMethod;
-    case HLoadClass::LoadKind::kDexCacheAddress:
+    case HLoadClass::LoadKind::kJitTableAddress:
       // TODO(VIXL): Enable it back when literal pools are fixed in VIXL.
       return HLoadClass::LoadKind::kDexCacheViaMethod;
     case HLoadClass::LoadKind::kDexCachePcRelative:
@@ -5289,7 +5812,7 @@
       : LocationSummary::kNoCall;
   LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(cls, call_kind);
   if (kUseBakerReadBarrier && requires_read_barrier && !cls->NeedsEnvironment()) {
-      TODO_VIXL32(FATAL);
+    locations->SetCustomSlowPathCallerSaves(RegisterSet::Empty());  // No caller-save registers.
   }
 
   HLoadClass::LoadKind load_kind = cls->GetLoadKind();
@@ -5345,7 +5868,7 @@
       TODO_VIXL32(FATAL);
       break;
     }
-    case HLoadClass::LoadKind::kDexCacheAddress: {
+    case HLoadClass::LoadKind::kJitTableAddress: {
       TODO_VIXL32(FATAL);
       break;
     }
@@ -6249,9 +6772,9 @@
     return;
   }
   __ Adds(out_low, first_low, value_low);
-  if (GetAssembler()->ShifterOperandCanHold(ADC, value_high, kCcKeep)) {
+  if (GetAssembler()->ShifterOperandCanHold(ADC, value_high, kCcDontCare)) {
     __ Adc(out_high, first_high, value_high);
-  } else if (GetAssembler()->ShifterOperandCanHold(SBC, ~value_high, kCcKeep)) {
+  } else if (GetAssembler()->ShifterOperandCanHold(SBC, ~value_high, kCcDontCare)) {
     __ Sbc(out_high, first_high, ~value_high);
   } else {
     LOG(FATAL) << "Unexpected constant " << value_high;
@@ -6336,14 +6859,30 @@
 }
 
 void InstructionCodeGeneratorARMVIXL::GenerateReferenceLoadOneRegister(
-    HInstruction* instruction ATTRIBUTE_UNUSED,
+    HInstruction* instruction,
     Location out,
     uint32_t offset,
-    Location maybe_temp ATTRIBUTE_UNUSED,
-    ReadBarrierOption read_barrier_option ATTRIBUTE_UNUSED) {
+    Location maybe_temp,
+    ReadBarrierOption read_barrier_option) {
   vixl32::Register out_reg = RegisterFrom(out);
-  if (kEmitCompilerReadBarrier) {
-    TODO_VIXL32(FATAL);
+  if (read_barrier_option == kWithReadBarrier) {
+    CHECK(kEmitCompilerReadBarrier);
+    DCHECK(maybe_temp.IsRegister()) << maybe_temp;
+    if (kUseBakerReadBarrier) {
+      // Load with fast path based Baker's read barrier.
+      // /* HeapReference<Object> */ out = *(out + offset)
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          instruction, out, out_reg, offset, maybe_temp, /* needs_null_check */ false);
+    } else {
+      // Load with slow path based read barrier.
+      // Save the value of `out` into `maybe_temp` before overwriting it
+      // in the following move operation, as we will need it for the
+      // read barrier below.
+      __ Mov(RegisterFrom(maybe_temp), out_reg);
+      // /* HeapReference<Object> */ out = *(out + offset)
+      GetAssembler()->LoadFromOffset(kLoadWord, out_reg, out_reg, offset);
+      codegen_->GenerateReadBarrierSlow(instruction, out, out, maybe_temp, offset);
+    }
   } else {
     // Plain load with no read barrier.
     // /* HeapReference<Object> */ out = *(out + offset)
@@ -6353,16 +6892,28 @@
 }
 
 void InstructionCodeGeneratorARMVIXL::GenerateReferenceLoadTwoRegisters(
-    HInstruction* instruction ATTRIBUTE_UNUSED,
+    HInstruction* instruction,
     Location out,
     Location obj,
     uint32_t offset,
-    Location maybe_temp ATTRIBUTE_UNUSED,
-    ReadBarrierOption read_barrier_option ATTRIBUTE_UNUSED) {
+    Location maybe_temp,
+    ReadBarrierOption read_barrier_option) {
   vixl32::Register out_reg = RegisterFrom(out);
   vixl32::Register obj_reg = RegisterFrom(obj);
-  if (kEmitCompilerReadBarrier) {
-    TODO_VIXL32(FATAL);
+  if (read_barrier_option == kWithReadBarrier) {
+    CHECK(kEmitCompilerReadBarrier);
+    if (kUseBakerReadBarrier) {
+      DCHECK(maybe_temp.IsRegister()) << maybe_temp;
+      // Load with fast path based Baker's read barrier.
+      // /* HeapReference<Object> */ out = *(obj + offset)
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          instruction, out, obj_reg, offset, maybe_temp, /* needs_null_check */ false);
+    } else {
+      // Load with slow path based read barrier.
+      // /* HeapReference<Object> */ out = *(obj + offset)
+      GetAssembler()->LoadFromOffset(kLoadWord, out_reg, obj_reg, offset);
+      codegen_->GenerateReadBarrierSlow(instruction, out, out, obj, offset);
+    }
   } else {
     // Plain load with no read barrier.
     // /* HeapReference<Object> */ out = *(obj + offset)
@@ -6372,14 +6923,61 @@
 }
 
 void InstructionCodeGeneratorARMVIXL::GenerateGcRootFieldLoad(
-    HInstruction* instruction ATTRIBUTE_UNUSED,
+    HInstruction* instruction,
     Location root,
     vixl32::Register obj,
     uint32_t offset,
     ReadBarrierOption read_barrier_option) {
   vixl32::Register root_reg = RegisterFrom(root);
   if (read_barrier_option == kWithReadBarrier) {
-    TODO_VIXL32(FATAL);
+    DCHECK(kEmitCompilerReadBarrier);
+    if (kUseBakerReadBarrier) {
+      // Fast path implementation of art::ReadBarrier::BarrierForRoot when
+      // Baker's read barrier are used:
+      //
+      //   root = obj.field;
+      //   temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+      //   if (temp != null) {
+      //     root = temp(root)
+      //   }
+
+      // /* GcRoot<mirror::Object> */ root = *(obj + offset)
+      GetAssembler()->LoadFromOffset(kLoadWord, root_reg, obj, offset);
+      static_assert(
+          sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(GcRoot<mirror::Object>),
+          "art::mirror::CompressedReference<mirror::Object> and art::GcRoot<mirror::Object> "
+          "have different sizes.");
+      static_assert(sizeof(mirror::CompressedReference<mirror::Object>) == sizeof(int32_t),
+                    "art::mirror::CompressedReference<mirror::Object> and int32_t "
+                    "have different sizes.");
+
+      // Slow path marking the GC root `root`.
+      Location temp = LocationFrom(lr);
+      SlowPathCodeARMVIXL* slow_path =
+          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL(
+              instruction,
+              root,
+              /*entrypoint*/ temp);
+      codegen_->AddSlowPath(slow_path);
+
+      // temp = Thread::Current()->pReadBarrierMarkReg ## root.reg()
+      const int32_t entry_point_offset =
+          CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(root.reg());
+      // Loading the entrypoint does not require a load acquire since it is only changed when
+      // threads are suspended or running a checkpoint.
+      GetAssembler()->LoadFromOffset(kLoadWord, RegisterFrom(temp), tr, entry_point_offset);
+      // The entrypoint is null when the GC is not marking, this prevents one load compared to
+      // checking GetIsGcMarking.
+      __ CompareAndBranchIfNonZero(RegisterFrom(temp), slow_path->GetEntryLabel());
+      __ Bind(slow_path->GetExitLabel());
+    } else {
+      // GC root loaded through a slow path for read barriers other
+      // than Baker's.
+      // /* GcRoot<mirror::Object>* */ root = obj + offset
+      __ Add(root_reg, obj, offset);
+      // /* mirror::Object* */ root = root->Read()
+      codegen_->GenerateReadBarrierForRootSlow(instruction, root, root);
+    }
   } else {
     // Plain GC root load with no read barrier.
     // /* GcRoot<mirror::Object> */ root = *(obj + offset)
@@ -6389,53 +6987,217 @@
   }
 }
 
-void CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier(
-    HInstruction* instruction ATTRIBUTE_UNUSED,
-    Location ref ATTRIBUTE_UNUSED,
-    vixl::aarch32::Register obj ATTRIBUTE_UNUSED,
-    uint32_t offset ATTRIBUTE_UNUSED,
-    Location temp ATTRIBUTE_UNUSED,
-    bool needs_null_check ATTRIBUTE_UNUSED) {
-  TODO_VIXL32(FATAL);
+void CodeGeneratorARMVIXL::GenerateFieldLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                                 Location ref,
+                                                                 vixl32::Register obj,
+                                                                 uint32_t offset,
+                                                                 Location temp,
+                                                                 bool needs_null_check) {
+  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(kUseBakerReadBarrier);
+
+  // /* HeapReference<Object> */ ref = *(obj + offset)
+  Location no_index = Location::NoLocation();
+  ScaleFactor no_scale_factor = TIMES_1;
+  GenerateReferenceLoadWithBakerReadBarrier(
+      instruction, ref, obj, offset, no_index, no_scale_factor, temp, needs_null_check);
 }
 
-void CodeGeneratorARMVIXL::GenerateReferenceLoadWithBakerReadBarrier(
-    HInstruction* instruction ATTRIBUTE_UNUSED,
-    Location ref ATTRIBUTE_UNUSED,
-    vixl::aarch32::Register obj ATTRIBUTE_UNUSED,
-    uint32_t offset ATTRIBUTE_UNUSED,
-    Location index ATTRIBUTE_UNUSED,
-    ScaleFactor scale_factor ATTRIBUTE_UNUSED,
-    Location temp ATTRIBUTE_UNUSED,
-    bool needs_null_check ATTRIBUTE_UNUSED,
-    bool always_update_field ATTRIBUTE_UNUSED,
-    vixl::aarch32::Register* temp2 ATTRIBUTE_UNUSED) {
-  TODO_VIXL32(FATAL);
+void CodeGeneratorARMVIXL::GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                                 Location ref,
+                                                                 vixl32::Register obj,
+                                                                 uint32_t data_offset,
+                                                                 Location index,
+                                                                 Location temp,
+                                                                 bool needs_null_check) {
+  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(kUseBakerReadBarrier);
+
+  static_assert(
+      sizeof(mirror::HeapReference<mirror::Object>) == sizeof(int32_t),
+      "art::mirror::HeapReference<art::mirror::Object> and int32_t have different sizes.");
+  // /* HeapReference<Object> */ ref =
+  //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
+  ScaleFactor scale_factor = TIMES_4;
+  GenerateReferenceLoadWithBakerReadBarrier(
+      instruction, ref, obj, data_offset, index, scale_factor, temp, needs_null_check);
 }
 
-void CodeGeneratorARMVIXL::GenerateReadBarrierSlow(HInstruction* instruction ATTRIBUTE_UNUSED,
-                                                   Location out ATTRIBUTE_UNUSED,
-                                                   Location ref ATTRIBUTE_UNUSED,
-                                                   Location obj ATTRIBUTE_UNUSED,
-                                                   uint32_t offset ATTRIBUTE_UNUSED,
-                                                   Location index ATTRIBUTE_UNUSED) {
-  TODO_VIXL32(FATAL);
+void CodeGeneratorARMVIXL::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
+                                                                     Location ref,
+                                                                     vixl32::Register obj,
+                                                                     uint32_t offset,
+                                                                     Location index,
+                                                                     ScaleFactor scale_factor,
+                                                                     Location temp,
+                                                                     bool needs_null_check,
+                                                                     bool always_update_field,
+                                                                     vixl32::Register* temp2) {
+  DCHECK(kEmitCompilerReadBarrier);
+  DCHECK(kUseBakerReadBarrier);
+
+  // In slow path based read barriers, the read barrier call is
+  // inserted after the original load. However, in fast path based
+  // Baker's read barriers, we need to perform the load of
+  // mirror::Object::monitor_ *before* the original reference load.
+  // This load-load ordering is required by the read barrier.
+  // The fast path/slow path (for Baker's algorithm) should look like:
+  //
+  //   uint32_t rb_state = Lockword(obj->monitor_).ReadBarrierState();
+  //   lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+  //   HeapReference<Object> ref = *src;  // Original reference load.
+  //   bool is_gray = (rb_state == ReadBarrier::GrayState());
+  //   if (is_gray) {
+  //     ref = ReadBarrier::Mark(ref);  // Performed by runtime entrypoint slow path.
+  //   }
+  //
+  // Note: the original implementation in ReadBarrier::Barrier is
+  // slightly more complex as it performs additional checks that we do
+  // not do here for performance reasons.
+
+  vixl32::Register ref_reg = RegisterFrom(ref);
+  vixl32::Register temp_reg = RegisterFrom(temp);
+  uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
+
+  // /* int32_t */ monitor = obj->monitor_
+  GetAssembler()->LoadFromOffset(kLoadWord, temp_reg, obj, monitor_offset);
+  if (needs_null_check) {
+    MaybeRecordImplicitNullCheck(instruction);
+  }
+  // /* LockWord */ lock_word = LockWord(monitor)
+  static_assert(sizeof(LockWord) == sizeof(int32_t),
+                "art::LockWord and int32_t have different sizes.");
+
+  // Introduce a dependency on the lock_word including the rb_state,
+  // which shall prevent load-load reordering without using
+  // a memory barrier (which would be more expensive).
+  // `obj` is unchanged by this operation, but its value now depends
+  // on `temp_reg`.
+  __ Add(obj, obj, Operand(temp_reg, ShiftType::LSR, 32));
+
+  // The actual reference load.
+  if (index.IsValid()) {
+    // Load types involving an "index": ArrayGet,
+    // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject
+    // intrinsics.
+    // /* HeapReference<Object> */ ref = *(obj + offset + (index << scale_factor))
+    if (index.IsConstant()) {
+      size_t computed_offset =
+          (Int32ConstantFrom(index) << scale_factor) + offset;
+      GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, obj, computed_offset);
+    } else {
+      // Handle the special case of the
+      // UnsafeGetObject/UnsafeGetObjectVolatile and UnsafeCASObject
+      // intrinsics, which use a register pair as index ("long
+      // offset"), of which only the low part contains data.
+      vixl32::Register index_reg = index.IsRegisterPair()
+          ? LowRegisterFrom(index)
+          : RegisterFrom(index);
+      UseScratchRegisterScope temps(GetVIXLAssembler());
+      const vixl32::Register temp3 = temps.Acquire();
+      __ Add(temp3, obj, Operand(index_reg, ShiftType::LSL, scale_factor));
+      GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, temp3, offset);
+    }
+  } else {
+    // /* HeapReference<Object> */ ref = *(obj + offset)
+    GetAssembler()->LoadFromOffset(kLoadWord, ref_reg, obj, offset);
+  }
+
+  // Object* ref = ref_addr->AsMirrorPtr()
+  GetAssembler()->MaybeUnpoisonHeapReference(ref_reg);
+
+  // Slow path marking the object `ref` when it is gray.
+  SlowPathCodeARMVIXL* slow_path;
+  if (always_update_field) {
+    DCHECK(temp2 != nullptr);
+    // ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL only supports address
+    // of the form `obj + field_offset`, where `obj` is a register and
+    // `field_offset` is a register pair (of which only the lower half
+    // is used). Thus `offset` and `scale_factor` above are expected
+    // to be null in this code path.
+    DCHECK_EQ(offset, 0u);
+    DCHECK_EQ(scale_factor, ScaleFactor::TIMES_1);
+    slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkAndUpdateFieldSlowPathARMVIXL(
+        instruction, ref, obj, /* field_offset */ index, temp_reg, *temp2);
+  } else {
+    slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARMVIXL(instruction, ref);
+  }
+  AddSlowPath(slow_path);
+
+  // if (rb_state == ReadBarrier::GrayState())
+  //   ref = ReadBarrier::Mark(ref);
+  // Given the numeric representation, it's enough to check the low bit of the
+  // rb_state. We do that by shifting the bit out of the lock word with LSRS
+  // which can be a 16-bit instruction unlike the TST immediate.
+  static_assert(ReadBarrier::WhiteState() == 0, "Expecting white to have value 0");
+  static_assert(ReadBarrier::GrayState() == 1, "Expecting gray to have value 1");
+  __ Lsrs(temp_reg, temp_reg, LockWord::kReadBarrierStateShift + 1);
+  __ B(cs, slow_path->GetEntryLabel());  // Carry flag is the last bit shifted out by LSRS.
+  __ Bind(slow_path->GetExitLabel());
 }
 
-void CodeGeneratorARMVIXL::MaybeGenerateReadBarrierSlow(HInstruction* instruction ATTRIBUTE_UNUSED,
+void CodeGeneratorARMVIXL::GenerateReadBarrierSlow(HInstruction* instruction,
+                                                   Location out,
+                                                   Location ref,
+                                                   Location obj,
+                                                   uint32_t offset,
+                                                   Location index) {
+  DCHECK(kEmitCompilerReadBarrier);
+
+  // Insert a slow path based read barrier *after* the reference load.
+  //
+  // If heap poisoning is enabled, the unpoisoning of the loaded
+  // reference will be carried out by the runtime within the slow
+  // path.
+  //
+  // Note that `ref` currently does not get unpoisoned (when heap
+  // poisoning is enabled), which is alright as the `ref` argument is
+  // not used by the artReadBarrierSlow entry point.
+  //
+  // TODO: Unpoison `ref` when it is used by artReadBarrierSlow.
+  SlowPathCodeARMVIXL* slow_path = new (GetGraph()->GetArena())
+      ReadBarrierForHeapReferenceSlowPathARMVIXL(instruction, out, ref, obj, offset, index);
+  AddSlowPath(slow_path);
+
+  __ B(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
+void CodeGeneratorARMVIXL::MaybeGenerateReadBarrierSlow(HInstruction* instruction,
                                                         Location out,
-                                                        Location ref ATTRIBUTE_UNUSED,
-                                                        Location obj ATTRIBUTE_UNUSED,
-                                                        uint32_t offset ATTRIBUTE_UNUSED,
-                                                        Location index ATTRIBUTE_UNUSED) {
+                                                        Location ref,
+                                                        Location obj,
+                                                        uint32_t offset,
+                                                        Location index) {
   if (kEmitCompilerReadBarrier) {
+    // Baker's read barriers shall be handled by the fast path
+    // (CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier).
     DCHECK(!kUseBakerReadBarrier);
-    TODO_VIXL32(FATAL);
+    // If heap poisoning is enabled, unpoisoning will be taken care of
+    // by the runtime within the slow path.
+    GenerateReadBarrierSlow(instruction, out, ref, obj, offset, index);
   } else if (kPoisonHeapReferences) {
     GetAssembler()->UnpoisonHeapReference(RegisterFrom(out));
   }
 }
 
+void CodeGeneratorARMVIXL::GenerateReadBarrierForRootSlow(HInstruction* instruction,
+                                                          Location out,
+                                                          Location root) {
+  DCHECK(kEmitCompilerReadBarrier);
+
+  // Insert a slow path based read barrier *after* the GC root load.
+  //
+  // Note that GC roots are not affected by heap poisoning, so we do
+  // not need to do anything special for this here.
+  SlowPathCodeARMVIXL* slow_path =
+      new (GetGraph()->GetArena()) ReadBarrierForRootSlowPathARMVIXL(instruction, out, root);
+  AddSlowPath(slow_path);
+
+  __ B(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
+}
+
 // Check if the desired_dispatch_info is supported. If it is, return it,
 // otherwise return a fall-back info that should be used instead.
 HInvokeStaticOrDirect::DispatchInfo CodeGeneratorARMVIXL::GetSupportedInvokeStaticOrDirectDispatch(
@@ -6805,7 +7567,7 @@
   if (num_entries <= kPackedSwitchCompareJumpThreshold ||
       !codegen_->GetAssembler()->GetVIXLAssembler()->IsUsingT32()) {
     // Create a series of compare/jumps.
-    UseScratchRegisterScope temps(GetAssembler()->GetVIXLAssembler());
+    UseScratchRegisterScope temps(GetVIXLAssembler());
     vixl32::Register temp_reg = temps.Acquire();
     // Note: It is fine for the below AddConstantSetFlags() using IP register to temporarily store
     // the immediate, because IP is used as the destination register. For the other
@@ -6853,7 +7615,7 @@
     __ Cmp(key_reg, num_entries - 1);
     __ B(hi, codegen_->GetLabelOf(default_block));
 
-    UseScratchRegisterScope temps(GetAssembler()->GetVIXLAssembler());
+    UseScratchRegisterScope temps(GetVIXLAssembler());
     vixl32::Register jump_offset = temps.Acquire();
 
     // Load jump offset from the table.
diff --git a/compiler/optimizing/code_generator_arm_vixl.h b/compiler/optimizing/code_generator_arm_vixl.h
index b7ba8dd..5ec3da4 100644
--- a/compiler/optimizing/code_generator_arm_vixl.h
+++ b/compiler/optimizing/code_generator_arm_vixl.h
@@ -576,7 +576,15 @@
                                              uint32_t offset,
                                              Location temp,
                                              bool needs_null_check);
-
+  // Fast path implementation of ReadBarrier::Barrier for a heap
+  // reference array load when Baker's read barriers are used.
+  void GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction,
+                                             Location ref,
+                                             vixl::aarch32::Register obj,
+                                             uint32_t data_offset,
+                                             Location index,
+                                             Location temp,
+                                             bool needs_null_check);
   // Factored implementation, used by GenerateFieldLoadWithBakerReadBarrier,
   // GenerateArrayLoadWithBakerReadBarrier and some intrinsics.
   //
@@ -634,6 +642,19 @@
                                     uint32_t offset,
                                     Location index = Location::NoLocation());
 
+  // Generate a read barrier for a GC root within `instruction` using
+  // a slow path.
+  //
+  // A read barrier for an object reference GC root is implemented as
+  // a call to the artReadBarrierForRootSlow runtime entry point,
+  // which is passed the value in location `root`:
+  //
+  //   mirror::Object* artReadBarrierForRootSlow(GcRoot<mirror::Object>* root);
+  //
+  // The `out` location contains the value returned by
+  // artReadBarrierForRootSlow.
+  void GenerateReadBarrierForRootSlow(HInstruction* instruction, Location out, Location root);
+
   void GenerateNop() OVERRIDE;
 
   void GenerateImplicitNullCheck(HNullCheck* instruction) OVERRIDE;
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 61dabfa..cae4161 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -99,8 +99,9 @@
       uint32_t gp_index = gp_index_;
       gp_index_ += 2;
       if (gp_index + 1 < calling_convention.GetNumberOfRegisters()) {
-        if (calling_convention.GetRegisterAt(gp_index) == A1) {
-          gp_index_++;  // Skip A1, and use A2_A3 instead.
+        Register reg = calling_convention.GetRegisterAt(gp_index);
+        if (reg == A1 || reg == A3) {
+          gp_index_++;  // Skip A1(A3), and use A2_A3(T0_T1) instead.
           gp_index++;
         }
         Register low_even = calling_convention.GetRegisterAt(gp_index);
@@ -5095,9 +5096,9 @@
 
 void LocationsBuilderMIPS::VisitInvokeInterface(HInvokeInterface* invoke) {
   HandleInvoke(invoke);
-  // The register T0 is required to be used for the hidden argument in
+  // The register T7 is required to be used for the hidden argument in
   // art_quick_imt_conflict_trampoline, so add the hidden argument.
-  invoke->GetLocations()->AddTemp(Location::RegisterLocation(T0));
+  invoke->GetLocations()->AddTemp(Location::RegisterLocation(T7));
 }
 
 void InstructionCodeGeneratorMIPS::VisitInvokeInterface(HInvokeInterface* invoke) {
@@ -5250,9 +5251,9 @@
       break;
     case HLoadClass::LoadKind::kBootImageAddress:
       break;
-    case HLoadClass::LoadKind::kDexCacheAddress:
+    case HLoadClass::LoadKind::kJitTableAddress:
       DCHECK(Runtime::Current()->UseJitCompilation());
-      fallback_load = false;
+      fallback_load = true;
       break;
     case HLoadClass::LoadKind::kDexCachePcRelative:
       DCHECK(!Runtime::Current()->UseJitCompilation());
@@ -5613,17 +5614,8 @@
                      codegen_->DeduplicateBootImageAddressLiteral(address));
       break;
     }
-    case HLoadClass::LoadKind::kDexCacheAddress: {
-      DCHECK_NE(cls->GetAddress(), 0u);
-      uint32_t address = dchecked_integral_cast<uint32_t>(cls->GetAddress());
-      static_assert(sizeof(GcRoot<mirror::Class>) == 4u, "Expected GC root to be 4 bytes.");
-      DCHECK_ALIGNED(cls->GetAddress(), 4u);
-      int16_t offset = Low16Bits(address);
-      uint32_t base_address = address - offset;  // This accounts for offset sign extension.
-      __ Lui(out, High16Bits(base_address));
-      // /* GcRoot<mirror::Class> */ out = *(base_address + offset)
-      GenerateGcRootFieldLoad(cls, out_loc, out, offset);
-      generate_null_check = !cls->IsInDexCache();
+    case HLoadClass::LoadKind::kJitTableAddress: {
+      LOG(FATAL) << "Unimplemented";
       break;
     }
     case HLoadClass::LoadKind::kDexCachePcRelative: {
diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h
index 2273e52..f03f29c 100644
--- a/compiler/optimizing/code_generator_mips.h
+++ b/compiler/optimizing/code_generator_mips.h
@@ -32,11 +32,11 @@
 // InvokeDexCallingConvention registers
 
 static constexpr Register kParameterCoreRegisters[] =
-    { A1, A2, A3 };
+    { A1, A2, A3, T0, T1 };
 static constexpr size_t kParameterCoreRegistersLength = arraysize(kParameterCoreRegisters);
 
 static constexpr FRegister kParameterFpuRegisters[] =
-    { F12, F14 };
+    { F8, F10, F12, F14, F16, F18 };
 static constexpr size_t kParameterFpuRegistersLength = arraysize(kParameterFpuRegisters);
 
 
@@ -48,7 +48,7 @@
     arraysize(kRuntimeParameterCoreRegisters);
 
 static constexpr FRegister kRuntimeParameterFpuRegisters[] =
-    { F12, F14};
+    { F12, F14 };
 static constexpr size_t kRuntimeParameterFpuRegistersLength =
     arraysize(kRuntimeParameterFpuRegisters);
 
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index d6e92cc..8612a67 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -1013,6 +1013,7 @@
       string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       jit_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      jit_class_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       constant_area_start_(-1),
       fixups_to_jump_tables_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
       method_address_offset_(-1) {
@@ -6034,7 +6035,7 @@
       break;
     case HLoadClass::LoadKind::kBootImageAddress:
       break;
-    case HLoadClass::LoadKind::kDexCacheAddress:
+    case HLoadClass::LoadKind::kJitTableAddress:
       DCHECK(Runtime::Current()->UseJitCompilation());
       break;
     case HLoadClass::LoadKind::kDexCacheViaMethod:
@@ -6073,6 +6074,16 @@
   locations->SetOut(Location::RequiresRegister());
 }
 
+Label* CodeGeneratorX86::NewJitRootClassPatch(const DexFile& dex_file,
+                                              dex::TypeIndex dex_index,
+                                              uint64_t address) {
+  jit_class_roots_.Overwrite(TypeReference(&dex_file, dex_index), address);
+  // Add a patch entry and return the label.
+  jit_class_patches_.emplace_back(dex_file, dex_index.index_);
+  PatchInfo<Label>* info = &jit_class_patches_.back();
+  return &info->label;
+}
+
 void InstructionCodeGeneratorX86::VisitLoadClass(HLoadClass* cls) {
   LocationSummary* locations = cls->GetLocations();
   if (cls->NeedsAccessCheck()) {
@@ -6124,16 +6135,12 @@
       codegen_->RecordSimplePatch();
       break;
     }
-    case HLoadClass::LoadKind::kDexCacheAddress: {
-      DCHECK_NE(cls->GetAddress(), 0u);
-      uint32_t address = dchecked_integral_cast<uint32_t>(cls->GetAddress());
+    case HLoadClass::LoadKind::kJitTableAddress: {
+      Address address = Address::Absolute(CodeGeneratorX86::kDummy32BitOffset);
+      Label* fixup_label = codegen_->NewJitRootClassPatch(
+          cls->GetDexFile(), cls->GetTypeIndex(), cls->GetAddress());
       // /* GcRoot<mirror::Class> */ out = *address
-      GenerateGcRootFieldLoad(cls,
-                              out_loc,
-                              Address::Absolute(address),
-                              /* fixup_label */ nullptr,
-                              read_barrier_option);
-      generate_null_check = !cls->IsInDexCache();
+      GenerateGcRootFieldLoad(cls, out_loc, address, fixup_label, kCompilerReadBarrierOption);
       break;
     }
     case HLoadClass::LoadKind::kDexCachePcRelative: {
@@ -7770,18 +7777,31 @@
   }
 }
 
+void CodeGeneratorX86::PatchJitRootUse(uint8_t* code,
+                                       const uint8_t* roots_data,
+                                       const PatchInfo<Label>& info,
+                                       uint64_t index_in_table) const {
+  uint32_t code_offset = info.label.Position() - kLabelPositionToLiteralOffsetAdjustment;
+  uintptr_t address =
+      reinterpret_cast<uintptr_t>(roots_data) + index_in_table * sizeof(GcRoot<mirror::Object>);
+  typedef __attribute__((__aligned__(1))) uint32_t unaligned_uint32_t;
+  reinterpret_cast<unaligned_uint32_t*>(code + code_offset)[0] =
+     dchecked_integral_cast<uint32_t>(address);
+}
+
 void CodeGeneratorX86::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) {
   for (const PatchInfo<Label>& info : jit_string_patches_) {
-    const auto& it = jit_string_roots_.find(StringReference(&info.dex_file,
-                                                            dex::StringIndex(info.index)));
+    const auto& it = jit_string_roots_.find(
+        StringReference(&info.dex_file, dex::StringIndex(info.index)));
     DCHECK(it != jit_string_roots_.end());
-    size_t index_in_table = it->second;
-    uint32_t code_offset = info.label.Position() - kLabelPositionToLiteralOffsetAdjustment;
-    uintptr_t address =
-        reinterpret_cast<uintptr_t>(roots_data) + index_in_table * sizeof(GcRoot<mirror::Object>);
-    typedef __attribute__((__aligned__(1))) uint32_t unaligned_uint32_t;
-    reinterpret_cast<unaligned_uint32_t*>(code + code_offset)[0] =
-       dchecked_integral_cast<uint32_t>(address);
+    PatchJitRootUse(code, roots_data, info, it->second);
+  }
+
+  for (const PatchInfo<Label>& info : jit_class_patches_) {
+    const auto& it = jit_class_roots_.find(
+        TypeReference(&info.dex_file, dex::TypeIndex(info.index)));
+    DCHECK(it != jit_class_roots_.end());
+    PatchJitRootUse(code, roots_data, info, it->second);
   }
 }
 
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index 2ae3670..c44da97 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -416,12 +416,17 @@
   Label* NewStringBssEntryPatch(HLoadString* load_string);
   Label* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file, uint32_t element_offset);
   Label* NewJitRootStringPatch(const DexFile& dex_file, dex::StringIndex dex_index);
+  Label* NewJitRootClassPatch(const DexFile& dex_file, dex::TypeIndex dex_index, uint64_t address);
 
   void MoveFromReturnRegister(Location trg, Primitive::Type type) OVERRIDE;
 
   // Emit linker patches.
   void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE;
 
+  void PatchJitRootUse(uint8_t* code,
+                       const uint8_t* roots_data,
+                       const PatchInfo<Label>& info,
+                       uint64_t index_in_table) const;
   void EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) OVERRIDE;
 
   // Emit a write barrier.
@@ -623,6 +628,9 @@
   // Patches for string root accesses in JIT compiled code.
   ArenaDeque<PatchInfo<Label>> jit_string_patches_;
 
+  // Patches for class root accesses in JIT compiled code.
+  ArenaDeque<PatchInfo<Label>> jit_class_patches_;
+
   // Offset to the start of the constant area in the assembled code.
   // Used for fixups to the constant area.
   int32_t constant_area_start_;
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 4474dec..7dfc736 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -1260,7 +1260,8 @@
         string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
         type_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
         fixups_to_jump_tables_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
-        jit_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
+        jit_string_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+        jit_class_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
   AddAllocatedRegister(Location::RegisterLocation(kFakeReturnRegister));
 }
 
@@ -5460,8 +5461,7 @@
       break;
     case HLoadClass::LoadKind::kBootImageAddress:
       break;
-    case HLoadClass::LoadKind::kDexCacheAddress:
-      DCHECK(Runtime::Current()->UseJitCompilation());
+    case HLoadClass::LoadKind::kJitTableAddress:
       break;
     case HLoadClass::LoadKind::kDexCachePcRelative:
       DCHECK(!Runtime::Current()->UseJitCompilation());
@@ -5500,6 +5500,16 @@
   locations->SetOut(Location::RequiresRegister());
 }
 
+Label* CodeGeneratorX86_64::NewJitRootClassPatch(const DexFile& dex_file,
+                                                 dex::TypeIndex dex_index,
+                                                 uint64_t address) {
+  jit_class_roots_.Overwrite(TypeReference(&dex_file, dex_index), address);
+  // Add a patch entry and return the label.
+  jit_class_patches_.emplace_back(dex_file, dex_index.index_);
+  PatchInfo<Label>* info = &jit_class_patches_.back();
+  return &info->label;
+}
+
 void InstructionCodeGeneratorX86_64::VisitLoadClass(HLoadClass* cls) {
   LocationSummary* locations = cls->GetLocations();
   if (cls->NeedsAccessCheck()) {
@@ -5543,26 +5553,13 @@
       codegen_->RecordSimplePatch();
       break;
     }
-    case HLoadClass::LoadKind::kDexCacheAddress: {
-      DCHECK_NE(cls->GetAddress(), 0u);
+    case HLoadClass::LoadKind::kJitTableAddress: {
+      Address address = Address::Absolute(CodeGeneratorX86_64::kDummy32BitOffset,
+                                          /* no_rip */ true);
+      Label* fixup_label =
+          codegen_->NewJitRootClassPatch(cls->GetDexFile(), cls->GetTypeIndex(), cls->GetAddress());
       // /* GcRoot<mirror::Class> */ out = *address
-      if (IsUint<32>(cls->GetAddress())) {
-        Address address = Address::Absolute(cls->GetAddress(), /* no_rip */ true);
-        GenerateGcRootFieldLoad(cls,
-                                out_loc,
-                                address,
-                                /* fixup_label */ nullptr,
-                                read_barrier_option);
-      } else {
-        // TODO: Consider using opcode A1, i.e. movl eax, moff32 (with 64-bit address).
-        __ movq(out, Immediate(cls->GetAddress()));
-        GenerateGcRootFieldLoad(cls,
-                                out_loc,
-                                Address(out, 0),
-                                /* fixup_label */ nullptr,
-                                read_barrier_option);
-      }
-      generate_null_check = !cls->IsInDexCache();
+      GenerateGcRootFieldLoad(cls, out_loc, address, fixup_label, kCompilerReadBarrierOption);
       break;
     }
     case HLoadClass::LoadKind::kDexCachePcRelative: {
@@ -7127,18 +7124,31 @@
   }
 }
 
+void CodeGeneratorX86_64::PatchJitRootUse(uint8_t* code,
+                                          const uint8_t* roots_data,
+                                          const PatchInfo<Label>& info,
+                                          uint64_t index_in_table) const {
+  uint32_t code_offset = info.label.Position() - kLabelPositionToLiteralOffsetAdjustment;
+  uintptr_t address =
+      reinterpret_cast<uintptr_t>(roots_data) + index_in_table * sizeof(GcRoot<mirror::Object>);
+  typedef __attribute__((__aligned__(1))) uint32_t unaligned_uint32_t;
+  reinterpret_cast<unaligned_uint32_t*>(code + code_offset)[0] =
+     dchecked_integral_cast<uint32_t>(address);
+}
+
 void CodeGeneratorX86_64::EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) {
   for (const PatchInfo<Label>& info : jit_string_patches_) {
-    const auto& it = jit_string_roots_.find(StringReference(&info.dex_file,
-                                                            dex::StringIndex(info.index)));
+    const auto& it = jit_string_roots_.find(
+        StringReference(&info.dex_file, dex::StringIndex(info.index)));
     DCHECK(it != jit_string_roots_.end());
-    size_t index_in_table = it->second;
-    uint32_t code_offset = info.label.Position() - kLabelPositionToLiteralOffsetAdjustment;
-    uintptr_t address =
-        reinterpret_cast<uintptr_t>(roots_data) + index_in_table * sizeof(GcRoot<mirror::Object>);
-    typedef __attribute__((__aligned__(1))) uint32_t unaligned_uint32_t;
-    reinterpret_cast<unaligned_uint32_t*>(code + code_offset)[0] =
-       dchecked_integral_cast<uint32_t>(address);
+    PatchJitRootUse(code, roots_data, info, it->second);
+  }
+
+  for (const PatchInfo<Label>& info : jit_class_patches_) {
+    const auto& it = jit_class_roots_.find(
+        TypeReference(&info.dex_file, dex::TypeIndex(info.index)));
+    DCHECK(it != jit_class_roots_.end());
+    PatchJitRootUse(code, roots_data, info, it->second);
   }
 }
 
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 2f41f73..391a23b 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -413,11 +413,17 @@
   Label* NewStringBssEntryPatch(HLoadString* load_string);
   Label* NewPcRelativeDexCacheArrayPatch(const DexFile& dex_file, uint32_t element_offset);
   Label* NewJitRootStringPatch(const DexFile& dex_file, dex::StringIndex dex_index);
+  Label* NewJitRootClassPatch(const DexFile& dex_file, dex::TypeIndex dex_index, uint64_t address);
 
   void MoveFromReturnRegister(Location trg, Primitive::Type type) OVERRIDE;
 
   void EmitLinkerPatches(ArenaVector<LinkerPatch>* linker_patches) OVERRIDE;
 
+  void PatchJitRootUse(uint8_t* code,
+                       const uint8_t* roots_data,
+                       const PatchInfo<Label>& info,
+                       uint64_t index_in_table) const;
+
   void EmitJitRootPatches(uint8_t* code, const uint8_t* roots_data) OVERRIDE;
 
   const X86_64InstructionSetFeatures& GetInstructionSetFeatures() const {
@@ -608,6 +614,9 @@
   // Patches for string literals in JIT compiled code.
   ArenaDeque<PatchInfo<Label>> jit_string_patches_;
 
+  // Patches for class literals in JIT compiled code.
+  ArenaDeque<PatchInfo<Label>> jit_class_patches_;
+
   DISALLOW_COPY_AND_ASSIGN(CodeGeneratorX86_64);
 };
 
diff --git a/compiler/optimizing/emit_swap_mips_test.cc b/compiler/optimizing/emit_swap_mips_test.cc
index 9dc53e6..0d4e1c5 100644
--- a/compiler/optimizing/emit_swap_mips_test.cc
+++ b/compiler/optimizing/emit_swap_mips_test.cc
@@ -154,54 +154,54 @@
 TEST_F(EmitSwapMipsTest, TwoFpuRegistersFloat) {
   moves_->AddMove(
       Location::FpuRegisterLocation(4),
-      Location::FpuRegisterLocation(6),
+      Location::FpuRegisterLocation(2),
       Primitive::kPrimFloat,
       nullptr);
   moves_->AddMove(
-      Location::FpuRegisterLocation(6),
+      Location::FpuRegisterLocation(2),
       Location::FpuRegisterLocation(4),
       Primitive::kPrimFloat,
       nullptr);
   const char* expected =
-      "mov.s $f8, $f6\n"
-      "mov.s $f6, $f4\n"
-      "mov.s $f4, $f8\n";
+      "mov.s $f6, $f2\n"
+      "mov.s $f2, $f4\n"
+      "mov.s $f4, $f6\n";
   DriverWrapper(moves_, expected, "TwoFpuRegistersFloat");
 }
 
 TEST_F(EmitSwapMipsTest, TwoFpuRegistersDouble) {
   moves_->AddMove(
       Location::FpuRegisterLocation(4),
-      Location::FpuRegisterLocation(6),
+      Location::FpuRegisterLocation(2),
       Primitive::kPrimDouble,
       nullptr);
   moves_->AddMove(
-      Location::FpuRegisterLocation(6),
+      Location::FpuRegisterLocation(2),
       Location::FpuRegisterLocation(4),
       Primitive::kPrimDouble,
       nullptr);
   const char* expected =
-      "mov.d $f8, $f6\n"
-      "mov.d $f6, $f4\n"
-      "mov.d $f4, $f8\n";
+      "mov.d $f6, $f2\n"
+      "mov.d $f2, $f4\n"
+      "mov.d $f4, $f6\n";
   DriverWrapper(moves_, expected, "TwoFpuRegistersDouble");
 }
 
 TEST_F(EmitSwapMipsTest, RegisterAndFpuRegister) {
   moves_->AddMove(
       Location::RegisterLocation(4),
-      Location::FpuRegisterLocation(6),
+      Location::FpuRegisterLocation(2),
       Primitive::kPrimFloat,
       nullptr);
   moves_->AddMove(
-      Location::FpuRegisterLocation(6),
+      Location::FpuRegisterLocation(2),
       Location::RegisterLocation(4),
       Primitive::kPrimFloat,
       nullptr);
   const char* expected =
       "or $t8, $a0, $zero\n"
-      "mfc1 $a0, $f6\n"
-      "mtc1 $t8, $f6\n";
+      "mfc1 $a0, $f2\n"
+      "mtc1 $t8, $f2\n";
   DriverWrapper(moves_, expected, "RegisterAndFpuRegister");
 }
 
@@ -327,9 +327,9 @@
       Primitive::kPrimFloat,
       nullptr);
   const char* expected =
-      "mov.s $f8, $f4\n"
+      "mov.s $f6, $f4\n"
       "lwc1 $f4, 48($sp)\n"
-      "swc1 $f8, 48($sp)\n";
+      "swc1 $f6, 48($sp)\n";
   DriverWrapper(moves_, expected, "FpuRegisterAndStackSlot");
 }
 
@@ -345,9 +345,9 @@
       Primitive::kPrimDouble,
       nullptr);
   const char* expected =
-      "mov.d $f8, $f4\n"
+      "mov.d $f6, $f4\n"
       "ldc1 $f4, 48($sp)\n"
-      "sdc1 $f8, 48($sp)\n";
+      "sdc1 $f6, 48($sp)\n";
   DriverWrapper(moves_, expected, "FpuRegisterAndDoubleStackSlot");
 }
 
diff --git a/compiler/optimizing/induction_var_analysis.cc b/compiler/optimizing/induction_var_analysis.cc
index 1561502..c240c67 100644
--- a/compiler/optimizing/induction_var_analysis.cc
+++ b/compiler/optimizing/induction_var_analysis.cc
@@ -296,21 +296,22 @@
       update = SolveAddSub(
           loop, phi, instruction, instruction->InputAt(0), instruction->InputAt(1), kSub, true);
     } else if (instruction->IsMul()) {
-      update = SolveGeo(
+      update = SolveOp(
           loop, phi, instruction, instruction->InputAt(0), instruction->InputAt(1), kMul);
     } else if (instruction->IsDiv()) {
-      update = SolveGeo(
+      update = SolveOp(
           loop, phi, instruction, instruction->InputAt(0), instruction->InputAt(1), kDiv);
     } else if (instruction->IsRem()) {
-      update = SolveGeo(
-          loop, phi, instruction, instruction->InputAt(0), instruction->InputAt(1), kNop);
+      update = SolveOp(
+          loop, phi, instruction, instruction->InputAt(0), instruction->InputAt(1), kRem);
     } else if (instruction->IsShl()) {
       HInstruction* mulc = GetMultConstantForShift(loop, instruction);
       if (mulc != nullptr) {
-        update = SolveGeo(loop, phi, instruction, instruction->InputAt(0), mulc, kMul);
+        update = SolveOp(loop, phi, instruction, instruction->InputAt(0), mulc, kMul);
       }
     } else if (instruction->IsXor()) {
-      update = SolveXor(loop, phi, instruction, instruction->InputAt(0), instruction->InputAt(1));
+      update = SolveOp(
+          loop, phi, instruction, instruction->InputAt(0), instruction->InputAt(1), kXor);
     } else if (instruction->IsEqual()) {
       update = SolveTest(loop, phi, instruction, 0);
     } else if (instruction->IsNotEqual()) {
@@ -334,6 +335,7 @@
         FALLTHROUGH_INTENDED;
       case kPolynomial:
       case kGeometric:
+      case kWrapAround:
         // Classify first phi and then the rest of the cycle "on-demand".
         // Statements are scanned in order.
         AssignInfo(loop, phi, induction);
@@ -402,14 +404,15 @@
                                                                             InductionOp op) {
   // Transfer over an addition or subtraction: any invariant, linear, polynomial, geometric,
   // wrap-around, or periodic can be combined with an invariant to yield a similar result.
-  // Even two linear inputs can be combined. Other combinations fail.
+  // Two linear or two polynomial inputs can be combined too. Other combinations fail.
   if (a != nullptr && b != nullptr) {
     type_ = Narrowest(type_, Narrowest(a->type, b->type));
     if (a->induction_class == kInvariant && b->induction_class == kInvariant) {
       return CreateInvariantOp(op, a, b);
-    } else if (a->induction_class == kLinear && b->induction_class == kLinear) {
-      return CreateInduction(kLinear,
-                             kNop,
+    } else if ((a->induction_class == kLinear && b->induction_class == kLinear) ||
+               (a->induction_class == kPolynomial && b->induction_class == kPolynomial)) {
+      return CreateInduction(a->induction_class,
+                             a->operation,
                              TransferAddSub(a->op_a, b->op_a, op),
                              TransferAddSub(a->op_b, b->op_b, op),
                              /*fetch*/ nullptr,
@@ -555,18 +558,33 @@
                                                                          HInstruction* y,
                                                                          InductionOp op,
                                                                          bool is_first_call) {
-  // Solve within a cycle over an addition or subtraction: adding or subtracting an
-  // invariant value, seeded from phi, keeps adding to the stride of the linear induction.
+  // Solve within a cycle over an addition or subtraction.
   InductionInfo* b = LookupInfo(loop, y);
-  if (b != nullptr && b->induction_class == kInvariant) {
-    if (x == entry_phi) {
-      return (op == kAdd) ? b : CreateInvariantOp(kNeg, nullptr, b);
-    }
-    auto it = cycle_.find(x);
-    if (it != cycle_.end()) {
-      InductionInfo* a = it->second;
-      if (a->induction_class == kInvariant) {
-        return CreateInvariantOp(op, a, b);
+  if (b != nullptr) {
+    if (b->induction_class == kInvariant) {
+      // Adding or subtracting an invariant value, seeded from phi,
+      // keeps adding to the stride of the linear induction.
+      if (x == entry_phi) {
+        return (op == kAdd) ? b : CreateInvariantOp(kNeg, nullptr, b);
+      }
+      auto it = cycle_.find(x);
+      if (it != cycle_.end()) {
+        InductionInfo* a = it->second;
+        if (a->induction_class == kInvariant) {
+          return CreateInvariantOp(op, a, b);
+        }
+      }
+    } else if (op == kAdd && b->induction_class == kLinear) {
+      // Solve within a tight cycle that adds a term that is already classified as a linear
+      // induction for a polynomial induction k = k + i (represented as sum over linear terms).
+      if (x == entry_phi && entry_phi->InputCount() == 2 && instruction == entry_phi->InputAt(1)) {
+        InductionInfo* initial = LookupInfo(loop, entry_phi->InputAt(0));
+        return CreateInduction(kPolynomial,
+                               kNop,
+                               b,
+                               initial,
+                               /*fetch*/ nullptr,
+                               type_);
       }
     }
   }
@@ -593,58 +611,63 @@
       }
     }
   }
-
   return nullptr;
 }
 
-HInductionVarAnalysis::InductionInfo* HInductionVarAnalysis::SolveGeo(HLoopInformation* loop,
+HInductionVarAnalysis::InductionInfo* HInductionVarAnalysis::SolveOp(HLoopInformation* loop,
                                                                       HInstruction* entry_phi,
                                                                       HInstruction* instruction,
                                                                       HInstruction* x,
                                                                       HInstruction* y,
                                                                       InductionOp op) {
-  // Solve within a tight cycle that is formed by exactly two instructions, one phi and
-  // one update, for a geometric induction of the form k = k * c.
-  if (x == entry_phi && entry_phi->InputCount() == 2 && instruction == entry_phi->InputAt(1)) {
-    InductionInfo* initial = LookupInfo(loop, entry_phi->InputAt(0));
-    InductionInfo* b = LookupInfo(loop, y);
-    if (b != nullptr && b->induction_class == kInvariant && b->operation == kFetch) {
-      return CreateInduction(kGeometric,
-                             op,
-                             initial,
-                             CreateConstant(0, type_),
-                             b->fetch,
-                             type_);
-    }
-  }
-  return nullptr;
-}
-
-HInductionVarAnalysis::InductionInfo* HInductionVarAnalysis::SolveXor(HLoopInformation* loop,
-                                                                      HInstruction* entry_phi,
-                                                                      HInstruction* instruction,
-                                                                      HInstruction* x,
-                                                                      HInstruction* y) {
-  // Solve within a tight cycle on periodic idiom of the form x = c ^ x or x = x ^ c.
+  // Solve within a tight cycle for a binary operation k = k op c or, for some op, k = c op k.
   if (entry_phi->InputCount() == 2 && instruction == entry_phi->InputAt(1)) {
-    InductionInfo* initial = LookupInfo(loop, entry_phi->InputAt(0));
-    InductionInfo* a = LookupInfo(loop, x);
-    if (a != nullptr && a->induction_class == kInvariant && entry_phi == y) {
-      return CreateInduction(kPeriodic,
-                             kNop,
-                             CreateInvariantOp(kXor, a, initial),
-                             initial,
-                             /*fetch*/ nullptr,
-                             type_);
-    }
+    InductionInfo* c = nullptr;
     InductionInfo* b = LookupInfo(loop, y);
     if (b != nullptr && b->induction_class == kInvariant && entry_phi == x) {
-      return CreateInduction(kPeriodic,
-                             kNop,
-                             CreateInvariantOp(kXor, initial, b),
-                             initial,
-                             /*fetch*/ nullptr,
-                             type_);
+      c = b;
+    } else if (op != kDiv && op != kRem) {
+      InductionInfo* a = LookupInfo(loop, x);
+      if (a != nullptr && a->induction_class == kInvariant && entry_phi == y) {
+        c = a;
+      }
+    }
+    // Found suitable operand left or right?
+    if (c != nullptr) {
+      InductionInfo* initial = LookupInfo(loop, entry_phi->InputAt(0));
+      switch (op) {
+        case kMul:
+        case kDiv:
+          // Restrict base of geometric induction to direct fetch.
+          if (c->operation == kFetch) {
+            return CreateInduction(kGeometric,
+                                   op,
+                                   initial,
+                                   CreateConstant(0, type_),
+                                   c->fetch,
+                                   type_);
+          };
+          break;
+        case kRem:
+          // Idiomatic MOD wrap-around induction.
+          return CreateInduction(kWrapAround,
+                                 kNop,
+                                 initial,
+                                 CreateInvariantOp(kRem, initial, c),
+                                 /*fetch*/ nullptr,
+                                 type_);
+        case kXor:
+          // Idiomatic XOR periodic induction.
+          return CreateInduction(kPeriodic,
+                                 kNop,
+                                 CreateInvariantOp(kXor, initial, c),
+                                 initial,
+                                 /*fetch*/ nullptr,
+                                 type_);
+        default:
+          CHECK(false) << op;
+          break;
+      }
     }
   }
   return nullptr;
@@ -659,9 +682,9 @@
   HInstruction* x = instruction->InputAt(0);
   HInstruction* y = instruction->InputAt(1);
   if (IsExact(LookupInfo(loop, x), &value) && value == opposite_value) {
-    return SolveXor(loop, entry_phi, instruction, graph_->GetIntConstant(1), y);
+    return SolveOp(loop, entry_phi, instruction, graph_->GetIntConstant(1), y, kXor);
   } else if (IsExact(LookupInfo(loop, y), &value) && value == opposite_value) {
-    return SolveXor(loop, entry_phi, instruction, x, graph_->GetIntConstant(1));
+    return SolveOp(loop, entry_phi, instruction, x, graph_->GetIntConstant(1), kXor);
   }
   return nullptr;
 }
@@ -1018,7 +1041,7 @@
 HInstruction* HInductionVarAnalysis::GetMultConstantForShift(HLoopInformation* loop,
                                                              HInstruction* instruction) {
   // Obtain the constant needed to treat shift as equivalent multiplication. This yields an
-  // existing instruction if the constants is already there. Otherwise, this has a side effect
+  // existing instruction if the constant is already there. Otherwise, this has a side effect
   // on the HIR. The restriction on the shift factor avoids generating a negative constant
   // (viz. 1 << 31 and 1L << 63 set the sign bit). The code assumes that generalization for
   // shift factors outside [0,32) and [0,64) ranges is done by earlier simplification.
@@ -1102,6 +1125,7 @@
         case kNeg:   inv += " - ";  break;
         case kMul:   inv += " * ";  break;
         case kDiv:   inv += " / ";  break;
+        case kRem:   inv += " % ";  break;
         case kXor:   inv += " ^ ";  break;
         case kLT:    inv += " < ";  break;
         case kLE:    inv += " <= "; break;
@@ -1118,32 +1142,30 @@
       return inv;
     } else {
       if (info->induction_class == kLinear) {
+        DCHECK(info->operation == kNop);
         return "(" + InductionToString(info->op_a) + " * i + " +
                      InductionToString(info->op_b) + "):" +
                      Primitive::PrettyDescriptor(info->type);
       } else if (info->induction_class == kPolynomial) {
-        return "poly( sum_i(" + InductionToString(info->op_a) + ") + " +
+        DCHECK(info->operation == kNop);
+        return "poly(sum_lt(" + InductionToString(info->op_a) + ") + " +
                                 InductionToString(info->op_b) + "):" +
                                 Primitive::PrettyDescriptor(info->type);
       } else if (info->induction_class == kGeometric) {
+        DCHECK(info->operation == kMul || info->operation == kDiv);
         DCHECK(info->fetch != nullptr);
-        if (info->operation == kNop) {
-         return "geo(" + InductionToString(info->op_a) + " mod_i " +
-                         FetchToString(info->fetch) + " + " +
-                         InductionToString(info->op_b) + "):" +
-                         Primitive::PrettyDescriptor(info->type);
-        } else {
-         return "geo(" + InductionToString(info->op_a) + " * " +
-                         FetchToString(info->fetch) +
-                         (info->operation == kMul ? " ^ i + " : " ^ -i + ") +
-                         InductionToString(info->op_b) + "):" +
-                         Primitive::PrettyDescriptor(info->type);
-        }
+        return "geo(" + InductionToString(info->op_a) + " * " +
+                        FetchToString(info->fetch) +
+                        (info->operation == kMul ? " ^ i + " : " ^ -i + ") +
+                        InductionToString(info->op_b) + "):" +
+                        Primitive::PrettyDescriptor(info->type);
       } else if (info->induction_class == kWrapAround) {
+        DCHECK(info->operation == kNop);
         return "wrap(" + InductionToString(info->op_a) + ", " +
                          InductionToString(info->op_b) + "):" +
                          Primitive::PrettyDescriptor(info->type);
       } else if (info->induction_class == kPeriodic) {
+        DCHECK(info->operation == kNop);
         return "periodic(" + InductionToString(info->op_a) + ", " +
                              InductionToString(info->op_b) + "):" +
                              Primitive::PrettyDescriptor(info->type);
diff --git a/compiler/optimizing/induction_var_analysis.h b/compiler/optimizing/induction_var_analysis.h
index 94afc71..4720f2d 100644
--- a/compiler/optimizing/induction_var_analysis.h
+++ b/compiler/optimizing/induction_var_analysis.h
@@ -65,6 +65,7 @@
     kNeg,
     kMul,
     kDiv,
+    kRem,
     kXor,
     kFetch,
     // Trip-counts.
@@ -85,10 +86,10 @@
    *         op: a + b, a - b, -b, a * b, a / b, a % b, a ^ b, fetch
    *   (2) linear:
    *         nop: a * i + b
-   *   (3) polynomial:                        // TODO: coming soon
-   *         nop: sum_i(a) + b, for linear a
+   *   (3) polynomial:
+   *         nop: sum_lt(a) + b, for linear a
    *   (4) geometric:
-   *         op: a * fetch^i + b, a * fetch^-i + b, a mod_i fetch + b
+   *         op: a * fetch^i + b, a * fetch^-i + b
    *   (5) wrap-around
    *         nop: a, then defined by b
    *   (6) periodic
@@ -177,17 +178,12 @@
                              HInstruction* y,
                              InductionOp op,
                              bool is_first_call);  // possibly swaps x and y to try again
-  InductionInfo* SolveGeo(HLoopInformation* loop,
-                          HInstruction* entry_phi,
-                          HInstruction* instruction,
-                          HInstruction* x,
-                          HInstruction* y,
-                          InductionOp op);
-  InductionInfo* SolveXor(HLoopInformation* loop,
-                          HInstruction* entry_phi,
-                          HInstruction* instruction,
-                          HInstruction* x,
-                          HInstruction* y);
+  InductionInfo* SolveOp(HLoopInformation* loop,
+                         HInstruction* entry_phi,
+                         HInstruction* instruction,
+                         HInstruction* x,
+                         HInstruction* y,
+                         InductionOp op);
   InductionInfo* SolveTest(HLoopInformation* loop,
                            HInstruction* entry_phi,
                            HInstruction* instruction,
diff --git a/compiler/optimizing/induction_var_analysis_test.cc b/compiler/optimizing/induction_var_analysis_test.cc
index 2199c8e..2d182f6 100644
--- a/compiler/optimizing/induction_var_analysis_test.cc
+++ b/compiler/optimizing/induction_var_analysis_test.cc
@@ -85,6 +85,7 @@
     constant0_ = graph_->GetIntConstant(0);
     constant1_ = graph_->GetIntConstant(1);
     constant2_ = graph_->GetIntConstant(2);
+    constant7_ = graph_->GetIntConstant(7);
     constant100_ = graph_->GetIntConstant(100);
     float_constant0_ = graph_->GetFloatConstant(0.0f);
     return_->AddInstruction(new (&allocator_) HReturnVoid());
@@ -193,6 +194,7 @@
   HInstruction* constant0_;
   HInstruction* constant1_;
   HInstruction* constant2_;
+  HInstruction* constant7_;
   HInstruction* constant100_;
   HInstruction* float_constant0_;
 
@@ -378,6 +380,135 @@
   EXPECT_TRUE(HaveSameInduction(store->InputAt(1), inc2));
 }
 
+TEST_F(InductionVarAnalysisTest, AddLinear) {
+  // Setup:
+  // for (int i = 0; i < 100; i++) {
+  //   t1 = i + i;
+  //   t2 = 7 + i;
+  //   t3 = t1 + t2;
+  // }
+  BuildLoopNest(1);
+
+  HInstruction* add1 = InsertInstruction(
+      new (&allocator_) HAdd(Primitive::kPrimInt, basic_[0], basic_[0]), 0);
+  HInstruction* add2 = InsertInstruction(
+      new (&allocator_) HAdd(Primitive::kPrimInt, constant7_, basic_[0]), 0);
+  HInstruction* add3 = InsertInstruction(
+      new (&allocator_) HAdd(Primitive::kPrimInt, add1, add2), 0);
+  PerformInductionVarAnalysis();
+
+  EXPECT_STREQ("((1) * i + (0)):PrimInt", GetInductionInfo(basic_[0], 0).c_str());
+  EXPECT_STREQ("(((1) + (1)) * i + (0)):PrimInt", GetInductionInfo(add1, 0).c_str());
+  EXPECT_STREQ("((1) * i + (7)):PrimInt", GetInductionInfo(add2, 0).c_str());
+  EXPECT_STREQ("((((1) + (1)) + (1)) * i + (7)):PrimInt", GetInductionInfo(add3, 0).c_str());
+}
+
+TEST_F(InductionVarAnalysisTest, FindPolynomialInduction) {
+  // Setup:
+  // k = 1;
+  // for (int i = 0; i < 100; i++) {
+  //   t = i * 2;
+  //   t = 100 + t
+  //   k = t + k;  // polynomial
+  // }
+  BuildLoopNest(1);
+  HPhi* k_header = InsertLoopPhi(0, 0);
+  k_header->AddInput(constant1_);
+
+  HInstruction* mul = InsertInstruction(
+      new (&allocator_) HMul(Primitive::kPrimInt, basic_[0], constant2_), 0);
+  HInstruction* add = InsertInstruction(
+      new (&allocator_) HAdd(Primitive::kPrimInt, constant100_, mul), 0);
+  HInstruction* pol = InsertInstruction(
+      new (&allocator_) HAdd(Primitive::kPrimInt, add, k_header), 0);
+  k_header->AddInput(pol);
+  PerformInductionVarAnalysis();
+
+  // Note, only the phi in the cycle and the base linear induction are classified.
+  EXPECT_STREQ("poly(sum_lt(((2) * i + (100)):PrimInt) + (1)):PrimInt",
+               GetInductionInfo(k_header, 0).c_str());
+  EXPECT_STREQ("((2) * i + (100)):PrimInt", GetInductionInfo(add, 0).c_str());
+  EXPECT_STREQ("", GetInductionInfo(pol, 0).c_str());
+}
+
+TEST_F(InductionVarAnalysisTest, FindPolynomialInductionAndDerived) {
+  // Setup:
+  // k = 1;
+  // for (int i = 0; i < 100; i++) {
+  //   t = k + 100;
+  //   t = k - 1;
+  //   t = - t
+  //   t = k * 2;
+  //   t = k << 2;
+  //   k = k + i;  // polynomial
+  // }
+  BuildLoopNest(1);
+  HPhi* k_header = InsertLoopPhi(0, 0);
+  k_header->AddInput(constant1_);
+
+  HInstruction* add = InsertInstruction(
+      new (&allocator_) HAdd(Primitive::kPrimInt, k_header, constant100_), 0);
+  HInstruction* sub = InsertInstruction(
+      new (&allocator_) HSub(Primitive::kPrimInt, k_header, constant1_), 0);
+  HInstruction* neg = InsertInstruction(
+      new (&allocator_) HNeg(Primitive::kPrimInt, sub), 0);
+  HInstruction* mul = InsertInstruction(
+      new (&allocator_) HMul(Primitive::kPrimInt, k_header, constant2_), 0);
+  HInstruction* shl = InsertInstruction(
+      new (&allocator_) HShl(Primitive::kPrimInt, k_header, constant2_), 0);
+  HInstruction* pol = InsertInstruction(
+      new (&allocator_) HAdd(Primitive::kPrimInt, k_header, basic_[0]), 0);
+  k_header->AddInput(pol);
+  PerformInductionVarAnalysis();
+
+  // Note, only the phi in the cycle and derived are classified.
+  EXPECT_STREQ("poly(sum_lt(((1) * i + (0)):PrimInt) + (1)):PrimInt",
+               GetInductionInfo(k_header, 0).c_str());
+  EXPECT_STREQ("poly(sum_lt(((1) * i + (0)):PrimInt) + ((1) + (100))):PrimInt",
+               GetInductionInfo(add, 0).c_str());
+  EXPECT_STREQ("poly(sum_lt(((1) * i + (0)):PrimInt) + ((1) - (1))):PrimInt",
+               GetInductionInfo(sub, 0).c_str());
+  EXPECT_STREQ("poly(sum_lt((( - (1)) * i + (0)):PrimInt) + ((1) - (1))):PrimInt",
+               GetInductionInfo(neg, 0).c_str());
+  EXPECT_STREQ("poly(sum_lt(((2) * i + (0)):PrimInt) + (2)):PrimInt",
+               GetInductionInfo(mul, 0).c_str());
+  EXPECT_STREQ("poly(sum_lt(((4) * i + (0)):PrimInt) + (4)):PrimInt",
+               GetInductionInfo(shl, 0).c_str());
+  EXPECT_STREQ("", GetInductionInfo(pol, 0).c_str());
+}
+
+TEST_F(InductionVarAnalysisTest, AddPolynomial) {
+  // Setup:
+  // k = 7;
+  // for (int i = 0; i < 100; i++) {
+  //   t = k + k;
+  //   t = t + k;
+  //   k = k + i
+  // }
+  BuildLoopNest(1);
+  HPhi* k_header = InsertLoopPhi(0, 0);
+  k_header->AddInput(constant7_);
+
+  HInstruction* add1 = InsertInstruction(
+      new (&allocator_) HAdd(Primitive::kPrimInt, k_header, k_header), 0);
+  HInstruction* add2 = InsertInstruction(
+      new (&allocator_) HAdd(Primitive::kPrimInt, add1, k_header), 0);
+  HInstruction* add3 = InsertInstruction(
+      new (&allocator_) HAdd(Primitive::kPrimInt, k_header, basic_[0]), 0);
+  k_header->AddInput(add3);
+  PerformInductionVarAnalysis();
+
+  // Note, only the phi in the cycle and added-derived are classified.
+  EXPECT_STREQ("poly(sum_lt(((1) * i + (0)):PrimInt) + (7)):PrimInt",
+               GetInductionInfo(k_header, 0).c_str());
+  EXPECT_STREQ("poly(sum_lt((((1) + (1)) * i + (0)):PrimInt) + ((7) + (7))):PrimInt",
+               GetInductionInfo(add1, 0).c_str());
+  EXPECT_STREQ(
+      "poly(sum_lt(((((1) + (1)) + (1)) * i + (0)):PrimInt) + (((7) + (7)) + (7))):PrimInt",
+      GetInductionInfo(add2, 0).c_str());
+  EXPECT_STREQ("", GetInductionInfo(add3, 0).c_str());
+}
+
 TEST_F(InductionVarAnalysisTest, FindGeometricMulInduction) {
   // Setup:
   // k = 1;
@@ -481,20 +612,20 @@
   EXPECT_STREQ("", GetInductionInfo(div, 0).c_str());
 }
 
-TEST_F(InductionVarAnalysisTest, FindGeometricRemInductionAndDerived) {
+TEST_F(InductionVarAnalysisTest, FindRemWrapAroundInductionAndDerived) {
   // Setup:
-  // k = 1;
+  // k = 100;
   // for (int i = 0; i < 100; i++) {
   //   t = k + 100;
   //   t = k - 1;
   //   t = -t
   //   t = k * 2;
   //   t = k << 2;
-  //   k = k % 100;  // geometric (% 100)
+  //   k = k % 7;
   // }
   BuildLoopNest(1);
   HPhi* k_header = InsertLoopPhi(0, 0);
-  k_header->AddInput(constant1_);
+  k_header->AddInput(constant100_);
 
   HInstruction* add = InsertInstruction(
       new (&allocator_) HAdd(Primitive::kPrimInt, k_header, constant100_), 0);
@@ -507,17 +638,17 @@
   HInstruction* shl = InsertInstruction(
       new (&allocator_) HShl(Primitive::kPrimInt, k_header, constant2_), 0);
   HInstruction* rem = InsertInstruction(
-      new (&allocator_) HRem(Primitive::kPrimInt, k_header, constant100_, kNoDexPc), 0);
+      new (&allocator_) HRem(Primitive::kPrimInt, k_header, constant7_, kNoDexPc), 0);
   k_header->AddInput(rem);
   PerformInductionVarAnalysis();
 
-  // Note, only the phi in the cycle and direct additive derived are classified.
-  EXPECT_STREQ("geo((1) mod_i 100 + (0)):PrimInt", GetInductionInfo(k_header, 0).c_str());
-  EXPECT_STREQ("geo((1) mod_i 100 + (100)):PrimInt", GetInductionInfo(add, 0).c_str());
-  EXPECT_STREQ("geo((1) mod_i 100 + ((0) - (1))):PrimInt", GetInductionInfo(sub, 0).c_str());
-  EXPECT_STREQ("", GetInductionInfo(neg, 0).c_str());
-  EXPECT_STREQ("", GetInductionInfo(mul, 0).c_str());
-  EXPECT_STREQ("", GetInductionInfo(shl, 0).c_str());
+  // Note, only the phi in the cycle and derived are classified.
+  EXPECT_STREQ("wrap((100), ((100) % (7))):PrimInt", GetInductionInfo(k_header, 0).c_str());
+  EXPECT_STREQ("wrap(((100) + (100)), (((100) % (7)) + (100))):PrimInt", GetInductionInfo(add, 0).c_str());
+  EXPECT_STREQ("wrap(((100) - (1)), (((100) % (7)) - (1))):PrimInt", GetInductionInfo(sub, 0).c_str());
+  EXPECT_STREQ("wrap(( - ((100) - (1))), ( - (((100) % (7)) - (1)))):PrimInt", GetInductionInfo(neg, 0).c_str());
+  EXPECT_STREQ("wrap(((100) * (2)), (((100) % (7)) * (2))):PrimInt", GetInductionInfo(mul, 0).c_str());
+  EXPECT_STREQ("wrap(((100) * (4)), (((100) % (7)) * (4))):PrimInt", GetInductionInfo(shl, 0).c_str());
   EXPECT_STREQ("", GetInductionInfo(rem, 0).c_str());
 }
 
diff --git a/compiler/optimizing/induction_var_range.cc b/compiler/optimizing/induction_var_range.cc
index 75619a3..e665551 100644
--- a/compiler/optimizing/induction_var_range.cc
+++ b/compiler/optimizing/induction_var_range.cc
@@ -460,6 +460,8 @@
   if (info != nullptr) {
     if (info->induction_class == HInductionVarAnalysis::kLinear) {
       return IsConstant(info->op_a, kExact, stride_value);
+    } else if (info->induction_class == HInductionVarAnalysis::kPolynomial) {
+      return NeedsTripCount(info->op_a, stride_value);
     } else if (info->induction_class == HInductionVarAnalysis::kWrapAround) {
       return NeedsTripCount(info->op_b, stride_value);
     }
@@ -492,7 +494,7 @@
                                                       bool in_body,
                                                       bool is_min) const {
   DCHECK(info != nullptr);
-  DCHECK(info->induction_class == HInductionVarAnalysis::kLinear);
+  DCHECK_EQ(info->induction_class, HInductionVarAnalysis::kLinear);
   // Detect common situation where an offset inside the trip-count cancels out during range
   // analysis (finding max a * (TC - 1) + OFFSET for a == 1 and TC = UPPER - OFFSET or finding
   // min a * (TC - 1) + OFFSET for a == -1 and TC = OFFSET - UPPER) to avoid losing information
@@ -539,25 +541,49 @@
                   GetVal(info->op_b, trip, in_body, is_min));
 }
 
+InductionVarRange::Value InductionVarRange::GetPolynomial(HInductionVarAnalysis::InductionInfo* info,
+                                                          HInductionVarAnalysis::InductionInfo* trip,
+                                                          bool in_body,
+                                                          bool is_min) const {
+  DCHECK(info != nullptr);
+  DCHECK_EQ(info->induction_class, HInductionVarAnalysis::kPolynomial);
+  int64_t a = 0;
+  int64_t b = 0;
+  if (IsConstant(info->op_a->op_a, kExact, &a) && CanLongValueFitIntoInt(a) && a >= 0 &&
+      IsConstant(info->op_a->op_b, kExact, &b) && CanLongValueFitIntoInt(b) && b >= 0) {
+    // Evaluate bounds on sum_i=0^m-1(a * i + b) + c with a,b >= 0 for known
+    // maximum index value m as a * (m * (m-1)) / 2 + b * m + c.
+    Value c = GetVal(info->op_b, trip, in_body, is_min);
+    if (is_min) {
+      return c;
+    } else {
+      Value m = GetVal(trip, trip, in_body, is_min);
+      Value t = DivValue(MulValue(m, SubValue(m, Value(1))), Value(2));
+      Value x = MulValue(Value(a), t);
+      Value y = MulValue(Value(b), m);
+      return AddValue(AddValue(x, y), c);
+    }
+  }
+  return Value();
+}
+
 InductionVarRange::Value InductionVarRange::GetGeometric(HInductionVarAnalysis::InductionInfo* info,
                                                          HInductionVarAnalysis::InductionInfo* trip,
                                                          bool in_body,
                                                          bool is_min) const {
   DCHECK(info != nullptr);
-  DCHECK(info->induction_class == HInductionVarAnalysis::kGeometric);
+  DCHECK_EQ(info->induction_class, HInductionVarAnalysis::kGeometric);
   int64_t a = 0;
   int64_t f = 0;
   if (IsConstant(info->op_a, kExact, &a) &&
       CanLongValueFitIntoInt(a) &&
-      IsIntAndGet(info->fetch, &f) &&
-      CanLongValueFitIntoInt(f) && f >= 1) {
-    // Conservative bounds on a * f^-i + b with f >= 1 can be computed without trip count.
-    // Same for mod. All other forms would require a much more elaborate evaluation.
+      IsIntAndGet(info->fetch, &f) && f >= 1) {
+    // Conservative bounds on a * f^-i + b with f >= 1 can be computed without
+    // trip count. Other forms would require a much more elaborate evaluation.
     const bool is_min_a = a >= 0 ? is_min : !is_min;
     if (info->operation == HInductionVarAnalysis::kDiv) {
-      return AddValue(Value(is_min_a ? 0 : a), GetVal(info->op_b, trip, in_body, is_min));
-    } else if (info->operation == HInductionVarAnalysis::kNop) {
-      return AddValue(Value(is_min_a ? (a % f) : a), GetVal(info->op_b, trip, in_body, is_min));
+      Value b = GetVal(info->op_b, trip, in_body, is_min);
+      return is_min_a ? b : AddValue(Value(a), b);
     }
   }
   return Value();
@@ -572,7 +598,7 @@
   if (chase_hint_ == nullptr && in_body && trip != nullptr && instruction == trip->op_a->fetch) {
     if (is_min) {
       return Value(1);
-    } else if (!IsUnsafeTripCount(trip)) {
+    } else if (!instruction->IsConstant() && !IsUnsafeTripCount(trip)) {
       return Value(std::numeric_limits<int32_t>::max());
     }
   }
@@ -650,6 +676,8 @@
             return GetMul(info->op_a, info->op_b, trip, in_body, is_min);
           case HInductionVarAnalysis::kDiv:
             return GetDiv(info->op_a, info->op_b, trip, in_body, is_min);
+          case HInductionVarAnalysis::kRem:
+            return GetRem(info->op_a, info->op_b);
           case HInductionVarAnalysis::kXor:
             return GetXor(info->op_a, info->op_b);
           case HInductionVarAnalysis::kFetch:
@@ -675,7 +703,7 @@
       case HInductionVarAnalysis::kLinear:
         return CorrectForType(GetLinear(info, trip, in_body, is_min), info->type);
       case HInductionVarAnalysis::kPolynomial:
-        break;
+        return GetPolynomial(info, trip, in_body, is_min);
       case HInductionVarAnalysis::kGeometric:
         return GetGeometric(info, trip, in_body, is_min);
       case HInductionVarAnalysis::kWrapAround:
@@ -757,6 +785,21 @@
   return Value();
 }
 
+InductionVarRange::Value InductionVarRange::GetRem(
+    HInductionVarAnalysis::InductionInfo* info1,
+    HInductionVarAnalysis::InductionInfo* info2) const {
+  int64_t v1 = 0;
+  int64_t v2 = 0;
+  // Only accept exact values.
+  if (IsConstant(info1, kExact, &v1) && IsConstant(info2, kExact, &v2) && v2 != 0) {
+    int64_t value = v1 % v2;
+    if (CanLongValueFitIntoInt(value)) {
+      return Value(static_cast<int32_t>(value));
+    }
+  }
+  return Value();
+}
+
 InductionVarRange::Value InductionVarRange::GetXor(
     HInductionVarAnalysis::InductionInfo* info1,
     HInductionVarAnalysis::InductionInfo* info2) const {
@@ -898,8 +941,12 @@
           upper = nullptr;
         }
         break;
+      case HInductionVarAnalysis::kPolynomial:
+        return GenerateLastValuePolynomial(info, trip, graph, block, lower);
       case HInductionVarAnalysis::kGeometric:
         return GenerateLastValueGeometric(info, trip, graph, block, lower);
+      case HInductionVarAnalysis::kWrapAround:
+        return GenerateLastValueWrapAround(info, trip, graph, block, lower);
       case HInductionVarAnalysis::kPeriodic:
         return GenerateLastValuePeriodic(info, trip, graph, block, lower, needs_taken_test);
       default:
@@ -925,13 +972,43 @@
       GenerateCode(info, trip, graph, block, upper, in_body, /* is_min */ false);
 }
 
+bool InductionVarRange::GenerateLastValuePolynomial(HInductionVarAnalysis::InductionInfo* info,
+                                                    HInductionVarAnalysis::InductionInfo* trip,
+                                                    HGraph* graph,
+                                                    HBasicBlock* block,
+                                                    /*out*/HInstruction** result) const {
+  DCHECK(info != nullptr);
+  DCHECK_EQ(info->induction_class, HInductionVarAnalysis::kPolynomial);
+  // Detect known coefficients and trip count (always taken).
+  int64_t a = 0;
+  int64_t b = 0;
+  int64_t m = 0;
+  if (IsConstant(info->op_a->op_a, kExact, &a) && a >= 0 &&
+      IsConstant(info->op_a->op_b, kExact, &b) && b >= 0 &&
+      IsConstant(trip->op_a, kExact, &m) && m >= 1) {
+    // Evaluate bounds on sum_i=0^m-1(a * i + b) + c with a,b >= 0 for known
+    // maximum index value m as a * (m * (m-1)) / 2 + b * m + c.
+    // TODO: generalize
+    HInstruction* c_instr = nullptr;
+    if (GenerateCode(info->op_b, nullptr, graph, block, graph ? &c_instr : nullptr, false, false)) {
+      if (graph != nullptr) {
+        int64_t sum = a * ((m * (m - 1)) / 2) + b * m;
+        *result = Insert(block, new (graph->GetArena()) HAdd(info->type,
+                                                             graph->GetIntConstant(sum), c_instr));
+      }
+      return true;
+    }
+  }
+  return false;
+}
+
 bool InductionVarRange::GenerateLastValueGeometric(HInductionVarAnalysis::InductionInfo* info,
                                                    HInductionVarAnalysis::InductionInfo* trip,
                                                    HGraph* graph,
                                                    HBasicBlock* block,
                                                    /*out*/HInstruction** result) const {
   DCHECK(info != nullptr);
-  DCHECK(info->induction_class == HInductionVarAnalysis::kGeometric);
+  DCHECK_EQ(info->induction_class, HInductionVarAnalysis::kGeometric);
   // Detect known base and trip count (always taken).
   int64_t f = 0;
   int64_t t = 0;
@@ -940,15 +1017,6 @@
     HInstruction* opb = nullptr;
     if (GenerateCode(info->op_a, nullptr, graph, block, &opa, false, false) &&
         GenerateCode(info->op_b, nullptr, graph, block, &opb, false, false)) {
-      // Generate a % f + b.
-      if (info->operation == HInductionVarAnalysis::kNop) {
-        if (graph != nullptr) {
-          HInstruction* rem =
-              Insert(block, new (graph->GetArena()) HRem(info->type, opa, info->fetch, kNoDexPc));
-          *result = Insert(block, new (graph->GetArena()) HAdd(info->type, rem, opb));
-        }
-        return true;
-      }
       // Compute f ^ t.
       int64_t fpowt = IntPow(f, t);
       if (graph != nullptr) {
@@ -980,6 +1048,28 @@
   return false;
 }
 
+bool InductionVarRange::GenerateLastValueWrapAround(HInductionVarAnalysis::InductionInfo* info,
+                                                    HInductionVarAnalysis::InductionInfo* trip,
+                                                    HGraph* graph,
+                                                    HBasicBlock* block,
+                                                    /*out*/HInstruction** result) const {
+  DCHECK(info != nullptr);
+  DCHECK_EQ(info->induction_class, HInductionVarAnalysis::kWrapAround);
+  // Count depth.
+  int32_t depth = 0;
+  for (; info->induction_class == HInductionVarAnalysis::kWrapAround;
+       info = info->op_b, ++depth) {}
+  // Handle wrap(x, wrap(.., y)) if trip count reaches an invariant at end.
+  // TODO: generalize
+  int64_t t = 0;
+  if (info->induction_class == HInductionVarAnalysis::kInvariant &&
+      IsConstant(trip->op_a, kExact, &t) && t >= depth &&
+      GenerateCode(info, nullptr, graph, block, result, false, false)) {
+    return true;
+  }
+  return false;
+}
+
 bool InductionVarRange::GenerateLastValuePeriodic(HInductionVarAnalysis::InductionInfo* info,
                                                   HInductionVarAnalysis::InductionInfo* trip,
                                                   HGraph* graph,
@@ -987,17 +1077,18 @@
                                                   /*out*/HInstruction** result,
                                                   /*out*/bool* needs_taken_test) const {
   DCHECK(info != nullptr);
-  DCHECK(info->induction_class == HInductionVarAnalysis::kPeriodic);
+  DCHECK_EQ(info->induction_class, HInductionVarAnalysis::kPeriodic);
   // Count period.
   int32_t period = 1;
   for (HInductionVarAnalysis::InductionInfo* p = info;
        p->induction_class == HInductionVarAnalysis::kPeriodic;
        p = p->op_b, ++period) {}
   // Handle periodic(x, y) case for restricted types.
+  // TODO: generalize
   if (period != 2 ||
       trip->op_a->type != Primitive::kPrimInt ||
       (info->type != Primitive::kPrimInt && info->type != Primitive::kPrimBoolean)) {
-    return false;  // TODO: easy to generalize
+    return false;
   }
   HInstruction* x_instr = nullptr;
   HInstruction* y_instr = nullptr;
@@ -1058,6 +1149,7 @@
         // invariants, some effort is made to keep this parameter consistent).
         switch (info->operation) {
           case HInductionVarAnalysis::kAdd:
+          case HInductionVarAnalysis::kRem:  // no proper is_min for second arg
           case HInductionVarAnalysis::kXor:  // no proper is_min for second arg
           case HInductionVarAnalysis::kLT:
           case HInductionVarAnalysis::kLE:
@@ -1070,6 +1162,8 @@
                 switch (info->operation) {
                   case HInductionVarAnalysis::kAdd:
                     operation = new (graph->GetArena()) HAdd(type, opa, opb); break;
+                  case HInductionVarAnalysis::kRem:
+                    operation = new (graph->GetArena()) HRem(type, opa, opb, kNoDexPc); break;
                   case HInductionVarAnalysis::kXor:
                     operation = new (graph->GetArena()) HXor(type, opa, opb); break;
                   case HInductionVarAnalysis::kLT:
diff --git a/compiler/optimizing/induction_var_range.h b/compiler/optimizing/induction_var_range.h
index f7360e8..ba14847 100644
--- a/compiler/optimizing/induction_var_range.h
+++ b/compiler/optimizing/induction_var_range.h
@@ -190,6 +190,10 @@
                   HInductionVarAnalysis::InductionInfo* trip,
                   bool in_body,
                   bool is_min) const;
+  Value GetPolynomial(HInductionVarAnalysis::InductionInfo* info,
+                      HInductionVarAnalysis::InductionInfo* trip,
+                      bool in_body,
+                      bool is_min) const;
   Value GetGeometric(HInductionVarAnalysis::InductionInfo* info,
                      HInductionVarAnalysis::InductionInfo* trip,
                      bool in_body,
@@ -212,6 +216,8 @@
                HInductionVarAnalysis::InductionInfo* trip,
                bool in_body,
                bool is_min) const;
+  Value GetRem(HInductionVarAnalysis::InductionInfo* info1,
+               HInductionVarAnalysis::InductionInfo* info2) const;
   Value GetXor(HInductionVarAnalysis::InductionInfo* info1,
                HInductionVarAnalysis::InductionInfo* info2) const;
 
@@ -249,12 +255,24 @@
                                 /*out*/ bool* needs_finite_test,
                                 /*out*/ bool* needs_taken_test) const;
 
+  bool GenerateLastValuePolynomial(HInductionVarAnalysis::InductionInfo* info,
+                                   HInductionVarAnalysis::InductionInfo* trip,
+                                   HGraph* graph,
+                                   HBasicBlock* block,
+                                   /*out*/HInstruction** result) const;
+
   bool GenerateLastValueGeometric(HInductionVarAnalysis::InductionInfo* info,
                                   HInductionVarAnalysis::InductionInfo* trip,
                                   HGraph* graph,
                                   HBasicBlock* block,
                                   /*out*/HInstruction** result) const;
 
+  bool GenerateLastValueWrapAround(HInductionVarAnalysis::InductionInfo* info,
+                                   HInductionVarAnalysis::InductionInfo* trip,
+                                   HGraph* graph,
+                                   HBasicBlock* block,
+                                   /*out*/HInstruction** result) const;
+
   bool GenerateLastValuePeriodic(HInductionVarAnalysis::InductionInfo* info,
                                  HInductionVarAnalysis::InductionInfo* trip,
                                  HGraph* graph,
diff --git a/compiler/optimizing/induction_var_range_test.cc b/compiler/optimizing/induction_var_range_test.cc
index 510841e..aa3e1aa 100644
--- a/compiler/optimizing/induction_var_range_test.cc
+++ b/compiler/optimizing/induction_var_range_test.cc
@@ -135,6 +135,8 @@
       case 'n': op = HInductionVarAnalysis::kNeg; break;
       case '*': op = HInductionVarAnalysis::kMul; break;
       case '/': op = HInductionVarAnalysis::kDiv; break;
+      case '%': op = HInductionVarAnalysis::kRem; break;
+      case '^': op = HInductionVarAnalysis::kXor; break;
       case '<': op = HInductionVarAnalysis::kLT;  break;
       default:  op = HInductionVarAnalysis::kNop; break;
     }
@@ -178,12 +180,21 @@
                                  Primitive::kPrimInt);
   }
 
+  /** Constructs a polynomial sum(a * i + b) + c induction. */
+  HInductionVarAnalysis::InductionInfo* CreatePolynomial(int32_t a, int32_t b, int32_t c) {
+    return iva_->CreateInduction(HInductionVarAnalysis::kPolynomial,
+                                 HInductionVarAnalysis::kNop,
+                                 CreateLinear(a, b),
+                                 CreateConst(c),
+                                 nullptr,
+                                 Primitive::kPrimInt);
+  }
+
   /** Constructs a geometric a * f^i + b induction. */
   HInductionVarAnalysis::InductionInfo* CreateGeometric(int32_t a, int32_t b, int32_t f, char op) {
     return iva_->CreateInduction(HInductionVarAnalysis::kGeometric,
-                                 op == '*' ? HInductionVarAnalysis::kMul :
-                                 op == '/' ? HInductionVarAnalysis::kDiv :
-                                             HInductionVarAnalysis::kNop,
+                                 op == '*' ? HInductionVarAnalysis::kMul
+                                           : HInductionVarAnalysis::kDiv,
                                  CreateConst(a),
                                  CreateConst(b),
                                  graph_->GetIntConstant(f),
@@ -200,7 +211,7 @@
                                  Primitive::kPrimInt);
   }
 
-  /** Constructs a wrap-around induction consisting of a constant, followed info */
+  /** Constructs a wrap-around induction consisting of a constant, followed by info. */
   HInductionVarAnalysis::InductionInfo* CreateWrapAround(
       int32_t initial,
       HInductionVarAnalysis::InductionInfo* info) {
@@ -256,6 +267,16 @@
     return range_.GetDiv(info1, info2, nullptr, /* in_body */ true, is_min);
   }
 
+  Value GetRem(HInductionVarAnalysis::InductionInfo* info1,
+               HInductionVarAnalysis::InductionInfo* info2) {
+    return range_.GetRem(info1, info2);
+  }
+
+  Value GetXor(HInductionVarAnalysis::InductionInfo* info1,
+               HInductionVarAnalysis::InductionInfo* info2) {
+    return range_.GetXor(info1, info2);
+  }
+
   bool IsExact(HInductionVarAnalysis::InductionInfo* info, int64_t* value) {
     return range_.IsConstant(info, InductionVarRange::kExact, value);
   }
@@ -438,6 +459,27 @@
   ExpectEqual(Value(20), GetMax(CreateWrapAround(20, -1, 10), nullptr));
 }
 
+TEST_F(InductionVarRangeTest, GetMinMaxPolynomial) {
+  ExpectEqual(Value(7), GetMin(CreatePolynomial(3, 5, 7), nullptr));
+  ExpectEqual(Value(), GetMax(CreatePolynomial(3, 5, 7), nullptr));
+  ExpectEqual(Value(7), GetMin(CreatePolynomial(3, 5, 7), CreateTripCount(5, true, true)));
+  ExpectEqual(Value(45), GetMax(CreatePolynomial(3, 5, 7), CreateTripCount(5, true, true)));
+  ExpectEqual(Value(7), GetMin(CreatePolynomial(3, 5, 7), CreateTripCount(10, true, true)));
+  ExpectEqual(Value(160), GetMax(CreatePolynomial(3, 5, 7), CreateTripCount(10, true, true)));
+  ExpectEqual(Value(-7), GetMin(CreatePolynomial(11, 13, -7),
+                               CreateTripCount(5, true, true)));
+  ExpectEqual(Value(111), GetMax(CreatePolynomial(11, 13, -7),
+                                 CreateTripCount(5, true, true)));
+  ExpectEqual(Value(-7), GetMin(CreatePolynomial(11, 13, -7),
+                               CreateTripCount(10, true, true)));
+  ExpectEqual(Value(506), GetMax(CreatePolynomial(11, 13, -7),
+                                 CreateTripCount(10, true, true)));
+  ExpectEqual(Value(), GetMin(CreatePolynomial(-3, 5, 7), CreateTripCount(10, true, true)));
+  ExpectEqual(Value(), GetMax(CreatePolynomial(-3, 5, 7), CreateTripCount(10, true, true)));
+  ExpectEqual(Value(), GetMin(CreatePolynomial(3, -5, 7), CreateTripCount(10, true, true)));
+  ExpectEqual(Value(), GetMax(CreatePolynomial(3, -5, 7), CreateTripCount(10, true, true)));
+}
+
 TEST_F(InductionVarRangeTest, GetMinMaxGeometricMul) {
   ExpectEqual(Value(), GetMin(CreateGeometric(1, 1, 1, '*'), nullptr));
   ExpectEqual(Value(), GetMax(CreateGeometric(1, 1, 1, '*'), nullptr));
@@ -454,17 +496,6 @@
   ExpectEqual(Value(-5), GetMax(CreateGeometric(-11, -5, 3, '/'), nullptr));
 }
 
-TEST_F(InductionVarRangeTest, GetMinMaxGeometricRem) {
-  ExpectEqual(Value(7), GetMin(CreateGeometric(11, 5, 3, '%'), nullptr));
-  ExpectEqual(Value(16), GetMax(CreateGeometric(11, 5, 3, '%'), nullptr));
-  ExpectEqual(Value(-3), GetMin(CreateGeometric(11, -5, 3, '%'), nullptr));
-  ExpectEqual(Value(6), GetMax(CreateGeometric(11, -5, 3, '%'), nullptr));
-  ExpectEqual(Value(-6), GetMin(CreateGeometric(-11, 5, 3, '%'), nullptr));
-  ExpectEqual(Value(3), GetMax(CreateGeometric(-11, 5, 3, '%'), nullptr));
-  ExpectEqual(Value(-16), GetMin(CreateGeometric(-11, -5, 3, '%'), nullptr));
-  ExpectEqual(Value(-7), GetMax(CreateGeometric(-11, -5, 3, '%'), nullptr));
-}
-
 TEST_F(InductionVarRangeTest, GetMinMaxPeriodic) {
   ExpectEqual(Value(-2), GetMin(CreateRange(-2, 99), nullptr));
   ExpectEqual(Value(99), GetMax(CreateRange(-2, 99), nullptr));
@@ -530,6 +561,46 @@
   ExpectEqual(Value(), GetDiv(CreateRange(-1, 1), CreateRange(-1, 1), false));
 }
 
+TEST_F(InductionVarRangeTest, GetMinMaxRem) {
+  ExpectEqual(Value(), GetMin(CreateInvariant('%', CreateConst(2), CreateRange(10, 20)), nullptr));
+  ExpectEqual(Value(), GetMax(CreateInvariant('%', CreateConst(2), CreateRange(10, 20)), nullptr));
+  ExpectEqual(Value(), GetMin(CreateInvariant('%', CreateRange(10, 20), CreateConst(2)), nullptr));
+  ExpectEqual(Value(), GetMax(CreateInvariant('%', CreateRange(10, 20), CreateConst(2)), nullptr));
+  ExpectEqual(Value(2), GetMin(CreateInvariant('%', CreateConst(2), CreateConst(5)), nullptr));
+  ExpectEqual(Value(2), GetMax(CreateInvariant('%', CreateConst(2), CreateConst(5)), nullptr));
+  ExpectEqual(Value(1), GetMin(CreateInvariant('%', CreateConst(11), CreateConst(5)), nullptr));
+  ExpectEqual(Value(1), GetMax(CreateInvariant('%', CreateConst(11), CreateConst(5)), nullptr));
+}
+
+TEST_F(InductionVarRangeTest, GetRem) {
+  ExpectEqual(Value(0), GetRem(CreateConst(1), CreateConst(1)));
+  ExpectEqual(Value(2), GetRem(CreateConst(2), CreateConst(5)));
+  ExpectEqual(Value(1), GetRem(CreateConst(11), CreateConst(5)));
+  ExpectEqual(Value(-2), GetRem(CreateConst(-2), CreateConst(5)));
+  ExpectEqual(Value(-1), GetRem(CreateConst(-11), CreateConst(5)));
+  ExpectEqual(Value(2), GetRem(CreateConst(2), CreateConst(-5)));
+  ExpectEqual(Value(1), GetRem(CreateConst(11), CreateConst(-5)));
+  ExpectEqual(Value(-2), GetRem(CreateConst(-2), CreateConst(-5)));
+  ExpectEqual(Value(-1), GetRem(CreateConst(-11), CreateConst(-5)));
+  ExpectEqual(Value(), GetRem(CreateConst(1), CreateConst(0)));
+}
+
+TEST_F(InductionVarRangeTest, GetMinMaxXor) {
+  ExpectEqual(Value(), GetMin(CreateInvariant('^', CreateConst(2), CreateRange(10, 20)), nullptr));
+  ExpectEqual(Value(), GetMax(CreateInvariant('^', CreateConst(2), CreateRange(10, 20)), nullptr));
+  ExpectEqual(Value(), GetMin(CreateInvariant('^', CreateRange(10, 20), CreateConst(2)), nullptr));
+  ExpectEqual(Value(), GetMax(CreateInvariant('^', CreateRange(10, 20), CreateConst(2)), nullptr));
+  ExpectEqual(Value(3), GetMin(CreateInvariant('^', CreateConst(1), CreateConst(2)), nullptr));
+  ExpectEqual(Value(3), GetMax(CreateInvariant('^', CreateConst(1), CreateConst(2)), nullptr));
+}
+
+TEST_F(InductionVarRangeTest, GetXor) {
+  ExpectEqual(Value(0), GetXor(CreateConst(1), CreateConst(1)));
+  ExpectEqual(Value(3), GetXor(CreateConst(1), CreateConst(2)));
+  ExpectEqual(Value(-2), GetXor(CreateConst(1), CreateConst(-1)));
+  ExpectEqual(Value(0), GetXor(CreateConst(-1), CreateConst(-1)));
+}
+
 TEST_F(InductionVarRangeTest, AddValue) {
   ExpectEqual(Value(110), AddValue(Value(10), Value(100)));
   ExpectEqual(Value(-5), AddValue(Value(x_, 1, -4), Value(x_, -1, -1)));
diff --git a/compiler/optimizing/inliner.cc b/compiler/optimizing/inliner.cc
index 8d93867..fe4662a 100644
--- a/compiler/optimizing/inliner.cc
+++ b/compiler/optimizing/inliner.cc
@@ -1444,7 +1444,7 @@
   // optimization that could lead to a HDeoptimize. The following optimizations do not.
   HDeadCodeElimination dce(callee_graph, stats_, "dead_code_elimination$inliner");
   HConstantFolding fold(callee_graph, "constant_folding$inliner");
-  HSharpening sharpening(callee_graph, codegen_, dex_compilation_unit, compiler_driver_);
+  HSharpening sharpening(callee_graph, codegen_, dex_compilation_unit, compiler_driver_, handles_);
   InstructionSimplifier simplify(callee_graph, stats_);
   IntrinsicsRecognizer intrinsics(callee_graph, stats_);
 
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index f4616e3..9d73e29 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -64,7 +64,8 @@
       top_loop_(nullptr),
       last_loop_(nullptr),
       iset_(nullptr),
-      induction_simplication_count_(0) {
+      induction_simplication_count_(0),
+      simplified_(false) {
 }
 
 void HLoopOptimization::Run() {
@@ -169,9 +170,15 @@
     if (current_induction_simplification_count != induction_simplication_count_) {
       induction_range_.ReVisit(node->loop_info);
     }
-    SimplifyBlocks(node);
-    SimplifyInduction(node);
-    SimplifyBlocks(node);
+    // Repeat simplifications until no more changes occur. Note that since
+    // each simplification consists of eliminating code (without introducing
+    // new code), this process is always finite.
+    do {
+      simplified_ = false;
+      SimplifyBlocks(node);
+      SimplifyInduction(node);
+    } while (simplified_);
+    // Remove inner loops when empty.
     if (node->inner == nullptr) {
       RemoveIfEmptyInnerLoop(node);
     }
@@ -198,63 +205,57 @@
       for (HInstruction* i : *iset_) {
         RemoveFromCycle(i);
       }
+      simplified_ = true;
       induction_simplication_count_++;
     }
   }
 }
 
 void HLoopOptimization::SimplifyBlocks(LoopNode* node) {
-  // Repeat the block simplifications until no more changes occur. Note that since
-  // each simplification consists of eliminating code (without introducing new code),
-  // this process is always finite.
-  bool changed;
-  do {
-    changed = false;
-    // Iterate over all basic blocks in the loop-body.
-    for (HBlocksInLoopIterator it(*node->loop_info); !it.Done(); it.Advance()) {
-      HBasicBlock* block = it.Current();
-      // Remove dead instructions from the loop-body.
-      for (HBackwardInstructionIterator i(block->GetInstructions()); !i.Done(); i.Advance()) {
-        HInstruction* instruction = i.Current();
-        if (instruction->IsDeadAndRemovable()) {
-          changed = true;
-          block->RemoveInstruction(instruction);
-        }
+  // Iterate over all basic blocks in the loop-body.
+  for (HBlocksInLoopIterator it(*node->loop_info); !it.Done(); it.Advance()) {
+    HBasicBlock* block = it.Current();
+    // Remove dead instructions from the loop-body.
+    for (HBackwardInstructionIterator i(block->GetInstructions()); !i.Done(); i.Advance()) {
+      HInstruction* instruction = i.Current();
+      if (instruction->IsDeadAndRemovable()) {
+        simplified_ = true;
+        block->RemoveInstruction(instruction);
       }
-      // Remove trivial control flow blocks from the loop-body.
-      HBasicBlock* succ = nullptr;
-      if (IsGotoBlock(block, &succ) && succ->GetPredecessors().size() == 1) {
-        // Trivial goto block can be removed.
-        HBasicBlock* pred = block->GetSinglePredecessor();
-        changed = true;
-        pred->ReplaceSuccessor(block, succ);
-        block->RemoveDominatedBlock(succ);
-        block->DisconnectAndDelete();
-        pred->AddDominatedBlock(succ);
-        succ->SetDominator(pred);
-      } else if (block->GetSuccessors().size() == 2) {
-        // Trivial if block can be bypassed to either branch.
-        HBasicBlock* succ0 = block->GetSuccessors()[0];
-        HBasicBlock* succ1 = block->GetSuccessors()[1];
-        HBasicBlock* meet0 = nullptr;
-        HBasicBlock* meet1 = nullptr;
-        if (succ0 != succ1 &&
-            IsGotoBlock(succ0, &meet0) &&
-            IsGotoBlock(succ1, &meet1) &&
-            meet0 == meet1 &&  // meets again
-            meet0 != block &&  // no self-loop
-            meet0->GetPhis().IsEmpty()) {  // not used for merging
-          changed = true;
-          succ0->DisconnectAndDelete();
-          if (block->Dominates(meet0)) {
-            block->RemoveDominatedBlock(meet0);
-            succ1->AddDominatedBlock(meet0);
-            meet0->SetDominator(succ1);
-          }
+    }
+    // Remove trivial control flow blocks from the loop-body.
+    HBasicBlock* succ = nullptr;
+    if (IsGotoBlock(block, &succ) && succ->GetPredecessors().size() == 1) {
+      // Trivial goto block can be removed.
+      HBasicBlock* pred = block->GetSinglePredecessor();
+      simplified_ = true;
+      pred->ReplaceSuccessor(block, succ);
+      block->RemoveDominatedBlock(succ);
+      block->DisconnectAndDelete();
+      pred->AddDominatedBlock(succ);
+      succ->SetDominator(pred);
+    } else if (block->GetSuccessors().size() == 2) {
+      // Trivial if block can be bypassed to either branch.
+      HBasicBlock* succ0 = block->GetSuccessors()[0];
+      HBasicBlock* succ1 = block->GetSuccessors()[1];
+      HBasicBlock* meet0 = nullptr;
+      HBasicBlock* meet1 = nullptr;
+      if (succ0 != succ1 &&
+          IsGotoBlock(succ0, &meet0) &&
+          IsGotoBlock(succ1, &meet1) &&
+          meet0 == meet1 &&  // meets again
+          meet0 != block &&  // no self-loop
+          meet0->GetPhis().IsEmpty()) {  // not used for merging
+        simplified_ = true;
+        succ0->DisconnectAndDelete();
+        if (block->Dominates(meet0)) {
+          block->RemoveDominatedBlock(meet0);
+          succ1->AddDominatedBlock(meet0);
+          meet0->SetDominator(succ1);
         }
       }
     }
-  } while (changed);
+  }
 }
 
 void HLoopOptimization::RemoveIfEmptyInnerLoop(LoopNode* node) {
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 3391bef..0f05b24 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -95,6 +95,9 @@
   // when the induction of inner loops has changed.
   int32_t induction_simplication_count_;
 
+  // Flag that tracks if any simplifications have occurred.
+  bool simplified_;
+
   friend class LoopOptimizationTest;
 
   DISALLOW_COPY_AND_ASSIGN(HLoopOptimization);
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index 594255c..925d4f1 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -2487,8 +2487,8 @@
       return os << "BootImageLinkTimePcRelative";
     case HLoadClass::LoadKind::kBootImageAddress:
       return os << "BootImageAddress";
-    case HLoadClass::LoadKind::kDexCacheAddress:
-      return os << "DexCacheAddress";
+    case HLoadClass::LoadKind::kJitTableAddress:
+      return os << "JitTableAddress";
     case HLoadClass::LoadKind::kDexCachePcRelative:
       return os << "DexCachePcRelative";
     case HLoadClass::LoadKind::kDexCacheViaMethod:
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index e3f4d8f..659cdda 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -5493,9 +5493,8 @@
     // GetIncludePatchInformation().
     kBootImageAddress,
 
-    // Load from the resolved types array at an absolute address.
-    // Used for classes outside the boot image referenced by JIT-compiled code.
-    kDexCacheAddress,
+    // Load from the root table associated with the JIT compiled method.
+    kJitTableAddress,
 
     // Load from resolved types array in the dex cache using a PC-relative load.
     // Used for classes outside boot image when we know that we can access
@@ -5588,7 +5587,6 @@
            NeedsAccessCheck();
   }
 
-
   bool CanThrow() const OVERRIDE {
     return CanCallRuntime();
   }
@@ -5613,7 +5611,9 @@
     return load_data_.address;
   }
 
-  bool NeedsDexCacheOfDeclaringClass() const OVERRIDE { return !IsReferrersClass(); }
+  bool NeedsDexCacheOfDeclaringClass() const OVERRIDE {
+    return !IsReferrersClass();
+  }
 
   static SideEffects SideEffectsForArchRuntimeCalls() {
     return SideEffects::CanTriggerGC();
@@ -5672,7 +5672,8 @@
   }
 
   static bool HasAddress(LoadKind load_kind) {
-    return load_kind == LoadKind::kBootImageAddress || load_kind == LoadKind::kDexCacheAddress;
+    return load_kind == LoadKind::kBootImageAddress ||
+        load_kind == LoadKind::kJitTableAddress;
   }
 
   static bool HasDexCacheReference(LoadKind load_kind) {
@@ -5691,7 +5692,7 @@
 
   union {
     uint32_t dex_cache_element_index;   // Only for dex cache reference.
-    uint64_t address;  // Up to 64-bit, needed for kDexCacheAddress on 64-bit targets.
+    uint64_t address;  // Up to 64-bit, needed for kJitTableAddress on 64-bit targets.
   } load_data_;
 
   ReferenceTypeInfo loaded_class_rti_;
diff --git a/compiler/optimizing/optimizing_cfi_test.cc b/compiler/optimizing/optimizing_cfi_test.cc
index 013e110..0e02311 100644
--- a/compiler/optimizing/optimizing_cfi_test.cc
+++ b/compiler/optimizing/optimizing_cfi_test.cc
@@ -24,12 +24,22 @@
 #include "optimizing/code_generator.h"
 #include "optimizing/optimizing_unit_test.h"
 #include "utils/assembler.h"
+#ifdef ART_USE_VIXL_ARM_BACKEND
+#include "utils/arm/assembler_arm_vixl.h"
+#else
 #include "utils/arm/assembler_thumb2.h"
+#endif
 #include "utils/mips/assembler_mips.h"
 #include "utils/mips64/assembler_mips64.h"
 
 #include "optimizing/optimizing_cfi_test_expected.inc"
 
+#ifdef ART_USE_VIXL_ARM_BACKEND
+namespace vixl32 = vixl::aarch32;
+
+using vixl32::r0;
+#endif
+
 namespace art {
 
 // Run the tests only on host.
@@ -158,8 +168,7 @@
     TestImpl(isa, #isa, expected_asm, expected_cfi);          \
   }
 
-// TODO(VIXL): Support this test for the VIXL backend.
-#if defined(ART_ENABLE_CODEGEN_arm) && !defined(ART_USE_VIXL_ARM_BACKEND)
+#ifdef ART_ENABLE_CODEGEN_arm
 TEST_ISA(kThumb2)
 #endif
 #ifdef ART_ENABLE_CODEGEN_arm64
@@ -178,8 +187,7 @@
 TEST_ISA(kMips64)
 #endif
 
-// TODO(VIXL): Support this test for the VIXL backend.
-#if defined(ART_ENABLE_CODEGEN_arm) && !defined(ART_USE_VIXL_ARM_BACKEND)
+#ifdef ART_ENABLE_CODEGEN_arm
 TEST_F(OptimizingCFITest, kThumb2Adjust) {
   std::vector<uint8_t> expected_asm(
       expected_asm_kThumb2_adjust,
@@ -188,6 +196,16 @@
       expected_cfi_kThumb2_adjust,
       expected_cfi_kThumb2_adjust + arraysize(expected_cfi_kThumb2_adjust));
   SetUpFrame(kThumb2);
+#ifdef ART_USE_VIXL_ARM_BACKEND
+#define __ down_cast<arm::ArmVIXLAssembler*>(GetCodeGenerator() \
+    ->GetAssembler())->GetVIXLAssembler()->
+  vixl32::Label target;
+  __ CompareAndBranchIfZero(r0, &target);
+  // Push the target out of range of CBZ.
+  for (size_t i = 0; i != 65; ++i) {
+    __ Ldr(r0, vixl32::MemOperand(r0));
+  }
+#else
 #define __ down_cast<arm::Thumb2Assembler*>(GetCodeGenerator()->GetAssembler())->
   Label target;
   __ CompareAndBranchIfZero(arm::R0, &target);
@@ -195,6 +213,7 @@
   for (size_t i = 0; i != 65; ++i) {
     __ ldr(arm::R0, arm::Address(arm::R0));
   }
+#endif
   __ Bind(&target);
 #undef __
   Finish();
diff --git a/compiler/optimizing/optimizing_cfi_test_expected.inc b/compiler/optimizing/optimizing_cfi_test_expected.inc
index f735dc8..82670c3 100644
--- a/compiler/optimizing/optimizing_cfi_test_expected.inc
+++ b/compiler/optimizing/optimizing_cfi_test_expected.inc
@@ -223,8 +223,16 @@
 // 0x00000040: .cfi_def_cfa_offset: 64
 
 static constexpr uint8_t expected_asm_kThumb2_adjust[] = {
+#ifdef ART_USE_VIXL_ARM_BACKEND
+    // VIXL emits an extra 2 bytes here for a 32-bit beq as there is no
+    // optimistic 16-bit emit and subsequent fixup for out of reach targets
+    // as with the current assembler.
+    0x60, 0xB5, 0x2D, 0xED, 0x02, 0x8A, 0x8B, 0xB0, 0x00, 0x28, 0x00, 0xF0,
+    0x41, 0x80, 0x00, 0x68, 0x00, 0x68, 0x00, 0x68, 0x00, 0x68, 0x00, 0x68,
+#else
     0x60, 0xB5, 0x2D, 0xED, 0x02, 0x8A, 0x8B, 0xB0, 0x00, 0x28,
     0x40, 0xD0, 0x00, 0x68, 0x00, 0x68, 0x00, 0x68, 0x00, 0x68, 0x00, 0x68,
+#endif
     0x00, 0x68, 0x00, 0x68, 0x00, 0x68, 0x00, 0x68, 0x00, 0x68, 0x00, 0x68,
     0x00, 0x68, 0x00, 0x68, 0x00, 0x68, 0x00, 0x68, 0x00, 0x68, 0x00, 0x68,
     0x00, 0x68, 0x00, 0x68, 0x00, 0x68, 0x00, 0x68, 0x00, 0x68, 0x00, 0x68,
@@ -239,7 +247,11 @@
 };
 static constexpr uint8_t expected_cfi_kThumb2_adjust[] = {
     0x42, 0x0E, 0x0C, 0x85, 0x03, 0x86, 0x02, 0x8E, 0x01, 0x44, 0x0E, 0x14,
+#ifdef ART_USE_VIXL_ARM_BACKEND
+    0x05, 0x50, 0x05, 0x05, 0x51, 0x04, 0x42, 0x0E, 0x40, 0x02, 0x88, 0x0A,
+#else
     0x05, 0x50, 0x05, 0x05, 0x51, 0x04, 0x42, 0x0E, 0x40, 0x02, 0x86, 0x0A,
+#endif
     0x42, 0x0E, 0x14, 0x44, 0x0E, 0x0C, 0x06, 0x50, 0x06, 0x51, 0x42, 0x0B,
     0x0E, 0x40,
 };
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 8ea2b06..64c87dc 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -375,7 +375,8 @@
                             const DexFile& dex_file,
                             Handle<mirror::DexCache> dex_cache,
                             ArtMethod* method,
-                            bool osr) const;
+                            bool osr,
+                            VariableSizedHandleScope* handles) const;
 
   void MaybeRunInliner(HGraph* graph,
                        CodeGenerator* codegen,
@@ -495,7 +496,7 @@
                                 number_of_dex_registers,
                                 /* depth */ 0);
   } else if (opt_name == HSharpening::kSharpeningPassName) {
-    return new (arena) HSharpening(graph, codegen, dex_compilation_unit, driver);
+    return new (arena) HSharpening(graph, codegen, dex_compilation_unit, driver, handles);
   } else if (opt_name == HSelectGenerator::kSelectGeneratorPassName) {
     return new (arena) HSelectGenerator(graph, stats);
   } else if (opt_name == HInductionVarAnalysis::kInductionPassName) {
@@ -767,7 +768,8 @@
   HInductionVarAnalysis* induction = new (arena) HInductionVarAnalysis(graph);
   BoundsCheckElimination* bce = new (arena) BoundsCheckElimination(graph, *side_effects, induction);
   HLoopOptimization* loop = new (arena) HLoopOptimization(graph, induction);
-  HSharpening* sharpening = new (arena) HSharpening(graph, codegen, dex_compilation_unit, driver);
+  HSharpening* sharpening = new (arena) HSharpening(
+      graph, codegen, dex_compilation_unit, driver, handles);
   InstructionSimplifier* simplify2 = new (arena) InstructionSimplifier(
       graph, stats, "instruction_simplifier$after_inlining");
   InstructionSimplifier* simplify3 = new (arena) InstructionSimplifier(
@@ -866,7 +868,8 @@
                                               const DexFile& dex_file,
                                               Handle<mirror::DexCache> dex_cache,
                                               ArtMethod* method,
-                                              bool osr) const {
+                                              bool osr,
+                                              VariableSizedHandleScope* handles) const {
   MaybeRecordStat(MethodCompilationStat::kAttemptCompilation);
   CompilerDriver* compiler_driver = GetCompilerDriver();
   InstructionSet instruction_set = compiler_driver->GetInstructionSet();
@@ -976,64 +979,56 @@
                              compiler_driver,
                              dump_mutex_);
 
-  VLOG(compiler) << "Building " << pass_observer.GetMethodName();
-
   {
-    ScopedObjectAccess soa(Thread::Current());
-    VariableSizedHandleScope handles(soa.Self());
-    // Do not hold `mutator_lock_` between optimizations.
-    ScopedThreadSuspension sts(soa.Self(), kNative);
-
-    {
-      PassScope scope(HGraphBuilder::kBuilderPassName, &pass_observer);
-      HGraphBuilder builder(graph,
-                            &dex_compilation_unit,
-                            &dex_compilation_unit,
-                            &dex_file,
-                            *code_item,
-                            compiler_driver,
-                            compilation_stats_.get(),
-                            interpreter_metadata,
-                            dex_cache,
-                            &handles);
-      GraphAnalysisResult result = builder.BuildGraph();
-      if (result != kAnalysisSuccess) {
-        switch (result) {
-          case kAnalysisSkipped:
-            MaybeRecordStat(MethodCompilationStat::kNotCompiledSkipped);
-            break;
-          case kAnalysisInvalidBytecode:
-            MaybeRecordStat(MethodCompilationStat::kNotCompiledInvalidBytecode);
-            break;
-          case kAnalysisFailThrowCatchLoop:
-            MaybeRecordStat(MethodCompilationStat::kNotCompiledThrowCatchLoop);
-            break;
-          case kAnalysisFailAmbiguousArrayOp:
-            MaybeRecordStat(MethodCompilationStat::kNotCompiledAmbiguousArrayOp);
-            break;
-          case kAnalysisSuccess:
-            UNREACHABLE();
-        }
-        pass_observer.SetGraphInBadState();
-        return nullptr;
+    VLOG(compiler) << "Building " << pass_observer.GetMethodName();
+    PassScope scope(HGraphBuilder::kBuilderPassName, &pass_observer);
+    HGraphBuilder builder(graph,
+                          &dex_compilation_unit,
+                          &dex_compilation_unit,
+                          &dex_file,
+                          *code_item,
+                          compiler_driver,
+                          compilation_stats_.get(),
+                          interpreter_metadata,
+                          dex_cache,
+                          handles);
+    GraphAnalysisResult result = builder.BuildGraph();
+    if (result != kAnalysisSuccess) {
+      switch (result) {
+        case kAnalysisSkipped:
+          MaybeRecordStat(MethodCompilationStat::kNotCompiledSkipped);
+          break;
+        case kAnalysisInvalidBytecode:
+          MaybeRecordStat(MethodCompilationStat::kNotCompiledInvalidBytecode);
+          break;
+        case kAnalysisFailThrowCatchLoop:
+          MaybeRecordStat(MethodCompilationStat::kNotCompiledThrowCatchLoop);
+          break;
+        case kAnalysisFailAmbiguousArrayOp:
+          MaybeRecordStat(MethodCompilationStat::kNotCompiledAmbiguousArrayOp);
+          break;
+        case kAnalysisSuccess:
+          UNREACHABLE();
       }
+      pass_observer.SetGraphInBadState();
+      return nullptr;
     }
-
-    RunOptimizations(graph,
-                     codegen.get(),
-                     compiler_driver,
-                     dex_compilation_unit,
-                     &pass_observer,
-                     &handles);
-
-    RegisterAllocator::Strategy regalloc_strategy =
-      compiler_options.GetRegisterAllocationStrategy();
-    AllocateRegisters(graph, codegen.get(), &pass_observer, regalloc_strategy);
-
-    codegen->Compile(code_allocator);
-    pass_observer.DumpDisassembly();
   }
 
+  RunOptimizations(graph,
+                   codegen.get(),
+                   compiler_driver,
+                   dex_compilation_unit,
+                   &pass_observer,
+                   handles);
+
+  RegisterAllocator::Strategy regalloc_strategy =
+    compiler_options.GetRegisterAllocationStrategy();
+  AllocateRegisters(graph, codegen.get(), &pass_observer, regalloc_strategy);
+
+  codegen->Compile(code_allocator);
+  pass_observer.DumpDisassembly();
+
   return codegen.release();
 }
 
@@ -1055,19 +1050,27 @@
             verified_method->GetEncounteredVerificationFailures())) {
     ArenaAllocator arena(Runtime::Current()->GetArenaPool());
     CodeVectorAllocator code_allocator(&arena);
-    std::unique_ptr<CodeGenerator> codegen(
-        TryCompile(&arena,
-                   &code_allocator,
-                   code_item,
-                   access_flags,
-                   invoke_type,
-                   class_def_idx,
-                   method_idx,
-                   jclass_loader,
-                   dex_file,
-                   dex_cache,
-                   nullptr,
-                   /* osr */ false));
+    std::unique_ptr<CodeGenerator> codegen;
+    {
+      ScopedObjectAccess soa(Thread::Current());
+      VariableSizedHandleScope handles(soa.Self());
+      // Go to native so that we don't block GC during compilation.
+      ScopedThreadSuspension sts(soa.Self(), kNative);
+      codegen.reset(
+          TryCompile(&arena,
+                     &code_allocator,
+                     code_item,
+                     access_flags,
+                     invoke_type,
+                     class_def_idx,
+                     method_idx,
+                     jclass_loader,
+                     dex_file,
+                     dex_cache,
+                     nullptr,
+                     /* osr */ false,
+                     &handles));
+    }
     if (codegen.get() != nullptr) {
       MaybeRecordStat(MethodCompilationStat::kCompiled);
       method = Emit(&arena, &code_allocator, codegen.get(), compiler_driver, code_item);
@@ -1138,6 +1141,8 @@
 
   ArenaAllocator arena(Runtime::Current()->GetJitArenaPool());
   CodeVectorAllocator code_allocator(&arena);
+  VariableSizedHandleScope handles(self);
+
   std::unique_ptr<CodeGenerator> codegen;
   {
     // Go to native so that we don't block GC during compilation.
@@ -1154,7 +1159,8 @@
                    *dex_file,
                    dex_cache,
                    method,
-                   osr));
+                   osr,
+                   &handles));
     if (codegen.get() == nullptr) {
       return false;
     }
diff --git a/compiler/optimizing/sharpening.cc b/compiler/optimizing/sharpening.cc
index daf160a..bbbb1a1 100644
--- a/compiler/optimizing/sharpening.cc
+++ b/compiler/optimizing/sharpening.cc
@@ -151,7 +151,7 @@
 
   bool is_in_dex_cache = false;
   bool is_in_boot_image = false;
-  HLoadClass::LoadKind desired_load_kind;
+  HLoadClass::LoadKind desired_load_kind = static_cast<HLoadClass::LoadKind>(-1);
   uint64_t address = 0u;  // Class or dex cache element address.
   {
     ScopedObjectAccess soa(Thread::Current());
@@ -190,18 +190,19 @@
           // TODO: Use direct pointers for all non-moving spaces, not just boot image. Bug: 29530787
           desired_load_kind = HLoadClass::LoadKind::kBootImageAddress;
           address = reinterpret_cast64<uint64_t>(klass);
+        } else if (is_in_dex_cache) {
+          desired_load_kind = HLoadClass::LoadKind::kJitTableAddress;
+          // We store in the address field the location of the stack reference maintained
+          // by the handle. We do this now so that the code generation does not need to figure
+          // out which class loader to use.
+          address = reinterpret_cast<uint64_t>(handles_->NewHandle(klass).GetReference());
         } else {
-          // Note: If the class is not in the dex cache or isn't initialized, the
-          // instruction needs environment and will not be inlined across dex files.
-          // Within a dex file, the slow-path helper loads the correct class and
-          // inlined frames are used correctly for OOM stack trace.
-          // TODO: Write a test for this. Bug: 29416588
-          desired_load_kind = HLoadClass::LoadKind::kDexCacheAddress;
-          void* dex_cache_element_address = &dex_cache->GetResolvedTypes()[type_index.index_];
-          address = reinterpret_cast64<uint64_t>(dex_cache_element_address);
+          // Class not loaded yet. Fallback to the dex cache.
+          // TODO(ngeoffray): Generate HDeoptimize instead.
+          desired_load_kind = HLoadClass::LoadKind::kDexCacheViaMethod;
         }
-        // AOT app compilation. Check if the class is in the boot image.
       } else if (is_in_boot_image && !codegen_->GetCompilerOptions().GetCompilePic()) {
+        // AOT app compilation. Check if the class is in the boot image.
         desired_load_kind = HLoadClass::LoadKind::kBootImageAddress;
         address = reinterpret_cast64<uint64_t>(klass);
       } else {
@@ -215,6 +216,7 @@
       }
     }
   }
+  DCHECK_NE(desired_load_kind, static_cast<HLoadClass::LoadKind>(-1));
 
   if (is_in_boot_image) {
     load_class->MarkInBootImage();
@@ -245,7 +247,7 @@
       load_class->SetLoadKindWithTypeReference(load_kind, dex_file, type_index);
       break;
     case HLoadClass::LoadKind::kBootImageAddress:
-    case HLoadClass::LoadKind::kDexCacheAddress:
+    case HLoadClass::LoadKind::kJitTableAddress:
       DCHECK_NE(address, 0u);
       load_class->SetLoadKindWithAddress(load_kind, address);
       break;
diff --git a/compiler/optimizing/sharpening.h b/compiler/optimizing/sharpening.h
index d35ae66..7418954 100644
--- a/compiler/optimizing/sharpening.h
+++ b/compiler/optimizing/sharpening.h
@@ -35,11 +35,13 @@
   HSharpening(HGraph* graph,
               CodeGenerator* codegen,
               const DexCompilationUnit& compilation_unit,
-              CompilerDriver* compiler_driver)
+              CompilerDriver* compiler_driver,
+              VariableSizedHandleScope* handles)
       : HOptimization(graph, kSharpeningPassName),
         codegen_(codegen),
         compilation_unit_(compilation_unit),
-        compiler_driver_(compiler_driver) { }
+        compiler_driver_(compiler_driver),
+        handles_(handles) { }
 
   void Run() OVERRIDE;
 
@@ -53,6 +55,7 @@
   CodeGenerator* codegen_;
   const DexCompilationUnit& compilation_unit_;
   CompilerDriver* compiler_driver_;
+  VariableSizedHandleScope* handles_;
 };
 
 }  // namespace art
diff --git a/compiler/utils/arm/assembler_arm_vixl.cc b/compiler/utils/arm/assembler_arm_vixl.cc
index c35c393..1614d04 100644
--- a/compiler/utils/arm/assembler_arm_vixl.cc
+++ b/compiler/utils/arm/assembler_arm_vixl.cc
@@ -455,5 +455,27 @@
   B(ne, label);
 }
 
+void ArmVIXLMacroAssembler::B(vixl32::Label* label) {
+  if (!label->IsBound()) {
+    // Try to use 16-bit T2 encoding of B instruction.
+    DCHECK(OutsideITBlock());
+    AssemblerAccurateScope ass(this,
+                               kMaxInstructionSizeInBytes,
+                               CodeBufferCheckScope::kMaximumSize);
+    b(al, Narrow, label);
+    AddBranchLabel(label);
+    return;
+  }
+  MacroAssembler::B(label);
+}
+
+void ArmVIXLMacroAssembler::B(vixl32::Condition cond, vixl32::Label* label) {
+  // To further reduce the Bcc encoding size and use 16-bit T1 encoding,
+  // we can provide a hint to this function: i.e. far_target=false.
+  // By default this function uses 'EncodingSizeType::Best' which generates 32-bit T3 encoding.
+  MacroAssembler::B(cond, label);
+}
+
+
 }  // namespace arm
 }  // namespace art
diff --git a/compiler/utils/arm/assembler_arm_vixl.h b/compiler/utils/arm/assembler_arm_vixl.h
index b4a4abc..17cf106 100644
--- a/compiler/utils/arm/assembler_arm_vixl.h
+++ b/compiler/utils/arm/assembler_arm_vixl.h
@@ -54,6 +54,77 @@
   void CompareAndBranchIfNonZero(vixl32::Register rn,
                                  vixl32::Label* label,
                                  bool is_far_target = true);
+
+  // In T32 some of the instructions (add, mov, etc) outside an IT block
+  // have only 32-bit encodings. But there are 16-bit flag setting
+  // versions of these instructions (adds, movs, etc). In most of the
+  // cases in ART we don't care if the instructions keep flags or not;
+  // thus we can benefit from smaller code size.
+  // VIXL will never generate flag setting versions (for example, adds
+  // for Add macro instruction) unless vixl32::DontCare option is
+  // explicitly specified. That's why we introduce wrappers to use
+  // DontCare option by default.
+#define WITH_FLAGS_DONT_CARE_RD_RN_OP(func_name) \
+  void (func_name)(vixl32::Register rd, vixl32::Register rn, const vixl32::Operand& operand) { \
+    MacroAssembler::func_name(vixl32::DontCare, rd, rn, operand); \
+  } \
+  using MacroAssembler::func_name
+
+  WITH_FLAGS_DONT_CARE_RD_RN_OP(Adc);
+  WITH_FLAGS_DONT_CARE_RD_RN_OP(Sub);
+  WITH_FLAGS_DONT_CARE_RD_RN_OP(Sbc);
+  WITH_FLAGS_DONT_CARE_RD_RN_OP(Rsb);
+  WITH_FLAGS_DONT_CARE_RD_RN_OP(Rsc);
+
+  WITH_FLAGS_DONT_CARE_RD_RN_OP(Eor);
+  WITH_FLAGS_DONT_CARE_RD_RN_OP(Orr);
+  WITH_FLAGS_DONT_CARE_RD_RN_OP(Orn);
+  WITH_FLAGS_DONT_CARE_RD_RN_OP(And);
+  WITH_FLAGS_DONT_CARE_RD_RN_OP(Bic);
+
+  WITH_FLAGS_DONT_CARE_RD_RN_OP(Asr);
+  WITH_FLAGS_DONT_CARE_RD_RN_OP(Lsr);
+  WITH_FLAGS_DONT_CARE_RD_RN_OP(Lsl);
+  WITH_FLAGS_DONT_CARE_RD_RN_OP(Ror);
+
+#undef WITH_FLAGS_DONT_CARE_RD_RN_OP
+
+#define WITH_FLAGS_DONT_CARE_RD_OP(func_name) \
+  void (func_name)(vixl32::Register rd, const vixl32::Operand& operand) { \
+    MacroAssembler::func_name(vixl32::DontCare, rd, operand); \
+  } \
+  using MacroAssembler::func_name
+
+  WITH_FLAGS_DONT_CARE_RD_OP(Mvn);
+  WITH_FLAGS_DONT_CARE_RD_OP(Mov);
+
+#undef WITH_FLAGS_DONT_CARE_RD_OP
+
+  // The following two functions don't fall into above categories. Overload them separately.
+  void Rrx(vixl32::Register rd, vixl32::Register rn) {
+    MacroAssembler::Rrx(vixl32::DontCare, rd, rn);
+  }
+  using MacroAssembler::Rrx;
+
+  void Mul(vixl32::Register rd, vixl32::Register rn, vixl32::Register rm) {
+    MacroAssembler::Mul(vixl32::DontCare, rd, rn, rm);
+  }
+  using MacroAssembler::Mul;
+
+  // TODO: Remove when MacroAssembler::Add(FlagsUpdate, Condition, Register, Register, Operand)
+  // makes the right decision about 16-bit encodings.
+  void Add(vixl32::Register rd, vixl32::Register rn, const vixl32::Operand& operand) {
+    if (rd.Is(rn)) {
+      MacroAssembler::Add(rd, rn, operand);
+    } else {
+      MacroAssembler::Add(vixl32::DontCare, rd, rn, operand);
+    }
+  }
+  using MacroAssembler::Add;
+
+  // These interfaces try to use 16-bit T2 encoding of B instruction.
+  void B(vixl32::Label* label);
+  void B(vixl32::Condition cond, vixl32::Label* label);
 };
 
 class ArmVIXLAssembler FINAL : public Assembler {
diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index b29974c..3dcad6a 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc
@@ -3252,6 +3252,9 @@
       CHECK_EQ(kMipsDoublewordSize, size) << dst;
       LoadDFromOffset(dst.AsFRegister(), src_register, src_offset);
     }
+  } else if (dst.IsDRegister()) {
+    CHECK_EQ(kMipsDoublewordSize, size) << dst;
+    LoadDFromOffset(dst.AsOverlappingDRegisterLow(), src_register, src_offset);
   }
 }
 
@@ -3396,6 +3399,9 @@
       CHECK_EQ(kMipsDoublewordSize, size);
       StoreDToOffset(src.AsFRegister(), SP, dest.Int32Value());
     }
+  } else if (src.IsDRegister()) {
+    CHECK_EQ(kMipsDoublewordSize, size);
+    StoreDToOffset(src.AsOverlappingDRegisterLow(), SP, dest.Int32Value());
   }
 }
 
diff --git a/dex2oat/dex2oat.cc b/dex2oat/dex2oat.cc
index 264be99..8fb4040 100644
--- a/dex2oat/dex2oat.cc
+++ b/dex2oat/dex2oat.cc
@@ -518,6 +518,7 @@
       runtime_(nullptr),
       thread_count_(sysconf(_SC_NPROCESSORS_CONF)),
       start_ns_(NanoTime()),
+      start_cputime_ns_(ProcessCpuNanoTime()),
       oat_fd_(-1),
       input_vdex_fd_(-1),
       output_vdex_fd_(-1),
@@ -2595,7 +2596,9 @@
     // Note: when creation of a runtime fails, e.g., when trying to compile an app but when there
     //       is no image, there won't be a Runtime::Current().
     // Note: driver creation can fail when loading an invalid dex file.
-    LOG(INFO) << "dex2oat took " << PrettyDuration(NanoTime() - start_ns_)
+    LOG(INFO) << "dex2oat took "
+              << PrettyDuration(NanoTime() - start_ns_)
+              << " (" << PrettyDuration(ProcessCpuNanoTime() - start_cputime_ns_) << " cpu)"
               << " (threads: " << thread_count_ << ") "
               << ((Runtime::Current() != nullptr && driver_ != nullptr) ?
                   driver_->GetMemoryUsageString(kIsDebugBuild || VLOG_IS_ON(compiler)) :
@@ -2643,6 +2646,7 @@
 
   size_t thread_count_;
   uint64_t start_ns_;
+  uint64_t start_cputime_ns_;
   std::unique_ptr<WatchDog> watchdog_;
   std::vector<std::unique_ptr<File>> oat_files_;
   std::vector<std::unique_ptr<File>> vdex_files_;
diff --git a/runtime/arch/mips/asm_support_mips.h b/runtime/arch/mips/asm_support_mips.h
index 135b074..7437774 100644
--- a/runtime/arch/mips/asm_support_mips.h
+++ b/runtime/arch/mips/asm_support_mips.h
@@ -21,7 +21,7 @@
 
 #define FRAME_SIZE_SAVE_ALL_CALLEE_SAVES 96
 #define FRAME_SIZE_SAVE_REFS_ONLY 48
-#define FRAME_SIZE_SAVE_REFS_AND_ARGS 80
+#define FRAME_SIZE_SAVE_REFS_AND_ARGS 112
 #define FRAME_SIZE_SAVE_EVERYTHING 256
 
 #endif  // ART_RUNTIME_ARCH_MIPS_ASM_SUPPORT_MIPS_H_
diff --git a/runtime/arch/mips/context_mips.cc b/runtime/arch/mips/context_mips.cc
index 375a03a..98ed5e6 100644
--- a/runtime/arch/mips/context_mips.cc
+++ b/runtime/arch/mips/context_mips.cc
@@ -75,11 +75,21 @@
   gprs_[A1] = nullptr;
   gprs_[A2] = nullptr;
   gprs_[A3] = nullptr;
+  gprs_[T0] = nullptr;
+  gprs_[T1] = nullptr;
 
+  fprs_[F8] = nullptr;
+  fprs_[F9] = nullptr;
+  fprs_[F10] = nullptr;
+  fprs_[F11] = nullptr;
   fprs_[F12] = nullptr;
   fprs_[F13] = nullptr;
   fprs_[F14] = nullptr;
   fprs_[F15] = nullptr;
+  fprs_[F16] = nullptr;
+  fprs_[F17] = nullptr;
+  fprs_[F18] = nullptr;
+  fprs_[F19] = nullptr;
 }
 
 extern "C" NO_RETURN void art_quick_do_long_jump(uint32_t*, uint32_t*);
diff --git a/runtime/arch/mips/quick_entrypoints_mips.S b/runtime/arch/mips/quick_entrypoints_mips.S
index 34e34b4..3e8cdc9 100644
--- a/runtime/arch/mips/quick_entrypoints_mips.S
+++ b/runtime/arch/mips/quick_entrypoints_mips.S
@@ -167,50 +167,60 @@
     /*
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kSaveRefsAndArgs).
-     * callee-save: $a1-$a3, $s2-$s8 + $gp + $ra, 12 total + 3 words padding + method*
+     * callee-save: $a1-$a3, $t0-$t1, $s2-$s8, $gp, $ra, $f8-$f19
+     *              (26 total + 1 word padding + method*)
      */
 .macro SETUP_SAVE_REFS_AND_ARGS_FRAME_REGISTERS_ONLY
-    addiu  $sp, $sp, -80
-    .cfi_adjust_cfa_offset 80
+    addiu  $sp, $sp, -112
+    .cfi_adjust_cfa_offset 112
 
     // Ugly compile-time check, but we only have the preprocessor.
-#if (FRAME_SIZE_SAVE_REFS_AND_ARGS != 80)
+#if (FRAME_SIZE_SAVE_REFS_AND_ARGS != 112)
 #error "FRAME_SIZE_SAVE_REFS_AND_ARGS(MIPS) size not as expected."
 #endif
 
-    sw     $ra, 76($sp)
-    .cfi_rel_offset 31, 76
-    sw     $s8, 72($sp)
-    .cfi_rel_offset 30, 72
-    sw     $gp, 68($sp)
-    .cfi_rel_offset 28, 68
-    sw     $s7, 64($sp)
-    .cfi_rel_offset 23, 64
-    sw     $s6, 60($sp)
-    .cfi_rel_offset 22, 60
-    sw     $s5, 56($sp)
-    .cfi_rel_offset 21, 56
-    sw     $s4, 52($sp)
-    .cfi_rel_offset 20, 52
-    sw     $s3, 48($sp)
-    .cfi_rel_offset 19, 48
-    sw     $s2, 44($sp)
-    .cfi_rel_offset 18, 44
-    sw     $a3, 40($sp)
-    .cfi_rel_offset 7, 40
-    sw     $a2, 36($sp)
-    .cfi_rel_offset 6, 36
-    sw     $a1, 32($sp)
-    .cfi_rel_offset 5, 32
-    SDu $f14, $f15, 24, $sp, $t0
-    SDu $f12, $f13, 16, $sp, $t0
+    sw     $ra, 108($sp)
+    .cfi_rel_offset 31, 108
+    sw     $s8, 104($sp)
+    .cfi_rel_offset 30, 104
+    sw     $gp, 100($sp)
+    .cfi_rel_offset 28, 100
+    sw     $s7, 96($sp)
+    .cfi_rel_offset 23, 96
+    sw     $s6, 92($sp)
+    .cfi_rel_offset 22, 92
+    sw     $s5, 88($sp)
+    .cfi_rel_offset 21, 88
+    sw     $s4, 84($sp)
+    .cfi_rel_offset 20, 84
+    sw     $s3, 80($sp)
+    .cfi_rel_offset 19, 80
+    sw     $s2, 76($sp)
+    .cfi_rel_offset 18, 76
+    sw     $t1, 72($sp)
+    .cfi_rel_offset 9, 72
+    sw     $t0, 68($sp)
+    .cfi_rel_offset 8, 68
+    sw     $a3, 64($sp)
+    .cfi_rel_offset 7, 64
+    sw     $a2, 60($sp)
+    .cfi_rel_offset 6, 60
+    sw     $a1, 56($sp)
+    .cfi_rel_offset 5, 56
+    SDu $f18, $f19, 48, $sp, $t8
+    SDu $f16, $f17, 40, $sp, $t8
+    SDu $f14, $f15, 32, $sp, $t8
+    SDu $f12, $f13, 24, $sp, $t8
+    SDu $f10, $f11, 16, $sp, $t8
+    SDu $f8, $f9, 8, $sp, $t8
     # bottom will hold Method*
 .endm
 
     /*
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kSaveRefsAndArgs). Restoration assumes non-moving GC.
-     * callee-save: $a1-$a3, $f12-$f15, $s2-$s8 + $gp + $ra, 12 total + 3 words padding + method*
+     * callee-save: $a1-$a3, $t0-$t1, $s2-$s8, $gp, $ra, $f8-$f19
+     *              (26 total + 1 word padding + method*)
      * Clobbers $t0 and $sp
      * Allocates ARG_SLOT_SIZE bytes at the bottom of the stack for arg slots.
      * Reserves FRAME_SIZE_SAVE_REFS_AND_ARGS + ARG_SLOT_SIZE bytes on the stack
@@ -229,7 +239,8 @@
     /*
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kSaveRefsAndArgs). Restoration assumes non-moving GC.
-     * callee-save: $a1-$a3, $f12-$f15, $s2-$s8 + $gp + $ra, 12 total + 3 words padding + method*
+     * callee-save: $a1-$a3, $t0-$t1, $s2-$s8, $gp, $ra, $f8-$f19
+     *              (26 total + 1 word padding + method*)
      * Clobbers $sp
      * Use $a0 as the Method* and loads it into bottom of stack.
      * Allocates ARG_SLOT_SIZE bytes at the bottom of the stack for arg slots.
@@ -246,34 +257,42 @@
 .macro RESTORE_SAVE_REFS_AND_ARGS_FRAME
     addiu  $sp, $sp, ARG_SLOT_SIZE                # remove argument slots on the stack
     .cfi_adjust_cfa_offset -ARG_SLOT_SIZE
-    lw     $ra, 76($sp)
+    lw     $ra, 108($sp)
     .cfi_restore 31
-    lw     $s8, 72($sp)
+    lw     $s8, 104($sp)
     .cfi_restore 30
-    lw     $gp, 68($sp)
+    lw     $gp, 100($sp)
     .cfi_restore 28
-    lw     $s7, 64($sp)
+    lw     $s7, 96($sp)
     .cfi_restore 23
-    lw     $s6, 60($sp)
+    lw     $s6, 92($sp)
     .cfi_restore 22
-    lw     $s5, 56($sp)
+    lw     $s5, 88($sp)
     .cfi_restore 21
-    lw     $s4, 52($sp)
+    lw     $s4, 84($sp)
     .cfi_restore 20
-    lw     $s3, 48($sp)
+    lw     $s3, 80($sp)
     .cfi_restore 19
-    lw     $s2, 44($sp)
+    lw     $s2, 76($sp)
     .cfi_restore 18
-    lw     $a3, 40($sp)
+    lw     $t1, 72($sp)
+    .cfi_restore 9
+    lw     $t0, 68($sp)
+    .cfi_restore 8
+    lw     $a3, 64($sp)
     .cfi_restore 7
-    lw     $a2, 36($sp)
+    lw     $a2, 60($sp)
     .cfi_restore 6
-    lw     $a1, 32($sp)
+    lw     $a1, 56($sp)
     .cfi_restore 5
-    LDu $f14, $f15, 24, $sp, $t1
-    LDu $f12, $f13, 16, $sp, $t1
-    addiu  $sp, $sp, 80           # pop frame
-    .cfi_adjust_cfa_offset -80
+    LDu $f18, $f19, 48, $sp, $t8
+    LDu $f16, $f17, 40, $sp, $t8
+    LDu $f14, $f15, 32, $sp, $t8
+    LDu $f12, $f13, 24, $sp, $t8
+    LDu $f10, $f11, 16, $sp, $t8
+    LDu $f8, $f9, 8, $sp, $t8
+    addiu  $sp, $sp, 112          # pop frame
+    .cfi_adjust_cfa_offset -112
 .endm
 
     /*
@@ -824,30 +843,56 @@
 INVOKE_TRAMPOLINE art_quick_invoke_super_trampoline_with_access_check, artInvokeSuperTrampolineWithAccessCheck
 INVOKE_TRAMPOLINE art_quick_invoke_virtual_trampoline_with_access_check, artInvokeVirtualTrampolineWithAccessCheck
 
-.macro LOAD_WORD_TO_REG reg, next_arg, index, label
+// Each of the following macros expands into four instructions or 16 bytes.
+// They are used to build indexable "tables" of code.
+
+.macro LOAD_WORD_TO_REG reg, next_arg, index_reg, label
     lw    $\reg, -4($\next_arg)   # next_arg points to argument after the current one (offset is 4)
     b     \label
-    addiu $\index, 1
+    addiu $\index_reg, 16
+    .balign 16
 .endm
 
-.macro LOAD_LONG_TO_REG reg1, reg2, next_arg, index, label
+.macro LOAD_LONG_TO_REG reg1, reg2, next_arg, index_reg, next_index, label
     lw    $\reg1, -8($\next_arg)  # next_arg points to argument after the current one (offset is 8)
     lw    $\reg2, -4($\next_arg)
     b     \label
-    li    $\index, 4              # long can be loaded only to a2_a3 pair so index will be always 4
+    li    $\index_reg, \next_index
+    .balign 16
 .endm
 
-.macro LOAD_FLOAT_TO_REG reg, next_arg, index, label
+.macro LOAD_FLOAT_TO_REG reg, next_arg, index_reg, label
     lwc1  $\reg, -4($\next_arg)   # next_arg points to argument after the current one (offset is 4)
     b     \label
-    addiu $\index, 1
+    addiu $\index_reg, 16
+    .balign 16
 .endm
 
-.macro LOAD_DOUBLE_TO_REG reg1, reg2, next_arg, index, tmp, label
+#if defined(__mips_isa_rev) && __mips_isa_rev > 2
+// LDu expands into 3 instructions for 64-bit FPU, so index_reg cannot be updated here.
+.macro LOAD_DOUBLE_TO_REG reg1, reg2, next_arg, index_reg, tmp, label
+    .set reorder                                # force use of the branch delay slot
     LDu  $\reg1, $\reg2, -8, $\next_arg, $\tmp  # next_arg points to argument after the current one
                                                 # (offset is 8)
     b     \label
-    addiu $\index, 1
+    .set noreorder
+    .balign 16
+.endm
+#else
+// LDu expands into 2 instructions for 32-bit FPU, so index_reg is updated here.
+.macro LOAD_DOUBLE_TO_REG reg1, reg2, next_arg, index_reg, tmp, label
+    LDu  $\reg1, $\reg2, -8, $\next_arg, $\tmp  # next_arg points to argument after the current one
+                                                # (offset is 8)
+    b     \label
+    addiu $\index_reg, 16
+    .balign 16
+.endm
+#endif
+
+.macro LOAD_END index_reg, next_index, label
+    b     \label
+    li    $\index_reg, \next_index
+    .balign 16
 .endm
 
 #define SPILL_SIZE    32
@@ -891,61 +936,63 @@
     lw    $gp, 16($fp)          # restore $gp
     lw    $a0, SPILL_SIZE($fp)  # restore ArtMethod*
     lw    $a1, 4($sp)           # a1 = this*
-    addiu $t0, $sp, 8           # t0 = pointer to the current argument (skip ArtMethod* and this*)
-    li    $t3, 2                # t3 = gpr_index = 2 (skip A0 and A1)
-    move  $t4, $zero            # t4 = fp_index = 0
-    lw    $t1, 20 + SPILL_SIZE($fp)  # get shorty (20 is offset from the $sp on entry + SPILL_SIZE
+    addiu $t8, $sp, 8           # t8 = pointer to the current argument (skip ArtMethod* and this*)
+    li    $t6, 0                # t6 = gpr_index = 0 (corresponds to A2; A0 and A1 are skipped)
+    li    $t7, 0                # t7 = fp_index = 0
+    lw    $t9, 20 + SPILL_SIZE($fp)  # get shorty (20 is offset from the $sp on entry + SPILL_SIZE
                                 # as the $fp is SPILL_SIZE bytes below the $sp on entry)
-    addiu $t1, 1                # t1 = shorty + 1 (skip 1 for return type)
+    addiu $t9, 1                # t9 = shorty + 1 (skip 1 for return type)
+
+    // Load the base addresses of tabInt ... tabDouble.
+    // We will use the register indices (gpr_index, fp_index) to branch.
+    // Note that the indices are scaled by 16, so they can be added to the bases directly.
+#if defined(__mips_isa_rev) && __mips_isa_rev >= 6
+    lapc  $t2, tabInt
+    lapc  $t3, tabLong
+    lapc  $t4, tabSingle
+    lapc  $t5, tabDouble
+#else
+    bltzal $zero, tabBase       # nal
+    addiu $t2, $ra, %lo(tabInt - tabBase)
+tabBase:
+    addiu $t3, $ra, %lo(tabLong - tabBase)
+    addiu $t4, $ra, %lo(tabSingle - tabBase)
+    addiu $t5, $ra, %lo(tabDouble - tabBase)
+#endif
+
 loop:
-    lbu   $t2, 0($t1)           # t2 = shorty[i]
-    beqz  $t2, loopEnd          # finish getting args when shorty[i] == '\0'
-    addiu $t1, 1
+    lbu   $ra, 0($t9)           # ra = shorty[i]
+    beqz  $ra, loopEnd          # finish getting args when shorty[i] == '\0'
+    addiu $t9, 1
 
-    li    $t9, 'J'              # put char 'J' into t9
-    beq   $t9, $t2, isLong      # branch if result type char == 'J'
-    li    $t9, 'D'              # put char 'D' into t9
-    beq   $t9, $t2, isDouble    # branch if result type char == 'D'
-    li    $t9, 'F'              # put char 'F' into t9
-    beq   $t9, $t2, isSingle    # branch if result type char == 'F'
-    addiu $t0, 4                # next_arg = curr_arg + 4 (in branch delay slot,
-                                # for both, int and single)
+    addiu $ra, -'J'
+    beqz  $ra, isLong           # branch if result type char == 'J'
+    addiu $ra, 'J' - 'D'
+    beqz  $ra, isDouble         # branch if result type char == 'D'
+    addiu $ra, 'D' - 'F'
+    beqz  $ra, isSingle         # branch if result type char == 'F'
 
-    li    $t5, 2                                   # skip a0 and a1 (ArtMethod* and this*)
-    bne   $t5, $t3, 1f                             # if (gpr_index == 2)
-    addiu $t5, 1
-    LOAD_WORD_TO_REG a2, t0, t3, loop              #   a2 = current argument, gpr_index++
-1:  bne   $t5, $t3, loop                           # else if (gpr_index == 3)
-    nop
-    LOAD_WORD_TO_REG a3, t0, t3, loop              #   a3 = current argument, gpr_index++
+    addu  $ra, $t2, $t6
+    jalr  $zero, $ra
+    addiu $t8, 4                # next_arg = curr_arg + 4
 
 isLong:
-    addiu $t0, 8                                   # next_arg = curr_arg + 8
-    slti  $t5, $t3, 3
-    beqz  $t5, 2f                                  # if (gpr_index < 3)
-    nop
-    LOAD_LONG_TO_REG a2, a3, t0, t3, loop          #   a2_a3 = curr_arg, gpr_index = 4
-2:  b     loop                                     # else
-    li    $t3, 4                                   #   gpr_index = 4
-
-isDouble:
-    addiu $t0, 8                                   # next_arg = curr_arg + 8
-    li    $t5, 0
-    bne   $t5, $t4, 3f                             # if (fp_index == 0)
-    addiu $t5, 1
-    LOAD_DOUBLE_TO_REG f12, f13, t0, t4, t9, loop  #   f12_f13 = curr_arg, fp_index++
-3:  bne   $t5, $t4, loop                           # else if (fp_index == 1)
-    nop
-    LOAD_DOUBLE_TO_REG f14, f15, t0, t4, t9, loop  #   f14_f15 = curr_arg, fp_index++
+    addu  $ra, $t3, $t6
+    jalr  $zero, $ra
+    addiu $t8, 8                # next_arg = curr_arg + 8
 
 isSingle:
-    li    $t5, 0
-    bne   $t5, $t4, 4f                             # if (fp_index == 0)
-    addiu $t5, 1
-    LOAD_FLOAT_TO_REG f12, t0, t4, loop            #   f12 = curr_arg, fp_index++
-4:  bne   $t5, $t4, loop                           # else if (fp_index == 1)
-    nop
-    LOAD_FLOAT_TO_REG f14, t0, t4, loop            #   f14 = curr_arg, fp_index++
+    addu  $ra, $t4, $t7
+    jalr  $zero, $ra
+    addiu $t8, 4                # next_arg = curr_arg + 4
+
+isDouble:
+    addu  $ra, $t5, $t7
+#if defined(__mips_isa_rev) && __mips_isa_rev > 2
+    addiu $t7, 16               # fp_index += 16 didn't fit into LOAD_DOUBLE_TO_REG
+#endif
+    jalr  $zero, $ra
+    addiu $t8, 8                # next_arg = curr_arg + 8
 
 loopEnd:
     lw    $t9, ART_METHOD_QUICK_CODE_OFFSET_32($a0)  # get pointer to the code
@@ -976,6 +1023,38 @@
     SDu   $f0, $f1, 0, $t0, $t1 # store floating point result
     jalr  $zero, $ra
     nop
+
+    // Note that gpr_index is kept within the range of tabInt and tabLong
+    // and fp_index is kept within the range of tabSingle and tabDouble.
+    .balign 16
+tabInt:
+    LOAD_WORD_TO_REG a2, t8, t6, loop             # a2 = current argument, gpr_index += 16
+    LOAD_WORD_TO_REG a3, t8, t6, loop             # a3 = current argument, gpr_index += 16
+    LOAD_WORD_TO_REG t0, t8, t6, loop             # t0 = current argument, gpr_index += 16
+    LOAD_WORD_TO_REG t1, t8, t6, loop             # t1 = current argument, gpr_index += 16
+    LOAD_END t6, 4*16, loop                       # no more GPR args, gpr_index = 4*16
+tabLong:
+    LOAD_LONG_TO_REG a2, a3, t8, t6, 2*16, loop   # a2_a3 = curr_arg, gpr_index = 2*16
+    LOAD_LONG_TO_REG t0, t1, t8, t6, 4*16, loop   # t0_t1 = curr_arg, gpr_index = 4*16
+    LOAD_LONG_TO_REG t0, t1, t8, t6, 4*16, loop   # t0_t1 = curr_arg, gpr_index = 4*16
+    LOAD_END t6, 4*16, loop                       # no more GPR args, gpr_index = 4*16
+    LOAD_END t6, 4*16, loop                       # no more GPR args, gpr_index = 4*16
+tabSingle:
+    LOAD_FLOAT_TO_REG f8, t8, t7, loop            # f8 = curr_arg, fp_index += 16
+    LOAD_FLOAT_TO_REG f10, t8, t7, loop           # f10 = curr_arg, fp_index += 16
+    LOAD_FLOAT_TO_REG f12, t8, t7, loop           # f12 = curr_arg, fp_index += 16
+    LOAD_FLOAT_TO_REG f14, t8, t7, loop           # f14 = curr_arg, fp_index += 16
+    LOAD_FLOAT_TO_REG f16, t8, t7, loop           # f16 = curr_arg, fp_index += 16
+    LOAD_FLOAT_TO_REG f18, t8, t7, loop           # f18 = curr_arg, fp_index += 16
+    LOAD_END t7, 6*16, loop                       # no more FPR args, fp_index = 6*16
+tabDouble:
+    LOAD_DOUBLE_TO_REG f8, f9, t8, t7, ra, loop   # f8_f9 = curr_arg; if FPU32, fp_index += 16
+    LOAD_DOUBLE_TO_REG f10, f11, t8, t7, ra, loop # f10_f11 = curr_arg; if FPU32, fp_index += 16
+    LOAD_DOUBLE_TO_REG f12, f13, t8, t7, ra, loop # f12_f13 = curr_arg; if FPU32, fp_index += 16
+    LOAD_DOUBLE_TO_REG f14, f15, t8, t7, ra, loop # f14_f15 = curr_arg; if FPU32, fp_index += 16
+    LOAD_DOUBLE_TO_REG f16, f17, t8, t7, ra, loop # f16_f17 = curr_arg; if FPU32, fp_index += 16
+    LOAD_DOUBLE_TO_REG f18, f19, t8, t7, ra, loop # f18_f19 = curr_arg; if FPU32, fp_index += 16
+    LOAD_END t7, 6*16, loop                       # no more FPR args, fp_index = 6*16
 END art_quick_invoke_stub
 
     /*
@@ -1016,64 +1095,63 @@
     addiu $sp, $sp, 16          # restore stack after memcpy
     lw    $gp, 16($fp)          # restore $gp
     lw    $a0, SPILL_SIZE($fp)  # restore ArtMethod*
-    addiu $t0, $sp, 4           # t0 = pointer to the current argument (skip ArtMethod*)
-    li    $t3, 1                # t3 = gpr_index = 1 (skip A0)
-    move  $t4, $zero            # t4 = fp_index = 0
-    lw    $t1, 20 + SPILL_SIZE($fp)  # get shorty (20 is offset from the $sp on entry + SPILL_SIZE
+    addiu $t8, $sp, 4           # t8 = pointer to the current argument (skip ArtMethod*)
+    li    $t6, 0                # t6 = gpr_index = 0 (corresponds to A1; A0 is skipped)
+    li    $t7, 0                # t7 = fp_index = 0
+    lw    $t9, 20 + SPILL_SIZE($fp)  # get shorty (20 is offset from the $sp on entry + SPILL_SIZE
                                 # as the $fp is SPILL_SIZE bytes below the $sp on entry)
-    addiu $t1, 1                # t1 = shorty + 1 (skip 1 for return type)
+    addiu $t9, 1                # t9 = shorty + 1 (skip 1 for return type)
+
+    // Load the base addresses of tabIntS ... tabDoubleS.
+    // We will use the register indices (gpr_index, fp_index) to branch.
+    // Note that the indices are scaled by 16, so they can be added to the bases directly.
+#if defined(__mips_isa_rev) && __mips_isa_rev >= 6
+    lapc  $t2, tabIntS
+    lapc  $t3, tabLongS
+    lapc  $t4, tabSingleS
+    lapc  $t5, tabDoubleS
+#else
+    bltzal $zero, tabBaseS      # nal
+    addiu $t2, $ra, %lo(tabIntS - tabBaseS)
+tabBaseS:
+    addiu $t3, $ra, %lo(tabLongS - tabBaseS)
+    addiu $t4, $ra, %lo(tabSingleS - tabBaseS)
+    addiu $t5, $ra, %lo(tabDoubleS - tabBaseS)
+#endif
+
 loopS:
-    lbu   $t2, 0($t1)           # t2 = shorty[i]
-    beqz  $t2, loopEndS         # finish getting args when shorty[i] == '\0'
-    addiu $t1, 1
+    lbu   $ra, 0($t9)           # ra = shorty[i]
+    beqz  $ra, loopEndS         # finish getting args when shorty[i] == '\0'
+    addiu $t9, 1
 
-    li    $t9, 'J'              # put char 'J' into t9
-    beq   $t9, $t2, isLongS     # branch if result type char == 'J'
-    li    $t9, 'D'              # put char 'D' into t9
-    beq   $t9, $t2, isDoubleS   # branch if result type char == 'D'
-    li    $t9, 'F'              # put char 'F' into t9
-    beq   $t9, $t2, isSingleS   # branch if result type char == 'F'
-    addiu $t0, 4                # next_arg = curr_arg + 4 (in branch delay slot,
-                                # for both, int and single)
+    addiu $ra, -'J'
+    beqz  $ra, isLongS          # branch if result type char == 'J'
+    addiu $ra, 'J' - 'D'
+    beqz  $ra, isDoubleS        # branch if result type char == 'D'
+    addiu $ra, 'D' - 'F'
+    beqz  $ra, isSingleS        # branch if result type char == 'F'
 
-    li    $t5, 1                                    # skip a0 (ArtMethod*)
-    bne   $t5, $t3, 1f                              # if (gpr_index == 1)
-    addiu $t5, 1
-    LOAD_WORD_TO_REG a1, t0, t3, loopS              #   a1 = current argument, gpr_index++
-1:  bne   $t5, $t3, 2f                              # else if (gpr_index == 2)
-    addiu $t5, 1
-    LOAD_WORD_TO_REG a2, t0, t3, loopS              #   a2 = current argument, gpr_index++
-2:  bne   $t5, $t3, loopS                           # else if (gpr_index == 3)
-    nop
-    LOAD_WORD_TO_REG a3, t0, t3, loopS              #   a3 = current argument, gpr_index++
+    addu  $ra, $t2, $t6
+    jalr  $zero, $ra
+    addiu $t8, 4                # next_arg = curr_arg + 4
 
 isLongS:
-    addiu $t0, 8                                    # next_arg = curr_arg + 8
-    slti  $t5, $t3, 3
-    beqz  $t5, 3f                                   # if (gpr_index < 3)
-    nop
-    LOAD_LONG_TO_REG a2, a3, t0, t3, loopS          #   a2_a3 = curr_arg, gpr_index = 4
-3:  b     loopS                                     # else
-    li    $t3, 4                                    #   gpr_index = 4
-
-isDoubleS:
-    addiu $t0, 8                                    # next_arg = curr_arg + 8
-    li    $t5, 0
-    bne   $t5, $t4, 4f                              # if (fp_index == 0)
-    addiu $t5, 1
-    LOAD_DOUBLE_TO_REG f12, f13, t0, t4, t9, loopS  #   f12_f13 = curr_arg, fp_index++
-4:  bne   $t5, $t4, loopS                           # else if (fp_index == 1)
-    nop
-    LOAD_DOUBLE_TO_REG f14, f15, t0, t4, t9, loopS  #   f14_f15 = curr_arg, fp_index++
+    addu  $ra, $t3, $t6
+    jalr  $zero, $ra
+    addiu $t8, 8                # next_arg = curr_arg + 8
 
 isSingleS:
-    li    $t5, 0
-    bne   $t5, $t4, 5f                              # if (fp_index == 0)
-    addiu $t5, 1
-    LOAD_FLOAT_TO_REG f12, t0, t4, loopS            #   f12 = curr_arg, fp_index++
-5:  bne   $t5, $t4, loopS                           # else if (fp_index == 1)
-    nop
-    LOAD_FLOAT_TO_REG f14, t0, t4, loopS            #   f14 = curr_arg, fp_index++
+    addu  $ra, $t4, $t7
+    jalr  $zero, $ra
+    addiu $t8, 4                # next_arg = curr_arg + 4
+
+isDoubleS:
+    addu  $ra, $t5, $t7
+#if defined(__mips_isa_rev) && __mips_isa_rev > 2
+    addiu $t7, 16               # fp_index += 16 didn't fit into LOAD_DOUBLE_TO_REG
+#endif
+    jalr  $zero, $ra
+    addiu $t8, 8                # next_arg = curr_arg + 8
 
 loopEndS:
     lw    $t9, ART_METHOD_QUICK_CODE_OFFSET_32($a0)  # get pointer to the code
@@ -1104,6 +1182,40 @@
     SDu   $f0, $f1, 0, $t0, $t1 # store floating point result
     jalr  $zero, $ra
     nop
+
+    // Note that gpr_index is kept within the range of tabIntS and tabLongS
+    // and fp_index is kept within the range of tabSingleS and tabDoubleS.
+    .balign 16
+tabIntS:
+    LOAD_WORD_TO_REG a1, t8, t6, loopS             # a1 = current argument, gpr_index += 16
+    LOAD_WORD_TO_REG a2, t8, t6, loopS             # a2 = current argument, gpr_index += 16
+    LOAD_WORD_TO_REG a3, t8, t6, loopS             # a3 = current argument, gpr_index += 16
+    LOAD_WORD_TO_REG t0, t8, t6, loopS             # t0 = current argument, gpr_index += 16
+    LOAD_WORD_TO_REG t1, t8, t6, loopS             # t1 = current argument, gpr_index += 16
+    LOAD_END t6, 5*16, loopS                       # no more GPR args, gpr_index = 5*16
+tabLongS:
+    LOAD_LONG_TO_REG a2, a3, t8, t6, 3*16, loopS   # a2_a3 = curr_arg, gpr_index = 3*16
+    LOAD_LONG_TO_REG a2, a3, t8, t6, 3*16, loopS   # a2_a3 = curr_arg, gpr_index = 3*16
+    LOAD_LONG_TO_REG t0, t1, t8, t6, 5*16, loopS   # t0_t1 = curr_arg, gpr_index = 5*16
+    LOAD_LONG_TO_REG t0, t1, t8, t6, 5*16, loopS   # t0_t1 = curr_arg, gpr_index = 5*16
+    LOAD_END t6, 5*16, loopS                       # no more GPR args, gpr_index = 5*16
+    LOAD_END t6, 5*16, loopS                       # no more GPR args, gpr_index = 5*16
+tabSingleS:
+    LOAD_FLOAT_TO_REG f8, t8, t7, loopS            # f8 = curr_arg, fp_index += 16
+    LOAD_FLOAT_TO_REG f10, t8, t7, loopS           # f10 = curr_arg, fp_index += 16
+    LOAD_FLOAT_TO_REG f12, t8, t7, loopS           # f12 = curr_arg, fp_index += 16
+    LOAD_FLOAT_TO_REG f14, t8, t7, loopS           # f14 = curr_arg, fp_index += 16
+    LOAD_FLOAT_TO_REG f16, t8, t7, loopS           # f16 = curr_arg, fp_index += 16
+    LOAD_FLOAT_TO_REG f18, t8, t7, loopS           # f18 = curr_arg, fp_index += 16
+    LOAD_END t7, 6*16, loopS                       # no more FPR args, fp_index = 6*16
+tabDoubleS:
+    LOAD_DOUBLE_TO_REG f8, f9, t8, t7, ra, loopS   # f8_f9 = curr_arg; if FPU32, fp_index += 16
+    LOAD_DOUBLE_TO_REG f10, f11, t8, t7, ra, loopS # f10_f11 = curr_arg; if FPU32, fp_index += 16
+    LOAD_DOUBLE_TO_REG f12, f13, t8, t7, ra, loopS # f12_f13 = curr_arg; if FPU32, fp_index += 16
+    LOAD_DOUBLE_TO_REG f14, f15, t8, t7, ra, loopS # f14_f15 = curr_arg; if FPU32, fp_index += 16
+    LOAD_DOUBLE_TO_REG f16, f17, t8, t7, ra, loopS # f16_f17 = curr_arg; if FPU32, fp_index += 16
+    LOAD_DOUBLE_TO_REG f18, f19, t8, t7, ra, loopS # f18_f19 = curr_arg; if FPU32, fp_index += 16
+    LOAD_END t7, 6*16, loopS                       # no more FPR args, fp_index = 6*16
 END art_quick_invoke_static_stub
 
 #undef SPILL_SIZE
@@ -1886,9 +1998,9 @@
     la      $t9, artQuickProxyInvokeHandler
     jalr    $t9                         # (Method* proxy method, receiver, Thread*, SP)
     addiu   $a3, $sp, ARG_SLOT_SIZE     # pass $sp (remove arg slots)
-    lw      $t0, THREAD_EXCEPTION_OFFSET(rSELF) # load Thread::Current()->exception_
+    lw      $t7, THREAD_EXCEPTION_OFFSET(rSELF) # load Thread::Current()->exception_
     RESTORE_SAVE_REFS_AND_ARGS_FRAME
-    bnez    $t0, 1f
+    bnez    $t7, 1f
     # don't care if $v0 and/or $v1 are modified, when exception branch taken
     MTD     $v0, $v1, $f0, $f1          # move float value to return value
     jalr    $zero, $ra
@@ -1900,25 +2012,25 @@
     /*
      * Called to resolve an imt conflict.
      * a0 is the conflict ArtMethod.
-     * t0 is a hidden argument that holds the target interface method's dex method index.
+     * t7 is a hidden argument that holds the target interface method's dex method index.
      *
-     * Note that this stub writes to a0, t0 and t1.
+     * Note that this stub writes to a0, t7 and t8.
      */
 ENTRY art_quick_imt_conflict_trampoline
-    lw      $t1, 0($sp)                                      # Load referrer.
-    lw      $t1, ART_METHOD_DEX_CACHE_METHODS_OFFSET_32($t1) # Load dex cache methods array.
-    sll     $t0, $t0, POINTER_SIZE_SHIFT                     # Calculate offset.
-    addu    $t0, $t1, $t0                                    # Add offset to base.
-    lw      $t0, 0($t0)                                      # Load interface method.
+    lw      $t8, 0($sp)                                      # Load referrer.
+    lw      $t8, ART_METHOD_DEX_CACHE_METHODS_OFFSET_32($t8) # Load dex cache methods array.
+    sll     $t7, $t7, POINTER_SIZE_SHIFT                     # Calculate offset.
+    addu    $t7, $t8, $t7                                    # Add offset to base.
+    lw      $t7, 0($t7)                                      # Load interface method.
     lw      $a0, ART_METHOD_JNI_OFFSET_32($a0)               # Load ImtConflictTable.
 
 .Limt_table_iterate:
-    lw      $t1, 0($a0)                                      # Load next entry in ImtConflictTable.
+    lw      $t8, 0($a0)                                      # Load next entry in ImtConflictTable.
     # Branch if found.
-    beq     $t1, $t0, .Limt_table_found
+    beq     $t8, $t7, .Limt_table_found
     nop
     # If the entry is null, the interface method is not in the ImtConflictTable.
-    beqz    $t1, .Lconflict_trampoline
+    beqz    $t8, .Lconflict_trampoline
     nop
     # Iterate over the entries of the ImtConflictTable.
     b       .Limt_table_iterate
@@ -1928,7 +2040,7 @@
     # We successfully hit an entry in the table. Load the target method and jump to it.
     lw      $a0, __SIZEOF_POINTER__($a0)
     lw      $t9, ART_METHOD_QUICK_CODE_OFFSET_32($a0)
-    jr      $t9
+    jalr    $zero, $t9
     nop
 
 .Lconflict_trampoline:
@@ -1972,7 +2084,7 @@
     # The result of the call is:
     # v0: ptr to native code, 0 on error.
     # v1: ptr to the bottom of the used area of the alloca, can restore stack till here.
-    beq     $v0, $zero, 1f         # check entry error
+    beq     $v0, $zero, 2f         # check entry error
     move    $t9, $v0               # save the code ptr
     move    $sp, $v1               # release part of the alloca
 
@@ -1980,10 +2092,22 @@
     lw      $a0,   0($sp)
     lw      $a1,   4($sp)
     lw      $a2,   8($sp)
-
-    # Load FPRs the same as GPRs. Look at BuildNativeCallFrameStateMachine.
-    jalr    $t9                    # native call
     lw      $a3,  12($sp)
+
+    # artQuickGenericJniTrampoline sets bit 0 of the native code address to 1
+    # when the first two arguments are both single precision floats. This lets
+    # us extract them properly from the stack and load into floating point
+    # registers.
+    MTD     $a0, $a1, $f12, $f13
+    andi    $t0, $t9, 1
+    xor     $t9, $t9, $t0
+    bnez    $t0, 1f
+    mtc1    $a1, $f14
+    MTD     $a2, $a3, $f14, $f15
+
+1:
+    jalr    $t9                    # native call
+    nop
     addiu   $sp, $sp, 16           # remove arg slots
 
     move    $gp, $s3               # restore $gp from $s3
@@ -1999,18 +2123,18 @@
     s.d     $f0, 16($sp)           # pass result_f
 
     lw      $t0, THREAD_EXCEPTION_OFFSET(rSELF) # load Thread::Current()->exception_
-    bne     $t0, $zero, 1f         # check for pending exceptions
+    bne     $t0, $zero, 2f         # check for pending exceptions
 
     move    $sp, $s8               # tear down the alloca
 
-    # tear dpown the callee-save frame
+    # tear down the callee-save frame
     RESTORE_SAVE_REFS_AND_ARGS_FRAME
 
     MTD     $v0, $v1, $f0, $f1     # move float value to return value
     jalr    $zero, $ra
     nop
 
-1:
+2:
     lw $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)
     # This will create a new save-all frame, required by the runtime.
     DELIVER_PENDING_EXCEPTION
@@ -2023,9 +2147,9 @@
     la      $t9, artQuickToInterpreterBridge
     jalr    $t9                                 # (Method* method, Thread*, SP)
     addiu   $a2, $sp, ARG_SLOT_SIZE             # pass $sp (remove arg slots)
-    lw      $t0, THREAD_EXCEPTION_OFFSET(rSELF) # load Thread::Current()->exception_
+    lw      $t7, THREAD_EXCEPTION_OFFSET(rSELF) # load Thread::Current()->exception_
     RESTORE_SAVE_REFS_AND_ARGS_FRAME
-    bnez    $t0, 1f
+    bnez    $t7, 1f
     # don't care if $v0 and/or $v1 are modified, when exception branch taken
     MTD     $v0, $v1, $f0, $f1                  # move float value to return value
     jalr    $zero, $ra
diff --git a/runtime/arch/mips/quick_method_frame_info_mips.h b/runtime/arch/mips/quick_method_frame_info_mips.h
index 90e7b20..6f16352 100644
--- a/runtime/arch/mips/quick_method_frame_info_mips.h
+++ b/runtime/arch/mips/quick_method_frame_info_mips.h
@@ -26,12 +26,13 @@
 namespace mips {
 
 static constexpr uint32_t kMipsCalleeSaveAlwaysSpills =
-    (1 << art::mips::RA);
+    (1u << art::mips::RA);
 static constexpr uint32_t kMipsCalleeSaveRefSpills =
     (1 << art::mips::S2) | (1 << art::mips::S3) | (1 << art::mips::S4) | (1 << art::mips::S5) |
     (1 << art::mips::S6) | (1 << art::mips::S7) | (1 << art::mips::GP) | (1 << art::mips::FP);
 static constexpr uint32_t kMipsCalleeSaveArgSpills =
-    (1 << art::mips::A1) | (1 << art::mips::A2) | (1 << art::mips::A3);
+    (1 << art::mips::A1) | (1 << art::mips::A2) | (1 << art::mips::A3) | (1 << art::mips::T0) |
+    (1 << art::mips::T1);
 static constexpr uint32_t kMipsCalleeSaveAllSpills =
     (1 << art::mips::S0) | (1 << art::mips::S1);
 static constexpr uint32_t kMipsCalleeSaveEverythingSpills =
@@ -44,11 +45,13 @@
 static constexpr uint32_t kMipsCalleeSaveFpAlwaysSpills = 0;
 static constexpr uint32_t kMipsCalleeSaveFpRefSpills = 0;
 static constexpr uint32_t kMipsCalleeSaveFpArgSpills =
-    (1 << art::mips::F12) | (1 << art::mips::F13) | (1 << art::mips::F14) | (1 << art::mips::F15);
+    (1 << art::mips::F8) | (1 << art::mips::F9) | (1 << art::mips::F10) | (1 << art::mips::F11) |
+    (1 << art::mips::F12) | (1 << art::mips::F13) | (1 << art::mips::F14) | (1 << art::mips::F15) |
+    (1 << art::mips::F16) | (1 << art::mips::F17) | (1 << art::mips::F18) | (1 << art::mips::F19);
 static constexpr uint32_t kMipsCalleeSaveAllFPSpills =
     (1 << art::mips::F20) | (1 << art::mips::F21) | (1 << art::mips::F22) | (1 << art::mips::F23) |
     (1 << art::mips::F24) | (1 << art::mips::F25) | (1 << art::mips::F26) | (1 << art::mips::F27) |
-    (1 << art::mips::F28) | (1 << art::mips::F29) | (1 << art::mips::F30) | (1 << art::mips::F31);
+    (1 << art::mips::F28) | (1 << art::mips::F29) | (1 << art::mips::F30) | (1u << art::mips::F31);
 static constexpr uint32_t kMipsCalleeSaveFpEverythingSpills =
     (1 << art::mips::F0) | (1 << art::mips::F1) | (1 << art::mips::F2) | (1 << art::mips::F3) |
     (1 << art::mips::F4) | (1 << art::mips::F5) | (1 << art::mips::F6) | (1 << art::mips::F7) |
@@ -57,7 +60,7 @@
     (1 << art::mips::F16) | (1 << art::mips::F17) | (1 << art::mips::F18) | (1 << art::mips::F19) |
     (1 << art::mips::F20) | (1 << art::mips::F21) | (1 << art::mips::F22) | (1 << art::mips::F23) |
     (1 << art::mips::F24) | (1 << art::mips::F25) | (1 << art::mips::F26) | (1 << art::mips::F27) |
-    (1 << art::mips::F28) | (1 << art::mips::F29) | (1 << art::mips::F30) | (1 << art::mips::F31);
+    (1 << art::mips::F28) | (1 << art::mips::F29) | (1 << art::mips::F30) | (1u << art::mips::F31);
 
 constexpr uint32_t MipsCalleeSaveCoreSpills(Runtime::CalleeSaveType type) {
   return kMipsCalleeSaveAlwaysSpills | kMipsCalleeSaveRefSpills |
diff --git a/runtime/arch/mips/registers_mips.h b/runtime/arch/mips/registers_mips.h
index ae01bd5..555f3f0 100644
--- a/runtime/arch/mips/registers_mips.h
+++ b/runtime/arch/mips/registers_mips.h
@@ -35,9 +35,9 @@
   A1   =  5,
   A2   =  6,
   A3   =  7,
-  T0   =  8,  // Temporaries.
+  T0   =  8,  // Two extra arguments / temporaries.
   T1   =  9,
-  T2   = 10,
+  T2   = 10,  // Temporaries.
   T3   = 11,
   T4   = 12,
   T5   = 13,
@@ -100,7 +100,7 @@
   F29 = 29,
   F30 = 30,
   F31 = 31,
-  FTMP = F8,  // scratch register
+  FTMP = F6,  // scratch register
   kNumberOfFRegisters = 32,
   kNoFRegister = -1,
 };
diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index 6665897..9e385f8 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc
@@ -355,7 +355,7 @@
         "lw $a2, 8($sp)\n\t"
         "lw $t9, 12($sp)\n\t"
         "lw $s1, 16($sp)\n\t"
-        "lw $t0, 20($sp)\n\t"
+        "lw $t7, 20($sp)\n\t"
         "addiu $sp, $sp, 24\n\t"
 
         "jalr $t9\n\t"             // Call the stub.
diff --git a/runtime/base/time_utils.cc b/runtime/base/time_utils.cc
index 3e5bac8..57f198d 100644
--- a/runtime/base/time_utils.cc
+++ b/runtime/base/time_utils.cc
@@ -167,6 +167,17 @@
 #endif
 }
 
+uint64_t ProcessCpuNanoTime() {
+#if defined(__linux__)
+  timespec now;
+  clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &now);
+  return static_cast<uint64_t>(now.tv_sec) * UINT64_C(1000000000) + now.tv_nsec;
+#else
+  UNIMPLEMENTED(WARNING);
+  return -1;
+#endif
+}
+
 void NanoSleep(uint64_t ns) {
   timespec tm;
   tm.tv_sec = ns / MsToNs(1000);
diff --git a/runtime/base/time_utils.h b/runtime/base/time_utils.h
index 383b52f..dbb8bcd 100644
--- a/runtime/base/time_utils.h
+++ b/runtime/base/time_utils.h
@@ -62,6 +62,9 @@
 // Returns the thread-specific CPU-time clock in nanoseconds or -1 if unavailable.
 uint64_t ThreadCpuNanoTime();
 
+// Returns the process CPU-time clock in nanoseconds or -1 if unavailable.
+uint64_t ProcessCpuNanoTime();
+
 // Converts the given number of nanoseconds to milliseconds.
 static constexpr inline uint64_t NsToMs(uint64_t ns) {
   return ns / 1000 / 1000;
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 319991b..f3a5be2 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -629,13 +629,13 @@
   // Sanity check Class[] and Object[]'s interfaces. GetDirectInterface may cause thread
   // suspension.
   CHECK_EQ(java_lang_Cloneable.Get(),
-           mirror::Class::GetDirectInterface(self, class_array_class, 0));
+           mirror::Class::GetDirectInterface(self, class_array_class.Get(), 0));
   CHECK_EQ(java_io_Serializable.Get(),
-           mirror::Class::GetDirectInterface(self, class_array_class, 1));
+           mirror::Class::GetDirectInterface(self, class_array_class.Get(), 1));
   CHECK_EQ(java_lang_Cloneable.Get(),
-           mirror::Class::GetDirectInterface(self, object_array_class, 0));
+           mirror::Class::GetDirectInterface(self, object_array_class.Get(), 0));
   CHECK_EQ(java_io_Serializable.Get(),
-           mirror::Class::GetDirectInterface(self, object_array_class, 1));
+           mirror::Class::GetDirectInterface(self, object_array_class.Get(), 1));
 
   CHECK_EQ(object_array_string.Get(),
            FindSystemClass(self, GetClassRootDescriptor(kJavaLangStringArrayClass)));
@@ -1981,13 +1981,36 @@
   void Visit(ObjPtr<mirror::ClassLoader> class_loader)
       REQUIRES_SHARED(Locks::classlinker_classes_lock_, Locks::mutator_lock_) OVERRIDE {
     ClassTable* const class_table = class_loader->GetClassTable();
-    if (!done_ && class_table != nullptr && !class_table->Visit(*visitor_)) {
-      // If the visitor ClassTable returns false it means that we don't need to continue.
-      done_ = true;
+    if (!done_ && class_table != nullptr) {
+      DefiningClassLoaderFilterVisitor visitor(class_loader, visitor_);
+      if (!class_table->Visit(visitor)) {
+        // If the visitor ClassTable returns false it means that we don't need to continue.
+        done_ = true;
+      }
     }
   }
 
  private:
+  // Class visitor that limits the class visits from a ClassTable to the classes with
+  // the provided defining class loader. This filter is used to avoid multiple visits
+  // of the same class which can be recorded for multiple initiating class loaders.
+  class DefiningClassLoaderFilterVisitor : public ClassVisitor {
+   public:
+    DefiningClassLoaderFilterVisitor(ObjPtr<mirror::ClassLoader> defining_class_loader,
+                                     ClassVisitor* visitor)
+        : defining_class_loader_(defining_class_loader), visitor_(visitor) { }
+
+    bool operator()(ObjPtr<mirror::Class> klass) OVERRIDE REQUIRES_SHARED(Locks::mutator_lock_) {
+      if (klass->GetClassLoader() != defining_class_loader_) {
+        return true;
+      }
+      return (*visitor_)(klass);
+    }
+
+    ObjPtr<mirror::ClassLoader> const defining_class_loader_;
+    ClassVisitor* const visitor_;
+  };
+
   ClassVisitor* const visitor_;
   // If done is true then we don't need to do any more visiting.
   bool done_;
@@ -2474,56 +2497,109 @@
     }
   } else {
     ScopedObjectAccessUnchecked soa(self);
-    ObjPtr<mirror::Class> cp_klass;
-    if (FindClassInBaseDexClassLoader(soa, self, descriptor, hash, class_loader, &cp_klass)) {
-      // The chain was understood. So the value in cp_klass is either the class we were looking
-      // for, or not found.
-      if (cp_klass != nullptr) {
-        return cp_klass.Ptr();
-      }
-      // TODO: We handle the boot classpath loader in FindClassInBaseDexClassLoader. Try to unify
-      //       this and the branch above. TODO: throw the right exception here.
+    ObjPtr<mirror::Class> result_ptr;
+    bool descriptor_equals;
+    bool known_hierarchy =
+        FindClassInBaseDexClassLoader(soa, self, descriptor, hash, class_loader, &result_ptr);
+    if (result_ptr != nullptr) {
+      // The chain was understood and we found the class. We still need to add the class to
+      // the class table to protect from racy programs that can try and redefine the path list
+      // which would change the Class<?> returned for subsequent evaluation of const-class.
+      DCHECK(known_hierarchy);
+      DCHECK(result_ptr->DescriptorEquals(descriptor));
+      descriptor_equals = true;
+    } else {
+      // Either the chain wasn't understood or the class wasn't found.
+      //
+      // If the chain was understood but we did not find the class, let the Java-side
+      // rediscover all this and throw the exception with the right stack trace. Note that
+      // the Java-side could still succeed for racy programs if another thread is actively
+      // modifying the class loader's path list.
 
-      // We'll let the Java-side rediscover all this and throw the exception with the right stack
-      // trace.
-    }
-
-    if (Runtime::Current()->IsAotCompiler()) {
-      // Oops, compile-time, can't run actual class-loader code.
-      ObjPtr<mirror::Throwable> pre_allocated = Runtime::Current()->GetPreAllocatedNoClassDefFoundError();
-      self->SetException(pre_allocated);
-      return nullptr;
-    }
-
-    ScopedLocalRef<jobject> class_loader_object(soa.Env(),
-                                                soa.AddLocalReference<jobject>(class_loader.Get()));
-    std::string class_name_string(DescriptorToDot(descriptor));
-    ScopedLocalRef<jobject> result(soa.Env(), nullptr);
-    {
-      ScopedThreadStateChange tsc(self, kNative);
-      ScopedLocalRef<jobject> class_name_object(soa.Env(),
-                                                soa.Env()->NewStringUTF(class_name_string.c_str()));
-      if (class_name_object.get() == nullptr) {
-        DCHECK(self->IsExceptionPending());  // OOME.
+      if (Runtime::Current()->IsAotCompiler()) {
+        // Oops, compile-time, can't run actual class-loader code.
+        ObjPtr<mirror::Throwable> pre_allocated =
+            Runtime::Current()->GetPreAllocatedNoClassDefFoundError();
+        self->SetException(pre_allocated);
         return nullptr;
       }
-      CHECK(class_loader_object.get() != nullptr);
-      result.reset(soa.Env()->CallObjectMethod(class_loader_object.get(),
-                                               WellKnownClasses::java_lang_ClassLoader_loadClass,
-                                               class_name_object.get()));
+
+      ScopedLocalRef<jobject> class_loader_object(
+          soa.Env(), soa.AddLocalReference<jobject>(class_loader.Get()));
+      std::string class_name_string(DescriptorToDot(descriptor));
+      ScopedLocalRef<jobject> result(soa.Env(), nullptr);
+      {
+        ScopedThreadStateChange tsc(self, kNative);
+        ScopedLocalRef<jobject> class_name_object(
+            soa.Env(), soa.Env()->NewStringUTF(class_name_string.c_str()));
+        if (class_name_object.get() == nullptr) {
+          DCHECK(self->IsExceptionPending());  // OOME.
+          return nullptr;
+        }
+        CHECK(class_loader_object.get() != nullptr);
+        result.reset(soa.Env()->CallObjectMethod(class_loader_object.get(),
+                                                 WellKnownClasses::java_lang_ClassLoader_loadClass,
+                                                 class_name_object.get()));
+      }
+      if (self->IsExceptionPending()) {
+        // If the ClassLoader threw, pass that exception up.
+        // However, to comply with the RI behavior, first check if another thread succeeded.
+        result_ptr = LookupClass(self, descriptor, hash, class_loader.Get());
+        if (result_ptr != nullptr && !result_ptr->IsErroneous()) {
+          self->ClearException();
+          return EnsureResolved(self, descriptor, result_ptr);
+        }
+        return nullptr;
+      } else if (result.get() == nullptr) {
+        // broken loader - throw NPE to be compatible with Dalvik
+        ThrowNullPointerException(StringPrintf("ClassLoader.loadClass returned null for %s",
+                                               class_name_string.c_str()).c_str());
+        return nullptr;
+      }
+      result_ptr = soa.Decode<mirror::Class>(result.get());
+      // Check the name of the returned class.
+      descriptor_equals = result_ptr->DescriptorEquals(descriptor);
     }
-    if (self->IsExceptionPending()) {
-      // If the ClassLoader threw, pass that exception up.
-      return nullptr;
-    } else if (result.get() == nullptr) {
-      // broken loader - throw NPE to be compatible with Dalvik
-      ThrowNullPointerException(StringPrintf("ClassLoader.loadClass returned null for %s",
-                                             class_name_string.c_str()).c_str());
-      return nullptr;
-    } else {
-      // success, return mirror::Class*
-      return soa.Decode<mirror::Class>(result.get()).Ptr();
+
+    // Try to insert the class to the class table, checking for mismatch.
+    ObjPtr<mirror::Class> old;
+    {
+      WriterMutexLock mu(self, *Locks::classlinker_classes_lock_);
+      ClassTable* const class_table = InsertClassTableForClassLoader(class_loader.Get());
+      old = class_table->Lookup(descriptor, hash);
+      if (old == nullptr) {
+        old = result_ptr;  // For the comparison below, after releasing the lock.
+        if (descriptor_equals) {
+          class_table->InsertWithHash(result_ptr.Ptr(), hash);
+          Runtime::Current()->GetHeap()->WriteBarrierEveryFieldOf(class_loader.Get());
+        }  // else throw below, after releasing the lock.
+      }
     }
+    if (UNLIKELY(old != result_ptr)) {
+      // Return `old` (even if `!descriptor_equals`) to mimic the RI behavior for parallel
+      // capable class loaders.  (All class loaders are considered parallel capable on Android.)
+      mirror::Class* loader_class = class_loader->GetClass();
+      const char* loader_class_name =
+          loader_class->GetDexFile().StringByTypeIdx(loader_class->GetDexTypeIndex());
+      LOG(WARNING) << "Initiating class loader of type " << DescriptorToDot(loader_class_name)
+          << " is not well-behaved; it returned a different Class for racing loadClass(\""
+          << DescriptorToDot(descriptor) << "\").";
+      return EnsureResolved(self, descriptor, old);
+    }
+    if (UNLIKELY(!descriptor_equals)) {
+      std::string result_storage;
+      const char* result_name = result_ptr->GetDescriptor(&result_storage);
+      std::string loader_storage;
+      const char* loader_class_name = class_loader->GetClass()->GetDescriptor(&loader_storage);
+      ThrowNoClassDefFoundError(
+          "Initiating class loader of type %s returned class %s instead of %s.",
+          DescriptorToDot(loader_class_name).c_str(),
+          DescriptorToDot(result_name).c_str(),
+          DescriptorToDot(descriptor).c_str());
+      return nullptr;
+    }
+    // success, return mirror::Class*
+    return result_ptr.Ptr();
   }
   UNREACHABLE();
 }
@@ -3609,12 +3685,6 @@
   Runtime::Current()->GetHeap()->WriteBarrierEveryFieldOf(klass);
 }
 
-bool ClassLinker::RemoveClass(const char* descriptor, ObjPtr<mirror::ClassLoader> class_loader) {
-  WriterMutexLock mu(Thread::Current(), *Locks::classlinker_classes_lock_);
-  ClassTable* const class_table = ClassTableForClassLoader(class_loader);
-  return class_table != nullptr && class_table->Remove(descriptor);
-}
-
 mirror::Class* ClassLinker::LookupClass(Thread* self,
                                         const char* descriptor,
                                         size_t hash,
@@ -3665,7 +3735,8 @@
       REQUIRES_SHARED(Locks::classlinker_classes_lock_, Locks::mutator_lock_) OVERRIDE {
     ClassTable* const class_table = class_loader->GetClassTable();
     ObjPtr<mirror::Class> klass = class_table->Lookup(descriptor_, hash_);
-    if (klass != nullptr) {
+    // Add `klass` only if `class_loader` is its defining (not just initiating) class loader.
+    if (klass != nullptr && klass->GetClassLoader() == class_loader) {
       result_->push_back(klass);
     }
   }
@@ -3684,6 +3755,7 @@
   const size_t hash = ComputeModifiedUtf8Hash(descriptor);
   ObjPtr<mirror::Class> klass = boot_class_table_.Lookup(descriptor, hash);
   if (klass != nullptr) {
+    DCHECK(klass->GetClassLoader() == nullptr);
     result.push_back(klass);
   }
   LookupClassesVisitor visitor(descriptor, hash, &result);
@@ -4486,7 +4558,7 @@
       StackHandleScope<1> hs_iface(self);
       MutableHandle<mirror::Class> handle_scope_iface(hs_iface.NewHandle<mirror::Class>(nullptr));
       for (size_t i = 0; i < num_direct_interfaces; i++) {
-        handle_scope_iface.Assign(mirror::Class::GetDirectInterface(self, klass, i));
+        handle_scope_iface.Assign(mirror::Class::GetDirectInterface(self, klass.Get(), i));
         CHECK(handle_scope_iface.Get() != nullptr);
         CHECK(handle_scope_iface->IsInterface());
         if (handle_scope_iface->HasBeenRecursivelyInitialized()) {
@@ -4622,7 +4694,8 @@
     MutableHandle<mirror::Class> handle_super_iface(hs.NewHandle<mirror::Class>(nullptr));
     // First we initialize all of iface's super-interfaces recursively.
     for (size_t i = 0; i < num_direct_ifaces; i++) {
-      ObjPtr<mirror::Class> super_iface = mirror::Class::GetDirectInterface(self, iface, i);
+      ObjPtr<mirror::Class> super_iface = mirror::Class::GetDirectInterface(self, iface.Get(), i);
+      DCHECK(super_iface != nullptr);
       if (!super_iface->HasBeenRecursivelyInitialized()) {
         // Recursive step
         handle_super_iface.Assign(super_iface);
@@ -6383,7 +6456,7 @@
   for (size_t i = 0; i < num_interfaces; i++) {
     ObjPtr<mirror::Class> interface = have_interfaces
         ? interfaces->GetWithoutChecks(i)
-        : mirror::Class::GetDirectInterface(self, klass, i);
+        : mirror::Class::GetDirectInterface(self, klass.Get(), i);
     DCHECK(interface != nullptr);
     if (UNLIKELY(!interface->IsInterface())) {
       std::string temp;
@@ -6421,7 +6494,7 @@
     std::vector<mirror::Class*> to_add;
     for (size_t i = 0; i < num_interfaces; i++) {
       ObjPtr<mirror::Class> interface = have_interfaces ? interfaces->Get(i) :
-          mirror::Class::GetDirectInterface(self, klass, i);
+          mirror::Class::GetDirectInterface(self, klass.Get(), i);
       to_add.push_back(interface.Ptr());
     }
 
@@ -7796,16 +7869,14 @@
   }
   const DexFile::FieldId& field_id = dex_file.GetFieldId(field_idx);
   Thread* const self = Thread::Current();
-  StackHandleScope<1> hs(self);
-  Handle<mirror::Class> klass(
-      hs.NewHandle(ResolveType(dex_file, field_id.class_idx_, dex_cache, class_loader)));
-  if (klass.Get() == nullptr) {
+  ObjPtr<mirror::Class> klass = ResolveType(dex_file, field_id.class_idx_, dex_cache, class_loader);
+  if (klass == nullptr) {
     DCHECK(Thread::Current()->IsExceptionPending());
     return nullptr;
   }
 
   if (is_static) {
-    resolved = mirror::Class::FindStaticField(self, klass.Get(), dex_cache.Get(), field_idx);
+    resolved = mirror::Class::FindStaticField(self, klass, dex_cache.Get(), field_idx);
   } else {
     resolved = klass->FindInstanceField(dex_cache.Get(), field_idx);
   }
@@ -7819,7 +7890,7 @@
       resolved = klass->FindInstanceField(name, type);
     }
     if (resolved == nullptr) {
-      ThrowNoSuchFieldError(is_static ? "static " : "instance ", klass.Get(), type, name);
+      ThrowNoSuchFieldError(is_static ? "static " : "instance ", klass, type, name);
       return nullptr;
     }
   }
@@ -7839,10 +7910,8 @@
   }
   const DexFile::FieldId& field_id = dex_file.GetFieldId(field_idx);
   Thread* self = Thread::Current();
-  StackHandleScope<1> hs(self);
-  Handle<mirror::Class> klass(
-      hs.NewHandle(ResolveType(dex_file, field_id.class_idx_, dex_cache, class_loader)));
-  if (klass.Get() == nullptr) {
+  ObjPtr<mirror::Class> klass(ResolveType(dex_file, field_id.class_idx_, dex_cache, class_loader));
+  if (klass == nullptr) {
     DCHECK(Thread::Current()->IsExceptionPending());
     return nullptr;
   }
@@ -7854,7 +7923,7 @@
   if (resolved != nullptr) {
     dex_cache->SetResolvedField(field_idx, resolved, image_pointer_size_);
   } else {
-    ThrowNoSuchFieldError("", klass.Get(), type, name);
+    ThrowNoSuchFieldError("", klass, type, name);
   }
   return resolved;
 }
@@ -7972,8 +8041,8 @@
       REQUIRES_SHARED(Locks::classlinker_classes_lock_, Locks::mutator_lock_) OVERRIDE {
     ClassTable* const class_table = class_loader->GetClassTable();
     if (class_table != nullptr) {
-      num_zygote_classes += class_table->NumZygoteClasses();
-      num_non_zygote_classes += class_table->NumNonZygoteClasses();
+      num_zygote_classes += class_table->NumZygoteClasses(class_loader);
+      num_non_zygote_classes += class_table->NumNonZygoteClasses(class_loader);
     }
   }
 
@@ -7984,13 +8053,13 @@
 size_t ClassLinker::NumZygoteClasses() const {
   CountClassesVisitor visitor;
   VisitClassLoaders(&visitor);
-  return visitor.num_zygote_classes + boot_class_table_.NumZygoteClasses();
+  return visitor.num_zygote_classes + boot_class_table_.NumZygoteClasses(nullptr);
 }
 
 size_t ClassLinker::NumNonZygoteClasses() const {
   CountClassesVisitor visitor;
   VisitClassLoaders(&visitor);
-  return visitor.num_non_zygote_classes + boot_class_table_.NumNonZygoteClasses();
+  return visitor.num_non_zygote_classes + boot_class_table_.NumNonZygoteClasses(nullptr);
 }
 
 size_t ClassLinker::NumLoadedClasses() {
@@ -8078,7 +8147,7 @@
   ScopedObjectAccessUnchecked soa(self);
 
   // For now, create a libcore-level DexFile for each ART DexFile. This "explodes" multidex.
-  StackHandleScope<11> hs(self);
+  StackHandleScope<6> hs(self);
 
   ArtField* dex_elements_field =
       jni::DecodeArtField(WellKnownClasses::dalvik_system_DexPathList_dexElements);
@@ -8157,7 +8226,9 @@
   // Make a pretend boot-classpath.
   // TODO: Should we scan the image?
   ArtField* const parent_field =
-      mirror::Class::FindField(self, hs.NewHandle(h_path_class_loader->GetClass()), "parent",
+      mirror::Class::FindField(self,
+                               h_path_class_loader->GetClass(),
+                               "parent",
                                "Ljava/lang/ClassLoader;");
   DCHECK(parent_field != nullptr);
   ObjPtr<mirror::Object> boot_cl =
diff --git a/runtime/class_linker.h b/runtime/class_linker.h
index de1f0f0..9b25303 100644
--- a/runtime/class_linker.h
+++ b/runtime/class_linker.h
@@ -218,12 +218,6 @@
 
   mirror::Class* FindPrimitiveClass(char type) REQUIRES_SHARED(Locks::mutator_lock_);
 
-  // General class unloading is not supported, this is used to prune
-  // unwanted classes during image writing.
-  bool RemoveClass(const char* descriptor, ObjPtr<mirror::ClassLoader> class_loader)
-      REQUIRES(!Locks::classlinker_classes_lock_)
-      REQUIRES_SHARED(Locks::mutator_lock_);
-
   void DumpForSigQuit(std::ostream& os) REQUIRES(!Locks::classlinker_classes_lock_);
 
   size_t NumLoadedClasses()
diff --git a/runtime/class_linker_test.cc b/runtime/class_linker_test.cc
index ddb9e59..862585a 100644
--- a/runtime/class_linker_test.cc
+++ b/runtime/class_linker_test.cc
@@ -215,10 +215,12 @@
     EXPECT_TRUE(array->ShouldHaveEmbeddedVTable());
     EXPECT_EQ(2, array->GetIfTableCount());
     ASSERT_TRUE(array->GetIfTable() != nullptr);
-    ObjPtr<mirror::Class> direct_interface0 = mirror::Class::GetDirectInterface(self, array, 0);
+    ObjPtr<mirror::Class> direct_interface0 =
+        mirror::Class::GetDirectInterface(self, array.Get(), 0);
     EXPECT_TRUE(direct_interface0 != nullptr);
     EXPECT_STREQ(direct_interface0->GetDescriptor(&temp), "Ljava/lang/Cloneable;");
-    ObjPtr<mirror::Class> direct_interface1 = mirror::Class::GetDirectInterface(self, array, 1);
+    ObjPtr<mirror::Class> direct_interface1 =
+        mirror::Class::GetDirectInterface(self, array.Get(), 1);
     EXPECT_STREQ(direct_interface1->GetDescriptor(&temp), "Ljava/io/Serializable;");
     ObjPtr<mirror::Class> array_ptr = array->GetComponentType();
     EXPECT_OBJ_PTR_EQ(class_linker_->FindArrayClass(self, &array_ptr), array.Get());
@@ -1019,48 +1021,48 @@
 
   EXPECT_EQ(9U, statics->NumStaticFields());
 
-  ArtField* s0 = mirror::Class::FindStaticField(soa.Self(), statics, "s0", "Z");
+  ArtField* s0 = mirror::Class::FindStaticField(soa.Self(), statics.Get(), "s0", "Z");
   EXPECT_EQ(s0->GetTypeAsPrimitiveType(), Primitive::kPrimBoolean);
   EXPECT_EQ(true, s0->GetBoolean(statics.Get()));
   s0->SetBoolean<false>(statics.Get(), false);
 
-  ArtField* s1 = mirror::Class::FindStaticField(soa.Self(), statics, "s1", "B");
+  ArtField* s1 = mirror::Class::FindStaticField(soa.Self(), statics.Get(), "s1", "B");
   EXPECT_EQ(s1->GetTypeAsPrimitiveType(), Primitive::kPrimByte);
   EXPECT_EQ(5, s1->GetByte(statics.Get()));
   s1->SetByte<false>(statics.Get(), 6);
 
-  ArtField* s2 = mirror::Class::FindStaticField(soa.Self(), statics, "s2", "C");
+  ArtField* s2 = mirror::Class::FindStaticField(soa.Self(), statics.Get(), "s2", "C");
   EXPECT_EQ(s2->GetTypeAsPrimitiveType(), Primitive::kPrimChar);
   EXPECT_EQ('a', s2->GetChar(statics.Get()));
   s2->SetChar<false>(statics.Get(), 'b');
 
-  ArtField* s3 = mirror::Class::FindStaticField(soa.Self(), statics, "s3", "S");
+  ArtField* s3 = mirror::Class::FindStaticField(soa.Self(), statics.Get(), "s3", "S");
   EXPECT_EQ(s3->GetTypeAsPrimitiveType(), Primitive::kPrimShort);
   EXPECT_EQ(-536, s3->GetShort(statics.Get()));
   s3->SetShort<false>(statics.Get(), -535);
 
-  ArtField* s4 = mirror::Class::FindStaticField(soa.Self(), statics, "s4", "I");
+  ArtField* s4 = mirror::Class::FindStaticField(soa.Self(), statics.Get(), "s4", "I");
   EXPECT_EQ(s4->GetTypeAsPrimitiveType(), Primitive::kPrimInt);
   EXPECT_EQ(2000000000, s4->GetInt(statics.Get()));
   s4->SetInt<false>(statics.Get(), 2000000001);
 
-  ArtField* s5 = mirror::Class::FindStaticField(soa.Self(), statics, "s5", "J");
+  ArtField* s5 = mirror::Class::FindStaticField(soa.Self(), statics.Get(), "s5", "J");
   EXPECT_EQ(s5->GetTypeAsPrimitiveType(), Primitive::kPrimLong);
   EXPECT_EQ(0x1234567890abcdefLL, s5->GetLong(statics.Get()));
   s5->SetLong<false>(statics.Get(), INT64_C(0x34567890abcdef12));
 
-  ArtField* s6 = mirror::Class::FindStaticField(soa.Self(), statics, "s6", "F");
+  ArtField* s6 = mirror::Class::FindStaticField(soa.Self(), statics.Get(), "s6", "F");
   EXPECT_EQ(s6->GetTypeAsPrimitiveType(), Primitive::kPrimFloat);
   EXPECT_DOUBLE_EQ(0.5, s6->GetFloat(statics.Get()));
   s6->SetFloat<false>(statics.Get(), 0.75);
 
-  ArtField* s7 = mirror::Class::FindStaticField(soa.Self(), statics, "s7", "D");
+  ArtField* s7 = mirror::Class::FindStaticField(soa.Self(), statics.Get(), "s7", "D");
   EXPECT_EQ(s7->GetTypeAsPrimitiveType(), Primitive::kPrimDouble);
   EXPECT_DOUBLE_EQ(16777217.0, s7->GetDouble(statics.Get()));
   s7->SetDouble<false>(statics.Get(), 16777219);
 
-  ArtField* s8 = mirror::Class::FindStaticField(soa.Self(), statics, "s8",
-                                                        "Ljava/lang/String;");
+  ArtField* s8 = mirror::Class::FindStaticField(
+      soa.Self(), statics.Get(), "s8", "Ljava/lang/String;");
   EXPECT_EQ(s8->GetTypeAsPrimitiveType(), Primitive::kPrimNot);
   EXPECT_TRUE(s8->GetObject(statics.Get())->AsString()->Equals("android"));
   mirror::String* str_value = mirror::String::AllocFromModifiedUtf8(soa.Self(), "robot");
@@ -1131,10 +1133,14 @@
   EXPECT_EQ(Aj1, A->FindVirtualMethodForVirtualOrInterface(Jj1, kRuntimePointerSize));
   EXPECT_EQ(Aj2, A->FindVirtualMethodForVirtualOrInterface(Jj2, kRuntimePointerSize));
 
-  ArtField* Afoo = mirror::Class::FindStaticField(soa.Self(), A, "foo", "Ljava/lang/String;");
-  ArtField* Bfoo = mirror::Class::FindStaticField(soa.Self(), B, "foo", "Ljava/lang/String;");
-  ArtField* Jfoo = mirror::Class::FindStaticField(soa.Self(), J, "foo", "Ljava/lang/String;");
-  ArtField* Kfoo = mirror::Class::FindStaticField(soa.Self(), K, "foo", "Ljava/lang/String;");
+  ArtField* Afoo =
+      mirror::Class::FindStaticField(soa.Self(), A.Get(), "foo", "Ljava/lang/String;");
+  ArtField* Bfoo =
+      mirror::Class::FindStaticField(soa.Self(), B.Get(), "foo", "Ljava/lang/String;");
+  ArtField* Jfoo =
+      mirror::Class::FindStaticField(soa.Self(), J.Get(), "foo", "Ljava/lang/String;");
+  ArtField* Kfoo =
+      mirror::Class::FindStaticField(soa.Self(), K.Get(), "foo", "Ljava/lang/String;");
   ASSERT_TRUE(Afoo != nullptr);
   EXPECT_EQ(Afoo, Bfoo);
   EXPECT_EQ(Afoo, Jfoo);
diff --git a/runtime/class_table.cc b/runtime/class_table.cc
index 6eb7496..ec33e5e 100644
--- a/runtime/class_table.cc
+++ b/runtime/class_table.cc
@@ -84,18 +84,29 @@
 
 #pragma clang diagnostic pop  // http://b/31104323
 
-size_t ClassTable::NumZygoteClasses() const {
+size_t ClassTable::CountDefiningLoaderClasses(ObjPtr<mirror::ClassLoader> defining_loader,
+                                              const ClassSet& set) const {
+  size_t count = 0;
+  for (const TableSlot& root : set) {
+    if (root.Read()->GetClassLoader() == defining_loader) {
+      ++count;
+    }
+  }
+  return count;
+}
+
+size_t ClassTable::NumZygoteClasses(ObjPtr<mirror::ClassLoader> defining_loader) const {
   ReaderMutexLock mu(Thread::Current(), lock_);
   size_t sum = 0;
   for (size_t i = 0; i < classes_.size() - 1; ++i) {
-    sum += classes_[i].Size();
+    sum += CountDefiningLoaderClasses(defining_loader, classes_[i]);
   }
   return sum;
 }
 
-size_t ClassTable::NumNonZygoteClasses() const {
+size_t ClassTable::NumNonZygoteClasses(ObjPtr<mirror::ClassLoader> defining_loader) const {
   ReaderMutexLock mu(Thread::Current(), lock_);
-  return classes_.back().Size();
+  return CountDefiningLoaderClasses(defining_loader, classes_.back());
 }
 
 mirror::Class* ClassTable::Lookup(const char* descriptor, size_t hash) {
@@ -104,7 +115,7 @@
   for (ClassSet& class_set : classes_) {
     auto it = class_set.FindWithHash(pair, hash);
     if (it != class_set.end()) {
-     return it->Read();
+      return it->Read();
     }
   }
   return nullptr;
@@ -150,7 +161,6 @@
     DCHECK(!a.Read()->DescriptorEquals(b.Read()->GetDescriptor(&temp)));
     return false;
   }
-  DCHECK_EQ(a.Read()->GetClassLoader(), b.Read()->GetClassLoader());
   std::string temp;
   return a.Read()->DescriptorEquals(b.Read()->GetDescriptor(&temp));
 }
diff --git a/runtime/class_table.h b/runtime/class_table.h
index fe0bbb3..104871f 100644
--- a/runtime/class_table.h
+++ b/runtime/class_table.h
@@ -142,10 +142,14 @@
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Returns the number of classes in previous snapshots.
-  size_t NumZygoteClasses() const REQUIRES(!lock_);
+  size_t NumZygoteClasses(ObjPtr<mirror::ClassLoader> defining_loader) const
+      REQUIRES(!lock_)
+      REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Returns all off the classes in the lastest snapshot.
-  size_t NumNonZygoteClasses() const REQUIRES(!lock_);
+  size_t NumNonZygoteClasses(ObjPtr<mirror::ClassLoader> defining_loader) const
+      REQUIRES(!lock_)
+      REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Update a class in the table with the new class. Returns the existing class which was replaced.
   mirror::Class* UpdateClass(const char* descriptor, mirror::Class* new_klass, size_t hash)
@@ -231,6 +235,11 @@
  private:
   void InsertWithoutLocks(ObjPtr<mirror::Class> klass) NO_THREAD_SAFETY_ANALYSIS;
 
+  size_t CountDefiningLoaderClasses(ObjPtr<mirror::ClassLoader> defining_loader,
+                                    const ClassSet& set) const
+      REQUIRES(lock_)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
   // Return true if we inserted the oat file, false if it already exists.
   bool InsertOatFileLocked(const OatFile* oat_file)
       REQUIRES(lock_)
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index dc2ae2e..e339666 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -1565,16 +1565,16 @@
 JDWP::JdwpError Dbg::OutputDeclaredInterfaces(JDWP::RefTypeId class_id, JDWP::ExpandBuf* pReply) {
   JDWP::JdwpError error;
   Thread* self = Thread::Current();
-  StackHandleScope<1> hs(self);
-  Handle<mirror::Class> c(hs.NewHandle(DecodeClass(class_id, &error)));
-  if (c.Get() == nullptr) {
+  ObjPtr<mirror::Class> c = DecodeClass(class_id, &error);
+  if (c == nullptr) {
     return error;
   }
   size_t interface_count = c->NumDirectInterfaces();
   expandBufAdd4BE(pReply, interface_count);
   for (size_t i = 0; i < interface_count; ++i) {
-    expandBufAddRefTypeId(pReply,
-                          gRegistry->AddRefType(mirror::Class::GetDirectInterface(self, c, i)));
+    ObjPtr<mirror::Class> interface = mirror::Class::GetDirectInterface(self, c, i);
+    DCHECK(interface != nullptr);
+    expandBufAddRefTypeId(pReply, gRegistry->AddRefType(interface));
   }
   return JDWP::ERR_NONE;
 }
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index fe82878..bf1d4ea 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -134,13 +134,23 @@
   // | Method*    | ---
   // | RA         |
   // | ...        |    callee saves
+  // | T1         |    arg5
+  // | T0         |    arg4
   // | A3         |    arg3
   // | A2         |    arg2
   // | A1         |    arg1
+  // | F19        |
+  // | F18        |    f_arg5
+  // | F17        |
+  // | F16        |    f_arg4
   // | F15        |
-  // | F14        |    f_arg1
+  // | F14        |    f_arg3
   // | F13        |
-  // | F12        |    f_arg0
+  // | F12        |    f_arg2
+  // | F11        |
+  // | F10        |    f_arg1
+  // | F9         |
+  // | F8         |    f_arg0
   // |            |    padding
   // | A0/Method* |  <- sp
   static constexpr bool kSplitPairAcrossRegisterAndStack = false;
@@ -148,14 +158,14 @@
   static constexpr bool kQuickSoftFloatAbi = false;
   static constexpr bool kQuickDoubleRegAlignedFloatBackFilled = false;
   static constexpr bool kQuickSkipOddFpRegisters = true;
-  static constexpr size_t kNumQuickGprArgs = 3;  // 3 arguments passed in GPRs.
-  static constexpr size_t kNumQuickFprArgs = 4;  // 2 arguments passed in FPRs. Floats can be passed
-                                                 // only in even numbered registers and each double
-                                                 // occupies two registers.
+  static constexpr size_t kNumQuickGprArgs = 5;   // 5 arguments passed in GPRs.
+  static constexpr size_t kNumQuickFprArgs = 12;  // 6 arguments passed in FPRs. Floats can be
+                                                  // passed only in even numbered registers and each
+                                                  // double occupies two registers.
   static constexpr bool kGprFprLockstep = false;
-  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 16;  // Offset of first FPR arg.
-  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 32;  // Offset of first GPR arg.
-  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 76;  // Offset of return address.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 8;  // Offset of first FPR arg.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 56;  // Offset of first GPR arg.
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 108;  // Offset of return address.
   static size_t GprIndexToGprOffset(uint32_t gpr_index) {
     return gpr_index * GetBytesPerGprSpillLocation(kRuntimeISA);
   }
@@ -187,7 +197,7 @@
   // | F12        |    f_arg0
   // |            |    padding
   // | A0/Method* |  <- sp
-  // NOTE: for Mip64, when A0 is skipped, F0 is also skipped.
+  // NOTE: for Mip64, when A0 is skipped, F12 is also skipped.
   static constexpr bool kSplitPairAcrossRegisterAndStack = false;
   static constexpr bool kAlignPairRegister = false;
   static constexpr bool kQuickSoftFloatAbi = false;
@@ -197,7 +207,7 @@
   static constexpr size_t kNumQuickFprArgs = 7;  // 7 arguments passed in FPRs.
   static constexpr bool kGprFprLockstep = true;
 
-  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 24;  // Offset of first FPR arg (F1).
+  static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Fpr1Offset = 24;  // Offset of first FPR arg (F13).
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_Gpr1Offset = 80;  // Offset of first GPR arg (A1).
   static constexpr size_t kQuickCalleeSaveFrame_RefAndArgs_LrOffset = 200;  // Offset of return address.
   static size_t GprIndexToGprOffset(uint32_t gpr_index) {
@@ -501,10 +511,16 @@
         case Primitive::kPrimDouble:
         case Primitive::kPrimLong:
           if (kQuickSoftFloatAbi || (cur_type_ == Primitive::kPrimLong)) {
-            if (cur_type_ == Primitive::kPrimLong && kAlignPairRegister && gpr_index_ == 0) {
-              // Currently, this is only for ARM and MIPS, where the first available parameter
-              // register is R1 (on ARM) or A1 (on MIPS). So we skip it, and use R2 (on ARM) or
-              // A2 (on MIPS) instead.
+            if (cur_type_ == Primitive::kPrimLong &&
+#if defined(__mips__) && !defined(__LP64__)
+                (gpr_index_ == 0 || gpr_index_ == 2) &&
+#else
+                gpr_index_ == 0 &&
+#endif
+                kAlignPairRegister) {
+              // Currently, this is only for ARM and MIPS, where we align long parameters with
+              // even-numbered registers by skipping R1 (on ARM) or A1(A3) (on MIPS) and using
+              // R2 (on ARM) or A2(T0) (on MIPS) instead.
               IncGprIndex();
             }
             is_split_long_or_double_ = (GetBytesPerGprSpillLocation(kRuntimeISA) == 4) &&
@@ -2086,6 +2102,41 @@
     // Note that the native code pointer will be automatically set by artFindNativeMethod().
   }
 
+#if defined(__mips__) && !defined(__LP64__)
+  // On MIPS32 if the first two arguments are floating-point, we need to know their types
+  // so that art_quick_generic_jni_trampoline can correctly extract them from the stack
+  // and load into floating-point registers.
+  // Possible arrangements of first two floating-point arguments on the stack (32-bit FPU
+  // view):
+  // (1)
+  //  |     DOUBLE    |     DOUBLE    | other args, if any
+  //  |  F12  |  F13  |  F14  |  F15  |
+  //  |  SP+0 |  SP+4 |  SP+8 | SP+12 | SP+16
+  // (2)
+  //  |     DOUBLE    | FLOAT | (PAD) | other args, if any
+  //  |  F12  |  F13  |  F14  |       |
+  //  |  SP+0 |  SP+4 |  SP+8 | SP+12 | SP+16
+  // (3)
+  //  | FLOAT | (PAD) |     DOUBLE    | other args, if any
+  //  |  F12  |       |  F14  |  F15  |
+  //  |  SP+0 |  SP+4 |  SP+8 | SP+12 | SP+16
+  // (4)
+  //  | FLOAT | FLOAT | other args, if any
+  //  |  F12  |  F14  |
+  //  |  SP+0 |  SP+4 | SP+8
+  // As you can see, only the last case (4) is special. In all others we can just
+  // load F12/F13 and F14/F15 in the same manner.
+  // Set bit 0 of the native code address to 1 in this case (valid code addresses
+  // are always a multiple of 4 on MIPS32, so we have 2 spare bits available).
+  if (nativeCode != nullptr &&
+      shorty != nullptr &&
+      shorty_len >= 3 &&
+      shorty[1] == 'F' &&
+      shorty[2] == 'F') {
+    nativeCode = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(nativeCode) | 1);
+  }
+#endif
+
   // Return native code addr(lo) and bottom of alloca address(hi).
   return GetTwoWordSuccessValue(reinterpret_cast<uintptr_t>(visitor.GetBottomOfUsedArea()),
                                 reinterpret_cast<uintptr_t>(nativeCode));
diff --git a/runtime/gc_root.h b/runtime/gc_root.h
index b795409..79e80f1 100644
--- a/runtime/gc_root.h
+++ b/runtime/gc_root.h
@@ -86,6 +86,22 @@
   return os;
 }
 
+// Not all combinations of flags are valid. You may not visit all roots as well as the new roots
+// (no logical reason to do this). You also may not start logging new roots and stop logging new
+// roots (also no logical reason to do this).
+//
+// The precise flag ensures that more metadata is supplied. An example is vreg data for compiled
+// method frames.
+enum VisitRootFlags : uint8_t {
+  kVisitRootFlagAllRoots = 0x1,
+  kVisitRootFlagNewRoots = 0x2,
+  kVisitRootFlagStartLoggingNewRoots = 0x4,
+  kVisitRootFlagStopLoggingNewRoots = 0x8,
+  kVisitRootFlagClearRootLog = 0x10,
+  kVisitRootFlagClassLoader = 0x20,
+  kVisitRootFlagPrecise = 0x80,
+};
+
 class RootVisitor {
  public:
   virtual ~RootVisitor() { }
diff --git a/runtime/interpreter/mterp/arm/op_float_to_long.S b/runtime/interpreter/mterp/arm/op_float_to_long.S
index 1770ea0..42bd98d 100644
--- a/runtime/interpreter/mterp/arm/op_float_to_long.S
+++ b/runtime/interpreter/mterp/arm/op_float_to_long.S
@@ -20,7 +20,7 @@
     adds    r0, r0, r0                  @ sign bit to carry
     mov     r0, #0xffffffff             @ assume maxlong for lsw
     mov     r1, #0x7fffffff             @ assume maxlong for msw
-    adcs    r0, r0, #0                  @ convert maxlong to minlong if exp negative
+    adc     r0, r0, #0
     adc     r1, r1, #0                  @ convert maxlong to minlong if exp negative
     bx      lr                          @ return
 f2l_maybeNaN:
diff --git a/runtime/interpreter/mterp/out/mterp_arm.S b/runtime/interpreter/mterp/out/mterp_arm.S
index c80d6b9..8916241 100644
--- a/runtime/interpreter/mterp/out/mterp_arm.S
+++ b/runtime/interpreter/mterp/out/mterp_arm.S
@@ -7425,7 +7425,7 @@
     adds    r0, r0, r0                  @ sign bit to carry
     mov     r0, #0xffffffff             @ assume maxlong for lsw
     mov     r1, #0x7fffffff             @ assume maxlong for msw
-    adcs    r0, r0, #0                  @ convert maxlong to minlong if exp negative
+    adc     r0, r0, #0
     adc     r1, r1, #0                  @ convert maxlong to minlong if exp negative
     bx      lr                          @ return
 f2l_maybeNaN:
@@ -7457,7 +7457,7 @@
     adds    r1, r1, r1                  @ sign bit to carry
     mov     r0, #0xffffffff             @ assume maxlong for lsw
     mov     r1, #0x7fffffff             @ assume maxlong for msw
-    adcs    r0, r0, #0
+    adc     r0, r0, #0
     adc     r1, r1, #0                  @ convert maxlong to minlong if exp negative
     bx      lr                          @ return
 d2l_maybeNaN:
@@ -12118,17 +12118,6 @@
     cmp     rPROFILE, #0
     bgt     MterpProfileActive                      @ if > 0, we may have some counts to report.
     ldmfd   sp!, {r3-r10,fp,pc}                     @ restore 10 regs and return
-    .cfi_restore r3
-    .cfi_restore r4
-    .cfi_restore r5
-    .cfi_restore r6
-    .cfi_restore r7
-    .cfi_restore r9
-    .cfi_restore r9
-    .cfi_restore r10
-    .cfi_restore fp
-    .cfi_restore pc
-    .cfi_adjust_cfa_offset -40
 
 MterpProfileActive:
     mov     rINST, r0                               @ stash return value
@@ -12140,17 +12129,6 @@
     bl      MterpAddHotnessBatch                    @ (method, shadow_frame, self)
     mov     r0, rINST                               @ restore return value
     ldmfd   sp!, {r3-r10,fp,pc}                     @ restore 10 regs and return
-    .cfi_restore r3
-    .cfi_restore r4
-    .cfi_restore r5
-    .cfi_restore r6
-    .cfi_restore r7
-    .cfi_restore r9
-    .cfi_restore r9
-    .cfi_restore r10
-    .cfi_restore fp
-    .cfi_restore pc
-    .cfi_adjust_cfa_offset -40
 
     END ExecuteMterpImpl
 
diff --git a/runtime/jit/jit_code_cache.cc b/runtime/jit/jit_code_cache.cc
index 93f50ad..1b0ad83 100644
--- a/runtime/jit/jit_code_cache.cc
+++ b/runtime/jit/jit_code_cache.cc
@@ -297,10 +297,11 @@
     ObjPtr<mirror::Object> object = roots->Get(i);
     if (kIsDebugBuild) {
       // Ensure the string is strongly interned. b/32995596
-      CHECK(object->IsString());
-      ObjPtr<mirror::String> str = reinterpret_cast<mirror::String*>(object.Ptr());
-      ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
-      CHECK(class_linker->GetInternTable()->LookupStrong(Thread::Current(), str) != nullptr);
+      if (object->IsString()) {
+        ObjPtr<mirror::String> str = reinterpret_cast<mirror::String*>(object.Ptr());
+        ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
+        CHECK(class_linker->GetInternTable()->LookupStrong(Thread::Current(), str) != nullptr);
+      }
     }
     gc_roots[i] = GcRoot<mirror::Object>(object);
   }
@@ -316,6 +317,31 @@
   return data - ComputeRootTableSize(roots);
 }
 
+// Helper for the GC to process a weak class in a JIT root table.
+static inline void ProcessWeakClass(GcRoot<mirror::Class>* root_ptr, IsMarkedVisitor* visitor)
+    REQUIRES_SHARED(Locks::mutator_lock_) {
+  // This does not need a read barrier because this is called by GC.
+  mirror::Class* cls = root_ptr->Read<kWithoutReadBarrier>();
+  if (cls != nullptr) {
+    DCHECK((cls->IsClass<kDefaultVerifyFlags, kWithoutReadBarrier>()));
+    // Look at the classloader of the class to know if it has been unloaded.
+    // This does not need a read barrier because this is called by GC.
+    mirror::Object* class_loader =
+        cls->GetClassLoader<kDefaultVerifyFlags, kWithoutReadBarrier>();
+    if (class_loader == nullptr || visitor->IsMarked(class_loader) != nullptr) {
+      // The class loader is live, update the entry if the class has moved.
+      mirror::Class* new_cls = down_cast<mirror::Class*>(visitor->IsMarked(cls));
+      // Note that new_object can be null for CMS and newly allocated objects.
+      if (new_cls != nullptr && new_cls != cls) {
+        *root_ptr = GcRoot<mirror::Class>(new_cls);
+      }
+    } else {
+      // The class loader is not live, clear the entry.
+      *root_ptr = GcRoot<mirror::Class>(nullptr);
+    }
+  }
+}
+
 void JitCodeCache::SweepRootTables(IsMarkedVisitor* visitor) {
   MutexLock mu(Thread::Current(), lock_);
   for (const auto& entry : method_code_map_) {
@@ -325,17 +351,22 @@
     for (uint32_t i = 0; i < number_of_roots; ++i) {
       // This does not need a read barrier because this is called by GC.
       mirror::Object* object = roots[i].Read<kWithoutReadBarrier>();
-      DCHECK(object != nullptr);
-      mirror::Object* new_object = visitor->IsMarked(object);
-      // We know the string is marked because it's a strongly-interned string that
-      // is always alive. The IsMarked implementation of the CMS collector returns
-      // null for newly allocated objects, but we know those haven't moved. Therefore,
-      // only update the entry if we get a different non-null string.
-      // TODO: Do not use IsMarked for j.l.Class, and adjust once we move this method
-      // out of the weak access/creation pause. b/32167580
-      if (new_object != nullptr && new_object != object) {
-        DCHECK(new_object->IsString());
-        roots[i] = GcRoot<mirror::Object>(new_object);
+      if (object == nullptr) {
+        // entry got deleted in a previous sweep.
+      } else if (object->IsString<kDefaultVerifyFlags, kWithoutReadBarrier>()) {
+        mirror::Object* new_object = visitor->IsMarked(object);
+        // We know the string is marked because it's a strongly-interned string that
+        // is always alive. The IsMarked implementation of the CMS collector returns
+        // null for newly allocated objects, but we know those haven't moved. Therefore,
+        // only update the entry if we get a different non-null string.
+        // TODO: Do not use IsMarked for j.l.Class, and adjust once we move this method
+        // out of the weak access/creation pause. b/32167580
+        if (new_object != nullptr && new_object != object) {
+          DCHECK(new_object->IsString());
+          roots[i] = GcRoot<mirror::Object>(new_object);
+        }
+      } else {
+        ProcessWeakClass(reinterpret_cast<GcRoot<mirror::Class>*>(&roots[i]), visitor);
       }
     }
   }
@@ -344,26 +375,7 @@
     for (size_t i = 0; i < info->number_of_inline_caches_; ++i) {
       InlineCache* cache = &info->cache_[i];
       for (size_t j = 0; j < InlineCache::kIndividualCacheSize; ++j) {
-        // This does not need a read barrier because this is called by GC.
-        mirror::Class* cls = cache->classes_[j].Read<kWithoutReadBarrier>();
-        if (cls != nullptr) {
-          // Look at the classloader of the class to know if it has been
-          // unloaded.
-          // This does not need a read barrier because this is called by GC.
-          mirror::Object* class_loader =
-              cls->GetClassLoader<kDefaultVerifyFlags, kWithoutReadBarrier>();
-          if (class_loader == nullptr || visitor->IsMarked(class_loader) != nullptr) {
-            // The class loader is live, update the entry if the class has moved.
-            mirror::Class* new_cls = down_cast<mirror::Class*>(visitor->IsMarked(cls));
-            // Note that new_object can be null for CMS and newly allocated objects.
-            if (new_cls != nullptr && new_cls != cls) {
-              cache->classes_[j] = GcRoot<mirror::Class>(new_cls);
-            }
-          } else {
-            // The class loader is not live, clear the entry.
-            cache->classes_[j] = GcRoot<mirror::Class>(nullptr);
-          }
-        }
+        ProcessWeakClass(&cache->classes_[j], visitor);
       }
     }
   }
diff --git a/runtime/jni_internal.cc b/runtime/jni_internal.cc
index 01a2ad8..3c641b0 100644
--- a/runtime/jni_internal.cc
+++ b/runtime/jni_internal.cc
@@ -224,8 +224,8 @@
   }
   std::string temp;
   if (is_static) {
-    field = mirror::Class::FindStaticField(soa.Self(), c, name,
-                                           field_type->GetDescriptor(&temp));
+    field = mirror::Class::FindStaticField(
+        soa.Self(), c.Get(), name, field_type->GetDescriptor(&temp));
   } else {
     field = c->FindInstanceField(name, field_type->GetDescriptor(&temp));
   }
diff --git a/runtime/mirror/class.cc b/runtime/mirror/class.cc
index 0cfe29b..a862c97 100644
--- a/runtime/mirror/class.cc
+++ b/runtime/mirror/class.cc
@@ -276,7 +276,7 @@
   if (num_direct_interfaces > 0) {
     os << "  interfaces (" << num_direct_interfaces << "):\n";
     for (size_t i = 0; i < num_direct_interfaces; ++i) {
-      ObjPtr<Class> interface = GetDirectInterface(self, h_this, i);
+      ObjPtr<Class> interface = GetDirectInterface(self, h_this.Get(), i);
       if (interface == nullptr) {
         os << StringPrintf("    %2zd: nullptr!\n", i);
       } else {
@@ -799,24 +799,21 @@
 }
 
 ArtField* Class::FindStaticField(Thread* self,
-                                 Handle<Class> klass,
+                                 ObjPtr<Class> klass,
                                  const StringPiece& name,
                                  const StringPiece& type) {
   // Is the field in this class (or its interfaces), or any of its
   // superclasses (or their interfaces)?
-  for (ObjPtr<Class> k = klass.Get(); k != nullptr; k = k->GetSuperClass()) {
+  for (ObjPtr<Class> k = klass; k != nullptr; k = k->GetSuperClass()) {
     // Is the field in this class?
     ArtField* f = k->FindDeclaredStaticField(name, type);
     if (f != nullptr) {
       return f;
     }
-    // Wrap k incase it moves during GetDirectInterface.
-    StackHandleScope<1> hs(self);
-    HandleWrapperObjPtr<Class> h_k(hs.NewHandleWrapper(&k));
     // Is this field in any of this class' interfaces?
-    for (uint32_t i = 0; i < h_k->NumDirectInterfaces(); ++i) {
-      StackHandleScope<1> hs2(self);
-      Handle<Class> interface(hs2.NewHandle(GetDirectInterface(self, h_k, i)));
+    for (uint32_t i = 0, num_interfaces = k->NumDirectInterfaces(); i != num_interfaces; ++i) {
+      ObjPtr<Class> interface = GetDirectInterface(self, k, i);
+      DCHECK(interface != nullptr);
       f = FindStaticField(self, interface, name, type);
       if (f != nullptr) {
         return f;
@@ -839,11 +836,10 @@
     // Though GetDirectInterface() should not cause thread suspension when called
     // from here, it takes a Handle as an argument, so we need to wrap `k`.
     ScopedAssertNoThreadSuspension ants(__FUNCTION__);
-    StackHandleScope<1> hs(self);
-    Handle<Class> h_k(hs.NewHandle(k));
     // Is this field in any of this class' interfaces?
-    for (uint32_t i = 0; i < h_k->NumDirectInterfaces(); ++i) {
-      ObjPtr<Class> interface = GetDirectInterface(self, h_k, i);
+    for (uint32_t i = 0, num_interfaces = k->NumDirectInterfaces(); i != num_interfaces; ++i) {
+      ObjPtr<Class> interface = GetDirectInterface(self, k, i);
+      DCHECK(interface != nullptr);
       f = FindStaticField(self, interface, dex_cache, dex_field_idx);
       if (f != nullptr) {
         return f;
@@ -854,11 +850,11 @@
 }
 
 ArtField* Class::FindField(Thread* self,
-                           Handle<Class> klass,
+                           ObjPtr<Class> klass,
                            const StringPiece& name,
                            const StringPiece& type) {
   // Find a field using the JLS field resolution order
-  for (ObjPtr<Class> k = klass.Get(); k != nullptr; k = k->GetSuperClass()) {
+  for (ObjPtr<Class> k = klass; k != nullptr; k = k->GetSuperClass()) {
     // Is the field in this class?
     ArtField* f = k->FindDeclaredInstanceField(name, type);
     if (f != nullptr) {
@@ -869,12 +865,10 @@
       return f;
     }
     // Is this field in any of this class' interfaces?
-    StackHandleScope<1> hs(self);
-    HandleWrapperObjPtr<Class> h_k(hs.NewHandleWrapper(&k));
-    for (uint32_t i = 0; i < h_k->NumDirectInterfaces(); ++i) {
-      StackHandleScope<1> hs2(self);
-      Handle<Class> interface(hs2.NewHandle(GetDirectInterface(self, h_k, i)));
-      f = interface->FindStaticField(self, interface, name, type);
+    for (uint32_t i = 0, num_interfaces = k->NumDirectInterfaces(); i != num_interfaces; ++i) {
+      ObjPtr<Class> interface = GetDirectInterface(self, k, i);
+      DCHECK(interface != nullptr);
+      f = FindStaticField(self, interface, name, type);
       if (f != nullptr) {
         return f;
       }
@@ -929,36 +923,46 @@
   return GetInterfaceTypeList()->GetTypeItem(idx).type_idx_;
 }
 
-ObjPtr<Class> Class::GetDirectInterface(Thread* self,
-                                        Handle<Class> klass,
-                                        uint32_t idx) {
-  DCHECK(klass.Get() != nullptr);
+ObjPtr<Class> Class::GetDirectInterface(Thread* self, ObjPtr<Class> klass, uint32_t idx) {
+  DCHECK(klass != nullptr);
   DCHECK(!klass->IsPrimitive());
   if (klass->IsArrayClass()) {
     ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
+    // Use ClassLinker::LookupClass(); avoid poisoning ObjPtr<>s by ClassLinker::FindSystemClass().
+    ObjPtr<Class> interface;
     if (idx == 0) {
-      return class_linker->FindSystemClass(self, "Ljava/lang/Cloneable;");
+      interface = class_linker->LookupClass(self, "Ljava/lang/Cloneable;", nullptr);
     } else {
       DCHECK_EQ(1U, idx);
-      return class_linker->FindSystemClass(self, "Ljava/io/Serializable;");
+      interface = class_linker->LookupClass(self, "Ljava/io/Serializable;", nullptr);
     }
+    DCHECK(interface != nullptr);
+    return interface;
   } else if (klass->IsProxyClass()) {
-    ObjPtr<ObjectArray<Class>> interfaces = klass.Get()->GetInterfaces();
+    ObjPtr<ObjectArray<Class>> interfaces = klass->GetInterfaces();
     DCHECK(interfaces != nullptr);
     return interfaces->Get(idx);
   } else {
     dex::TypeIndex type_idx = klass->GetDirectInterfaceTypeIdx(idx);
     ObjPtr<Class> interface = klass->GetDexCache()->GetResolvedType(type_idx);
-    if (interface == nullptr) {
-      interface = Runtime::Current()->GetClassLinker()->ResolveType(klass->GetDexFile(),
-                                                                    type_idx,
-                                                                    klass.Get());
-      CHECK(interface != nullptr || self->IsExceptionPending());
-    }
     return interface;
   }
 }
 
+ObjPtr<Class> Class::ResolveDirectInterface(Thread* self, Handle<Class> klass, uint32_t idx) {
+  ObjPtr<Class> interface = GetDirectInterface(self, klass.Get(), idx);
+  if (interface == nullptr) {
+    DCHECK(!klass->IsArrayClass());
+    DCHECK(!klass->IsProxyClass());
+    dex::TypeIndex type_idx = klass->GetDirectInterfaceTypeIdx(idx);
+    interface = Runtime::Current()->GetClassLinker()->ResolveType(klass->GetDexFile(),
+                                                                  type_idx,
+                                                                  klass.Get());
+    CHECK(interface != nullptr || self->IsExceptionPending());
+  }
+  return interface;
+}
+
 ObjPtr<Class> Class::GetCommonSuperClass(Handle<Class> klass) {
   DCHECK(klass.Get() != nullptr);
   DCHECK(!klass->IsInterface());
diff --git a/runtime/mirror/class.h b/runtime/mirror/class.h
index 248c941..d7449c8 100644
--- a/runtime/mirror/class.h
+++ b/runtime/mirror/class.h
@@ -1087,7 +1087,9 @@
   ArtField* GetStaticField(uint32_t i) REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Find a static or instance field using the JLS resolution order
-  static ArtField* FindField(Thread* self, Handle<Class> klass, const StringPiece& name,
+  static ArtField* FindField(Thread* self,
+                             ObjPtr<Class> klass,
+                             const StringPiece& name,
                              const StringPiece& type)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
@@ -1108,7 +1110,7 @@
 
   // Finds the given static field in this class or a superclass.
   static ArtField* FindStaticField(Thread* self,
-                                   Handle<Class> klass,
+                                   ObjPtr<Class> klass,
                                    const StringPiece& name,
                                    const StringPiece& type)
       REQUIRES_SHARED(Locks::mutator_lock_);
@@ -1204,9 +1206,15 @@
 
   dex::TypeIndex GetDirectInterfaceTypeIdx(uint32_t idx) REQUIRES_SHARED(Locks::mutator_lock_);
 
-  static ObjPtr<Class> GetDirectInterface(Thread* self,
-                                          Handle<Class> klass,
-                                          uint32_t idx)
+  // Get the direct interface of the `klass` at index `idx` if resolved, otherwise return null.
+  // If the caller expects the interface to be resolved, for example for a resolved `klass`,
+  // that assumption should be checked by `DCHECK(result != nullptr)`.
+  static ObjPtr<Class> GetDirectInterface(Thread* self, ObjPtr<Class> klass, uint32_t idx)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Resolve and get the direct interface of the `klass` at index `idx`.
+  // Returns null with a pending exception if the resolution fails.
+  static ObjPtr<Class> ResolveDirectInterface(Thread* self, Handle<Class> klass, uint32_t idx)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   const char* GetSourceFile() REQUIRES_SHARED(Locks::mutator_lock_);
diff --git a/runtime/mirror/dex_cache-inl.h b/runtime/mirror/dex_cache-inl.h
index be8815a..b54e416 100644
--- a/runtime/mirror/dex_cache-inl.h
+++ b/runtime/mirror/dex_cache-inl.h
@@ -71,6 +71,8 @@
 }
 
 inline Class* DexCache::GetResolvedType(dex::TypeIndex type_idx) {
+  // It is theorized that a load acquire is not required since obtaining the resolved class will
+  // always have an address depedency or a lock.
   DCHECK_LT(type_idx.index_, NumResolvedTypes());
   return GetResolvedTypes()[type_idx.index_].Read();
 }
@@ -78,7 +80,11 @@
 inline void DexCache::SetResolvedType(dex::TypeIndex type_idx, ObjPtr<Class> resolved) {
   DCHECK_LT(type_idx.index_, NumResolvedTypes());  // NOTE: Unchecked, i.e. not throwing AIOOB.
   // TODO default transaction support.
-  GetResolvedTypes()[type_idx.index_] = GcRoot<Class>(resolved);
+  // Use a release store for SetResolvedType. This is done to prevent other threads from seeing a
+  // class but not necessarily seeing the loaded members like the static fields array.
+  // See b/32075261.
+  reinterpret_cast<Atomic<GcRoot<mirror::Class>>&>(GetResolvedTypes()[type_idx.index_]).
+      StoreRelease(GcRoot<Class>(resolved));
   // TODO: Fine-grained marking, so that we don't need to go through all arrays in full.
   Runtime::Current()->GetHeap()->WriteBarrierEveryFieldOf(this);
 }
diff --git a/runtime/mirror/object_test.cc b/runtime/mirror/object_test.cc
index 4b47f7f..a6f56ae 100644
--- a/runtime/mirror/object_test.cc
+++ b/runtime/mirror/object_test.cc
@@ -140,9 +140,9 @@
   Handle<mirror::Class> klass(hs.NewHandle(oa->GetClass()));
   ASSERT_EQ(2U, klass->NumDirectInterfaces());
   EXPECT_OBJ_PTR_EQ(class_linker_->FindSystemClass(soa.Self(), "Ljava/lang/Cloneable;"),
-                    mirror::Class::GetDirectInterface(soa.Self(), klass, 0));
+                    mirror::Class::GetDirectInterface(soa.Self(), klass.Get(), 0));
   EXPECT_OBJ_PTR_EQ(class_linker_->FindSystemClass(soa.Self(), "Ljava/io/Serializable;"),
-                    mirror::Class::GetDirectInterface(soa.Self(), klass, 1));
+                    mirror::Class::GetDirectInterface(soa.Self(), klass.Get(), 1));
 }
 
 TEST_F(ObjectTest, AllocArray) {
@@ -708,19 +708,19 @@
   // Wrong type.
   EXPECT_TRUE(c->FindDeclaredStaticField("CASE_INSENSITIVE_ORDER", "I") == nullptr);
   EXPECT_TRUE(mirror::Class::FindStaticField(
-      soa.Self(), c, "CASE_INSENSITIVE_ORDER", "I") == nullptr);
+      soa.Self(), c.Get(), "CASE_INSENSITIVE_ORDER", "I") == nullptr);
 
   // Wrong name.
   EXPECT_TRUE(c->FindDeclaredStaticField(
       "cASE_INSENSITIVE_ORDER", "Ljava/util/Comparator;") == nullptr);
   EXPECT_TRUE(
-      mirror::Class::FindStaticField(soa.Self(), c, "cASE_INSENSITIVE_ORDER",
-                                     "Ljava/util/Comparator;") == nullptr);
+      mirror::Class::FindStaticField(
+          soa.Self(), c.Get(), "cASE_INSENSITIVE_ORDER", "Ljava/util/Comparator;") == nullptr);
 
   // Right name and type.
   ArtField* f1 = c->FindDeclaredStaticField("CASE_INSENSITIVE_ORDER", "Ljava/util/Comparator;");
-  ArtField* f2 = mirror::Class::FindStaticField(soa.Self(), c, "CASE_INSENSITIVE_ORDER",
-                                                "Ljava/util/Comparator;");
+  ArtField* f2 = mirror::Class::FindStaticField(
+      soa.Self(), c.Get(), "CASE_INSENSITIVE_ORDER", "Ljava/util/Comparator;");
   EXPECT_TRUE(f1 != nullptr);
   EXPECT_TRUE(f2 != nullptr);
   EXPECT_EQ(f1, f2);
diff --git a/runtime/mirror/string-inl.h b/runtime/mirror/string-inl.h
index 6870fda..95516ac 100644
--- a/runtime/mirror/string-inl.h
+++ b/runtime/mirror/string-inl.h
@@ -305,8 +305,11 @@
 
 template<typename MemoryType>
 bool String::AllASCII(const MemoryType* const chars, const int length) {
+  static_assert(std::is_unsigned<MemoryType>::value, "Expecting unsigned MemoryType");
   for (int i = 0; i < length; ++i) {
-    if (chars[i] >= 0x80) {
+    // Valid ASCII characters are in range 1..0x7f. Zero is not considered ASCII
+    // because it would complicate the detection of ASCII strings in Modified-UTF8.
+    if ((chars[i] - 1u) >= 0x7fu) {
       return false;
     }
   }
diff --git a/runtime/native/java_lang_Class.cc b/runtime/native/java_lang_Class.cc
index 642826c..3341f53 100644
--- a/runtime/native/java_lang_Class.cc
+++ b/runtime/native/java_lang_Class.cc
@@ -278,7 +278,7 @@
 
     uint32_t num_direct_interfaces = h_clazz->NumDirectInterfaces();
     for (uint32_t i = 0; i < num_direct_interfaces; i++) {
-      ObjPtr<mirror::Class> iface = mirror::Class::GetDirectInterface(self, h_clazz, i);
+      ObjPtr<mirror::Class> iface = mirror::Class::ResolveDirectInterface(self, h_clazz, i);
       if (UNLIKELY(iface == nullptr)) {
         self->AssertPendingException();
         return nullptr;
diff --git a/runtime/oat_file_assistant_test.cc b/runtime/oat_file_assistant_test.cc
index 5730cf2..94c12af 100644
--- a/runtime/oat_file_assistant_test.cc
+++ b/runtime/oat_file_assistant_test.cc
@@ -1297,7 +1297,7 @@
 
   for (std::pair<OatFileAssistant::DexOptNeeded, const char*> field : mapping) {
     ArtField* art_field = mirror::Class::FindStaticField(
-        soa.Self(), dexfile, field.second, "I");
+        soa.Self(), dexfile.Get(), field.second, "I");
     ASSERT_FALSE(art_field == nullptr);
     EXPECT_EQ(art_field->GetTypeAsPrimitiveType(), Primitive::kPrimInt);
     EXPECT_EQ(field.first, art_field->GetInt(dexfile.Get()));
diff --git a/runtime/obj_ptr.h b/runtime/obj_ptr.h
index d24c6fb..2da2ae5 100644
--- a/runtime/obj_ptr.h
+++ b/runtime/obj_ptr.h
@@ -51,27 +51,24 @@
       REQUIRES_SHARED(Locks::mutator_lock_)
       : reference_(0u) {}
 
-  template <typename Type>
+  template <typename Type,
+            typename = typename std::enable_if<std::is_base_of<MirrorType, Type>::value>::type>
   ALWAYS_INLINE ObjPtr(Type* ptr)  // NOLINT
       REQUIRES_SHARED(Locks::mutator_lock_)
       : reference_(Encode(static_cast<MirrorType*>(ptr))) {
-    static_assert(std::is_base_of<MirrorType, Type>::value,
-                  "Input type must be a subtype of the ObjPtr type");
   }
 
-  template <typename Type>
+  template <typename Type,
+            typename = typename std::enable_if<std::is_base_of<MirrorType, Type>::value>::type>
   ALWAYS_INLINE ObjPtr(const ObjPtr<Type, kPoison>& other)  // NOLINT
       REQUIRES_SHARED(Locks::mutator_lock_)
       : reference_(Encode(static_cast<MirrorType*>(other.Ptr()))) {
-    static_assert(std::is_base_of<MirrorType, Type>::value,
-                  "Input type must be a subtype of the ObjPtr type");
   }
 
-  template <typename Type>
+  template <typename Type,
+            typename = typename std::enable_if<std::is_base_of<MirrorType, Type>::value>::type>
   ALWAYS_INLINE ObjPtr& operator=(const ObjPtr<Type, kPoison>& other)
       REQUIRES_SHARED(Locks::mutator_lock_) {
-    static_assert(std::is_base_of<MirrorType, Type>::value,
-                  "Input type must be a subtype of the ObjPtr type");
     reference_ = Encode(static_cast<MirrorType*>(other.Ptr()));
     return *this;
   }
diff --git a/runtime/openjdkjvmti/ti_heap.cc b/runtime/openjdkjvmti/ti_heap.cc
index 5e588a8..7b2521d 100644
--- a/runtime/openjdkjvmti/ti_heap.cc
+++ b/runtime/openjdkjvmti/ti_heap.cc
@@ -174,10 +174,11 @@
 class FollowReferencesHelper FINAL {
  public:
   FollowReferencesHelper(HeapUtil* h,
-                         art::ObjPtr<art::mirror::Object> initial_object ATTRIBUTE_UNUSED,
+                         art::ObjPtr<art::mirror::Object> initial_object,
                          const jvmtiHeapCallbacks* callbacks,
                          const void* user_data)
       : tag_table_(h->GetTags()),
+        initial_object_(initial_object),
         callbacks_(callbacks),
         user_data_(user_data),
         start_(0),
@@ -187,13 +188,23 @@
   void Init()
       REQUIRES_SHARED(art::Locks::mutator_lock_)
       REQUIRES(!*tag_table_->GetAllowDisallowLock()) {
-    CollectAndReportRootsVisitor carrv(this, tag_table_, &worklist_, &visited_);
-    art::Runtime::Current()->VisitRoots(&carrv);
-    art::Runtime::Current()->VisitImageRoots(&carrv);
-    stop_reports_ = carrv.IsStopReports();
+    if (initial_object_.IsNull()) {
+      CollectAndReportRootsVisitor carrv(this, tag_table_, &worklist_, &visited_);
 
-    if (stop_reports_) {
-      worklist_.clear();
+      // We need precise info (e.g., vregs).
+      constexpr art::VisitRootFlags kRootFlags = static_cast<art::VisitRootFlags>(
+          art::VisitRootFlags::kVisitRootFlagAllRoots | art::VisitRootFlags::kVisitRootFlagPrecise);
+      art::Runtime::Current()->VisitRoots(&carrv, kRootFlags);
+
+      art::Runtime::Current()->VisitImageRoots(&carrv);
+      stop_reports_ = carrv.IsStopReports();
+
+      if (stop_reports_) {
+        worklist_.clear();
+      }
+    } else {
+      visited_.insert(initial_object_.Ptr());
+      worklist_.push_back(initial_object_.Ptr());
     }
   }
 
@@ -316,7 +327,36 @@
         }
 
         case art::RootType::kRootJavaFrame:
+        {
+          uint32_t thread_id = info.GetThreadId();
+          ref_info->stack_local.thread_id = thread_id;
+
+          art::Thread* thread = FindThread(info);
+          if (thread != nullptr) {
+            art::mirror::Object* thread_obj = thread->GetPeer();
+            if (thread->IsStillStarting()) {
+              thread_obj = nullptr;
+            } else {
+              thread_obj = thread->GetPeer();
+            }
+            if (thread_obj != nullptr) {
+              ref_info->stack_local.thread_tag = tag_table_->GetTagOrZero(thread_obj);
+            }
+          }
+
+          auto& java_info = static_cast<const art::JavaFrameRootInfo&>(info);
+          ref_info->stack_local.slot = static_cast<jint>(java_info.GetVReg());
+          const art::StackVisitor* visitor = java_info.GetVisitor();
+          ref_info->stack_local.location =
+              static_cast<jlocation>(visitor->GetDexPc(false /* abort_on_failure */));
+          ref_info->stack_local.depth = static_cast<jint>(visitor->GetFrameDepth());
+          art::ArtMethod* method = visitor->GetMethod();
+          if (method != nullptr) {
+            ref_info->stack_local.method = art::jni::EncodeArtMethod(method);
+          }
+
           return JVMTI_HEAP_REFERENCE_STACK_LOCAL;
+        }
 
         case art::RootType::kRootNativeStack:
         case art::RootType::kRootThreadBlock:
@@ -484,7 +524,7 @@
     art::Handle<art::mirror::Class> h_klass(hs.NewHandle<art::mirror::Class>(klass));
     for (size_t i = 0; i < h_klass->NumDirectInterfaces(); ++i) {
       art::ObjPtr<art::mirror::Class> inf_klass =
-          art::mirror::Class::GetDirectInterface(self, h_klass, i);
+          art::mirror::Class::ResolveDirectInterface(self, h_klass, i);
       if (inf_klass == nullptr) {
         // TODO: With a resolved class this should not happen...
         self->ClearException();
@@ -616,6 +656,7 @@
   }
 
   ObjectTagTable* tag_table_;
+  art::ObjPtr<art::mirror::Object> initial_object_;
   const jvmtiHeapCallbacks* callbacks_;
   const void* user_data_;
 
@@ -646,20 +687,28 @@
   }
 
   art::Thread* self = art::Thread::Current();
-  art::ScopedObjectAccess soa(self);      // Now we know we have the shared lock.
 
-  art::Runtime::Current()->GetHeap()->IncrementDisableMovingGC(self);
+  art::gc::Heap* heap = art::Runtime::Current()->GetHeap();
+  if (heap->IsGcConcurrentAndMoving()) {
+    // Need to take a heap dump while GC isn't running. See the
+    // comment in Heap::VisitObjects().
+    heap->IncrementDisableMovingGC(self);
+  }
   {
-    art::ObjPtr<art::mirror::Object> o_initial = soa.Decode<art::mirror::Object>(initial_object);
-
+    art::ScopedObjectAccess soa(self);      // Now we know we have the shared lock.
     art::ScopedThreadSuspension sts(self, art::kWaitingForVisitObjects);
     art::ScopedSuspendAll ssa("FollowReferences");
 
-    FollowReferencesHelper frh(this, o_initial, callbacks, user_data);
+    FollowReferencesHelper frh(this,
+                               self->DecodeJObject(initial_object),
+                               callbacks,
+                               user_data);
     frh.Init();
     frh.Work();
   }
-  art::Runtime::Current()->GetHeap()->DecrementDisableMovingGC(self);
+  if (heap->IsGcConcurrentAndMoving()) {
+    heap->DecrementDisableMovingGC(self);
+  }
 
   return ERR(NONE);
 }
diff --git a/runtime/proxy_test.cc b/runtime/proxy_test.cc
index fd7e56d..1292a81 100644
--- a/runtime/proxy_test.cc
+++ b/runtime/proxy_test.cc
@@ -128,8 +128,8 @@
   ASSERT_TRUE(proxy_class->IsInitialized());
 
   EXPECT_EQ(2U, proxy_class->NumDirectInterfaces());  // Interfaces$I and Interfaces$J.
-  EXPECT_OBJ_PTR_EQ(I.Get(), mirror::Class::GetDirectInterface(soa.Self(), proxy_class, 0));
-  EXPECT_OBJ_PTR_EQ(J.Get(), mirror::Class::GetDirectInterface(soa.Self(), proxy_class, 1));
+  EXPECT_OBJ_PTR_EQ(I.Get(), mirror::Class::GetDirectInterface(soa.Self(), proxy_class.Get(), 0));
+  EXPECT_OBJ_PTR_EQ(J.Get(), mirror::Class::GetDirectInterface(soa.Self(), proxy_class.Get(), 1));
   std::string temp;
   const char* proxy_class_descriptor = proxy_class->GetDescriptor(&temp);
   EXPECT_STREQ("L$Proxy1234;", proxy_class_descriptor);
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 66bb803..14628f0 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -1493,11 +1493,6 @@
 }
 
 void Runtime::DumpForSigQuit(std::ostream& os) {
-  // Dumping for SIGQIT may cause deadlocks if the the debugger is active. b/26118154
-  if (Dbg::IsDebuggerActive()) {
-    LOG(INFO) << "Skipping DumpForSigQuit due to active debugger";
-    return;
-  }
   GetClassLinker()->DumpForSigQuit(os);
   GetInternTable()->DumpForSigQuit(os);
   GetJavaVM()->DumpForSigQuit(os);
@@ -1700,13 +1695,13 @@
   VisitTransactionRoots(visitor);
 }
 
-void Runtime::VisitNonConcurrentRoots(RootVisitor* visitor) {
-  thread_list_->VisitRoots(visitor);
+void Runtime::VisitNonConcurrentRoots(RootVisitor* visitor, VisitRootFlags flags) {
+  VisitThreadRoots(visitor, flags);
   VisitNonThreadRoots(visitor);
 }
 
-void Runtime::VisitThreadRoots(RootVisitor* visitor) {
-  thread_list_->VisitRoots(visitor);
+void Runtime::VisitThreadRoots(RootVisitor* visitor, VisitRootFlags flags) {
+  thread_list_->VisitRoots(visitor, flags);
 }
 
 size_t Runtime::FlipThreadRoots(Closure* thread_flip_visitor, Closure* flip_callback,
@@ -1715,7 +1710,7 @@
 }
 
 void Runtime::VisitRoots(RootVisitor* visitor, VisitRootFlags flags) {
-  VisitNonConcurrentRoots(visitor);
+  VisitNonConcurrentRoots(visitor, flags);
   VisitConcurrentRoots(visitor, flags);
 }
 
diff --git a/runtime/runtime.h b/runtime/runtime.h
index e6b3128..d40c631 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -100,18 +100,6 @@
 
 typedef std::vector<std::pair<std::string, const void*>> RuntimeOptions;
 
-// Not all combinations of flags are valid. You may not visit all roots as well as the new roots
-// (no logical reason to do this). You also may not start logging new roots and stop logging new
-// roots (also no logical reason to do this).
-enum VisitRootFlags : uint8_t {
-  kVisitRootFlagAllRoots = 0x1,
-  kVisitRootFlagNewRoots = 0x2,
-  kVisitRootFlagStartLoggingNewRoots = 0x4,
-  kVisitRootFlagStopLoggingNewRoots = 0x8,
-  kVisitRootFlagClearRootLog = 0x10,
-  kVisitRootFlagClassLoader = 0x20,
-};
-
 class Runtime {
  public:
   // Parse raw runtime options.
@@ -349,28 +337,16 @@
   void VisitTransactionRoots(RootVisitor* visitor)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  // Visit all of the thread roots.
-  void VisitThreadRoots(RootVisitor* visitor) REQUIRES_SHARED(Locks::mutator_lock_);
-
   // Flip thread roots from from-space refs to to-space refs.
   size_t FlipThreadRoots(Closure* thread_flip_visitor, Closure* flip_callback,
                          gc::collector::GarbageCollector* collector)
       REQUIRES(!Locks::mutator_lock_);
 
-  // Visit all other roots which must be done with mutators suspended.
-  void VisitNonConcurrentRoots(RootVisitor* visitor)
-      REQUIRES_SHARED(Locks::mutator_lock_);
-
   // Sweep system weaks, the system weak is deleted if the visitor return null. Otherwise, the
   // system weak is updated to be the visitor's returned value.
   void SweepSystemWeaks(IsMarkedVisitor* visitor)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  // Constant roots are the roots which never change after the runtime is initialized, they only
-  // need to be visited once per GC cycle.
-  void VisitConstantRoots(RootVisitor* visitor)
-      REQUIRES_SHARED(Locks::mutator_lock_);
-
   // Returns a special method that calls into a trampoline for runtime method resolution
   ArtMethod* GetResolutionMethod();
 
@@ -702,6 +678,19 @@
 
   void MaybeSaveJitProfilingInfo();
 
+  // Visit all of the thread roots.
+  void VisitThreadRoots(RootVisitor* visitor, VisitRootFlags flags)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Visit all other roots which must be done with mutators suspended.
+  void VisitNonConcurrentRoots(RootVisitor* visitor, VisitRootFlags flags)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
+  // Constant roots are the roots which never change after the runtime is initialized, they only
+  // need to be visited once per GC cycle.
+  void VisitConstantRoots(RootVisitor* visitor)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
   // A pointer to the active runtime or null.
   static Runtime* instance_;
 
diff --git a/runtime/stack.cc b/runtime/stack.cc
index f20aa20..792da88 100644
--- a/runtime/stack.cc
+++ b/runtime/stack.cc
@@ -614,12 +614,6 @@
   return result;
 }
 
-static instrumentation::InstrumentationStackFrame& GetInstrumentationStackFrame(Thread* thread,
-                                                                                uint32_t depth) {
-  CHECK_LT(depth, thread->GetInstrumentationStack()->size());
-  return thread->GetInstrumentationStack()->at(depth);
-}
-
 static void AssertPcIsWithinQuickCode(ArtMethod* method, uintptr_t pc)
     REQUIRES_SHARED(Locks::mutator_lock_) {
   if (method->IsNative() || method->IsRuntimeMethod() || method->IsProxyMethod()) {
@@ -777,6 +771,7 @@
   return QuickMethodFrameInfo(frame_size, callee_info.CoreSpillMask(), callee_info.FpSpillMask());
 }
 
+template <StackVisitor::CountTransitions kCount>
 void StackVisitor::WalkStack(bool include_transitions) {
   DCHECK(thread_ == Thread::Current() || thread_->IsSuspended());
   CHECK_EQ(cur_depth_, 0U);
@@ -842,8 +837,9 @@
           // While profiling, the return pc is restored from the side stack, except when walking
           // the stack for an exception where the side stack will be unwound in VisitFrame.
           if (reinterpret_cast<uintptr_t>(GetQuickInstrumentationExitPc()) == return_pc) {
+            CHECK_LT(instrumentation_stack_depth, thread_->GetInstrumentationStack()->size());
             const instrumentation::InstrumentationStackFrame& instrumentation_frame =
-                GetInstrumentationStackFrame(thread_, instrumentation_stack_depth);
+                thread_->GetInstrumentationStack()->at(instrumentation_stack_depth);
             instrumentation_stack_depth++;
             if (GetMethod() ==
                 Runtime::Current()->GetCalleeSaveMethod(Runtime::kSaveAllCalleeSaves)) {
@@ -907,13 +903,18 @@
         return;
       }
     }
-    cur_depth_++;
+    if (kCount == CountTransitions::kYes) {
+      cur_depth_++;
+    }
   }
   if (num_frames_ != 0) {
     CHECK_EQ(cur_depth_, num_frames_);
   }
 }
 
+template void StackVisitor::WalkStack<StackVisitor::CountTransitions::kYes>(bool);
+template void StackVisitor::WalkStack<StackVisitor::CountTransitions::kNo>(bool);
+
 void JavaFrameRootInfo::Describe(std::ostream& os) const {
   const StackVisitor* visitor = stack_visitor_;
   CHECK(visitor != nullptr);
diff --git a/runtime/stack.h b/runtime/stack.h
index d02e4b7..b1e99e5 100644
--- a/runtime/stack.h
+++ b/runtime/stack.h
@@ -595,6 +595,12 @@
   // Return 'true' if we should continue to visit more frames, 'false' to stop.
   virtual bool VisitFrame() REQUIRES_SHARED(Locks::mutator_lock_) = 0;
 
+  enum class CountTransitions {
+    kYes,
+    kNo,
+  };
+
+  template <CountTransitions kCount = CountTransitions::kYes>
   void WalkStack(bool include_transitions = false)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 17f5513..bc133d1 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -2802,7 +2802,7 @@
 }
 
 // RootVisitor parameters are: (const Object* obj, size_t vreg, const StackVisitor* visitor).
-template <typename RootVisitor>
+template <typename RootVisitor, bool kPrecise = false>
 class ReferenceMapVisitor : public StackVisitor {
  public:
   ReferenceMapVisitor(Thread* thread, Context* context, RootVisitor& visitor)
@@ -2889,7 +2889,9 @@
     }
   }
 
-  void VisitQuickFrame() REQUIRES_SHARED(Locks::mutator_lock_) {
+  template <typename T>
+  ALWAYS_INLINE
+  inline void VisitQuickFrameWithVregCallback() REQUIRES_SHARED(Locks::mutator_lock_) {
     ArtMethod** cur_quick_frame = GetCurrentQuickFrame();
     DCHECK(cur_quick_frame != nullptr);
     ArtMethod* m = *cur_quick_frame;
@@ -2906,6 +2908,9 @@
       CodeInfoEncoding encoding = code_info.ExtractEncoding();
       StackMap map = code_info.GetStackMapForNativePcOffset(native_pc_offset, encoding);
       DCHECK(map.IsValid());
+
+      T vreg_info(m, code_info, encoding, map, visitor_);
+
       // Visit stack entries that hold pointers.
       size_t number_of_bits = map.GetNumberOfStackMaskBits(encoding.stack_map_encoding);
       for (size_t i = 0; i < number_of_bits; ++i) {
@@ -2914,7 +2919,7 @@
           mirror::Object* ref = ref_addr->AsMirrorPtr();
           if (ref != nullptr) {
             mirror::Object* new_ref = ref;
-            visitor_(&new_ref, -1, this);
+            vreg_info.VisitStack(&new_ref, i, this);
             if (ref != new_ref) {
               ref_addr->Assign(new_ref);
             }
@@ -2935,13 +2940,119 @@
                        << "set in register_mask=" << register_mask << " at " << DescribeLocation();
           }
           if (*ref_addr != nullptr) {
-            visitor_(ref_addr, -1, this);
+            vreg_info.VisitRegister(ref_addr, i, this);
           }
         }
       }
     }
   }
 
+  void VisitQuickFrame() REQUIRES_SHARED(Locks::mutator_lock_) {
+    if (kPrecise) {
+      VisitQuickFramePrecise();
+    } else {
+      VisitQuickFrameNonPrecise();
+    }
+  }
+
+  void VisitQuickFrameNonPrecise() REQUIRES_SHARED(Locks::mutator_lock_) {
+    struct UndefinedVRegInfo {
+      UndefinedVRegInfo(ArtMethod* method ATTRIBUTE_UNUSED,
+                        const CodeInfo& code_info ATTRIBUTE_UNUSED,
+                        const CodeInfoEncoding& encoding ATTRIBUTE_UNUSED,
+                        const StackMap& map ATTRIBUTE_UNUSED,
+                        RootVisitor& _visitor)
+          : visitor(_visitor) {
+      }
+
+      ALWAYS_INLINE
+      void VisitStack(mirror::Object** ref,
+                      size_t stack_index ATTRIBUTE_UNUSED,
+                      const StackVisitor* stack_visitor)
+          REQUIRES_SHARED(Locks::mutator_lock_) {
+        visitor(ref, -1, stack_visitor);
+      }
+
+      ALWAYS_INLINE
+      void VisitRegister(mirror::Object** ref,
+                         size_t register_index ATTRIBUTE_UNUSED,
+                         const StackVisitor* stack_visitor)
+          REQUIRES_SHARED(Locks::mutator_lock_) {
+        visitor(ref, -1, stack_visitor);
+      }
+
+      RootVisitor& visitor;
+    };
+    VisitQuickFrameWithVregCallback<UndefinedVRegInfo>();
+  }
+
+  void VisitQuickFramePrecise() REQUIRES_SHARED(Locks::mutator_lock_) {
+    struct StackMapVRegInfo {
+      StackMapVRegInfo(ArtMethod* method,
+                       const CodeInfo& _code_info,
+                       const CodeInfoEncoding& _encoding,
+                       const StackMap& map,
+                       RootVisitor& _visitor)
+          : number_of_dex_registers(method->GetCodeItem()->registers_size_),
+            code_info(_code_info),
+            encoding(_encoding),
+            dex_register_map(code_info.GetDexRegisterMapOf(map,
+                                                           encoding,
+                                                           number_of_dex_registers)),
+            visitor(_visitor) {
+      }
+
+      // TODO: If necessary, we should consider caching a reverse map instead of the linear
+      //       lookups for each location.
+      void FindWithType(const size_t index,
+                        const DexRegisterLocation::Kind kind,
+                        mirror::Object** ref,
+                        const StackVisitor* stack_visitor)
+          REQUIRES_SHARED(Locks::mutator_lock_) {
+        bool found = false;
+        for (size_t dex_reg = 0; dex_reg != number_of_dex_registers; ++dex_reg) {
+          DexRegisterLocation location = dex_register_map.GetDexRegisterLocation(
+              dex_reg, number_of_dex_registers, code_info, encoding);
+          if (location.GetKind() == kind && static_cast<size_t>(location.GetValue()) == index) {
+            visitor(ref, dex_reg, stack_visitor);
+            found = true;
+          }
+        }
+
+        if (!found) {
+          // If nothing found, report with -1.
+          visitor(ref, -1, stack_visitor);
+        }
+      }
+
+      void VisitStack(mirror::Object** ref, size_t stack_index, const StackVisitor* stack_visitor)
+          REQUIRES_SHARED(Locks::mutator_lock_) {
+        const size_t stack_offset = stack_index * kFrameSlotSize;
+        FindWithType(stack_offset,
+                     DexRegisterLocation::Kind::kInStack,
+                     ref,
+                     stack_visitor);
+      }
+
+      void VisitRegister(mirror::Object** ref,
+                         size_t register_index,
+                         const StackVisitor* stack_visitor)
+          REQUIRES_SHARED(Locks::mutator_lock_) {
+        FindWithType(register_index,
+                     DexRegisterLocation::Kind::kInRegister,
+                     ref,
+                     stack_visitor);
+      }
+
+      size_t number_of_dex_registers;
+      const CodeInfo& code_info;
+      const CodeInfoEncoding& encoding;
+      DexRegisterMap dex_register_map;
+      RootVisitor& visitor;
+    };
+    VisitQuickFrameWithVregCallback<StackMapVRegInfo>();
+  }
+
   // Visitor for when we visit a root.
   RootVisitor& visitor_;
 };
@@ -2960,6 +3071,7 @@
   const uint32_t tid_;
 };
 
+template <bool kPrecise>
 void Thread::VisitRoots(RootVisitor* visitor) {
   const uint32_t thread_id = GetThreadId();
   visitor->VisitRootIfNonNull(&tlsPtr_.opeer, RootInfo(kRootThreadObject, thread_id));
@@ -2977,7 +3089,7 @@
   // Visit roots for deoptimization.
   if (tlsPtr_.stacked_shadow_frame_record != nullptr) {
     RootCallbackVisitor visitor_to_callback(visitor, thread_id);
-    ReferenceMapVisitor<RootCallbackVisitor> mapper(this, nullptr, visitor_to_callback);
+    ReferenceMapVisitor<RootCallbackVisitor, kPrecise> mapper(this, nullptr, visitor_to_callback);
     for (StackedShadowFrameRecord* record = tlsPtr_.stacked_shadow_frame_record;
          record != nullptr;
          record = record->GetLink()) {
@@ -3000,7 +3112,7 @@
   }
   if (tlsPtr_.frame_id_to_shadow_frame != nullptr) {
     RootCallbackVisitor visitor_to_callback(visitor, thread_id);
-    ReferenceMapVisitor<RootCallbackVisitor> mapper(this, nullptr, visitor_to_callback);
+    ReferenceMapVisitor<RootCallbackVisitor, kPrecise> mapper(this, nullptr, visitor_to_callback);
     for (FrameIdToShadowFrame* record = tlsPtr_.frame_id_to_shadow_frame;
          record != nullptr;
          record = record->GetNext()) {
@@ -3013,14 +3125,22 @@
   // Visit roots on this thread's stack
   Context* context = GetLongJumpContext();
   RootCallbackVisitor visitor_to_callback(visitor, thread_id);
-  ReferenceMapVisitor<RootCallbackVisitor> mapper(this, context, visitor_to_callback);
-  mapper.WalkStack();
+  ReferenceMapVisitor<RootCallbackVisitor, kPrecise> mapper(this, context, visitor_to_callback);
+  mapper.template WalkStack<StackVisitor::CountTransitions::kNo>(false);
   ReleaseLongJumpContext(context);
   for (instrumentation::InstrumentationStackFrame& frame : *GetInstrumentationStack()) {
     visitor->VisitRootIfNonNull(&frame.this_object_, RootInfo(kRootVMInternal, thread_id));
   }
 }
 
+void Thread::VisitRoots(RootVisitor* visitor, VisitRootFlags flags) {
+  if ((flags & VisitRootFlags::kVisitRootFlagPrecise) != 0) {
+    VisitRoots<true>(visitor);
+  } else {
+    VisitRoots<false>(visitor);
+  }
+}
+
 class VerifyRootVisitor : public SingleRootVisitor {
  public:
   void VisitRoot(mirror::Object* root, const RootInfo& info ATTRIBUTE_UNUSED)
diff --git a/runtime/thread.h b/runtime/thread.h
index b80fdc7..31cd0eb 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -549,7 +549,8 @@
     return tlsPtr_.frame_id_to_shadow_frame != nullptr;
   }
 
-  void VisitRoots(RootVisitor* visitor) REQUIRES_SHARED(Locks::mutator_lock_);
+  void VisitRoots(RootVisitor* visitor, VisitRootFlags flags = kVisitRootFlagAllRoots)
+      REQUIRES_SHARED(Locks::mutator_lock_);
 
   ALWAYS_INLINE void VerifyStack() REQUIRES_SHARED(Locks::mutator_lock_);
 
@@ -1245,6 +1246,9 @@
   // Install the protected region for implicit stack checks.
   void InstallImplicitProtection();
 
+  template <bool kPrecise>
+  void VisitRoots(RootVisitor* visitor) REQUIRES_SHARED(Locks::mutator_lock_);
+
   static bool IsAotCompiler();
 
   // 32 bits of atomically changed state and flags. Keeping as 32 bits allows and atomic CAS to
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index a6bd83d..664eeb4 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -1395,10 +1395,10 @@
   }
 }
 
-void ThreadList::VisitRoots(RootVisitor* visitor) const {
+void ThreadList::VisitRoots(RootVisitor* visitor, VisitRootFlags flags) const {
   MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
   for (const auto& thread : list_) {
-    thread->VisitRoots(visitor);
+    thread->VisitRoots(visitor, flags);
   }
 }
 
diff --git a/runtime/thread_list.h b/runtime/thread_list.h
index 1acabcb..658db00 100644
--- a/runtime/thread_list.h
+++ b/runtime/thread_list.h
@@ -155,7 +155,7 @@
                !Locks::thread_list_lock_,
                !Locks::thread_suspend_count_lock_);
 
-  void VisitRoots(RootVisitor* visitor) const
+  void VisitRoots(RootVisitor* visitor, VisitRootFlags flags) const
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   void VisitRootsForSuspendedThreads(RootVisitor* visitor)
diff --git a/runtime/utils.cc b/runtime/utils.cc
index 6ed54f7..66739a9 100644
--- a/runtime/utils.cc
+++ b/runtime/utils.cc
@@ -37,6 +37,7 @@
 #if defined(__APPLE__)
 #include "AvailabilityMacros.h"  // For MAC_OS_X_VERSION_MAX_ALLOWED
 #include <sys/syscall.h>
+#include <crt_externs.h>
 #endif
 
 #if defined(__linux__)
diff --git a/test/021-string2/expected.txt b/test/021-string2/expected.txt
index a9c6eb8..f269c7c 100644
--- a/test/021-string2/expected.txt
+++ b/test/021-string2/expected.txt
@@ -1,2 +1,6 @@
 Got expected npe
 OK
+ true true true true
+ true true true true
+ true true true true
+ true true true true
diff --git a/test/021-string2/src/Main.java b/test/021-string2/src/Main.java
index 51351e1..df0a3dd 100644
--- a/test/021-string2/src/Main.java
+++ b/test/021-string2/src/Main.java
@@ -92,6 +92,31 @@
 
         testCompareToAndEquals();
         testIndexOf();
+
+        String s0_0 = "\u0000";
+        String s0_1 = new String(s0_0);
+        String s0_2 = new String(new char[] { '\u0000' });
+        String s0_3 = s0_0 + "";
+        System.out.println(
+            " " + $noinline$equals(s0_0, s0_0) +
+            " " + $noinline$equals(s0_0, s0_1) +
+            " " + $noinline$equals(s0_0, s0_2) +
+            " " + $noinline$equals(s0_0, s0_3));
+        System.out.println(
+            " " + $noinline$equals(s0_1, s0_0) +
+            " " + $noinline$equals(s0_1, s0_1) +
+            " " + $noinline$equals(s0_1, s0_2) +
+            " " + $noinline$equals(s0_1, s0_3));
+        System.out.println(
+            " " + $noinline$equals(s0_2, s0_0) +
+            " " + $noinline$equals(s0_2, s0_1) +
+            " " + $noinline$equals(s0_2, s0_2) +
+            " " + $noinline$equals(s0_2, s0_3));
+        System.out.println(
+            " " + $noinline$equals(s0_3, s0_0) +
+            " " + $noinline$equals(s0_3, s0_1) +
+            " " + $noinline$equals(s0_3, s0_2) +
+            " " + $noinline$equals(s0_3, s0_3));
     }
 
     public static void testCompareToAndEquals() {
diff --git a/test/530-checker-loops4/src/Main.java b/test/530-checker-loops4/src/Main.java
index 2e19c88..7d3d7d9 100644
--- a/test/530-checker-loops4/src/Main.java
+++ b/test/530-checker-loops4/src/Main.java
@@ -88,11 +88,9 @@
   //
   /// CHECK-START: int Main.geo4(int) loop_optimization (after)
   /// CHECK-DAG: <<Par:i\d+>> ParameterValue        loop:none
-  /// CHECK-DAG: <<Zer:i\d+>> IntConstant 0         loop:none
   /// CHECK-DAG: <<Int:i\d+>> IntConstant 7         loop:none
   /// CHECK-DAG: <<Rem:i\d+>> Rem [<<Par>>,<<Int>>] loop:none
-  /// CHECK-DAG: <<Add:i\d+>> Add [<<Rem>>,<<Zer>>] loop:none
-  /// CHECK-DAG:              Return [<<Add>>]      loop:none
+  /// CHECK-DAG:              Return [<<Rem>>]      loop:none
   //
   /// CHECK-START: int Main.geo4(int) loop_optimization (after)
   /// CHECK-NOT: Phi
@@ -104,6 +102,17 @@
   }
 
   // TODO: someday?
+  //
+  /// CHECK-START: int Main.geo1BCE() BCE (before)
+  /// CHECK-DAG: BoundsCheck loop:none
+  /// CHECK-DAG: BoundsCheck loop:{{B\d+}}
+  //
+  /// CHECK-START: int Main.geo1BCE() BCE (after)
+  /// CHECK-DAG: BoundsCheck loop:{{B\d+}}
+  //
+  /// CHECK-START: int Main.geo1BCE() BCE (after)
+  /// CHECK-NOT: BoundsCheck loop:none
+  /// CHECK-NOT: Deoptimize
   public static int geo1BCE() {
     int[] x = {  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
                 11, 12, 13, 14, 15, 16, 17, 19, 19, 20,
@@ -118,6 +127,17 @@
   }
 
   // TODO: someday?
+  //
+  /// CHECK-START: int Main.geo2BCE() BCE (before)
+  /// CHECK-DAG: BoundsCheck loop:none
+  /// CHECK-DAG: BoundsCheck loop:{{B\d+}}
+  //
+  /// CHECK-START: int Main.geo2BCE() BCE (after)
+  /// CHECK-DAG: BoundsCheck loop:{{B\d+}}
+  //
+  /// CHECK-START: int Main.geo2BCE() BCE (after)
+  /// CHECK-NOT: BoundsCheck loop:none
+  /// CHECK-NOT: Deoptimize
   public static int geo2BCE() {
     int[] x = {  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
                 11, 12, 13, 14, 15, 16, 17, 19, 19, 20,
@@ -225,16 +245,20 @@
     return a;
   }
 
-  // TODO: Rem is already optimized away by the time the loop optimizer runs;
-  //       we could still optimize this case with last value on wrap-around!
+  // Even though Rem is already optimized away by the time induction analysis
+  // and the loop optimizer run, the loop is optimized with a trivial
+  // wrap-around induction just as the wrap-around for REM would.
   //
   /// CHECK-START: int Main.geoRemBlackHole(int) loop_optimization (before)
   /// CHECK-DAG: Phi loop:<<Loop:B\d+>>
   /// CHECK-DAG: Phi loop:<<Loop>>
   //
   /// CHECK-START: int Main.geoRemBlackHole(int) loop_optimization (after)
-  /// CHECK-DAG: Phi loop:<<Loop:B\d+>>
-  /// CHECK-DAG: Phi loop:<<Loop>>
+  /// CHECK-DAG: <<Zero:i\d+>> IntConstant 0
+  /// CHECK-DAG:               Return [<<Zero>>]
+  //
+  /// CHECK-START: int Main.geoRemBlackHole(int) loop_optimization (after)
+  /// CHECK-NOT: Phi
   public static int geoRemBlackHole(int a) {
     for (int i = 0; i < 100; i++) {
       a %= 1;
diff --git a/test/530-checker-loops5/expected.txt b/test/530-checker-loops5/expected.txt
new file mode 100644
index 0000000..b0aad4d
--- /dev/null
+++ b/test/530-checker-loops5/expected.txt
@@ -0,0 +1 @@
+passed
diff --git a/test/530-checker-loops5/info.txt b/test/530-checker-loops5/info.txt
new file mode 100644
index 0000000..15dbf37
--- /dev/null
+++ b/test/530-checker-loops5/info.txt
@@ -0,0 +1 @@
+Test on loop optimizations, in particular with polynomial induction.
diff --git a/test/530-checker-loops5/src/Main.java b/test/530-checker-loops5/src/Main.java
new file mode 100644
index 0000000..54b54d0
--- /dev/null
+++ b/test/530-checker-loops5/src/Main.java
@@ -0,0 +1,186 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//
+// Test on loop optimizations, in particular around polynomial induction.
+//
+public class Main {
+
+  /// CHECK-START: int Main.poly1() loop_optimization (before)
+  /// CHECK-DAG: Phi loop:<<Loop:B\d+>>
+  /// CHECK-DAG: Add loop:<<Loop>>
+  /// CHECK-DAG: Add loop:<<Loop>>
+  //
+  /// CHECK-START: int Main.poly1() loop_optimization (after)
+  /// CHECK-DAG: <<Zer:i\d+>> IntConstant 0         loop:none
+  /// CHECK-DAG: <<Int:i\d+>> IntConstant 55        loop:none
+  /// CHECK-DAG: <<Add:i\d+>> Add [<<Int>>,<<Zer>>] loop:none
+  /// CHECK-DAG:              Return [<<Add>>]      loop:none
+  //
+  /// CHECK-START: int Main.poly1() instruction_simplifier$after_bce (after)
+  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 55 loop:none
+  /// CHECK-DAG:               Return [<<Int>>]  loop:none
+  //
+  /// CHECK-START: int Main.poly1() loop_optimization (after)
+  /// CHECK-NOT: Phi
+  public static int poly1() {
+    int a = 0;
+    for (int i = 0; i <= 10; i++) {
+      a += i;
+    }
+    return a;
+  }
+
+  // Multiplication in linear induction has been optimized earlier,
+  // but that does not stop the induction variable recognition
+  // and loop optimizer.
+  //
+  /// CHECK-START: int Main.poly2(int) loop_optimization (before)
+  /// CHECK-DAG: Phi loop:<<Loop:B\d+>>
+  /// CHECK-DAG: Shl loop:<<Loop>>
+  /// CHECK-DAG: Add loop:<<Loop>>
+  /// CHECK-DAG: Add loop:<<Loop>>
+  /// CHECK-DAG: Add loop:<<Loop>>
+  /// CHECK-DAG: Add loop:<<Loop>>
+  //
+  /// CHECK-START: int Main.poly2(int) loop_optimization (after)
+  /// CHECK-DAG: <<Par:i\d+>> ParameterValue        loop:none
+  /// CHECK-DAG: <<Int:i\d+>> IntConstant 185       loop:none
+  /// CHECK-DAG: <<Add:i\d+>> Add [<<Int>>,<<Par>>] loop:none
+  /// CHECK-DAG:              Return [<<Add>>]      loop:none
+  //
+  /// CHECK-START: int Main.poly2(int) loop_optimization (after)
+  /// CHECK-NOT: Phi
+  public static int poly2(int a) {
+    for (int i = 0; i < 10; i++) {
+      int k = 3 * i + 5;
+      a += k;
+    }
+    return a;
+  }
+
+  /// CHECK-START: int Main.poly3() loop_optimization (before)
+  /// CHECK-DAG: Phi loop:<<Loop:B\d+>>
+  /// CHECK-DAG: Add loop:<<Loop>>
+  /// CHECK-DAG: Add loop:<<Loop>>
+  //
+  /// CHECK-START: int Main.poly3() loop_optimization (after)
+  /// CHECK-DAG: <<Ini:i\d+>> IntConstant 12345       loop:none
+  /// CHECK-DAG: <<Int:i\d+>> IntConstant -2146736968 loop:none
+  /// CHECK-DAG: <<Add:i\d+>> Add [<<Int>>,<<Ini>>]   loop:none
+  /// CHECK-DAG:              Return [<<Add>>]        loop:none
+  //
+  /// CHECK-START: int Main.poly3() instruction_simplifier$after_bce (after)
+  /// CHECK-DAG: <<Int:i\d+>>  IntConstant -2146724623 loop:none
+  /// CHECK-DAG:               Return [<<Int>>]        loop:none
+  //
+  /// CHECK-START: int Main.poly3() loop_optimization (after)
+  /// CHECK-NOT: Phi
+  public static int poly3() {
+    int a = 12345;
+    for (int i = 0; i <= 10; i++) {
+      a += (2147483646 * i + 67890);
+    }
+    return a;
+  }
+
+  /// CHECK-START: int Main.polyBCE1() BCE (before)
+  /// CHECK-DAG: BoundsCheck loop:none
+  /// CHECK-DAG: BoundsCheck loop:{{B\d+}}
+  //
+  /// CHECK-START: int Main.polyBCE1() BCE (after)
+  /// CHECK-NOT: BoundsCheck
+  /// CHECK-NOT: Deoptimize
+  public static int polyBCE1() {
+    int[] x = {  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
+                11, 12, 13, 14, 15, 16, 17, 19, 19, 20,
+                21, 22 };
+    int a = 0;
+    int r = 0;
+    for (int i = 0; i < 8; i++) {
+      r += x[a];
+      a += i;
+    }
+    return r;
+  }
+
+  /// CHECK-START: int Main.polyBCE2() BCE (before)
+  /// CHECK-DAG: BoundsCheck loop:none
+  /// CHECK-DAG: BoundsCheck loop:{{B\d+}}
+  //
+  /// CHECK-START: int Main.polyBCE2() BCE (after)
+  /// CHECK-NOT: BoundsCheck
+  /// CHECK-NOT: Deoptimize
+  public static int polyBCE2() {
+    int[] x = {  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
+                11, 12, 13, 14, 15, 16, 17, 19, 19, 20,
+                21, 22, 23, 24, 25, 26, 27 };
+    int a = 1;
+    int r = 0;
+    for (int i = 0; i < 6; i++) {
+      int k = 2 * i + 1;
+      r -= x[a];
+      a += k;
+    }
+    return r;
+  }
+
+  /// CHECK-START: int Main.polyBCE3() BCE (before)
+  /// CHECK-DAG: BoundsCheck loop:none
+  /// CHECK-DAG: BoundsCheck loop:{{B\d+}}
+  //
+  /// CHECK-START: int Main.polyBCE3() BCE (after)
+  /// CHECK-NOT: BoundsCheck
+  /// CHECK-NOT: Deoptimize
+  public static int polyBCE3() {
+    int[] x = {  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,
+                11, 12, 13, 14, 15, 16, 17, 19, 19, 20,
+                21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+                31, 32, 33, 34, 35, 36, 37, 38 };
+    int r = 0;
+    int a1 = 1;
+    int a2 = 2;
+    for (int i = 0; i < 5; i++) {
+      int t = a1 + a2;  // two polynomials combined into new polynomial
+      r -= x[t];
+      a1 += (3 * i + 1);
+      a2 += (2 * i);
+    }
+    return r;
+  }
+
+  //
+  // Verifier.
+  //
+
+  public static void main(String[] args) {
+    expectEquals(55, poly1());
+    expectEquals(185, poly2(0));
+    expectEquals(192, poly2(7));
+    expectEquals(-2146724623, poly3());
+    expectEquals(64, polyBCE1());
+    expectEquals(-68, polyBCE2());
+    expectEquals(-80, polyBCE3());
+
+    System.out.println("passed");
+  }
+
+  private static void expectEquals(int expected, int result) {
+    if (expected != result) {
+      throw new Error("Expected: " + expected + ", found: " + result);
+    }
+  }
+}
diff --git a/test/538-checker-embed-constants/src/Main.java b/test/538-checker-embed-constants/src/Main.java
index 0329e63..4f34ec9 100644
--- a/test/538-checker-embed-constants/src/Main.java
+++ b/test/538-checker-embed-constants/src/Main.java
@@ -308,7 +308,7 @@
   }
 
   /// CHECK-START-ARM: long Main.shl32(long) disassembly (after)
-  /// CHECK-DAG:            mov {{r\d+}}, {{r\d+}}
+  /// CHECK-DAG:            mov{{s?}} {{r\d+}}, {{r\d+}}
   /// CHECK-DAG:            mov{{s?|\.w}} {{r\d+}}, #0
 
   /// CHECK-START-ARM: long Main.shl32(long) disassembly (after)
@@ -377,7 +377,7 @@
 
   /// CHECK-START-ARM: long Main.shr32(long) disassembly (after)
   /// CHECK-DAG:            asr{{s?|\.w}} {{r\d+}}, <<high:r\d+>>, #31
-  /// CHECK-DAG:            mov {{r\d+}}, <<high>>
+  /// CHECK-DAG:            mov{{s?}} {{r\d+}}, <<high>>
 
   /// CHECK-START-ARM: long Main.shr32(long) disassembly (after)
   /// CHECK-NOT:            asr{{s?|\.w}} {{r\d+}}, {{r\d+}}, {{r\d+}}
@@ -445,7 +445,7 @@
   }
 
   /// CHECK-START-ARM: long Main.ushr32(long) disassembly (after)
-  /// CHECK-DAG:            mov {{r\d+}}, {{r\d+}}
+  /// CHECK-DAG:            mov{{s?}} {{r\d+}}, {{r\d+}}
   /// CHECK-DAG:            mov{{s?|\.w}} {{r\d+}}, #0
 
   /// CHECK-START-ARM: long Main.ushr32(long) disassembly (after)
diff --git a/test/618-checker-induction/src/Main.java b/test/618-checker-induction/src/Main.java
index f85479a..87a69b2 100644
--- a/test/618-checker-induction/src/Main.java
+++ b/test/618-checker-induction/src/Main.java
@@ -25,7 +25,7 @@
   /// CHECK-DAG: Phi loop:{{B\d+}} outer_loop:none
   //
   /// CHECK-START: void Main.deadSingleLoop() loop_optimization (after)
-  /// CHECK-NOT: Phi loop:{{B\d+}} outer_loop:none
+  /// CHECK-NOT: Phi
   static void deadSingleLoop() {
     for (int i = 0; i < 4; i++) {
     }
@@ -35,7 +35,7 @@
   /// CHECK-DAG: Phi loop:{{B\d+}} outer_loop:none
   //
   /// CHECK-START: void Main.deadSingleLoop() loop_optimization (after)
-  /// CHECK-NOT: Phi loop:{{B\d+}} outer_loop:none
+  /// CHECK-NOT: Phi
   static void deadSingleLoopN(int n) {
     for (int i = 0; i < n; i++) {
     }
@@ -56,7 +56,7 @@
   /// CHECK-DAG: Phi loop:{{B\d+}}      outer_loop:<<Loop>>
   //
   /// CHECK-START: void Main.deadNestedLoops() loop_optimization (after)
-  /// CHECK-NOT: Phi loop:{{B\d+}}
+  /// CHECK-NOT: Phi
   static void deadNestedLoops() {
     for (int i = 0; i < 4; i++) {
       for (int j = 0; j < 4; j++) {
@@ -74,7 +74,7 @@
   /// CHECK-DAG: Phi loop:{{B\d+}}       outer_loop:none
   //
   /// CHECK-START: void Main.deadNestedAndFollowingLoops() loop_optimization (after)
-  /// CHECK-NOT: Phi loop:{{B\d+}}
+  /// CHECK-NOT: Phi
   static void deadNestedAndFollowingLoops() {
     for (int i = 0; i < 4; i++) {
       for (int j = 0; j < 4; j++) {
@@ -96,7 +96,7 @@
   /// CHECK-DAG: Phi loop:{{B\d+}} outer_loop:none
   //
   /// CHECK-START: void Main.deadConditional(int) loop_optimization (after)
-  /// CHECK-NOT: Phi loop:{{B\d+}}
+  /// CHECK-NOT: Phi
   public static void deadConditional(int n) {
     int k = 0;
     int m = 0;
@@ -116,7 +116,7 @@
   /// CHECK-DAG: Phi loop:<<Loop>>      outer_loop:none
   //
   /// CHECK-START: void Main.deadConditionalCycle(int) loop_optimization (after)
-  /// CHECK-NOT: Phi loop:{{B\d+}}
+  /// CHECK-NOT: Phi
   public static void deadConditionalCycle(int n) {
     int k = 0;
     int m = 0;
@@ -215,12 +215,11 @@
   /// CHECK-DAG:               Return [<<Phi1>>] loop:none
   //
   /// CHECK-START: int Main.closedFormInductionUp() loop_optimization (after)
-  /// CHECK-NOT:               Phi    loop:{{B\d+}} outer_loop:none
-  /// CHECK-DAG:               Return loop:none
+  /// CHECK-NOT:               Phi
   //
   /// CHECK-START: int Main.closedFormInductionUp() instruction_simplifier$after_bce (after)
-  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 12395
-  /// CHECK-DAG:               Return [<<Int>>] loop:none
+  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 12395 loop:none
+  /// CHECK-DAG:               Return [<<Int>>]  loop:none
   static int closedFormInductionUp() {
     int closed = 12345;
     for (int i = 0; i < 10; i++) {
@@ -235,8 +234,13 @@
   /// CHECK-DAG:               Return [<<Phi2>>] loop:none
   //
   /// CHECK-START: int Main.closedFormInductionInAndDown(int) loop_optimization (after)
-  /// CHECK-NOT:               Phi    loop:{{B\d+}} outer_loop:none
-  /// CHECK-DAG:               Return loop:none
+  /// CHECK-NOT:               Phi
+  //
+  /// CHECK-START: int Main.closedFormInductionInAndDown(int) instruction_simplifier$after_bce (after)
+  /// CHECK-DAG: <<Par:i\d+>>  ParameterValue        loop:none
+  /// CHECK-DAG: <<Int:i\d+>>  IntConstant -50       loop:none
+  /// CHECK-DAG: <<Add:i\d+>>  Add [<<Int>>,<<Par>>] loop:none
+  /// CHECK-DAG:               Return [<<Add>>]      loop:none
   static int closedFormInductionInAndDown(int closed) {
     for (int i = 0; i < 10; i++) {
       closed -= 5;
@@ -252,12 +256,10 @@
   /// CHECK-DAG:               Return [<<Phi1>>] loop:none
   //
   /// CHECK-START: int Main.closedFormNested() loop_optimization (after)
-  /// CHECK-NOT:               Phi    loop:{{B\d+}} outer_loop:none
-  /// CHECK-NOT:               Phi    loop:{{B\d+}} outer_loop:loop{{B\d+}}
-  /// CHECK-DAG:               Return loop:none
+  /// CHECK-NOT:               Phi
   //
   /// CHECK-START: int Main.closedFormNested() instruction_simplifier$after_bce (after)
-  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 100
+  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 100  loop:none
   /// CHECK-DAG:               Return [<<Int>>] loop:none
   static int closedFormNested() {
     int closed = 0;
@@ -277,13 +279,11 @@
   /// CHECK-DAG:               Return [<<Phi1>>] loop:none
   //
   /// CHECK-START: int Main.closedFormNestedAlt() loop_optimization (after)
-  /// CHECK-NOT:               Phi    loop:{{B\d+}} outer_loop:none
-  /// CHECK-NOT:               Phi    loop:{{B\d+}} outer_loop:loop{{B\d+}}
-  /// CHECK-DAG:               Return loop:none
+  /// CHECK-NOT:               Phi
   //
   /// CHECK-START: int Main.closedFormNestedAlt() instruction_simplifier$after_bce (after)
-  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 15082
-  /// CHECK-DAG:               Return [<<Int>>] loop:none
+  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 15082 loop:none
+  /// CHECK-DAG:               Return [<<Int>>]  loop:none
   static int closedFormNestedAlt() {
     int closed = 12345;
     for (int i = 0; i < 17; i++) {
@@ -360,11 +360,10 @@
   /// CHECK-DAG:              Return [<<Phi>>] loop:none
   //
   /// CHECK-START: int Main.mainIndexReturned() loop_optimization (after)
-  /// CHECK-NOT:              Phi    loop:{{B\d+}} outer_loop:none
-  /// CHECK-DAG:              Return loop:none
+  /// CHECK-NOT:              Phi
   //
   /// CHECK-START: int Main.mainIndexReturned() instruction_simplifier$after_bce (after)
-  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 10
+  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 10   loop:none
   /// CHECK-DAG:               Return [<<Int>>] loop:none
   static int mainIndexReturned() {
     int i;
@@ -378,11 +377,10 @@
   /// CHECK-DAG:               Return [<<Phi2>>] loop:none
   //
   /// CHECK-START: int Main.periodicReturned9() loop_optimization (after)
-  /// CHECK-NOT:               Phi    loop:{{B\d+}} outer_loop:none
-  /// CHECK-DAG:               Return loop:none
+  /// CHECK-NOT:               Phi
   //
   /// CHECK-START: int Main.periodicReturned9() instruction_simplifier$after_bce (after)
-  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 1
+  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 1    loop:none
   /// CHECK-DAG:               Return [<<Int>>] loop:none
   static int periodicReturned9() {
     int k = 0;
@@ -398,11 +396,10 @@
   /// CHECK-DAG:               Return [<<Phi2>>] loop:none
   //
   /// CHECK-START: int Main.periodicReturned10() loop_optimization (after)
-  /// CHECK-NOT:               Phi    loop:{{B\d+}} outer_loop:none
-  /// CHECK-DAG:               Return loop:none
+  /// CHECK-NOT:               Phi
   //
   /// CHECK-START: int Main.periodicReturned10() instruction_simplifier$after_bce (after)
-  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 0
+  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 0    loop:none
   /// CHECK-DAG:               Return [<<Int>>] loop:none
   static int periodicReturned10() {
     int k = 0;
@@ -412,7 +409,18 @@
     return k;
   }
 
-  // If ever replaced by closed form, last value should be correct!
+  /// CHECK-START: int Main.getSum21() loop_optimization (before)
+  /// CHECK-DAG: <<Phi1:i\d+>> Phi               loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>> Phi               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG: <<Phi3:i\d+>> Phi               loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:               Return [<<Phi3>>] loop:none
+  //
+  /// CHECK-START: int Main.getSum21() loop_optimization (after)
+  /// CHECK-NOT:               Phi
+  //
+  /// CHECK-START: int Main.getSum21() instruction_simplifier$after_bce (after)
+  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 21   loop:none
+  /// CHECK-DAG:               Return [<<Int>>] loop:none
   private static int getSum21() {
     int k = 0;
     int sum = 0;
@@ -436,8 +444,7 @@
   /// CHECK-DAG:               Return [<<Phi2>>] loop:none
   //
   /// CHECK-START: int Main.periodicReturnedN(int) loop_optimization (after)
-  /// CHECK-NOT:               Phi    loop:{{B\d+}} outer_loop:none
-  /// CHECK-DAG:               Return loop:none
+  /// CHECK-NOT:               Phi
   static int periodicReturnedN(int n) {
     int k = 0;
     for (int i = 0; i < n; i++) {
@@ -480,11 +487,10 @@
   /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
   //
   /// CHECK-START: int Main.closedFeed() loop_optimization (after)
-  /// CHECK-NOT:               Phi    loop:{{B\d+}} outer_loop:none
-  /// CHECK-DAG:               Return loop:none
+  /// CHECK-NOT:               Phi
   //
   /// CHECK-START: int Main.closedFeed() instruction_simplifier$after_bce (after)
-  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 20
+  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 20   loop:none
   /// CHECK-DAG:               Return [<<Int>>] loop:none
   private static int closedFeed() {
     int closed = 0;
@@ -505,11 +511,10 @@
   /// CHECK-DAG:               Return [<<Phi1>>] loop:none
   //
   /// CHECK-START: int Main.closedLargeUp() loop_optimization (after)
-  /// CHECK-NOT:               Phi    loop:B\d+ outer_loop:none
-  /// CHECK-DAG:               Return loop:none
+  /// CHECK-NOT:               Phi
   //
   /// CHECK-START: int Main.closedLargeUp() instruction_simplifier$after_bce (after)
-  /// CHECK-DAG: <<Int:i\d+>>  IntConstant -10
+  /// CHECK-DAG: <<Int:i\d+>>  IntConstant -10  loop:none
   /// CHECK-DAG:               Return [<<Int>>] loop:none
   private static int closedLargeUp() {
     int closed = 0;
@@ -525,11 +530,10 @@
   /// CHECK-DAG:               Return [<<Phi1>>] loop:none
   //
   /// CHECK-START: int Main.closedLargeDown() loop_optimization (after)
-  /// CHECK-NOT:               Phi    loop:B\d+ outer_loop:none
-  /// CHECK-DAG:               Return loop:none
+  /// CHECK-NOT:               Phi
   //
   /// CHECK-START: int Main.closedLargeDown() instruction_simplifier$after_bce (after)
-  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 10
+  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 10   loop:none
   /// CHECK-DAG:               Return [<<Int>>] loop:none
   private static int closedLargeDown() {
     int closed = 0;
@@ -548,11 +552,10 @@
   /// CHECK-DAG:               Return [<<Phi5>>] loop:none
   //
   /// CHECK-START: int Main.waterFall() loop_optimization (after)
-  /// CHECK-NOT:               Phi    loop:B\d+ outer_loop:none
-  /// CHECK-DAG:               Return loop:none
+  /// CHECK-NOT:               Phi
   //
   /// CHECK-START: int Main.waterFall() instruction_simplifier$after_bce (after)
-  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 50
+  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 50   loop:none
   /// CHECK-DAG:               Return [<<Int>>] loop:none
   private static int waterFall() {
     int i = 0;
@@ -570,11 +573,10 @@
   /// CHECK-DAG:               Return [<<Phi2>>] loop:none
   //
   /// CHECK-START: boolean Main.periodicBoolIdiom1() loop_optimization (after)
-  /// CHECK-NOT:               Phi    loop:{{B\d+}} outer_loop:none
-  /// CHECK-DAG:               Return loop:none
+  /// CHECK-NOT:               Phi
   //
   /// CHECK-START: boolean Main.periodicBoolIdiom1() instruction_simplifier$after_bce (after)
-  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 0
+  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 0    loop:none
   /// CHECK-DAG:               Return [<<Int>>] loop:none
   private static boolean periodicBoolIdiom1() {
     boolean x = true;
@@ -590,11 +592,10 @@
   /// CHECK-DAG:               Return [<<Phi2>>] loop:none
   //
   /// CHECK-START: boolean Main.periodicBoolIdiom2() loop_optimization (after)
-  /// CHECK-NOT:               Phi    loop:{{B\d+}} outer_loop:none
-  /// CHECK-DAG:               Return loop:none
+  /// CHECK-NOT:               Phi
   //
   /// CHECK-START: boolean Main.periodicBoolIdiom2() instruction_simplifier$after_bce (after)
-  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 0
+  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 0    loop:none
   /// CHECK-DAG:               Return [<<Int>>] loop:none
   private static boolean periodicBoolIdiom2() {
     boolean x = true;
@@ -610,11 +611,10 @@
   /// CHECK-DAG:               Return [<<Phi2>>] loop:none
   //
   /// CHECK-START: boolean Main.periodicBoolIdiom3() loop_optimization (after)
-  /// CHECK-NOT:               Phi    loop:{{B\d+}} outer_loop:none
-  /// CHECK-DAG:               Return loop:none
+  /// CHECK-NOT:               Phi
   //
   /// CHECK-START: boolean Main.periodicBoolIdiom3() instruction_simplifier$after_bce (after)
-  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 0
+  /// CHECK-DAG: <<Int:i\d+>>  IntConstant 0    loop:none
   /// CHECK-DAG:               Return [<<Int>>] loop:none
   private static boolean periodicBoolIdiom3() {
     boolean x = true;
@@ -630,8 +630,7 @@
   /// CHECK-DAG:               Return [<<Phi2>>] loop:none
   //
   /// CHECK-START: boolean Main.periodicBoolIdiom1N(boolean, int) loop_optimization (after)
-  /// CHECK-NOT:               Phi    loop:{{B\d+}} outer_loop:none
-  /// CHECK-DAG:               Return loop:none
+  /// CHECK-NOT:               Phi
   private static boolean periodicBoolIdiom1N(boolean x, int n) {
     for (int i = 0; i < n; i++) {
       x = !x;
@@ -645,8 +644,7 @@
   /// CHECK-DAG:               Return [<<Phi2>>] loop:none
   //
   /// CHECK-START: boolean Main.periodicBoolIdiom2N(boolean, int) loop_optimization (after)
-  /// CHECK-NOT:               Phi    loop:{{B\d+}} outer_loop:none
-  /// CHECK-DAG:               Return loop:none
+  /// CHECK-NOT:               Phi
   private static boolean periodicBoolIdiom2N(boolean x, int n) {
     for (int i = 0; i < n; i++) {
       x = (x != true);
@@ -660,8 +658,7 @@
   /// CHECK-DAG:               Return [<<Phi2>>] loop:none
   //
   /// CHECK-START: boolean Main.periodicBoolIdiom3N(boolean, int) loop_optimization (after)
-  /// CHECK-NOT:               Phi    loop:{{B\d+}} outer_loop:none
-  /// CHECK-DAG:               Return loop:none
+  /// CHECK-NOT:               Phi
   private static boolean periodicBoolIdiom3N(boolean x, int n) {
     for (int i = 0; i < n; i++) {
       x = (x == false);
diff --git a/test/626-const-class-linking/clear_dex_cache_types.cc b/test/626-const-class-linking/clear_dex_cache_types.cc
new file mode 100644
index 0000000..b035896
--- /dev/null
+++ b/test/626-const-class-linking/clear_dex_cache_types.cc
@@ -0,0 +1,65 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "jni.h"
+#include "object_lock.h"
+#include "scoped_thread_state_change-inl.h"
+
+namespace art {
+
+extern "C" JNIEXPORT void JNICALL Java_Main_nativeClearResolvedTypes(JNIEnv*, jclass, jclass cls) {
+  ScopedObjectAccess soa(Thread::Current());
+  mirror::DexCache* dex_cache = soa.Decode<mirror::Class>(cls)->GetDexCache();
+  for (size_t i = 0, num_types = dex_cache->NumResolvedTypes(); i != num_types; ++i) {
+    dex_cache->SetResolvedType(dex::TypeIndex(i), ObjPtr<mirror::Class>(nullptr));
+  }
+}
+
+extern "C" JNIEXPORT void JNICALL Java_Main_nativeSkipVerification(JNIEnv*, jclass, jclass cls) {
+  ScopedObjectAccess soa(Thread::Current());
+  StackHandleScope<1> hs(soa.Self());
+  Handle<mirror::Class> klass = hs.NewHandle(soa.Decode<mirror::Class>(cls));
+  mirror::Class::Status status = klass->GetStatus();
+  if (status == mirror::Class::kStatusResolved) {
+    ObjectLock<mirror::Class> lock(soa.Self(), klass);
+    klass->SetStatus(klass, mirror::Class::kStatusVerified, soa.Self());
+  } else {
+    LOG(ERROR) << klass->PrettyClass() << " has unexpected status: " << status;
+  }
+}
+
+extern "C" JNIEXPORT void JNICALL Java_Main_nativeDumpClasses(JNIEnv*, jclass, jobjectArray array) {
+  ScopedObjectAccess soa(Thread::Current());
+  StackHandleScope<1> hs(soa.Self());
+  Handle<mirror::ObjectArray<mirror::Object>> classes =
+      hs.NewHandle(soa.Decode<mirror::ObjectArray<mirror::Object>>(array));
+  CHECK(classes.Get() != nullptr);
+  for (size_t i = 0, length = classes->GetLength(); i != length; ++i) {
+    CHECK(classes->Get(i) != nullptr) << i;
+    CHECK(classes->Get(i)->IsClass())
+        << i << " " << classes->Get(i)->GetClass()->PrettyDescriptor();
+    mirror::Class* as_class = classes->Get(i)->AsClass();
+    mirror::ClassLoader* loader = as_class->GetClassLoader();
+    LOG(ERROR) << "Class #" << i << ": " << as_class->PrettyDescriptor()
+        << " @" << static_cast<const void*>(as_class)
+        << " status:" << as_class->GetStatus()
+        << " definingLoader:" << static_cast<const void*>(loader)
+        << " definingLoaderClass:"
+        << (loader != nullptr ? loader->GetClass()->PrettyDescriptor() : "N/A");
+  }
+}
+
+}  // namespace art
diff --git a/test/626-const-class-linking/expected.txt b/test/626-const-class-linking/expected.txt
new file mode 100644
index 0000000..de1b815
--- /dev/null
+++ b/test/626-const-class-linking/expected.txt
@@ -0,0 +1,61 @@
+JNI_OnLoad called
+first: Helper1 class loader: DelegatingLoader
+second: Test class loader: DefiningLoader
+first: Helper1 class loader: DelegatingLoader
+second: Test class loader: DefiningLoader
+testClearDexCache done
+first: Helper1 class loader: DelegatingLoader
+second: Test class loader: DefiningLoader
+first: Helper2 class loader: DelegatingLoader
+second: Test class loader: DefiningLoader
+testMultiDex done
+first: Helper1 class loader: RacyLoader
+second: Test class loader: DefiningLoader
+first: Helper1 class loader: RacyLoader
+second: Test class loader: DefiningLoader
+first: Helper1 class loader: RacyLoader
+second: Test class loader: DefiningLoader
+first: Helper1 class loader: RacyLoader
+second: Test class loader: DefiningLoader
+total: 4
+  throwables: 0
+  classes: 4 (1 unique)
+testRacyLoader done
+first: Helper1 class loader: RacyLoader
+second: Test class loader: DefiningLoader
+first: Helper1 class loader: RacyLoader
+second: Test class loader: DefiningLoader
+first: Helper3 class loader: RacyLoader
+second: Test3 class loader: DefiningLoader
+first: Helper3 class loader: RacyLoader
+second: Test3 class loader: DefiningLoader
+total: 4
+  throwables: 0
+  classes: 4 (2 unique)
+testRacyLoader2 done
+java.lang.NoClassDefFoundError: Initiating class loader of type MisbehavingLoader returned class Helper2 instead of Test.
+testMisbehavingLoader done
+first: Helper1 class loader: RacyMisbehavingLoader
+second: Test class loader: DefiningLoader
+first: Helper1 class loader: RacyMisbehavingLoader
+second: Test class loader: DefiningLoader
+first: Helper1 class loader: RacyMisbehavingLoader
+second: Test class loader: DefiningLoader
+first: Helper1 class loader: RacyMisbehavingLoader
+second: Test class loader: DefiningLoader
+total: 4
+  throwables: 0
+  classes: 4 (1 unique)
+testRacyMisbehavingLoader done
+first: Helper1 class loader: RacyMisbehavingLoader
+second: Test class loader: DefiningLoader
+first: Helper1 class loader: RacyMisbehavingLoader
+second: Test class loader: DefiningLoader
+first: Helper1 class loader: RacyMisbehavingLoader
+second: Test class loader: DefiningLoader
+first: Helper1 class loader: RacyMisbehavingLoader
+second: Test class loader: DefiningLoader
+total: 4
+  throwables: 0
+  classes: 4 (1 unique)
+testRacyMisbehavingLoader2 done
diff --git a/test/626-const-class-linking/info.txt b/test/626-const-class-linking/info.txt
new file mode 100644
index 0000000..9c19a46
--- /dev/null
+++ b/test/626-const-class-linking/info.txt
@@ -0,0 +1,3 @@
+Test that once a const-class instruction is linked, it will keep referring
+to the same class even in the presence of custom class loaders even after
+clearing the dex cache type array.
diff --git a/test/626-const-class-linking/multidex.jpp b/test/626-const-class-linking/multidex.jpp
new file mode 100644
index 0000000..c7a6648
--- /dev/null
+++ b/test/626-const-class-linking/multidex.jpp
@@ -0,0 +1,27 @@
+ClassPair:
+  @@com.android.jack.annotations.ForceInMainDex
+  class ClassPair
+DefiningLoader:
+  @@com.android.jack.annotations.ForceInMainDex
+  class DefiningLoader
+DelegatingLoader:
+  @@com.android.jack.annotations.ForceInMainDex
+  class DelegatingLoader
+Helper1:
+  @@com.android.jack.annotations.ForceInMainDex
+  class Helper1
+Main:
+  @@com.android.jack.annotations.ForceInMainDex
+  class Main
+MisbehavingLoader:
+  @@com.android.jack.annotations.ForceInMainDex
+  class MisbehavingLoader
+RacyLoader:
+  @@com.android.jack.annotations.ForceInMainDex
+  class RacyLoader
+RacyMisbehavingHelper:
+  @@com.android.jack.annotations.ForceInMainDex
+  class RacyMisbehavingHelper
+RacyMisbehavingLoader:
+  @@com.android.jack.annotations.ForceInMainDex
+  class RacyMisbehavingLoader
diff --git a/test/626-const-class-linking/src-multidex/Helper2.java b/test/626-const-class-linking/src-multidex/Helper2.java
new file mode 100644
index 0000000..5bb31ee
--- /dev/null
+++ b/test/626-const-class-linking/src-multidex/Helper2.java
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Helper2 {
+    public static ClassPair get() {
+        Class<?> helper2_class = Helper2.class;
+        Class<?> test_class = Test.class;
+        return new ClassPair(helper2_class, test_class);
+    }
+}
diff --git a/test/626-const-class-linking/src-multidex/Helper3.java b/test/626-const-class-linking/src-multidex/Helper3.java
new file mode 100644
index 0000000..af996de
--- /dev/null
+++ b/test/626-const-class-linking/src-multidex/Helper3.java
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Helper3 {
+    public static ClassPair get() {
+        Class<?> helper3_class = Helper3.class;
+        Class<?> test3_class = Test3.class;
+        return new ClassPair(helper3_class, test3_class);
+    }
+}
diff --git a/test/626-const-class-linking/src-multidex/Test.java b/test/626-const-class-linking/src-multidex/Test.java
new file mode 100644
index 0000000..1b0cc2a
--- /dev/null
+++ b/test/626-const-class-linking/src-multidex/Test.java
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Test {
+}
diff --git a/test/626-const-class-linking/src-multidex/Test3.java b/test/626-const-class-linking/src-multidex/Test3.java
new file mode 100644
index 0000000..c4b134d
--- /dev/null
+++ b/test/626-const-class-linking/src-multidex/Test3.java
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Test3 {
+}
diff --git a/test/626-const-class-linking/src/ClassPair.java b/test/626-const-class-linking/src/ClassPair.java
new file mode 100644
index 0000000..b07036c
--- /dev/null
+++ b/test/626-const-class-linking/src/ClassPair.java
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class ClassPair {
+    public Class<?> first;
+    public Class<?> second;
+
+    public ClassPair(Class<?> first, Class<?> second) {
+        this.first = first;
+        this.second = second;
+    }
+
+    public void print() {
+        String first_loader_name = first.getClassLoader().getClass().getName();
+        System.out.println("first: " + first.getName() + " class loader: " + first_loader_name);
+        String second_loader_name = second.getClassLoader().getClass().getName();
+        System.out.println("second: " + second.getName() + " class loader: " + second_loader_name);
+    }
+}
diff --git a/test/626-const-class-linking/src/DefiningLoader.java b/test/626-const-class-linking/src/DefiningLoader.java
new file mode 100644
index 0000000..b17ab77
--- /dev/null
+++ b/test/626-const-class-linking/src/DefiningLoader.java
@@ -0,0 +1,239 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.Method;
+import java.lang.reflect.InvocationTargetException;
+
+/**
+ * A class loader with atypical behavior: we try to load a private
+ * class implementation before asking the system or boot loader.  This
+ * is used to create multiple classes with identical names in a single VM.
+ *
+ * If DexFile is available, we use that; if not, we assume we're not in
+ * Dalvik and instantiate the class with defineClass().
+ *
+ * The location of the DEX files and class data is dependent upon the
+ * test framework.
+ */
+public class DefiningLoader extends ClassLoader {
+    static {
+        // For JVM, register as parallel capable.
+        // Android treats all class loaders as parallel capable and makes this a no-op.
+        registerAsParallelCapable();
+    }
+
+    /* this is where the .class files live */
+    static final String CLASS_PATH1 = "classes/";
+    static final String CLASS_PATH2 = "classes2/";
+
+    /* this is the DEX/Jar file */
+    static final String DEX_FILE = System.getenv("DEX_LOCATION") + "/626-const-class-linking.jar";
+
+    /* on Dalvik, this is a DexFile; otherwise, it's null */
+    private Class<?> mDexClass;
+
+    private Object mDexFile;
+
+    /**
+     * Construct DefiningLoader, grabbing a reference to the DexFile class
+     * if we're running under Dalvik.
+     */
+    public DefiningLoader(ClassLoader parent) {
+        super(parent);
+
+        try {
+            mDexClass = parent.loadClass("dalvik.system.DexFile");
+        } catch (ClassNotFoundException cnfe) {
+            // ignore -- not running Dalvik
+        }
+    }
+
+    /**
+     * Finds the class with the specified binary name.
+     *
+     * We search for a file in CLASS_PATH or pull an entry from DEX_FILE.
+     * If we don't find a match, we throw an exception.
+     */
+    protected Class<?> findClass(String name) throws ClassNotFoundException
+    {
+        if (mDexClass != null) {
+            return findClassDalvik(name);
+        } else {
+            return findClassNonDalvik(name);
+        }
+    }
+
+    /**
+     * Finds the class with the specified binary name, from a DEX file.
+     */
+    private Class<?> findClassDalvik(String name)
+        throws ClassNotFoundException {
+
+        if (mDexFile == null) {
+            synchronized (DefiningLoader.class) {
+                Constructor<?> ctor;
+                /*
+                 * Construct a DexFile object through reflection.
+                 */
+                try {
+                    ctor = mDexClass.getConstructor(String.class);
+                } catch (NoSuchMethodException nsme) {
+                    throw new ClassNotFoundException("getConstructor failed",
+                        nsme);
+                }
+
+                try {
+                    mDexFile = ctor.newInstance(DEX_FILE);
+                } catch (InstantiationException ie) {
+                    throw new ClassNotFoundException("newInstance failed", ie);
+                } catch (IllegalAccessException iae) {
+                    throw new ClassNotFoundException("newInstance failed", iae);
+                } catch (InvocationTargetException ite) {
+                    throw new ClassNotFoundException("newInstance failed", ite);
+                }
+            }
+        }
+
+        /*
+         * Call DexFile.loadClass(String, ClassLoader).
+         */
+        Method meth;
+
+        try {
+            meth = mDexClass.getMethod("loadClass", String.class, ClassLoader.class);
+        } catch (NoSuchMethodException nsme) {
+            throw new ClassNotFoundException("getMethod failed", nsme);
+        }
+
+        try {
+            meth.invoke(mDexFile, name, this);
+        } catch (IllegalAccessException iae) {
+            throw new ClassNotFoundException("loadClass failed", iae);
+        } catch (InvocationTargetException ite) {
+            throw new ClassNotFoundException("loadClass failed",
+                ite.getCause());
+        }
+
+        return null;
+    }
+
+    /**
+     * Finds the class with the specified binary name, from .class files.
+     */
+    private Class<?> findClassNonDalvik(String name)
+        throws ClassNotFoundException {
+
+        String[] pathNames = { CLASS_PATH1 + name + ".class", CLASS_PATH2 + name + ".class" };
+
+        String pathName = null;
+        RandomAccessFile raf = null;
+
+        for (String pn : pathNames) {
+            pathName = pn;
+            try {
+                //System.out.println("--- Defining: looking for " + pathName);
+                raf = new RandomAccessFile(new File(pathName), "r");
+                break;
+            } catch (FileNotFoundException fnfe) {
+            }
+        }
+        if (raf == null) {
+            throw new ClassNotFoundException("Not found: " + pathNames[0] + ":" + pathNames[1]);
+        }
+
+        /* read the entire file in */
+        byte[] fileData;
+        try {
+            fileData = new byte[(int) raf.length()];
+            raf.readFully(fileData);
+        } catch (IOException ioe) {
+            throw new ClassNotFoundException("Read error: " + pathName);
+        } finally {
+            try {
+                raf.close();
+            } catch (IOException ioe) {
+                // drop
+            }
+        }
+
+        /* create the class */
+        //System.out.println("--- Defining: defining " + name);
+        try {
+            return defineClass(name, fileData, 0, fileData.length);
+        } catch (Throwable th) {
+            throw new ClassNotFoundException("defineClass failed", th);
+        }
+    }
+
+    /**
+     * Load a class.
+     *
+     * Normally a class loader wouldn't override this, but we want our
+     * version of the class to take precedence over an already-loaded
+     * version.
+     *
+     * We still want the system classes (e.g. java.lang.Object) from the
+     * bootstrap class loader.
+     */
+    synchronized protected Class<?> loadClass(String name, boolean resolve)
+        throws ClassNotFoundException
+    {
+        Class<?> res;
+
+        /*
+         * 1. Invoke findLoadedClass(String) to check if the class has
+         * already been loaded.
+         *
+         * This doesn't change.
+         */
+        res = findLoadedClass(name);
+        if (res != null) {
+            // System.out.println("FancyLoader.loadClass: " + name + " already loaded");
+            if (resolve)
+                resolveClass(res);
+            return res;
+        }
+
+        /*
+         * 3. Invoke the findClass(String) method to find the class.
+         */
+        try {
+            res = findClass(name);
+            if (resolve)
+                resolveClass(res);
+        }
+        catch (ClassNotFoundException e) {
+            // we couldn't find it, so eat the exception and keep going
+        }
+
+        /*
+         * 2. Invoke the loadClass method on the parent class loader.  If
+         * the parent loader is null the class loader built-in to the
+         * virtual machine is used, instead.
+         *
+         * (Since we're not in java.lang, we can't actually invoke the
+         * parent's loadClass() method, but we passed our parent to the
+         * super-class which can take care of it for us.)
+         */
+        res = super.loadClass(name, resolve);   // returns class or throws
+        return res;
+    }
+}
diff --git a/test/626-const-class-linking/src/DelegatingLoader.java b/test/626-const-class-linking/src/DelegatingLoader.java
new file mode 100644
index 0000000..49955d4
--- /dev/null
+++ b/test/626-const-class-linking/src/DelegatingLoader.java
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class DelegatingLoader extends DefiningLoader {
+    private DefiningLoader defining_loader;
+
+    public DelegatingLoader(ClassLoader parent, DefiningLoader defining_loader) {
+        super(parent);
+        this.defining_loader = defining_loader;
+    }
+
+    public void resetDefiningLoader(DefiningLoader defining_loader) {
+        this.defining_loader = defining_loader;
+    }
+
+    protected Class<?> findClass(String name) throws ClassNotFoundException
+    {
+        if (name.equals("Test")) {
+            throw new Error("Unexpected DelegatingLoader.findClass(\"Test\")");
+        }
+        return super.findClass(name);
+    }
+
+    protected Class<?> loadClass(String name, boolean resolve)
+        throws ClassNotFoundException
+    {
+        if (name.equals("Test")) {
+            return defining_loader.loadClass(name, resolve);
+        }
+        return super.loadClass(name, resolve);
+    }
+}
diff --git a/test/626-const-class-linking/src/Helper1.java b/test/626-const-class-linking/src/Helper1.java
new file mode 100644
index 0000000..ff9cd1a
--- /dev/null
+++ b/test/626-const-class-linking/src/Helper1.java
@@ -0,0 +1,23 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Helper1 {
+    public static ClassPair get() {
+        Class<?> helper1_class = Helper1.class;
+        Class<?> test_class = Test.class;
+        return new ClassPair(helper1_class, test_class);
+    }
+}
diff --git a/test/626-const-class-linking/src/Main.java b/test/626-const-class-linking/src/Main.java
new file mode 100644
index 0000000..0029428
--- /dev/null
+++ b/test/626-const-class-linking/src/Main.java
@@ -0,0 +1,354 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.lang.ref.WeakReference;
+import java.lang.reflect.Field;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.util.ArrayList;
+
+public class Main {
+    public static void main(String[] args) throws Exception {
+        try {
+            System.loadLibrary(args[0]);
+        } catch (UnsatisfiedLinkError ule) {
+            usingRI = true;
+            // Add expected JNI_OnLoad log line to match expected.txt.
+            System.out.println("JNI_OnLoad called");
+        }
+
+        testClearDexCache();
+        testMultiDex();
+        testRacyLoader();
+        testRacyLoader2();
+        testMisbehavingLoader();
+        testRacyMisbehavingLoader();
+        testRacyMisbehavingLoader2();
+    }
+
+    private static void testClearDexCache() throws Exception {
+        DelegatingLoader delegating_loader = createDelegatingLoader();
+        Class<?> helper = delegating_loader.loadClass("Helper1");
+
+        WeakReference<Class<?>> weak_test1 = wrapHelperGet(helper);
+        changeInner(delegating_loader);
+        clearResolvedTypes(helper);
+        Runtime.getRuntime().gc();
+        WeakReference<Class<?>> weak_test2 = wrapHelperGet(helper);
+        Runtime.getRuntime().gc();
+
+        Class<?> test1 = weak_test1.get();
+        if (test1 == null) {
+            System.out.println("test1 disappeared");
+        }
+        Class<?> test2 = weak_test2.get();
+        if (test2 == null) {
+            System.out.println("test2 disappeared");
+        }
+        if (test1 != test2) {
+            System.out.println("test1 != test2");
+        }
+
+        System.out.println("testClearDexCache done");
+    }
+
+    private static void testMultiDex() throws Exception {
+        DelegatingLoader delegating_loader = createDelegatingLoader();
+
+        Class<?> helper1 = delegating_loader.loadClass("Helper1");
+        WeakReference<Class<?>> weak_test1 = wrapHelperGet(helper1);
+
+        changeInner(delegating_loader);
+
+        Class<?> helper2 = delegating_loader.loadClass("Helper2");
+        WeakReference<Class<?>> weak_test2 = wrapHelperGet(helper2);
+
+        Runtime.getRuntime().gc();
+
+        Class<?> test1 = weak_test1.get();
+        if (test1 == null) {
+            System.out.println("test1 disappeared");
+        }
+        Class<?> test2 = weak_test2.get();
+        if (test2 == null) {
+            System.out.println("test2 disappeared");
+        }
+        if (test1 != test2) {
+            System.out.println("test1 != test2");
+        }
+
+        System.out.println("testMultiDex done");
+    }
+
+    private static void testMisbehavingLoader() throws Exception {
+        ClassLoader system_loader = ClassLoader.getSystemClassLoader();
+        DefiningLoader defining_loader = new DefiningLoader(system_loader);
+        MisbehavingLoader misbehaving_loader =
+            new MisbehavingLoader(system_loader, defining_loader);
+        Class<?> helper = misbehaving_loader.loadClass("Helper1");
+
+        try {
+            WeakReference<Class<?>> weak_test = wrapHelperGet(helper);
+        } catch (InvocationTargetException ite) {
+            String message = ite.getCause().getMessage();
+            if (usingRI && "Test".equals(message)) {
+              // Replace RI message with dalvik message to match expected.txt.
+              message = "Initiating class loader of type " +
+                  misbehaving_loader.getClass().getName() +
+                  " returned class Helper2 instead of Test.";
+            }
+            System.out.println(ite.getCause().getClass().getName() + ": " + message);
+        }
+        System.out.println("testMisbehavingLoader done");
+    }
+
+    private static void testRacyLoader() throws Exception {
+        final ClassLoader system_loader = ClassLoader.getSystemClassLoader();
+
+        final Thread[] threads = new Thread[4];
+        final Object[] results = new Object[threads.length];
+
+        final RacyLoader racy_loader = new RacyLoader(system_loader, threads.length);
+        final Class<?> helper1 = racy_loader.loadClass("Helper1");
+        skipVerification(helper1);  // Avoid class loading during verification.
+
+        for (int i = 0; i != threads.length; ++i) {
+            final int my_index = i;
+            Thread t = new Thread() {
+                public void run() {
+                    try {
+                        Method get = helper1.getDeclaredMethod("get");
+                        results[my_index] = get.invoke(null);
+                    } catch (InvocationTargetException ite) {
+                        results[my_index] = ite.getCause();
+                    } catch (Throwable t) {
+                        results[my_index] = t;
+                    }
+                }
+            };
+            t.start();
+            threads[i] = t;
+        }
+        for (Thread t : threads) {
+            t.join();
+        }
+        dumpResultStats(results, 1);
+        System.out.println("testRacyLoader done");
+    }
+
+    private static void testRacyLoader2() throws Exception {
+        final ClassLoader system_loader = ClassLoader.getSystemClassLoader();
+
+        final Thread[] threads = new Thread[4];
+        final Object[] results = new Object[threads.length];
+
+        final RacyLoader racy_loader = new RacyLoader(system_loader, threads.length);
+        final Class<?> helper1 = racy_loader.loadClass("Helper1");
+        skipVerification(helper1);  // Avoid class loading during verification.
+        final Class<?> helper3 = racy_loader.loadClass("Helper3");
+        skipVerification(helper3);  // Avoid class loading during verification.
+
+        for (int i = 0; i != threads.length; ++i) {
+            final int my_index = i;
+            Thread t = new Thread() {
+                public void run() {
+                    try {
+                        Class<?> helper = (my_index < threads.length / 2) ? helper1 : helper3;
+                        Method get = helper.getDeclaredMethod("get");
+                        results[my_index] = get.invoke(null);
+                    } catch (InvocationTargetException ite) {
+                        results[my_index] = ite.getCause();
+                    } catch (Throwable t) {
+                        results[my_index] = t;
+                    }
+                }
+            };
+            t.start();
+            threads[i] = t;
+        }
+        for (Thread t : threads) {
+            t.join();
+        }
+        dumpResultStats(results, 2);
+        System.out.println("testRacyLoader2 done");
+    }
+
+    private static void testRacyMisbehavingLoader() throws Exception {
+        final ClassLoader system_loader = ClassLoader.getSystemClassLoader();
+
+        final Thread[] threads = new Thread[4];
+        final Object[] results = new Object[threads.length];
+
+        final RacyMisbehavingLoader racy_loader =
+            new RacyMisbehavingLoader(system_loader, threads.length, false);
+        final Class<?> helper1 = racy_loader.loadClass("RacyMisbehavingHelper");
+        skipVerification(helper1);  // Avoid class loading during verification.
+
+        for (int i = 0; i != threads.length; ++i) {
+            final int my_index = i;
+            Thread t = new Thread() {
+                public void run() {
+                    try {
+                        Method get = helper1.getDeclaredMethod("get");
+                        results[my_index] = get.invoke(null);
+                    } catch (InvocationTargetException ite) {
+                        results[my_index] = ite.getCause();
+                    } catch (Throwable t) {
+                        results[my_index] = t;
+                    }
+                }
+            };
+            t.start();
+            threads[i] = t;
+        }
+        for (Thread t : threads) {
+            t.join();
+        }
+        dumpResultStats(results, 1);
+        System.out.println("testRacyMisbehavingLoader done");
+    }
+
+    private static void testRacyMisbehavingLoader2() throws Exception {
+        final ClassLoader system_loader = ClassLoader.getSystemClassLoader();
+
+        final Thread[] threads = new Thread[4];
+        final Object[] results = new Object[threads.length];
+
+        final RacyMisbehavingLoader racy_loader =
+            new RacyMisbehavingLoader(system_loader, threads.length, true);
+        final Class<?> helper1 = racy_loader.loadClass("RacyMisbehavingHelper");
+        skipVerification(helper1);  // Avoid class loading during verification.
+
+        for (int i = 0; i != threads.length; ++i) {
+            final int my_index = i;
+            Thread t = new Thread() {
+                public void run() {
+                    try {
+                        Method get = helper1.getDeclaredMethod("get");
+                        results[my_index] = get.invoke(null);
+                    } catch (InvocationTargetException ite) {
+                        results[my_index] = ite.getCause();
+                    } catch (Throwable t) {
+                        results[my_index] = t;
+                    }
+                }
+            };
+            t.start();
+            threads[i] = t;
+        }
+        for (Thread t : threads) {
+            t.join();
+        }
+        dumpResultStats(results, 1);
+        System.out.println("testRacyMisbehavingLoader2 done");
+    }
+
+    private static void dumpResultStats(Object[] results, int expected_unique) throws Exception {
+        int throwables = 0;
+        int classes = 0;
+        int unique_classes = 0;
+        for (int i = 0; i != results.length; ++i) {
+            Object r = results[i];
+            if (r instanceof Throwable) {
+                ++throwables;
+                System.out.println(((Throwable) r).getMessage());
+            } else if (isClassPair(r)) {
+                printPair(r);
+                Object ref = getSecond(r);
+                ++classes;
+                ++unique_classes;
+                for (int j = 0; j != i; ++j) {
+                    Object rj = results[j];
+                    if (isClassPair(results[j]) && getSecond(results[j]) == ref) {
+                        --unique_classes;
+                        break;
+                    }
+                }
+            }
+        }
+        System.out.println("total: " + results.length);
+        System.out.println("  throwables: " + throwables);
+        System.out.println("  classes: " + classes
+            + " (" + unique_classes + " unique)");
+        if (expected_unique != unique_classes) {
+            System.out.println("MISMATCH with expected_unique: " + expected_unique);
+            ArrayList<Class<?>> list = new ArrayList<Class<?>>();
+            for (int i = 0; i != results.length; ++i) {
+                Object r = results[i];
+                if (isClassPair(r)) {
+                    list.add(getSecond(r));
+                }
+            }
+            nativeDumpClasses(list.toArray());
+        }
+    }
+
+    private static DelegatingLoader createDelegatingLoader() {
+        ClassLoader system_loader = ClassLoader.getSystemClassLoader();
+        DefiningLoader defining_loader = new DefiningLoader(system_loader);
+        return new DelegatingLoader(system_loader, defining_loader);
+    }
+
+    private static void changeInner(DelegatingLoader delegating_loader) {
+        ClassLoader system_loader = ClassLoader.getSystemClassLoader();
+        DefiningLoader defining_loader = new DefiningLoader(system_loader);
+        delegating_loader.resetDefiningLoader(defining_loader);
+    }
+
+    private static WeakReference<Class<?>> wrapHelperGet(Class<?> helper) throws Exception {
+        Method get = helper.getDeclaredMethod("get");
+        Object pair = get.invoke(null);
+        printPair(pair);
+        return new WeakReference<Class<?>>(getSecond(pair));
+    }
+
+    private static void printPair(Object pair) throws Exception {
+        Method print = pair.getClass().getDeclaredMethod("print");
+        print.invoke(pair);
+    }
+
+    private static Class<?> getSecond(Object pair) throws Exception {
+        Field second = pair.getClass().getDeclaredField("second");
+        return (Class<?>) second.get(pair);
+    }
+
+    private static boolean isClassPair(Object r) {
+        return r != null && r.getClass().getName().equals("ClassPair");
+    }
+
+    public static void clearResolvedTypes(Class<?> c) {
+        if (!usingRI) {
+            nativeClearResolvedTypes(c);
+        }
+    }
+
+    // Skip verification of a class on ART. Verification can cause classes to be loaded
+    // while holding a lock on the class being verified and holding that lock can interfere
+    // with the intent of the "racy" tests. In these tests we're waiting in the loadClass()
+    // for all the tested threads to synchronize and they cannot reach that point if they
+    // are waiting for the class lock on ClassLinker::InitializeClass(Helper1/Helper3).
+    public static void skipVerification(Class<?> c) {
+        if (!usingRI) {
+            nativeSkipVerification(c);
+        }
+    }
+
+    public static native void nativeClearResolvedTypes(Class<?> c);
+    public static native void nativeSkipVerification(Class<?> c);
+    public static native void nativeDumpClasses(Object[] array);
+
+    static boolean usingRI = false;
+}
diff --git a/test/626-const-class-linking/src/MisbehavingLoader.java b/test/626-const-class-linking/src/MisbehavingLoader.java
new file mode 100644
index 0000000..ca9783e
--- /dev/null
+++ b/test/626-const-class-linking/src/MisbehavingLoader.java
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Class loader that returns Helper2.class when asked to load "Test".
+public class MisbehavingLoader extends DefiningLoader {
+    private DefiningLoader defining_loader;
+
+    public MisbehavingLoader(ClassLoader parent, DefiningLoader defining_loader) {
+        super(parent);
+        this.defining_loader = defining_loader;
+    }
+
+    protected Class<?> findClass(String name) throws ClassNotFoundException
+    {
+        if (name.equals("Helper1") || name.equals("Helper2")) {
+            return super.findClass(name);
+        } else if (name.equals("Test")) {
+            throw new Error("Unexpected MisbehavingLoader.findClass(\"Test\")");
+        }
+        return super.findClass(name);
+    }
+
+    protected Class<?> loadClass(String name, boolean resolve)
+        throws ClassNotFoundException
+    {
+        if (name.equals("Helper1") || name.equals("Helper2")) {
+            return super.loadClass(name, resolve);
+        } else if (name.equals("Test")) {
+            // Ask for a different class.
+            return defining_loader.loadClass("Helper2", resolve);
+        }
+        return super.loadClass(name, resolve);
+    }
+}
diff --git a/test/626-const-class-linking/src/RacyLoader.java b/test/626-const-class-linking/src/RacyLoader.java
new file mode 100644
index 0000000..9c164a3
--- /dev/null
+++ b/test/626-const-class-linking/src/RacyLoader.java
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class RacyLoader extends DefiningLoader {
+    static {
+        // For JVM, register as parallel capable.
+        // Android treats all class loaders as parallel capable and makes this a no-op.
+        registerAsParallelCapable();
+    }
+
+    private Object lock = new Object();
+    private int index = 0;
+    private int count;
+
+    private DefiningLoader[] defining_loaders;
+
+    public RacyLoader(ClassLoader parent, int count) {
+        super(parent);
+        this.count = count;
+        defining_loaders = new DefiningLoader[2];
+        for (int i = 0; i != defining_loaders.length; ++i) {
+            defining_loaders[i] = new DefiningLoader(parent);
+        }
+    }
+
+    protected Class<?> findClass(String name) throws ClassNotFoundException
+    {
+        if (name.equals("Test") || name.equals("Test3")) {
+            throw new Error("Unexpected RacyLoader.findClass(\"" + name + "\")");
+        }
+        return super.findClass(name);
+    }
+
+    protected Class<?> loadClass(String name, boolean resolve)
+        throws ClassNotFoundException
+    {
+        if (name.equals("Test") || name.equals("Test3")) {
+            int my_index = syncWithOtherInstances(count);
+            Class<?> result = defining_loaders[my_index & 1].loadClass(name, resolve);
+            syncWithOtherInstances(2 * count);
+            return result;
+        }
+        return super.loadClass(name, resolve);
+    }
+
+    private int syncWithOtherInstances(int limit) {
+        int my_index;
+        synchronized (lock) {
+            my_index = index;
+            ++index;
+            if (index != limit) {
+                do {
+                    try {
+                        lock.wait();
+                    } catch (InterruptedException ie) {
+                        throw new Error(ie);
+                    }
+                } while (index < limit);
+            } else {
+                lock.notifyAll();
+            }
+        }
+        return my_index;
+    }
+}
diff --git a/test/626-const-class-linking/src/RacyMisbehavingHelper.java b/test/626-const-class-linking/src/RacyMisbehavingHelper.java
new file mode 100644
index 0000000..4525278
--- /dev/null
+++ b/test/626-const-class-linking/src/RacyMisbehavingHelper.java
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.lang.reflect.Method;
+
+public class RacyMisbehavingHelper {
+    public static ClassPair get() {
+        Class<?> helper1_class = Helper1.class;
+        Class<?> test_class = Test.class;
+        try {
+            // After loading the correct class, allow loading the incorrect class.
+            ClassLoader loader = helper1_class.getClassLoader();
+            Method reportAfterLoading = loader.getClass().getDeclaredMethod("reportAfterLoading");
+            reportAfterLoading.invoke(loader);
+        } catch (Throwable t) {
+            t.printStackTrace();
+        }
+        return new ClassPair(helper1_class, test_class);
+    }
+}
diff --git a/test/626-const-class-linking/src/RacyMisbehavingLoader.java b/test/626-const-class-linking/src/RacyMisbehavingLoader.java
new file mode 100644
index 0000000..f5bcb4c
--- /dev/null
+++ b/test/626-const-class-linking/src/RacyMisbehavingLoader.java
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class RacyMisbehavingLoader extends DefiningLoader {
+    static {
+        // For JVM, register as parallel capable.
+        // Android treats all class loaders as parallel capable and makes this a no-op.
+        registerAsParallelCapable();
+    }
+
+    private Object lock = new Object();
+    private int index = 0;
+    private int count;
+    private boolean throw_error;
+
+    private DefiningLoader[] defining_loaders;
+
+    public RacyMisbehavingLoader(ClassLoader parent, int count, boolean throw_error) {
+        super(parent);
+        this.count = count;
+        this.throw_error = throw_error;
+        defining_loaders = new DefiningLoader[2];
+        for (int i = 0; i != defining_loaders.length; ++i) {
+            defining_loaders[i] = new DefiningLoader(parent);
+        }
+    }
+
+    public void reportAfterLoading() {
+        synchronized (lock) {
+            ++index;
+            if (index == 2 * count) {
+                lock.notifyAll();
+            }
+        }
+    }
+
+    protected Class<?> findClass(String name) throws ClassNotFoundException
+    {
+        if (name.equals("Test")) {
+            throw new Error("Unexpected RacyLoader.findClass(\"" + name + "\")");
+        }
+        return super.findClass(name);
+    }
+
+    protected Class<?> loadClass(String name, boolean resolve)
+        throws ClassNotFoundException
+    {
+        if (name.equals("Test")) {
+            int my_index = syncWithOtherInstances(count);
+            Class<?> result;
+            if ((my_index & 1) == 0) {
+              // Do not delay loading the correct class.
+              result = defining_loaders[my_index & 1].loadClass(name, resolve);
+            } else {
+              // Delay loading the wrong class.
+              syncWithOtherInstances(2 * count);
+              if (throw_error) {
+                throw new Error("RacyMisbehavingLoader throw_error=true");
+              }
+              result = defining_loaders[my_index & 1].loadClass("Test3", resolve);
+            }
+            return result;
+        }
+        return super.loadClass(name, resolve);
+    }
+
+    private int syncWithOtherInstances(int limit) {
+        int my_index;
+        synchronized (lock) {
+            my_index = index;
+            ++index;
+            if (index != limit) {
+                do {
+                    try {
+                        lock.wait();
+                    } catch (InterruptedException ie) {
+                        throw new Error(ie);
+                    }
+                } while (index < limit);
+            } else {
+                lock.notifyAll();
+            }
+        }
+        return my_index;
+    }
+}
diff --git a/test/913-heaps/expected.txt b/test/913-heaps/expected.txt
index 8002cfa..e5fa53f 100644
--- a/test/913-heaps/expected.txt
+++ b/test/913-heaps/expected.txt
@@ -1,7 +1,7 @@
 ---
 true true
-root@root --(stack-local)--> 1@1000 [size=16, length=-1]
-root@root --(stack-local)--> 3000@0 [size=132, length=-1]
+root@root --(stack-local[id=1,tag=3000,depth=2,method=doFollowReferencesTestNonRoot,vreg=11,location= 31])--> 1@1000 [size=16, length=-1]
+root@root --(stack-local[id=1,tag=3000,depth=3,method=doFollowReferencesTest,vreg=1,location= 28])--> 3000@0 [size=132, length=-1]
 root@root --(thread)--> 3000@0 [size=132, length=-1]
 0@0 --(array-element@0)--> 1@1000 [size=16, length=-1]
 1001@0 --(superclass)--> 1000@0 [size=123, length=-1]
@@ -21,12 +21,6 @@
 5@1002 --(field@28)--> 1@1000 [size=16, length=-1]
 6@1000 --(class)--> 1000@0 [size=123, length=-1]
 ---
-root@root --(stack-local)--> 1@1000 [size=16, length=-1]
-root@root --(stack-local)--> 2@1000 [size=16, length=-1]
-root@root --(stack-local)--> 3000@0 [size=132, length=-1]
-root@root --(thread)--> 2@1000 [size=16, length=-1]
-root@root --(thread)--> 3000@0 [size=132, length=-1]
-0@0 --(array-element@0)--> 1@1000 [size=16, length=-1]
 1001@0 --(superclass)--> 1000@0 [size=123, length=-1]
 1002@0 --(interface)--> 2001@0 [size=132, length=-1]
 1002@0 --(superclass)--> 1001@0 [size=123, length=-1]
@@ -46,7 +40,9 @@
 ---
 root@root --(jni-global)--> 1@1000 [size=16, length=-1]
 root@root --(jni-local[id=1,tag=3000,depth=0,method=followReferences])--> 1@1000 [size=16, length=-1]
-root@root --(stack-local)--> 1@1000 [size=16, length=-1]
+root@root --(stack-local[id=1,tag=3000,depth=1,method=doFollowReferencesTestImpl,vreg=10,location= 6])--> 1@1000 [size=16, length=-1]
+root@root --(stack-local[id=1,tag=3000,depth=1,method=doFollowReferencesTestImpl,vreg=5,location= 6])--> 1@1000 [size=16, length=-1]
+root@root --(stack-local[id=1,tag=3000,depth=2,method=doFollowReferencesTestRoot,vreg=3,location= 18])--> 1@1000 [size=16, length=-1]
 root@root --(thread)--> 1@1000 [size=16, length=-1]
 root@root --(thread)--> 3000@0 [size=132, length=-1]
 1001@0 --(superclass)--> 1000@0 [size=123, length=-1]
@@ -66,13 +62,6 @@
 5@1002 --(field@28)--> 1@1000 [size=16, length=-1]
 6@1000 --(class)--> 1000@0 [size=123, length=-1]
 ---
-root@root --(jni-global)--> 1@1000 [size=16, length=-1]
-root@root --(jni-local[id=1,tag=3000,depth=0,method=followReferences])--> 1@1000 [size=16, length=-1]
-root@root --(stack-local)--> 1@1000 [size=16, length=-1]
-root@root --(stack-local)--> 2@1000 [size=16, length=-1]
-root@root --(thread)--> 1@1000 [size=16, length=-1]
-root@root --(thread)--> 2@1000 [size=16, length=-1]
-root@root --(thread)--> 3000@0 [size=132, length=-1]
 1001@0 --(superclass)--> 1000@0 [size=123, length=-1]
 1002@0 --(interface)--> 2001@0 [size=132, length=-1]
 1002@0 --(superclass)--> 1001@0 [size=123, length=-1]
diff --git a/test/913-heaps/heaps.cc b/test/913-heaps/heaps.cc
index 340671d..7b00fcd 100644
--- a/test/913-heaps/heaps.cc
+++ b/test/913-heaps/heaps.cc
@@ -269,6 +269,43 @@
       jvmtiHeapReferenceInfo info_;
     };
 
+    class StackLocalElement : public Elem {
+     public:
+      StackLocalElement(const std::string& referrer,
+                        const std::string& referree,
+                        jlong size,
+                        jint length,
+                        const jvmtiHeapReferenceInfo* reference_info)
+          : Elem(referrer, referree, size, length) {
+        memcpy(&info_, reference_info, sizeof(jvmtiHeapReferenceInfo));
+      }
+
+     protected:
+      std::string PrintArrowType() const OVERRIDE {
+        char* name = nullptr;
+        if (info_.stack_local.method != nullptr) {
+          jvmti_env->GetMethodName(info_.stack_local.method, &name, nullptr, nullptr);
+        }
+        std::string ret = StringPrintf("stack-local[id=%" PRId64 ",tag=%" PRId64 ",depth=%d,"
+                                       "method=%s,vreg=%d,location=% " PRId64 "]",
+                                       info_.stack_local.thread_id,
+                                       info_.stack_local.thread_tag,
+                                       info_.stack_local.depth,
+                                       name == nullptr ? "<null>" : name,
+                                       info_.stack_local.slot,
+                                       info_.stack_local.location);
+        if (name != nullptr) {
+          jvmti_env->Deallocate(reinterpret_cast<unsigned char*>(name));
+        }
+
+        return ret;
+      }
+
+     private:
+      const std::string string_;
+      jvmtiHeapReferenceInfo info_;
+    };
+
     // For simple or unimplemented cases.
     class StringElement : public Elem {
      public:
@@ -380,11 +417,11 @@
                                                          length,
                                                          "monitor"));
         case JVMTI_HEAP_REFERENCE_STACK_LOCAL:
-          return std::unique_ptr<Elem>(new StringElement(referrer,
-                                                         referree,
-                                                         size,
-                                                         length,
-                                                         "stack-local"));
+          return std::unique_ptr<Elem>(new StackLocalElement(referrer,
+                                                             referree,
+                                                             size,
+                                                             length,
+                                                             reference_info));
         case JVMTI_HEAP_REFERENCE_JNI_LOCAL:
           return std::unique_ptr<Elem>(new JNILocalElement(referrer,
                                                            referree,
diff --git a/test/913-heaps/src/Main.java b/test/913-heaps/src/Main.java
index a6ace9a..564596e 100644
--- a/test/913-heaps/src/Main.java
+++ b/test/913-heaps/src/Main.java
@@ -85,7 +85,7 @@
     v.add("0@0", "1@1000");  // tmpStorage[0] --(array-element)--> a.
 
     doFollowReferencesTestImpl(null, Integer.MAX_VALUE, -1, null, v, null);
-    doFollowReferencesTestImpl(a.foo, Integer.MAX_VALUE, -1, null, v, "2@1000");
+    doFollowReferencesTestImpl(a.foo2, Integer.MAX_VALUE, -1, null, v, "3@1001");
 
     tmpStorage.clear();
   }
@@ -96,7 +96,7 @@
     A a = createTree(v);
 
     doFollowReferencesTestImpl(null, Integer.MAX_VALUE, -1, a, v, null);
-    doFollowReferencesTestImpl(a.foo, Integer.MAX_VALUE, -1, a, v, "2@1000");
+    doFollowReferencesTestImpl(a.foo2, Integer.MAX_VALUE, -1, a, v, "3@1001");
   }
 
   private static void doFollowReferencesTestImpl(A root, int stopAfter, int followSet,
diff --git a/test/956-methodhandles/src/Main.java b/test/956-methodhandles/src/Main.java
index ee9c436..17b56b4 100644
--- a/test/956-methodhandles/src/Main.java
+++ b/test/956-methodhandles/src/Main.java
@@ -1080,7 +1080,7 @@
       return result;
     }
     public static Long sumToReference(int... ints) {
-      System.err.println("Hi");
+      System.out.println("Hi");
       return new Long(sumToPrimitive(ints));
     }
     public static MethodHandles.Lookup lookup() {
@@ -1432,13 +1432,13 @@
     assertEquals(Long.valueOf(10l), (Long) mh.invoke(1, 2, 3, 4));
     try {
       // WrongMethodTypeException should be raised before invoke here.
-      System.err.print("Expect Hi here: ");
+      System.out.print("Expect Hi here: ");
       assertEquals(Long.valueOf(10l), (Byte) mh.invoke(1, 2, 3, 4));
       fail();
     } catch (ClassCastException e) {}
     try {
       // WrongMethodTypeException should be raised before invoke here.
-      System.err.println("Don't expect Hi now");
+      System.out.println("Don't expect Hi now");
       byte b = (byte) mh.invoke(1, 2, 3, 4);
       fail();
     } catch (WrongMethodTypeException e) {}
diff --git a/test/Android.bp b/test/Android.bp
index 44c64c1..2625f56 100644
--- a/test/Android.bp
+++ b/test/Android.bp
@@ -314,6 +314,7 @@
         "595-profile-saving/profile-saving.cc",
         "596-app-images/app_images.cc",
         "597-deopt-new-string/deopt.cc",
+        "626-const-class-linking/clear_dex_cache_types.cc",
     ],
     shared_libs: [
         "libbacktrace",
diff --git a/test/Android.run-test.mk b/test/Android.run-test.mk
index 28e1e60..b515130 100644
--- a/test/Android.run-test.mk
+++ b/test/Android.run-test.mk
@@ -440,9 +440,11 @@
   629-vdex-speed
 
 # This test fails without an image.
+# 964 often times out due to the large number of classes it tries to compile.
 TEST_ART_BROKEN_NO_IMAGE_RUN_TESTS := \
   137-cfi \
-  138-duplicate-classes-check
+  138-duplicate-classes-check \
+  964-default-iface-init
 
 ifneq (,$(filter no-dex2oat,$(PREBUILD_TYPES)))
   ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,$(TARGET_TYPES),$(RUN_TYPES),no-dex2oat, \
@@ -512,9 +514,11 @@
 # Test 906 iterates the heap filtering with different options. No instances should be created
 # between those runs to be able to have precise checks.
 # Test 902 hits races with the JIT compiler. b/32821077
+# Test 626-const-class-linking can deadlock with JIT. b/33567581
 # Test 629 requires compilation.
 TEST_ART_BROKEN_JIT_RUN_TESTS := \
   137-cfi \
+  626-const-class-linking \
   629-vdex-speed \
   902-hello-transformation \
   904-object-allocation \
diff --git a/test/etc/default-build b/test/etc/default-build
index f2b5078..51ae175 100755
--- a/test/etc/default-build
+++ b/test/etc/default-build
@@ -273,8 +273,10 @@
 fi
 
 # Create a single jar with two dex files for multidex.
-if [ ${HAS_SRC_MULTIDEX} = "true" ] || [ ${HAS_SMALI_MULTIDEX} = "true" ]; then
-  zip $TEST_NAME.jar classes.dex classes2.dex
-elif [ ${NEED_DEX} = "true" ]; then
-  zip $TEST_NAME.jar classes.dex
+if [ ${NEED_DEX} = "true" ]; then
+  if [ ${HAS_SRC_MULTIDEX} = "true" ] || [ ${HAS_SMALI_MULTIDEX} = "true" ]; then
+    zip $TEST_NAME.jar classes.dex classes2.dex
+  else
+    zip $TEST_NAME.jar classes.dex
+  fi
 fi
diff --git a/test/etc/run-test-jar b/test/etc/run-test-jar
index bb3a3ad..f0abb44 100755
--- a/test/etc/run-test-jar
+++ b/test/etc/run-test-jar
@@ -316,7 +316,8 @@
 if [ "$USE_JVM" = "y" ]; then
   export LD_LIBRARY_PATH=${ANDROID_HOST_OUT}/lib64
   # Xmx is necessary since we don't pass down the ART flags to JVM.
-  cmdline="${JAVA} ${DEBUGGER_OPTS} ${JVM_VERIFY_ARG} -Xmx256m -classpath classes ${FLAGS} $MAIN $@ ${ARGS}"
+  # We pass the classes2 path whether it's used (src-multidex) or not.
+  cmdline="${JAVA} ${DEBUGGER_OPTS} ${JVM_VERIFY_ARG} -Xmx256m -classpath classes:classes2 ${FLAGS} $MAIN $@ ${ARGS}"
   if [ "$DEV_MODE" = "y" ]; then
     echo $cmdline
   fi
diff --git a/tools/run-jdwp-tests.sh b/tools/run-jdwp-tests.sh
index 74b0f16..6d5c74b 100755
--- a/tools/run-jdwp-tests.sh
+++ b/tools/run-jdwp-tests.sh
@@ -155,6 +155,7 @@
       --classpath $test_jack \
       --toolchain jack --language JN \
       --vm-arg -Xcompiler-option --vm-arg --debuggable \
+      --jack-arg -g \
       $test
 
 vogar_exit_status=$?