Merge "ARM: Purge Arm32Assembler."
diff --git a/build/Android.common_build.mk b/build/Android.common_build.mk
index 33c2a8e..b5d41d9 100644
--- a/build/Android.common_build.mk
+++ b/build/Android.common_build.mk
@@ -136,7 +136,6 @@
 # Base set of cflags used by all things ART.
 art_cflags += \
   -fno-rtti \
-  -std=gnu++11 \
   -ggdb3 \
   -Wall \
   -Werror \
@@ -152,24 +151,19 @@
 
 # The architectures the compiled tools are able to run on. Setting this to 'all' will cause all
 # architectures to be included.
-ART_TARGET_CODEGEN_ARCHS ?= all
+ART_TARGET_CODEGEN_ARCHS ?= svelte
 ART_HOST_CODEGEN_ARCHS ?= all
 
 ifeq ($(ART_TARGET_CODEGEN_ARCHS),all)
   ART_TARGET_CODEGEN_ARCHS := $(sort $(ART_TARGET_SUPPORTED_ARCH) $(ART_HOST_SUPPORTED_ARCH))
-  # We need to handle the fact that some compiler tests mix code from different architectures.
-  ART_TARGET_COMPILER_TESTS ?= true
 else
-  ART_TARGET_COMPILER_TESTS := false
   ifeq ($(ART_TARGET_CODEGEN_ARCHS),svelte)
     ART_TARGET_CODEGEN_ARCHS := $(sort $(ART_TARGET_ARCH_64) $(ART_TARGET_ARCH_32))
   endif
 endif
 ifeq ($(ART_HOST_CODEGEN_ARCHS),all)
   ART_HOST_CODEGEN_ARCHS := $(sort $(ART_TARGET_SUPPORTED_ARCH) $(ART_HOST_SUPPORTED_ARCH))
-  ART_HOST_COMPILER_TESTS ?= true
 else
-  ART_HOST_COMPILER_TESTS := false
   ifeq ($(ART_HOST_CODEGEN_ARCHS),svelte)
     ART_HOST_CODEGEN_ARCHS := $(sort $(ART_TARGET_CODEGEN_ARCHS) $(ART_HOST_ARCH_64) $(ART_HOST_ARCH_32))
   endif
diff --git a/build/Android.gtest.mk b/build/Android.gtest.mk
index 4739f7d..c61efac 100644
--- a/build/Android.gtest.mk
+++ b/build/Android.gtest.mk
@@ -299,13 +299,7 @@
 COMPILER_GTEST_COMMON_SRC_FILES_all := \
   compiler/jni/jni_cfi_test.cc \
   compiler/optimizing/codegen_test.cc \
-  compiler/optimizing/constant_folding_test.cc \
-  compiler/optimizing/dead_code_elimination_test.cc \
-  compiler/optimizing/linearize_test.cc \
-  compiler/optimizing/liveness_test.cc \
-  compiler/optimizing/live_ranges_test.cc \
   compiler/optimizing/optimizing_cfi_test.cc \
-  compiler/optimizing/register_allocator_test.cc \
 
 COMPILER_GTEST_COMMON_SRC_FILES_arm := \
   compiler/linker/arm/relative_patcher_thumb2_test.cc \
@@ -325,6 +319,16 @@
   compiler/linker/x86/relative_patcher_x86_test.cc \
   compiler/utils/x86/managed_register_x86_test.cc \
 
+# These tests are testing architecture-independent functionality, but happen
+# to use x86 codegen as part of the test.
+COMPILER_GTEST_COMMON_SRC_FILES_x86 += \
+  compiler/optimizing/constant_folding_test.cc \
+  compiler/optimizing/dead_code_elimination_test.cc \
+  compiler/optimizing/linearize_test.cc \
+  compiler/optimizing/live_ranges_test.cc \
+  compiler/optimizing/liveness_test.cc \
+  compiler/optimizing/register_allocator_test.cc \
+
 COMPILER_GTEST_COMMON_SRC_FILES_x86_64 := \
   compiler/linker/x86_64/relative_patcher_x86_64_test.cc \
 
@@ -359,9 +363,7 @@
   $(COMPILER_GTEST_COMMON_SRC_FILES_x86_64) \
 
 $(foreach arch,$(ART_TARGET_CODEGEN_ARCHS),$(eval COMPILER_GTEST_TARGET_SRC_FILES += $$(COMPILER_GTEST_TARGET_SRC_FILES_$(arch))))
-ifeq (true,$(ART_TARGET_COMPILER_TESTS))
-  COMPILER_GTEST_TARGET_SRC_FILES += $(COMPILER_GTEST_TARGET_SRC_FILES_all)
-endif
+COMPILER_GTEST_TARGET_SRC_FILES += $(COMPILER_GTEST_TARGET_SRC_FILES_all)
 
 COMPILER_GTEST_HOST_SRC_FILES := \
   $(COMPILER_GTEST_COMMON_SRC_FILES) \
@@ -395,9 +397,7 @@
   compiler/utils/x86_64/assembler_x86_64_test.cc
 
 $(foreach arch,$(ART_HOST_CODEGEN_ARCHS),$(eval COMPILER_GTEST_HOST_SRC_FILES += $$(COMPILER_GTEST_HOST_SRC_FILES_$(arch))))
-ifeq (true,$(ART_HOST_COMPILER_TESTS))
-  COMPILER_GTEST_HOST_SRC_FILES += $(COMPILER_GTEST_HOST_SRC_FILES_all)
-endif
+COMPILER_GTEST_HOST_SRC_FILES += $(COMPILER_GTEST_HOST_SRC_FILES_all)
 
 ART_TEST_CFLAGS :=
 
diff --git a/compiler/Android.mk b/compiler/Android.mk
index 2426eb9..37f48e1 100644
--- a/compiler/Android.mk
+++ b/compiler/Android.mk
@@ -47,7 +47,6 @@
 	optimizing/code_generator_utils.cc \
 	optimizing/constant_folding.cc \
 	optimizing/dead_code_elimination.cc \
-	optimizing/dex_cache_array_fixups_arm.cc \
 	optimizing/graph_checker.cc \
 	optimizing/graph_visualizer.cc \
 	optimizing/gvn.cc \
@@ -61,7 +60,6 @@
 	optimizing/load_store_elimination.cc \
 	optimizing/locations.cc \
 	optimizing/nodes.cc \
-	optimizing/nodes_arm64.cc \
 	optimizing/optimization.cc \
 	optimizing/optimizing_compiler.cc \
 	optimizing/parallel_move_resolver.cc \
@@ -78,7 +76,6 @@
 	optimizing/ssa_liveness_analysis.cc \
 	optimizing/ssa_phi_elimination.cc \
 	optimizing/stack_map_stream.cc \
-	optimizing/x86_memory_gen.cc \
 	trampolines/trampoline_compiler.cc \
 	utils/assembler.cc \
 	utils/jni_macro_assembler.cc \
@@ -94,6 +91,9 @@
 	linker/arm/relative_patcher_arm_base.cc \
 	linker/arm/relative_patcher_thumb2.cc \
 	optimizing/code_generator_arm.cc \
+	optimizing/dex_cache_array_fixups_arm.cc \
+	optimizing/instruction_simplifier_arm.cc \
+	optimizing/instruction_simplifier_shared.cc \
 	optimizing/intrinsics_arm.cc \
 	utils/arm/assembler_arm.cc \
 	utils/arm/assembler_thumb2.cc \
@@ -108,8 +108,8 @@
     $(LIBART_COMPILER_SRC_FILES_arm) \
 	jni/quick/arm64/calling_convention_arm64.cc \
 	linker/arm64/relative_patcher_arm64.cc \
+	optimizing/nodes_arm64.cc \
 	optimizing/code_generator_arm64.cc \
-	optimizing/instruction_simplifier_arm.cc \
 	optimizing/instruction_simplifier_arm64.cc \
 	optimizing/instruction_simplifier_shared.cc \
 	optimizing/intrinsics_arm64.cc \
@@ -143,6 +143,7 @@
 	optimizing/code_generator_x86.cc \
 	optimizing/intrinsics_x86.cc \
 	optimizing/pc_relative_fixups_x86.cc \
+	optimizing/x86_memory_gen.cc \
 	utils/x86/assembler_x86.cc \
 	utils/x86/jni_macro_assembler_x86.cc \
 	utils/x86/managed_register_x86.cc \
diff --git a/compiler/cfi_test.h b/compiler/cfi_test.h
index f8b7460..c754e55 100644
--- a/compiler/cfi_test.h
+++ b/compiler/cfi_test.h
@@ -22,11 +22,13 @@
 #include <sstream>
 
 #include "arch/instruction_set.h"
+#include "base/enums.h"
 #include "debug/dwarf/dwarf_constants.h"
 #include "debug/dwarf/dwarf_test.h"
 #include "debug/dwarf/headers.h"
 #include "disassembler/disassembler.h"
 #include "gtest/gtest.h"
+#include "thread.h"
 
 namespace art {
 
@@ -57,7 +59,13 @@
     // Pretty-print assembly.
     const uint8_t* asm_base = actual_asm.data();
     const uint8_t* asm_end = asm_base + actual_asm.size();
-    auto* opts = new DisassemblerOptions(false, asm_base, asm_end, true);
+    auto* opts = new DisassemblerOptions(false,
+                                         asm_base,
+                                         asm_end,
+                                         true,
+                                         is64bit
+                                             ? &Thread::DumpThreadOffset<PointerSize::k64>
+                                             : &Thread::DumpThreadOffset<PointerSize::k32>);
     std::unique_ptr<Disassembler> disasm(Disassembler::Create(isa, opts));
     std::stringstream stream;
     const uint8_t* base = actual_asm.data() + (isa == kThumb2 ? 1 : 0);
diff --git a/compiler/image_test.cc b/compiler/image_test.cc
index 91579e9..e1ee0d2 100644
--- a/compiler/image_test.cc
+++ b/compiler/image_test.cc
@@ -188,6 +188,7 @@
   }
 
   uint64_t image_file_size;
+  size_t image_size;
   {
     std::unique_ptr<File> file(OS::OpenFileForReading(image_file.GetFilename().c_str()));
     ASSERT_TRUE(file.get() != nullptr);
@@ -206,6 +207,7 @@
     ASSERT_TRUE(space->IsMallocSpace());
 
     image_file_size = file->GetLength();
+    image_size = image_header.GetImageSize();
   }
 
   ASSERT_TRUE(compiler_driver_->GetImageClasses() != nullptr);
@@ -255,10 +257,10 @@
   ASSERT_TRUE(image_space != nullptr);
   if (storage_mode == ImageHeader::kStorageModeUncompressed) {
     // Uncompressed, image should be smaller than file.
-    ASSERT_LE(image_space->Size(), image_file_size);
+    ASSERT_LE(image_size, image_file_size);
   } else {
     // Compressed, file should be smaller than image.
-    ASSERT_LE(image_file_size, image_space->Size());
+    ASSERT_LE(image_file_size, image_size);
   }
 
   image_space->VerifyImageAllocations();
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index efae4d0..bb45999 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -52,6 +52,7 @@
 #include "mirror/array-inl.h"
 #include "mirror/class-inl.h"
 #include "mirror/class_loader.h"
+#include "mirror/dex_cache.h"
 #include "mirror/dex_cache-inl.h"
 #include "mirror/method.h"
 #include "mirror/object-inl.h"
@@ -1418,6 +1419,9 @@
           bin_offset = RoundUp(bin_offset, method_alignment);
           break;
         }
+        case kBinDexCacheArray:
+          bin_offset = RoundUp(bin_offset, DexCacheArraysLayout::Alignment());
+          break;
         case kBinImTable:
         case kBinIMTConflictTable: {
           bin_offset = RoundUp(bin_offset, static_cast<size_t>(target_ptr_size_));
@@ -2034,7 +2038,7 @@
   // 64-bit values here, clearing the top 32 bits for 32-bit targets. The zero-extension is
   // done by casting to the unsigned type uintptr_t before casting to int64_t, i.e.
   //     static_cast<int64_t>(reinterpret_cast<uintptr_t>(image_begin_ + offset))).
-  GcRoot<mirror::String>* orig_strings = orig_dex_cache->GetStrings();
+  mirror::StringDexCacheType* orig_strings = orig_dex_cache->GetStrings();
   if (orig_strings != nullptr) {
     copy_dex_cache->SetFieldPtrWithSize<false>(mirror::DexCache::StringsOffset(),
                                                NativeLocationInImage(orig_strings),
diff --git a/compiler/jni/jni_cfi_test.cc b/compiler/jni/jni_cfi_test.cc
index 524ce4d..4b056f5 100644
--- a/compiler/jni/jni_cfi_test.cc
+++ b/compiler/jni/jni_cfi_test.cc
@@ -104,12 +104,24 @@
     TestImpl(isa, #isa, expected_asm, expected_cfi); \
   }
 
+#ifdef ART_ENABLE_CODEGEN_arm
 TEST_ISA(kThumb2)
+#endif
+#ifdef ART_ENABLE_CODEGEN_arm64
 TEST_ISA(kArm64)
+#endif
+#ifdef ART_ENABLE_CODEGEN_x86
 TEST_ISA(kX86)
+#endif
+#ifdef ART_ENABLE_CODEGEN_x86_64
 TEST_ISA(kX86_64)
+#endif
+#ifdef ART_ENABLE_CODEGEN_mips
 TEST_ISA(kMips)
+#endif
+#ifdef ART_ENABLE_CODEGEN_mips64
 TEST_ISA(kMips64)
+#endif
 
 #endif  // ART_TARGET_ANDROID
 
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index 8273b15..8a80982 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -1189,8 +1189,13 @@
   }
 
   mirror::String* GetTargetString(const LinkerPatch& patch) SHARED_REQUIRES(Locks::mutator_lock_) {
-    mirror::DexCache* dex_cache = GetDexCache(patch.TargetStringDexFile());
-    mirror::String* string = dex_cache->GetResolvedString(patch.TargetStringIndex());
+    ScopedObjectAccessUnchecked soa(Thread::Current());
+    StackHandleScope<1> hs(soa.Self());
+    ClassLinker* linker = Runtime::Current()->GetClassLinker();
+    Handle<mirror::DexCache> dex_cache(hs.NewHandle(GetDexCache(patch.TargetStringDexFile())));
+    mirror::String* string = linker->LookupString(*patch.TargetStringDexFile(),
+                                                  patch.TargetStringIndex(),
+                                                  dex_cache);
     DCHECK(string != nullptr);
     DCHECK(writer_->HasBootImage() ||
            Runtime::Current()->GetHeap()->ObjectIsInBootImageSpace(string));
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 5152075..c532e72 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -1228,7 +1228,8 @@
          instruction->IsLoadString() ||
          instruction->IsInstanceOf() ||
          instruction->IsCheckCast() ||
-         (instruction->IsInvokeVirtual() && instruction->GetLocations()->Intrinsified()))
+         (instruction->IsInvokeVirtual() && instruction->GetLocations()->Intrinsified()) ||
+         (instruction->IsInvokeStaticOrDirect() && instruction->GetLocations()->Intrinsified()))
       << "instruction->DebugName()=" << instruction->DebugName()
       << " slow_path->GetDescription()=" << slow_path->GetDescription();
 }
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 4c4128c..6d9c55c 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -429,7 +429,8 @@
            instruction_->IsLoadString() ||
            instruction_->IsInstanceOf() ||
            instruction_->IsCheckCast() ||
-           (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified())
+           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) ||
+           (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier marking slow path: "
         << instruction_->DebugName();
 
@@ -441,6 +442,9 @@
     DCHECK_NE(reg, SP);
     DCHECK_NE(reg, LR);
     DCHECK_NE(reg, PC);
+    // IP is used internally by the ReadBarrierMarkRegX entry point
+    // as a temporary, it cannot be the entry point's input/output.
+    DCHECK_NE(reg, IP);
     DCHECK(0 <= reg && reg < kNumberOfCoreRegisters) << reg;
     // "Compact" slow path, saving two moves.
     //
@@ -5585,55 +5589,15 @@
       __ LoadLiteral(out, codegen_->DeduplicateBootImageAddressLiteral(address));
       return;  // No dex cache slow path.
     }
-    case HLoadString::LoadKind::kDexCacheAddress: {
-      DCHECK_NE(load->GetAddress(), 0u);
-      uint32_t address = dchecked_integral_cast<uint32_t>(load->GetAddress());
-      // 16-bit LDR immediate has a 5-bit offset multiplied by the size and that gives
-      // a 128B range. To try and reduce the number of literals if we load multiple strings,
-      // simply split the dex cache address to a 128B aligned base loaded from a literal
-      // and the remaining offset embedded in the load.
-      static_assert(sizeof(GcRoot<mirror::String>) == 4u, "Expected GC root to be 4 bytes.");
-      DCHECK_ALIGNED(load->GetAddress(), 4u);
-      constexpr size_t offset_bits = /* encoded bits */ 5 + /* scale */ 2;
-      uint32_t base_address = address & ~MaxInt<uint32_t>(offset_bits);
-      uint32_t offset = address & MaxInt<uint32_t>(offset_bits);
-      __ LoadLiteral(out, codegen_->DeduplicateDexCacheAddressLiteral(base_address));
-      // /* GcRoot<mirror::String> */ out = *(base_address + offset)
-      GenerateGcRootFieldLoad(load, out_loc, out, offset);
-      break;
-    }
-    case HLoadString::LoadKind::kDexCachePcRelative: {
-      Register base_reg = locations->InAt(0).AsRegister<Register>();
-      HArmDexCacheArraysBase* base = load->InputAt(0)->AsArmDexCacheArraysBase();
-      int32_t offset = load->GetDexCacheElementOffset() - base->GetElementOffset();
-      // /* GcRoot<mirror::String> */ out = *(dex_cache_arrays_base + offset)
-      GenerateGcRootFieldLoad(load, out_loc, base_reg, offset);
-      break;
-    }
-    case HLoadString::LoadKind::kDexCacheViaMethod: {
-      Register current_method = locations->InAt(0).AsRegister<Register>();
-
-      // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
-      GenerateGcRootFieldLoad(
-          load, out_loc, current_method, ArtMethod::DeclaringClassOffset().Int32Value());
-      // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
-      __ LoadFromOffset(kLoadWord, out, out, mirror::Class::DexCacheStringsOffset().Int32Value());
-      // /* GcRoot<mirror::String> */ out = out[string_index]
-      GenerateGcRootFieldLoad(
-          load, out_loc, out, CodeGenerator::GetCacheOffset(load->GetStringIndex()));
-      break;
-    }
     default:
-      LOG(FATAL) << "Unexpected load kind: " << load->GetLoadKind();
-      UNREACHABLE();
+      break;
   }
 
-  if (!load->IsInDexCache()) {
-    SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathARM(load);
-    codegen_->AddSlowPath(slow_path);
-    __ CompareAndBranchIfZero(out, slow_path->GetEntryLabel());
-    __ Bind(slow_path->GetExitLabel());
-  }
+  // TODO: Re-add the compiler code to do string dex cache lookup again.
+  SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathARM(load);
+  codegen_->AddSlowPath(slow_path);
+  __ b(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
 }
 
 static int32_t GetExceptionTlsOffset() {
@@ -6413,7 +6377,7 @@
                     "art::mirror::CompressedReference<mirror::Object> and int32_t "
                     "have different sizes.");
 
-      // Slow path used to mark the GC root `root`.
+      // Slow path marking the GC root `root`.
       SlowPathCode* slow_path =
           new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, root);
       codegen_->AddSlowPath(slow_path);
@@ -6522,7 +6486,8 @@
   // Introduce a dependency on the lock_word including the rb_state,
   // which shall prevent load-load reordering without using
   // a memory barrier (which would be more expensive).
-  // obj is unchanged by this operation, but its value now depends on temp_reg.
+  // `obj` is unchanged by this operation, but its value now depends
+  // on `temp_reg`.
   __ add(obj, obj, ShifterOperand(temp_reg, LSR, 32));
 
   // The actual reference load.
@@ -6553,7 +6518,7 @@
   // Object* ref = ref_addr->AsMirrorPtr()
   __ MaybeUnpoisonHeapReference(ref_reg);
 
-  // Slow path used to mark the object `ref` when it is gray.
+  // Slow path marking the object `ref` when it is gray.
   SlowPathCode* slow_path =
       new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM(instruction, ref);
   AddSlowPath(slow_path);
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index d95e7df..cc8985d 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -595,7 +595,8 @@
            instruction_->IsLoadString() ||
            instruction_->IsInstanceOf() ||
            instruction_->IsCheckCast() ||
-           (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified())
+           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) ||
+           (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier marking slow path: "
         << instruction_->DebugName();
 
@@ -607,7 +608,8 @@
     DCHECK_NE(obj_.reg(), LR);
     DCHECK_NE(obj_.reg(), WSP);
     DCHECK_NE(obj_.reg(), WZR);
-    // WIP0 is used by the slow path as a temp, it can not be the object register.
+    // IP0 is used internally by the ReadBarrierMarkRegX entry point
+    // as a temporary, it cannot be the entry point's input/output.
     DCHECK_NE(obj_.reg(), IP0);
     DCHECK(0 <= obj_.reg() && obj_.reg() < kNumberOfWRegisters) << obj_.reg();
     // "Compact" slow path, saving two moves.
@@ -4195,7 +4197,6 @@
 }
 
 void InstructionCodeGeneratorARM64::VisitLoadString(HLoadString* load) {
-  Location out_loc = load->GetLocations()->Out();
   Register out = OutputRegister(load);
 
   switch (load->GetLoadKind()) {
@@ -4231,63 +4232,15 @@
       __ Ldr(out.W(), codegen_->DeduplicateBootImageAddressLiteral(load->GetAddress()));
       return;  // No dex cache slow path.
     }
-    case HLoadString::LoadKind::kDexCacheAddress: {
-      DCHECK_NE(load->GetAddress(), 0u);
-      // LDR immediate has a 12-bit offset multiplied by the size and for 32-bit loads
-      // that gives a 16KiB range. To try and reduce the number of literals if we load
-      // multiple strings, simply split the dex cache address to a 16KiB aligned base
-      // loaded from a literal and the remaining offset embedded in the load.
-      static_assert(sizeof(GcRoot<mirror::String>) == 4u, "Expected GC root to be 4 bytes.");
-      DCHECK_ALIGNED(load->GetAddress(), 4u);
-      constexpr size_t offset_bits = /* encoded bits */ 12 + /* scale */ 2;
-      uint64_t base_address = load->GetAddress() & ~MaxInt<uint64_t>(offset_bits);
-      uint32_t offset = load->GetAddress() & MaxInt<uint64_t>(offset_bits);
-      __ Ldr(out.X(), codegen_->DeduplicateDexCacheAddressLiteral(base_address));
-      // /* GcRoot<mirror::String> */ out = *(base_address + offset)
-      GenerateGcRootFieldLoad(load, out_loc, out.X(), offset);
-      break;
-    }
-    case HLoadString::LoadKind::kDexCachePcRelative: {
-      // Add ADRP with its PC-relative DexCache access patch.
-      const DexFile& dex_file = load->GetDexFile();
-      uint32_t element_offset = load->GetDexCacheElementOffset();
-      vixl::aarch64::Label* adrp_label =
-          codegen_->NewPcRelativeDexCacheArrayPatch(dex_file, element_offset);
-      {
-        SingleEmissionCheckScope guard(GetVIXLAssembler());
-        __ Bind(adrp_label);
-        __ adrp(out.X(), /* offset placeholder */ 0);
-      }
-      // Add LDR with its PC-relative DexCache access patch.
-      vixl::aarch64::Label* ldr_label =
-          codegen_->NewPcRelativeDexCacheArrayPatch(dex_file, element_offset, adrp_label);
-      // /* GcRoot<mirror::String> */ out = *(base_address + offset)  /* PC-relative */
-      GenerateGcRootFieldLoad(load, out_loc, out.X(), /* offset placeholder */ 0, ldr_label);
-      break;
-    }
-    case HLoadString::LoadKind::kDexCacheViaMethod: {
-      Register current_method = InputRegisterAt(load, 0);
-      // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
-      GenerateGcRootFieldLoad(
-          load, out_loc, current_method, ArtMethod::DeclaringClassOffset().Int32Value());
-      // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
-      __ Ldr(out.X(), HeapOperand(out, mirror::Class::DexCacheStringsOffset().Uint32Value()));
-      // /* GcRoot<mirror::String> */ out = out[string_index]
-      GenerateGcRootFieldLoad(
-          load, out_loc, out.X(), CodeGenerator::GetCacheOffset(load->GetStringIndex()));
-      break;
-    }
     default:
-      LOG(FATAL) << "Unexpected load kind: " << load->GetLoadKind();
-      UNREACHABLE();
+      break;
   }
 
-  if (!load->IsInDexCache()) {
-    SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathARM64(load);
-    codegen_->AddSlowPath(slow_path);
-    __ Cbz(out, slow_path->GetEntryLabel());
-    __ Bind(slow_path->GetExitLabel());
-  }
+  // TODO: Re-add the compiler code to do string dex cache lookup again.
+  SlowPathCodeARM64* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathARM64(load);
+  codegen_->AddSlowPath(slow_path);
+  __ B(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
 }
 
 void LocationsBuilderARM64::VisitLongConstant(HLongConstant* constant) {
@@ -5088,7 +5041,7 @@
                     "art::mirror::CompressedReference<mirror::Object> and int32_t "
                     "have different sizes.");
 
-      // Slow path used to mark the GC root `root`.
+      // Slow path marking the GC root `root`.
       SlowPathCodeARM64* slow_path =
           new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, root);
       codegen_->AddSlowPath(slow_path);
@@ -5239,7 +5192,8 @@
   // Introduce a dependency on the lock_word including rb_state,
   // to prevent load-load reordering, and without using
   // a memory barrier (which would be more expensive).
-  // obj is unchanged by this operation, but its value now depends on temp.
+  // `obj` is unchanged by this operation, but its value now depends
+  // on `temp`.
   __ Add(obj.X(), obj.X(), Operand(temp.X(), LSR, 32));
 
   // The actual reference load.
@@ -5285,7 +5239,7 @@
   // Object* ref = ref_addr->AsMirrorPtr()
   GetAssembler()->MaybeUnpoisonHeapReference(ref_reg);
 
-  // Slow path used to mark the object `ref` when it is gray.
+  // Slow path marking the object `ref` when it is gray.
   SlowPathCodeARM64* slow_path =
       new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathARM64(instruction, ref);
   AddSlowPath(slow_path);
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 58879bc..8a2f90d 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -1833,11 +1833,19 @@
   }
 }
 
+auto InstructionCodeGeneratorMIPS::GetImplicitNullChecker(HInstruction* instruction) {
+  auto null_checker = [this, instruction]() {
+    this->codegen_->MaybeRecordImplicitNullCheck(instruction);
+  };
+  return null_checker;
+}
+
 void InstructionCodeGeneratorMIPS::VisitArrayGet(HArrayGet* instruction) {
   LocationSummary* locations = instruction->GetLocations();
   Register obj = locations->InAt(0).AsRegister<Register>();
   Location index = locations->InAt(1);
   uint32_t data_offset = CodeGenerator::GetArrayDataOffset(instruction);
+  auto null_checker = GetImplicitNullChecker(instruction);
 
   Primitive::Type type = instruction->GetType();
   switch (type) {
@@ -1846,10 +1854,10 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
-        __ LoadFromOffset(kLoadUnsignedByte, out, obj, offset);
+        __ LoadFromOffset(kLoadUnsignedByte, out, obj, offset, null_checker);
       } else {
         __ Addu(TMP, obj, index.AsRegister<Register>());
-        __ LoadFromOffset(kLoadUnsignedByte, out, TMP, data_offset);
+        __ LoadFromOffset(kLoadUnsignedByte, out, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -1859,10 +1867,10 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
-        __ LoadFromOffset(kLoadSignedByte, out, obj, offset);
+        __ LoadFromOffset(kLoadSignedByte, out, obj, offset, null_checker);
       } else {
         __ Addu(TMP, obj, index.AsRegister<Register>());
-        __ LoadFromOffset(kLoadSignedByte, out, TMP, data_offset);
+        __ LoadFromOffset(kLoadSignedByte, out, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -1872,11 +1880,11 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
-        __ LoadFromOffset(kLoadSignedHalfword, out, obj, offset);
+        __ LoadFromOffset(kLoadSignedHalfword, out, obj, offset, null_checker);
       } else {
         __ Sll(TMP, index.AsRegister<Register>(), TIMES_2);
         __ Addu(TMP, obj, TMP);
-        __ LoadFromOffset(kLoadSignedHalfword, out, TMP, data_offset);
+        __ LoadFromOffset(kLoadSignedHalfword, out, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -1886,11 +1894,11 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
-        __ LoadFromOffset(kLoadUnsignedHalfword, out, obj, offset);
+        __ LoadFromOffset(kLoadUnsignedHalfword, out, obj, offset, null_checker);
       } else {
         __ Sll(TMP, index.AsRegister<Register>(), TIMES_2);
         __ Addu(TMP, obj, TMP);
-        __ LoadFromOffset(kLoadUnsignedHalfword, out, TMP, data_offset);
+        __ LoadFromOffset(kLoadUnsignedHalfword, out, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -1902,11 +1910,11 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
-        __ LoadFromOffset(kLoadWord, out, obj, offset);
+        __ LoadFromOffset(kLoadWord, out, obj, offset, null_checker);
       } else {
         __ Sll(TMP, index.AsRegister<Register>(), TIMES_4);
         __ Addu(TMP, obj, TMP);
-        __ LoadFromOffset(kLoadWord, out, TMP, data_offset);
+        __ LoadFromOffset(kLoadWord, out, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -1916,11 +1924,11 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
-        __ LoadFromOffset(kLoadDoubleword, out, obj, offset);
+        __ LoadFromOffset(kLoadDoubleword, out, obj, offset, null_checker);
       } else {
         __ Sll(TMP, index.AsRegister<Register>(), TIMES_8);
         __ Addu(TMP, obj, TMP);
-        __ LoadFromOffset(kLoadDoubleword, out, TMP, data_offset);
+        __ LoadFromOffset(kLoadDoubleword, out, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -1930,11 +1938,11 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
-        __ LoadSFromOffset(out, obj, offset);
+        __ LoadSFromOffset(out, obj, offset, null_checker);
       } else {
         __ Sll(TMP, index.AsRegister<Register>(), TIMES_4);
         __ Addu(TMP, obj, TMP);
-        __ LoadSFromOffset(out, TMP, data_offset);
+        __ LoadSFromOffset(out, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -1944,11 +1952,11 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
-        __ LoadDFromOffset(out, obj, offset);
+        __ LoadDFromOffset(out, obj, offset, null_checker);
       } else {
         __ Sll(TMP, index.AsRegister<Register>(), TIMES_8);
         __ Addu(TMP, obj, TMP);
-        __ LoadDFromOffset(out, TMP, data_offset);
+        __ LoadDFromOffset(out, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -1957,7 +1965,6 @@
       LOG(FATAL) << "Unreachable type " << instruction->GetType();
       UNREACHABLE();
   }
-  codegen_->MaybeRecordImplicitNullCheck(instruction);
 }
 
 void LocationsBuilderMIPS::VisitArrayLength(HArrayLength* instruction) {
@@ -2004,6 +2011,7 @@
   bool needs_runtime_call = locations->WillCall();
   bool needs_write_barrier =
       CodeGenerator::StoreNeedsWriteBarrier(value_type, instruction->GetValue());
+  auto null_checker = GetImplicitNullChecker(instruction);
 
   switch (value_type) {
     case Primitive::kPrimBoolean:
@@ -2013,10 +2021,10 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_1) + data_offset;
-        __ StoreToOffset(kStoreByte, value, obj, offset);
+        __ StoreToOffset(kStoreByte, value, obj, offset, null_checker);
       } else {
         __ Addu(TMP, obj, index.AsRegister<Register>());
-        __ StoreToOffset(kStoreByte, value, TMP, data_offset);
+        __ StoreToOffset(kStoreByte, value, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -2028,11 +2036,11 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_2) + data_offset;
-        __ StoreToOffset(kStoreHalfword, value, obj, offset);
+        __ StoreToOffset(kStoreHalfword, value, obj, offset, null_checker);
       } else {
         __ Sll(TMP, index.AsRegister<Register>(), TIMES_2);
         __ Addu(TMP, obj, TMP);
-        __ StoreToOffset(kStoreHalfword, value, TMP, data_offset);
+        __ StoreToOffset(kStoreHalfword, value, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -2045,14 +2053,13 @@
         if (index.IsConstant()) {
           size_t offset =
               (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
-          __ StoreToOffset(kStoreWord, value, obj, offset);
+          __ StoreToOffset(kStoreWord, value, obj, offset, null_checker);
         } else {
           DCHECK(index.IsRegister()) << index;
           __ Sll(TMP, index.AsRegister<Register>(), TIMES_4);
           __ Addu(TMP, obj, TMP);
-          __ StoreToOffset(kStoreWord, value, TMP, data_offset);
+          __ StoreToOffset(kStoreWord, value, TMP, data_offset, null_checker);
         }
-        codegen_->MaybeRecordImplicitNullCheck(instruction);
         if (needs_write_barrier) {
           DCHECK_EQ(value_type, Primitive::kPrimNot);
           codegen_->MarkGCCard(obj, value);
@@ -2075,11 +2082,11 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
-        __ StoreToOffset(kStoreDoubleword, value, obj, offset);
+        __ StoreToOffset(kStoreDoubleword, value, obj, offset, null_checker);
       } else {
         __ Sll(TMP, index.AsRegister<Register>(), TIMES_8);
         __ Addu(TMP, obj, TMP);
-        __ StoreToOffset(kStoreDoubleword, value, TMP, data_offset);
+        __ StoreToOffset(kStoreDoubleword, value, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -2091,11 +2098,11 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset;
-        __ StoreSToOffset(value, obj, offset);
+        __ StoreSToOffset(value, obj, offset, null_checker);
       } else {
         __ Sll(TMP, index.AsRegister<Register>(), TIMES_4);
         __ Addu(TMP, obj, TMP);
-        __ StoreSToOffset(value, TMP, data_offset);
+        __ StoreSToOffset(value, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -2107,11 +2114,11 @@
       if (index.IsConstant()) {
         size_t offset =
             (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_8) + data_offset;
-        __ StoreDToOffset(value, obj, offset);
+        __ StoreDToOffset(value, obj, offset, null_checker);
       } else {
         __ Sll(TMP, index.AsRegister<Register>(), TIMES_8);
         __ Addu(TMP, obj, TMP);
-        __ StoreDToOffset(value, TMP, data_offset);
+        __ StoreDToOffset(value, TMP, data_offset, null_checker);
       }
       break;
     }
@@ -2120,11 +2127,6 @@
       LOG(FATAL) << "Unreachable type " << instruction->GetType();
       UNREACHABLE();
   }
-
-  // Ints and objects are handled in the switch.
-  if (value_type != Primitive::kPrimInt && value_type != Primitive::kPrimNot) {
-    codegen_->MaybeRecordImplicitNullCheck(instruction);
-  }
 }
 
 void LocationsBuilderMIPS::VisitBoundsCheck(HBoundsCheck* instruction) {
@@ -3589,6 +3591,7 @@
   LoadOperandType load_type = kLoadUnsignedByte;
   bool is_volatile = field_info.IsVolatile();
   uint32_t offset = field_info.GetFieldOffset().Uint32Value();
+  auto null_checker = GetImplicitNullChecker(instruction);
 
   switch (type) {
     case Primitive::kPrimBoolean:
@@ -3654,34 +3657,20 @@
       if (type == Primitive::kPrimLong) {
         DCHECK(locations->Out().IsRegisterPair());
         dst = locations->Out().AsRegisterPairLow<Register>();
-        Register dst_high = locations->Out().AsRegisterPairHigh<Register>();
-        if (obj == dst) {
-          __ LoadFromOffset(kLoadWord, dst_high, obj, offset + kMipsWordSize);
-          codegen_->MaybeRecordImplicitNullCheck(instruction);
-          __ LoadFromOffset(kLoadWord, dst, obj, offset);
-        } else {
-          __ LoadFromOffset(kLoadWord, dst, obj, offset);
-          codegen_->MaybeRecordImplicitNullCheck(instruction);
-          __ LoadFromOffset(kLoadWord, dst_high, obj, offset + kMipsWordSize);
-        }
       } else {
         DCHECK(locations->Out().IsRegister());
         dst = locations->Out().AsRegister<Register>();
-        __ LoadFromOffset(load_type, dst, obj, offset);
       }
+      __ LoadFromOffset(load_type, dst, obj, offset, null_checker);
     } else {
       DCHECK(locations->Out().IsFpuRegister());
       FRegister dst = locations->Out().AsFpuRegister<FRegister>();
       if (type == Primitive::kPrimFloat) {
-        __ LoadSFromOffset(dst, obj, offset);
+        __ LoadSFromOffset(dst, obj, offset, null_checker);
       } else {
-        __ LoadDFromOffset(dst, obj, offset);
+        __ LoadDFromOffset(dst, obj, offset, null_checker);
       }
     }
-    // Longs are handled earlier.
-    if (type != Primitive::kPrimLong) {
-      codegen_->MaybeRecordImplicitNullCheck(instruction);
-    }
   }
 
   if (is_volatile) {
@@ -3729,6 +3718,7 @@
   StoreOperandType store_type = kStoreByte;
   bool is_volatile = field_info.IsVolatile();
   uint32_t offset = field_info.GetFieldOffset().Uint32Value();
+  auto null_checker = GetImplicitNullChecker(instruction);
 
   switch (type) {
     case Primitive::kPrimBoolean:
@@ -3800,28 +3790,20 @@
       if (type == Primitive::kPrimLong) {
         DCHECK(locations->InAt(1).IsRegisterPair());
         src = locations->InAt(1).AsRegisterPairLow<Register>();
-        Register src_high = locations->InAt(1).AsRegisterPairHigh<Register>();
-        __ StoreToOffset(kStoreWord, src, obj, offset);
-        codegen_->MaybeRecordImplicitNullCheck(instruction);
-        __ StoreToOffset(kStoreWord, src_high, obj, offset + kMipsWordSize);
       } else {
         DCHECK(locations->InAt(1).IsRegister());
         src = locations->InAt(1).AsRegister<Register>();
-        __ StoreToOffset(store_type, src, obj, offset);
       }
+      __ StoreToOffset(store_type, src, obj, offset, null_checker);
     } else {
       DCHECK(locations->InAt(1).IsFpuRegister());
       FRegister src = locations->InAt(1).AsFpuRegister<FRegister>();
       if (type == Primitive::kPrimFloat) {
-        __ StoreSToOffset(src, obj, offset);
+        __ StoreSToOffset(src, obj, offset, null_checker);
       } else {
-        __ StoreDToOffset(src, obj, offset);
+        __ StoreDToOffset(src, obj, offset, null_checker);
       }
     }
-    // Longs are handled earlier.
-    if (type != Primitive::kPrimLong) {
-      codegen_->MaybeRecordImplicitNullCheck(instruction);
-    }
   }
 
   // TODO: memory barriers?
@@ -4580,11 +4562,6 @@
     case HLoadString::LoadKind::kBootImageLinkTimePcRelative:
       base_or_current_method_reg = isR6 ? ZERO : locations->InAt(0).AsRegister<Register>();
       break;
-    // We need an extra register for PC-relative dex cache accesses.
-    case HLoadString::LoadKind::kDexCachePcRelative:
-    case HLoadString::LoadKind::kDexCacheViaMethod:
-      base_or_current_method_reg = locations->InAt(0).AsRegister<Register>();
-      break;
     default:
       base_or_current_method_reg = ZERO;
       break;
@@ -4628,52 +4605,15 @@
                      codegen_->DeduplicateBootImageAddressLiteral(address));
       return;  // No dex cache slow path.
     }
-    case HLoadString::LoadKind::kDexCacheAddress: {
-      DCHECK_NE(load->GetAddress(), 0u);
-      uint32_t address = dchecked_integral_cast<uint32_t>(load->GetAddress());
-      static_assert(sizeof(GcRoot<mirror::String>) == 4u, "Expected GC root to be 4 bytes.");
-      DCHECK_ALIGNED(load->GetAddress(), 4u);
-      int16_t offset = Low16Bits(address);
-      uint32_t base_address = address - offset;  // This accounts for offset sign extension.
-      __ Lui(out, High16Bits(base_address));
-      // /* GcRoot<mirror::String> */ out = *(base_address + offset)
-      GenerateGcRootFieldLoad(load, out_loc, out, offset);
-      break;
-    }
-    case HLoadString::LoadKind::kDexCachePcRelative: {
-      HMipsDexCacheArraysBase* base = load->InputAt(0)->AsMipsDexCacheArraysBase();
-      int32_t offset =
-          load->GetDexCacheElementOffset() - base->GetElementOffset() - kDexCacheArrayLwOffset;
-      // /* GcRoot<mirror::String> */ out = *(dex_cache_arrays_base + offset)
-      GenerateGcRootFieldLoad(load, out_loc, base_or_current_method_reg, offset);
-      break;
-    }
-    case HLoadString::LoadKind::kDexCacheViaMethod: {
-      // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
-      GenerateGcRootFieldLoad(load,
-                              out_loc,
-                              base_or_current_method_reg,
-                              ArtMethod::DeclaringClassOffset().Int32Value());
-      // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
-      __ LoadFromOffset(kLoadWord, out, out, mirror::Class::DexCacheStringsOffset().Int32Value());
-      // /* GcRoot<mirror::String> */ out = out[string_index]
-      GenerateGcRootFieldLoad(load,
-                              out_loc,
-                              out,
-                              CodeGenerator::GetCacheOffset(load->GetStringIndex()));
-      break;
-    }
     default:
-      LOG(FATAL) << "Unexpected load kind: " << load->GetLoadKind();
-      UNREACHABLE();
+      break;
   }
 
-  if (!load->IsInDexCache()) {
-    SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathMIPS(load);
-    codegen_->AddSlowPath(slow_path);
-    __ Beqz(out, slow_path->GetEntryLabel());
-    __ Bind(slow_path->GetExitLabel());
-  }
+  // TODO: Re-add the compiler code to do string dex cache lookup again.
+  SlowPathCodeMIPS* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathMIPS(load);
+  codegen_->AddSlowPath(slow_path);
+  __ B(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
 }
 
 void LocationsBuilderMIPS::VisitLongConstant(HLongConstant* constant) {
diff --git a/compiler/optimizing/code_generator_mips.h b/compiler/optimizing/code_generator_mips.h
index 63a0345..46810d6 100644
--- a/compiler/optimizing/code_generator_mips.h
+++ b/compiler/optimizing/code_generator_mips.h
@@ -257,6 +257,7 @@
   void GenerateDivRemWithAnyConstant(HBinaryOperation* instruction);
   void GenerateDivRemIntegral(HBinaryOperation* instruction);
   void HandleGoto(HInstruction* got, HBasicBlock* successor);
+  auto GetImplicitNullChecker(HInstruction* instruction);
 
   MipsAssembler* const assembler_;
   CodeGeneratorMIPS* const codegen_;
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index 4e7a272..4a5755c 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -3261,22 +3261,11 @@
 }
 
 void InstructionCodeGeneratorMIPS64::VisitLoadString(HLoadString* load) {
-  LocationSummary* locations = load->GetLocations();
-  GpuRegister out = locations->Out().AsRegister<GpuRegister>();
-  GpuRegister current_method = locations->InAt(0).AsRegister<GpuRegister>();
-  __ LoadFromOffset(kLoadUnsignedWord, out, current_method,
-                    ArtMethod::DeclaringClassOffset().Int32Value());
-  __ LoadFromOffset(kLoadDoubleword, out, out, mirror::Class::DexCacheStringsOffset().Int32Value());
-  __ LoadFromOffset(
-      kLoadUnsignedWord, out, out, CodeGenerator::GetCacheOffset(load->GetStringIndex()));
-  // TODO: We will need a read barrier here.
-
-  if (!load->IsInDexCache()) {
-    SlowPathCodeMIPS64* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathMIPS64(load);
-    codegen_->AddSlowPath(slow_path);
-    __ Beqzc(out, slow_path->GetEntryLabel());
-    __ Bind(slow_path->GetExitLabel());
-  }
+  // TODO: Re-add the compiler code to do string dex cache lookup again.
+  SlowPathCodeMIPS64* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathMIPS64(load);
+  codegen_->AddSlowPath(slow_path);
+  __ Bc(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
 }
 
 void LocationsBuilderMIPS64::VisitLongConstant(HLongConstant* constant) {
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index 7a561bb..f50eb5c 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -445,8 +445,8 @@
 // Slow path marking an object during a read barrier.
 class ReadBarrierMarkSlowPathX86 : public SlowPathCode {
  public:
-  ReadBarrierMarkSlowPathX86(HInstruction* instruction, Location obj)
-      : SlowPathCode(instruction), obj_(obj) {
+  ReadBarrierMarkSlowPathX86(HInstruction* instruction, Location obj, bool unpoison)
+      : SlowPathCode(instruction), obj_(obj), unpoison_(unpoison) {
     DCHECK(kEmitCompilerReadBarrier);
   }
 
@@ -464,11 +464,16 @@
            instruction_->IsLoadString() ||
            instruction_->IsInstanceOf() ||
            instruction_->IsCheckCast() ||
-           (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified())
+           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) ||
+           (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier marking slow path: "
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
+    if (unpoison_) {
+      // Object* ref = ref_addr->AsMirrorPtr()
+      __ MaybeUnpoisonHeapReference(reg);
+    }
     // No need to save live registers; it's taken care of by the
     // entrypoint. Also, there is no need to update the stack mask,
     // as this runtime call will not trigger a garbage collection.
@@ -498,6 +503,7 @@
 
  private:
   const Location obj_;
+  const bool unpoison_;
 
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathX86);
 };
@@ -1578,15 +1584,15 @@
   locations->SetOut(Location::SameAsFirstInput());
 }
 
-void InstructionCodeGeneratorX86::GenerateIntCompare(Location lhs, Location rhs) {
+void CodeGeneratorX86::GenerateIntCompare(Location lhs, Location rhs) {
   Register lhs_reg = lhs.AsRegister<Register>();
   if (rhs.IsConstant()) {
     int32_t value = CodeGenerator::GetInt32ValueOf(rhs.GetConstant());
-    codegen_->Compare32BitValue(lhs_reg, value);
+    Compare32BitValue(lhs_reg, value);
   } else if (rhs.IsStackSlot()) {
-    __ cmpl(lhs_reg, Address(ESP, rhs.GetStackIndex()));
+    assembler_.cmpl(lhs_reg, Address(ESP, rhs.GetStackIndex()));
   } else {
-    __ cmpl(lhs_reg, rhs.AsRegister<Register>());
+    assembler_.cmpl(lhs_reg, rhs.AsRegister<Register>());
   }
 }
 
@@ -1619,7 +1625,7 @@
         DCHECK_NE(condition->InputAt(0)->GetType(), Primitive::kPrimLong);
         DCHECK(!Primitive::IsFloatingPointType(condition->InputAt(0)->GetType()));
         LocationSummary* cond_locations = condition->GetLocations();
-        GenerateIntCompare(cond_locations->InAt(0), cond_locations->InAt(1));
+        codegen_->GenerateIntCompare(cond_locations->InAt(0), cond_locations->InAt(1));
         cond = X86Condition(condition->GetCondition());
       }
     } else {
@@ -1728,7 +1734,7 @@
 
       // Clear output register: setb only sets the low byte.
       __ xorl(reg, reg);
-      GenerateIntCompare(lhs, rhs);
+      codegen_->GenerateIntCompare(lhs, rhs);
       __ setb(X86Condition(cond->GetCondition()), reg);
       return;
     }
@@ -4210,7 +4216,7 @@
     case Primitive::kPrimShort:
     case Primitive::kPrimChar:
     case Primitive::kPrimInt: {
-      GenerateIntCompare(left, right);
+      codegen_->GenerateIntCompare(left, right);
       break;
     }
     case Primitive::kPrimLong: {
@@ -4630,10 +4636,6 @@
     // load the temp into the XMM and then copy the XMM into the
     // output, 32 bits at a time).
     locations->AddTemp(Location::RequiresFpuRegister());
-  } else if (object_field_get_with_read_barrier && kUseBakerReadBarrier) {
-    // We need a temporary register for the read barrier marking slow
-    // path in CodeGeneratorX86::GenerateFieldLoadWithBakerReadBarrier.
-    locations->AddTemp(Location::RequiresRegister());
   }
 }
 
@@ -4677,11 +4679,10 @@
     case Primitive::kPrimNot: {
       // /* HeapReference<Object> */ out = *(base + offset)
       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-        Location temp_loc = locations->GetTemp(0);
         // Note that a potential implicit null check is handled in this
         // CodeGeneratorX86::GenerateFieldLoadWithBakerReadBarrier call.
         codegen_->GenerateFieldLoadWithBakerReadBarrier(
-            instruction, out, base, offset, temp_loc, /* needs_null_check */ true);
+            instruction, out, base, offset, /* needs_null_check */ true);
         if (is_volatile) {
           codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
         }
@@ -5092,11 +5093,6 @@
             Location::kOutputOverlap :
             Location::kNoOutputOverlap);
   }
-  // We need a temporary register for the read barrier marking slow
-  // path in CodeGeneratorX86::GenerateArrayLoadWithBakerReadBarrier.
-  if (object_array_get_with_read_barrier && kUseBakerReadBarrier) {
-    locations->AddTemp(Location::RequiresRegister());
-  }
 }
 
 void InstructionCodeGeneratorX86::VisitArrayGet(HArrayGet* instruction) {
@@ -5171,11 +5167,10 @@
       // /* HeapReference<Object> */ out =
       //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-        Location temp = locations->GetTemp(0);
         // Note that a potential implicit null check is handled in this
         // CodeGeneratorX86::GenerateArrayLoadWithBakerReadBarrier call.
         codegen_->GenerateArrayLoadWithBakerReadBarrier(
-            instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ true);
+            instruction, out_loc, obj, data_offset, index, /* needs_null_check */ true);
       } else {
         Register out = out_loc.AsRegister<Register>();
         if (index.IsConstant()) {
@@ -6230,48 +6225,15 @@
       codegen_->RecordSimplePatch();
       return;  // No dex cache slow path.
     }
-    case HLoadString::LoadKind::kDexCacheAddress: {
-      DCHECK_NE(load->GetAddress(), 0u);
-      uint32_t address = dchecked_integral_cast<uint32_t>(load->GetAddress());
-      // /* GcRoot<mirror::String> */ out = *address
-      GenerateGcRootFieldLoad(load, out_loc, Address::Absolute(address));
-      break;
-    }
-    case HLoadString::LoadKind::kDexCachePcRelative: {
-      Register base_reg = locations->InAt(0).AsRegister<Register>();
-      uint32_t offset = load->GetDexCacheElementOffset();
-      Label* fixup_label = codegen_->NewPcRelativeDexCacheArrayPatch(load->GetDexFile(), offset);
-      // /* GcRoot<mirror::String> */ out = *(base + offset)  /* PC-relative */
-      GenerateGcRootFieldLoad(
-          load, out_loc, Address(base_reg, CodeGeneratorX86::kDummy32BitOffset), fixup_label);
-      break;
-    }
-    case HLoadString::LoadKind::kDexCacheViaMethod: {
-      Register current_method = locations->InAt(0).AsRegister<Register>();
-
-      // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
-      GenerateGcRootFieldLoad(
-          load, out_loc, Address(current_method, ArtMethod::DeclaringClassOffset().Int32Value()));
-
-      // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
-      __ movl(out, Address(out, mirror::Class::DexCacheStringsOffset().Int32Value()));
-      // /* GcRoot<mirror::String> */ out = out[string_index]
-      GenerateGcRootFieldLoad(
-          load, out_loc, Address(out, CodeGenerator::GetCacheOffset(load->GetStringIndex())));
-      break;
-    }
     default:
-      LOG(FATAL) << "Unexpected load kind: " << load->GetLoadKind();
-      UNREACHABLE();
+      break;
   }
 
-  if (!load->IsInDexCache()) {
-    SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathX86(load);
-    codegen_->AddSlowPath(slow_path);
-    __ testl(out, out);
-    __ j(kEqual, slow_path->GetEntryLabel());
-    __ Bind(slow_path->GetExitLabel());
-  }
+  // TODO: Re-add the compiler code to do string dex cache lookup again.
+  SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathX86(load);
+  codegen_->AddSlowPath(slow_path);
+  __ jmp(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
 }
 
 static Address GetExceptionTlsAddress() {
@@ -6313,8 +6275,8 @@
 
 static bool TypeCheckNeedsATemporary(TypeCheckKind type_check_kind) {
   return kEmitCompilerReadBarrier &&
-      (kUseBakerReadBarrier ||
-       type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+      !kUseBakerReadBarrier &&
+      (type_check_kind == TypeCheckKind::kAbstractClassCheck ||
        type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
        type_check_kind == TypeCheckKind::kArrayObjectCheck);
 }
@@ -6375,7 +6337,7 @@
   }
 
   // /* HeapReference<Class> */ out = obj->klass_
-  GenerateReferenceLoadTwoRegisters(instruction, out_loc, obj_loc, class_offset, maybe_temp_loc);
+  GenerateReferenceLoadTwoRegisters(instruction, out_loc, obj_loc, class_offset);
 
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck: {
@@ -6597,7 +6559,7 @@
   }
 
   // /* HeapReference<Class> */ temp = obj->klass_
-  GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+  GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
 
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck:
@@ -6633,8 +6595,7 @@
       // going into the slow path, as it has been overwritten in the
       // meantime.
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
       __ jmp(type_check_slow_path->GetEntryLabel());
 
       __ Bind(&compare_classes);
@@ -6673,8 +6634,7 @@
       // going into the slow path, as it has been overwritten in the
       // meantime.
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
       __ jmp(type_check_slow_path->GetEntryLabel());
       break;
     }
@@ -6706,8 +6666,7 @@
       // going into the slow path, as it has been overwritten in the
       // meantime.
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
       __ jmp(type_check_slow_path->GetEntryLabel());
 
       __ Bind(&check_non_primitive_component_type);
@@ -6715,8 +6674,7 @@
       __ j(kEqual, &done);
       // Same comment as above regarding `temp` and the slow path.
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
       __ jmp(type_check_slow_path->GetEntryLabel());
       break;
     }
@@ -6907,17 +6865,17 @@
                                                                    Location maybe_temp) {
   Register out_reg = out.AsRegister<Register>();
   if (kEmitCompilerReadBarrier) {
-    DCHECK(maybe_temp.IsRegister()) << maybe_temp;
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(out + offset)
       codegen_->GenerateFieldLoadWithBakerReadBarrier(
-          instruction, out, out_reg, offset, maybe_temp, /* needs_null_check */ false);
+          instruction, out, out_reg, offset, /* needs_null_check */ false);
     } else {
       // Load with slow path based read barrier.
       // Save the value of `out` into `maybe_temp` before overwriting it
       // in the following move operation, as we will need it for the
       // read barrier below.
+      DCHECK(maybe_temp.IsRegister()) << maybe_temp;
       __ movl(maybe_temp.AsRegister<Register>(), out_reg);
       // /* HeapReference<Object> */ out = *(out + offset)
       __ movl(out_reg, Address(out_reg, offset));
@@ -6934,17 +6892,15 @@
 void InstructionCodeGeneratorX86::GenerateReferenceLoadTwoRegisters(HInstruction* instruction,
                                                                     Location out,
                                                                     Location obj,
-                                                                    uint32_t offset,
-                                                                    Location maybe_temp) {
+                                                                    uint32_t offset) {
   Register out_reg = out.AsRegister<Register>();
   Register obj_reg = obj.AsRegister<Register>();
   if (kEmitCompilerReadBarrier) {
     if (kUseBakerReadBarrier) {
-      DCHECK(maybe_temp.IsRegister()) << maybe_temp;
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(obj + offset)
       codegen_->GenerateFieldLoadWithBakerReadBarrier(
-          instruction, out, obj_reg, offset, maybe_temp, /* needs_null_check */ false);
+          instruction, out, obj_reg, offset, /* needs_null_check */ false);
     } else {
       // Load with slow path based read barrier.
       // /* HeapReference<Object> */ out = *(obj + offset)
@@ -6987,9 +6943,9 @@
                     "art::mirror::CompressedReference<mirror::Object> and int32_t "
                     "have different sizes.");
 
-      // Slow path used to mark the GC root `root`.
-      SlowPathCode* slow_path =
-          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86(instruction, root);
+      // Slow path marking the GC root `root`.
+      SlowPathCode* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86(
+          instruction, root, /* unpoison */ false);
       codegen_->AddSlowPath(slow_path);
 
       __ fs()->cmpl(Address::Absolute(Thread::IsGcMarkingOffset<kX86PointerSize>().Int32Value()),
@@ -7023,14 +6979,13 @@
                                                              Location ref,
                                                              Register obj,
                                                              uint32_t offset,
-                                                             Location temp,
                                                              bool needs_null_check) {
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
   // /* HeapReference<Object> */ ref = *(obj + offset)
   Address src(obj, offset);
-  GenerateReferenceLoadWithBakerReadBarrier(instruction, ref, obj, src, temp, needs_null_check);
+  GenerateReferenceLoadWithBakerReadBarrier(instruction, ref, obj, src, needs_null_check);
 }
 
 void CodeGeneratorX86::GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction,
@@ -7038,7 +6993,6 @@
                                                              Register obj,
                                                              uint32_t data_offset,
                                                              Location index,
-                                                             Location temp,
                                                              bool needs_null_check) {
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
@@ -7051,14 +7005,13 @@
   Address src = index.IsConstant() ?
       Address(obj, (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset) :
       Address(obj, index.AsRegister<Register>(), TIMES_4, data_offset);
-  GenerateReferenceLoadWithBakerReadBarrier(instruction, ref, obj, src, temp, needs_null_check);
+  GenerateReferenceLoadWithBakerReadBarrier(instruction, ref, obj, src, needs_null_check);
 }
 
 void CodeGeneratorX86::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
                                                                  Location ref,
                                                                  Register obj,
                                                                  const Address& src,
-                                                                 Location temp,
                                                                  bool needs_null_check) {
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
@@ -7088,17 +7041,23 @@
   //   performance reasons.
 
   Register ref_reg = ref.AsRegister<Register>();
-  Register temp_reg = temp.AsRegister<Register>();
   uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
 
-  // /* int32_t */ monitor = obj->monitor_
-  __ movl(temp_reg, Address(obj, monitor_offset));
+  // Given the numeric representation, it's enough to check the low bit of the rb_state.
+  static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+  static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+  static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+  constexpr uint32_t gray_byte_position = LockWord::kReadBarrierStateShift / kBitsPerByte;
+  constexpr uint32_t gray_bit_position = LockWord::kReadBarrierStateShift % kBitsPerByte;
+  constexpr int32_t test_value = static_cast<int8_t>(1 << gray_bit_position);
+
+  // if (rb_state == ReadBarrier::gray_ptr_)
+  //   ref = ReadBarrier::Mark(ref);
+  // At this point, just do the "if" and make sure that flags are preserved until the branch.
+  __ testb(Address(obj, monitor_offset + gray_byte_position), Immediate(test_value));
   if (needs_null_check) {
     MaybeRecordImplicitNullCheck(instruction);
   }
-  // /* LockWord */ lock_word = LockWord(monitor)
-  static_assert(sizeof(LockWord) == sizeof(int32_t),
-                "art::LockWord and int32_t have different sizes.");
 
   // Load fence to prevent load-load reordering.
   // Note that this is a no-op, thanks to the x86 memory model.
@@ -7106,25 +7065,20 @@
 
   // The actual reference load.
   // /* HeapReference<Object> */ ref = *src
-  __ movl(ref_reg, src);
+  __ movl(ref_reg, src);  // Flags are unaffected.
+
+  // Note: Reference unpoisoning modifies the flags, so we need to delay it after the branch.
+  // Slow path marking the object `ref` when it is gray.
+  SlowPathCode* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86(
+      instruction, ref, /* unpoison */ true);
+  AddSlowPath(slow_path);
+
+  // We have done the "if" of the gray bit check above, now branch based on the flags.
+  __ j(kNotZero, slow_path->GetEntryLabel());
 
   // Object* ref = ref_addr->AsMirrorPtr()
   __ MaybeUnpoisonHeapReference(ref_reg);
 
-  // Slow path used to mark the object `ref` when it is gray.
-  SlowPathCode* slow_path =
-      new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86(instruction, ref);
-  AddSlowPath(slow_path);
-
-  // if (rb_state == ReadBarrier::gray_ptr_)
-  //   ref = ReadBarrier::Mark(ref);
-  // Given the numeric representation, it's enough to check the low bit of the
-  // rb_state. We do that by shifting the bit out of the lock word with SHR.
-  static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
-  static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
-  static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
-  __ shrl(temp_reg, Immediate(LockWord::kReadBarrierStateShift + 1));
-  __ j(kCarrySet, slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
 }
 
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index f306b33..c644e40 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -254,8 +254,7 @@
   void GenerateReferenceLoadTwoRegisters(HInstruction* instruction,
                                          Location out,
                                          Location obj,
-                                         uint32_t offset,
-                                         Location maybe_temp);
+                                         uint32_t offset);
   // Generate a GC root reference load:
   //
   //   root <- *address
@@ -295,7 +294,6 @@
                                    HBasicBlock* default_block);
 
   void GenerateFPCompare(Location lhs, Location rhs, HInstruction* insn, bool is_double);
-  void GenerateIntCompare(Location lhs, Location rhs);
 
   X86Assembler* const assembler_;
   CodeGeneratorX86* const codegen_;
@@ -431,6 +429,8 @@
                   Register value,
                   bool value_can_be_null);
 
+  void GenerateIntCompare(Location lhs, Location rhs);
+
   void GenerateMemoryBarrier(MemBarrierKind kind);
 
   Label* GetLabelOf(HBasicBlock* block) const {
@@ -486,7 +486,6 @@
                                              Location ref,
                                              Register obj,
                                              uint32_t offset,
-                                             Location temp,
                                              bool needs_null_check);
   // Fast path implementation of ReadBarrier::Barrier for a heap
   // reference array load when Baker's read barriers are used.
@@ -495,7 +494,6 @@
                                              Register obj,
                                              uint32_t data_offset,
                                              Location index,
-                                             Location temp,
                                              bool needs_null_check);
   // Factored implementation used by GenerateFieldLoadWithBakerReadBarrier
   // and GenerateArrayLoadWithBakerReadBarrier.
@@ -503,7 +501,6 @@
                                                  Location ref,
                                                  Register obj,
                                                  const Address& src,
-                                                 Location temp,
                                                  bool needs_null_check);
 
   // Generate a read barrier for a heap reference within `instruction`
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index cf01a79..ec37e5d 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -466,8 +466,8 @@
 // Slow path marking an object during a read barrier.
 class ReadBarrierMarkSlowPathX86_64 : public SlowPathCode {
  public:
-  ReadBarrierMarkSlowPathX86_64(HInstruction* instruction, Location obj)
-      : SlowPathCode(instruction), obj_(obj) {
+  ReadBarrierMarkSlowPathX86_64(HInstruction* instruction, Location obj, bool unpoison)
+      : SlowPathCode(instruction), obj_(obj), unpoison_(unpoison) {
     DCHECK(kEmitCompilerReadBarrier);
   }
 
@@ -485,11 +485,16 @@
            instruction_->IsLoadString() ||
            instruction_->IsInstanceOf() ||
            instruction_->IsCheckCast() ||
-           (instruction_->IsInvokeVirtual()) && instruction_->GetLocations()->Intrinsified())
+           (instruction_->IsInvokeVirtual() && instruction_->GetLocations()->Intrinsified()) ||
+           (instruction_->IsInvokeStaticOrDirect() && instruction_->GetLocations()->Intrinsified()))
         << "Unexpected instruction in read barrier marking slow path: "
         << instruction_->DebugName();
 
     __ Bind(GetEntryLabel());
+    if (unpoison_) {
+      // Object* ref = ref_addr->AsMirrorPtr()
+      __ MaybeUnpoisonHeapReference(obj_.AsRegister<CpuRegister>());
+    }
     // No need to save live registers; it's taken care of by the
     // entrypoint. Also, there is no need to update the stack mask,
     // as this runtime call will not trigger a garbage collection.
@@ -519,6 +524,7 @@
 
  private:
   const Location obj_;
+  const bool unpoison_;
 
   DISALLOW_COPY_AND_ASSIGN(ReadBarrierMarkSlowPathX86_64);
 };
@@ -4151,11 +4157,6 @@
         Location::RequiresRegister(),
         object_field_get_with_read_barrier ? Location::kOutputOverlap : Location::kNoOutputOverlap);
   }
-  if (object_field_get_with_read_barrier && kUseBakerReadBarrier) {
-    // We need a temporary register for the read barrier marking slow
-    // path in CodeGeneratorX86_64::GenerateFieldLoadWithBakerReadBarrier.
-    locations->AddTemp(Location::RequiresRegister());
-  }
 }
 
 void InstructionCodeGeneratorX86_64::HandleFieldGet(HInstruction* instruction,
@@ -4199,11 +4200,10 @@
     case Primitive::kPrimNot: {
       // /* HeapReference<Object> */ out = *(base + offset)
       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-        Location temp_loc = locations->GetTemp(0);
         // Note that a potential implicit null check is handled in this
         // CodeGeneratorX86::GenerateFieldLoadWithBakerReadBarrier call.
         codegen_->GenerateFieldLoadWithBakerReadBarrier(
-            instruction, out, base, offset, temp_loc, /* needs_null_check */ true);
+            instruction, out, base, offset, /* needs_null_check */ true);
         if (is_volatile) {
           codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
         }
@@ -4587,11 +4587,6 @@
         Location::RequiresRegister(),
         object_array_get_with_read_barrier ? Location::kOutputOverlap : Location::kNoOutputOverlap);
   }
-  // We need a temporary register for the read barrier marking slow
-  // path in CodeGeneratorX86_64::GenerateArrayLoadWithBakerReadBarrier.
-  if (object_array_get_with_read_barrier && kUseBakerReadBarrier) {
-    locations->AddTemp(Location::RequiresRegister());
-  }
 }
 
 void InstructionCodeGeneratorX86_64::VisitArrayGet(HArrayGet* instruction) {
@@ -4666,11 +4661,10 @@
       // /* HeapReference<Object> */ out =
       //     *(obj + data_offset + index * sizeof(HeapReference<Object>))
       if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-        Location temp = locations->GetTemp(0);
         // Note that a potential implicit null check is handled in this
         // CodeGeneratorX86::GenerateArrayLoadWithBakerReadBarrier call.
         codegen_->GenerateArrayLoadWithBakerReadBarrier(
-            instruction, out_loc, obj, data_offset, index, temp, /* needs_null_check */ true);
+            instruction, out_loc, obj, data_offset, index, /* needs_null_check */ true);
       } else {
         CpuRegister out = out_loc.AsRegister<CpuRegister>();
         if (index.IsConstant()) {
@@ -5635,53 +5629,15 @@
       codegen_->RecordSimplePatch();
       return;  // No dex cache slow path.
     }
-    case HLoadString::LoadKind::kDexCacheAddress: {
-      DCHECK_NE(load->GetAddress(), 0u);
-      // /* GcRoot<mirror::String> */ out = *address
-      if (IsUint<32>(load->GetAddress())) {
-        Address address = Address::Absolute(load->GetAddress(), /* no_rip */ true);
-        GenerateGcRootFieldLoad(load, out_loc, address);
-      } else {
-        // TODO: Consider using opcode A1, i.e. movl eax, moff32 (with 64-bit address).
-        __ movq(out, Immediate(load->GetAddress()));
-        GenerateGcRootFieldLoad(load, out_loc, Address(out, 0));
-      }
-      break;
-    }
-    case HLoadString::LoadKind::kDexCachePcRelative: {
-      uint32_t offset = load->GetDexCacheElementOffset();
-      Label* fixup_label = codegen_->NewPcRelativeDexCacheArrayPatch(load->GetDexFile(), offset);
-      Address address = Address::Absolute(CodeGeneratorX86_64::kDummy32BitOffset,
-                                          /* no_rip */ false);
-      // /* GcRoot<mirror::String> */ out = *address  /* PC-relative */
-      GenerateGcRootFieldLoad(load, out_loc, address, fixup_label);
-      break;
-    }
-    case HLoadString::LoadKind::kDexCacheViaMethod: {
-      CpuRegister current_method = locations->InAt(0).AsRegister<CpuRegister>();
-
-      // /* GcRoot<mirror::Class> */ out = current_method->declaring_class_
-      GenerateGcRootFieldLoad(
-          load, out_loc, Address(current_method, ArtMethod::DeclaringClassOffset().Int32Value()));
-      // /* GcRoot<mirror::String>[] */ out = out->dex_cache_strings_
-      __ movq(out, Address(out, mirror::Class::DexCacheStringsOffset().Uint32Value()));
-      // /* GcRoot<mirror::String> */ out = out[string_index]
-      GenerateGcRootFieldLoad(
-          load, out_loc, Address(out, CodeGenerator::GetCacheOffset(load->GetStringIndex())));
-      break;
-    }
     default:
-      LOG(FATAL) << "Unexpected load kind: " << load->GetLoadKind();
-      UNREACHABLE();
+      break;
   }
 
-  if (!load->IsInDexCache()) {
-    SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathX86_64(load);
-    codegen_->AddSlowPath(slow_path);
-    __ testl(out, out);
-    __ j(kEqual, slow_path->GetEntryLabel());
-    __ Bind(slow_path->GetExitLabel());
-  }
+  // TODO: Re-add the compiler code to do string dex cache lookup again.
+  SlowPathCode* slow_path = new (GetGraph()->GetArena()) LoadStringSlowPathX86_64(load);
+  codegen_->AddSlowPath(slow_path);
+  __ jmp(slow_path->GetEntryLabel());
+  __ Bind(slow_path->GetExitLabel());
 }
 
 static Address GetExceptionTlsAddress() {
@@ -5724,8 +5680,8 @@
 
 static bool TypeCheckNeedsATemporary(TypeCheckKind type_check_kind) {
   return kEmitCompilerReadBarrier &&
-      (kUseBakerReadBarrier ||
-       type_check_kind == TypeCheckKind::kAbstractClassCheck ||
+      !kUseBakerReadBarrier &&
+      (type_check_kind == TypeCheckKind::kAbstractClassCheck ||
        type_check_kind == TypeCheckKind::kClassHierarchyCheck ||
        type_check_kind == TypeCheckKind::kArrayObjectCheck);
 }
@@ -5786,7 +5742,7 @@
   }
 
   // /* HeapReference<Class> */ out = obj->klass_
-  GenerateReferenceLoadTwoRegisters(instruction, out_loc, obj_loc, class_offset, maybe_temp_loc);
+  GenerateReferenceLoadTwoRegisters(instruction, out_loc, obj_loc, class_offset);
 
   switch (type_check_kind) {
     case TypeCheckKind::kExactCheck: {
@@ -6016,8 +5972,7 @@
       }
 
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
 
       if (cls.IsRegister()) {
         __ cmpl(temp, cls.AsRegister<CpuRegister>());
@@ -6041,8 +5996,7 @@
       }
 
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
 
       // If the class is abstract, we eagerly fetch the super class of the
       // object to avoid doing a comparison we know will fail.
@@ -6062,8 +6016,7 @@
       // going into the slow path, as it has been overwritten in the
       // meantime.
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
       __ jmp(type_check_slow_path->GetEntryLabel());
 
       __ Bind(&compare_classes);
@@ -6087,8 +6040,7 @@
       }
 
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
 
       // Walk over the class hierarchy to find a match.
       NearLabel loop;
@@ -6114,8 +6066,7 @@
       // going into the slow path, as it has been overwritten in the
       // meantime.
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
       __ jmp(type_check_slow_path->GetEntryLabel());
       __ Bind(&done);
       break;
@@ -6134,8 +6085,7 @@
       }
 
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
 
       // Do an exact check.
       NearLabel check_non_primitive_component_type;
@@ -6163,8 +6113,7 @@
       // going into the slow path, as it has been overwritten in the
       // meantime.
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
       __ jmp(type_check_slow_path->GetEntryLabel());
 
       __ Bind(&check_non_primitive_component_type);
@@ -6172,8 +6121,7 @@
       __ j(kEqual, &done);
       // Same comment as above regarding `temp` and the slow path.
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
       __ jmp(type_check_slow_path->GetEntryLabel());
       __ Bind(&done);
       break;
@@ -6189,8 +6137,7 @@
       }
 
       // /* HeapReference<Class> */ temp = obj->klass_
-      GenerateReferenceLoadTwoRegisters(
-          instruction, temp_loc, obj_loc, class_offset, maybe_temp2_loc);
+      GenerateReferenceLoadTwoRegisters(instruction, temp_loc, obj_loc, class_offset);
 
       // We always go into the type check slow path for the unresolved
       // and interface check cases.
@@ -6358,17 +6305,17 @@
                                                                       Location maybe_temp) {
   CpuRegister out_reg = out.AsRegister<CpuRegister>();
   if (kEmitCompilerReadBarrier) {
-    DCHECK(maybe_temp.IsRegister()) << maybe_temp;
     if (kUseBakerReadBarrier) {
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(out + offset)
       codegen_->GenerateFieldLoadWithBakerReadBarrier(
-          instruction, out, out_reg, offset, maybe_temp, /* needs_null_check */ false);
+          instruction, out, out_reg, offset, /* needs_null_check */ false);
     } else {
       // Load with slow path based read barrier.
       // Save the value of `out` into `maybe_temp` before overwriting it
       // in the following move operation, as we will need it for the
       // read barrier below.
+      DCHECK(maybe_temp.IsRegister()) << maybe_temp;
       __ movl(maybe_temp.AsRegister<CpuRegister>(), out_reg);
       // /* HeapReference<Object> */ out = *(out + offset)
       __ movl(out_reg, Address(out_reg, offset));
@@ -6385,17 +6332,15 @@
 void InstructionCodeGeneratorX86_64::GenerateReferenceLoadTwoRegisters(HInstruction* instruction,
                                                                        Location out,
                                                                        Location obj,
-                                                                       uint32_t offset,
-                                                                       Location maybe_temp) {
+                                                                       uint32_t offset) {
   CpuRegister out_reg = out.AsRegister<CpuRegister>();
   CpuRegister obj_reg = obj.AsRegister<CpuRegister>();
   if (kEmitCompilerReadBarrier) {
     if (kUseBakerReadBarrier) {
-      DCHECK(maybe_temp.IsRegister()) << maybe_temp;
       // Load with fast path based Baker's read barrier.
       // /* HeapReference<Object> */ out = *(obj + offset)
       codegen_->GenerateFieldLoadWithBakerReadBarrier(
-          instruction, out, obj_reg, offset, maybe_temp, /* needs_null_check */ false);
+          instruction, out, obj_reg, offset, /* needs_null_check */ false);
     } else {
       // Load with slow path based read barrier.
       // /* HeapReference<Object> */ out = *(obj + offset)
@@ -6438,9 +6383,9 @@
                     "art::mirror::CompressedReference<mirror::Object> and int32_t "
                     "have different sizes.");
 
-      // Slow path used to mark the GC root `root`.
-      SlowPathCode* slow_path =
-          new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86_64(instruction, root);
+      // Slow path marking the GC root `root`.
+      SlowPathCode* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86_64(
+          instruction, root, /* unpoison */ false);
       codegen_->AddSlowPath(slow_path);
 
       __ gs()->cmpl(Address::Absolute(Thread::IsGcMarkingOffset<kX86_64PointerSize>().Int32Value(),
@@ -6475,14 +6420,13 @@
                                                                 Location ref,
                                                                 CpuRegister obj,
                                                                 uint32_t offset,
-                                                                Location temp,
                                                                 bool needs_null_check) {
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
 
   // /* HeapReference<Object> */ ref = *(obj + offset)
   Address src(obj, offset);
-  GenerateReferenceLoadWithBakerReadBarrier(instruction, ref, obj, src, temp, needs_null_check);
+  GenerateReferenceLoadWithBakerReadBarrier(instruction, ref, obj, src, needs_null_check);
 }
 
 void CodeGeneratorX86_64::GenerateArrayLoadWithBakerReadBarrier(HInstruction* instruction,
@@ -6490,7 +6434,6 @@
                                                                 CpuRegister obj,
                                                                 uint32_t data_offset,
                                                                 Location index,
-                                                                Location temp,
                                                                 bool needs_null_check) {
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
@@ -6503,14 +6446,13 @@
   Address src = index.IsConstant() ?
       Address(obj, (index.GetConstant()->AsIntConstant()->GetValue() << TIMES_4) + data_offset) :
       Address(obj, index.AsRegister<CpuRegister>(), TIMES_4, data_offset);
-  GenerateReferenceLoadWithBakerReadBarrier(instruction, ref, obj, src, temp, needs_null_check);
+  GenerateReferenceLoadWithBakerReadBarrier(instruction, ref, obj, src, needs_null_check);
 }
 
 void CodeGeneratorX86_64::GenerateReferenceLoadWithBakerReadBarrier(HInstruction* instruction,
                                                                     Location ref,
                                                                     CpuRegister obj,
                                                                     const Address& src,
-                                                                    Location temp,
                                                                     bool needs_null_check) {
   DCHECK(kEmitCompilerReadBarrier);
   DCHECK(kUseBakerReadBarrier);
@@ -6540,17 +6482,23 @@
   //   performance reasons.
 
   CpuRegister ref_reg = ref.AsRegister<CpuRegister>();
-  CpuRegister temp_reg = temp.AsRegister<CpuRegister>();
   uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
 
-  // /* int32_t */ monitor = obj->monitor_
-  __ movl(temp_reg, Address(obj, monitor_offset));
+  // Given the numeric representation, it's enough to check the low bit of the rb_state.
+  static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+  static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+  static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+  constexpr uint32_t gray_byte_position = LockWord::kReadBarrierStateShift / kBitsPerByte;
+  constexpr uint32_t gray_bit_position = LockWord::kReadBarrierStateShift % kBitsPerByte;
+  constexpr int32_t test_value = static_cast<int8_t>(1 << gray_bit_position);
+
+  // if (rb_state == ReadBarrier::gray_ptr_)
+  //   ref = ReadBarrier::Mark(ref);
+  // At this point, just do the "if" and make sure that flags are preserved until the branch.
+  __ testb(Address(obj, monitor_offset + gray_byte_position), Immediate(test_value));
   if (needs_null_check) {
     MaybeRecordImplicitNullCheck(instruction);
   }
-  // /* LockWord */ lock_word = LockWord(monitor)
-  static_assert(sizeof(LockWord) == sizeof(int32_t),
-                "art::LockWord and int32_t have different sizes.");
 
   // Load fence to prevent load-load reordering.
   // Note that this is a no-op, thanks to the x86-64 memory model.
@@ -6558,25 +6506,20 @@
 
   // The actual reference load.
   // /* HeapReference<Object> */ ref = *src
-  __ movl(ref_reg, src);
+  __ movl(ref_reg, src);  // Flags are unaffected.
+
+  // Note: Reference unpoisoning modifies the flags, so we need to delay it after the branch.
+  // Slow path marking the object `ref` when it is gray.
+  SlowPathCode* slow_path = new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86_64(
+      instruction, ref, /* unpoison */ true);
+  AddSlowPath(slow_path);
+
+  // We have done the "if" of the gray bit check above, now branch based on the flags.
+  __ j(kNotZero, slow_path->GetEntryLabel());
 
   // Object* ref = ref_addr->AsMirrorPtr()
   __ MaybeUnpoisonHeapReference(ref_reg);
 
-  // Slow path used to mark the object `ref` when it is gray.
-  SlowPathCode* slow_path =
-      new (GetGraph()->GetArena()) ReadBarrierMarkSlowPathX86_64(instruction, ref);
-  AddSlowPath(slow_path);
-
-  // if (rb_state == ReadBarrier::gray_ptr_)
-  //   ref = ReadBarrier::Mark(ref);
-  // Given the numeric representation, it's enough to check the low bit of the
-  // rb_state. We do that by shifting the bit out of the lock word with SHR.
-  static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
-  static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
-  static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
-  __ shrl(temp_reg, Immediate(LockWord::kReadBarrierStateShift + 1));
-  __ j(kCarrySet, slow_path->GetEntryLabel());
   __ Bind(slow_path->GetExitLabel());
 }
 
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 4e0e34c..44844ac 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -248,8 +248,7 @@
   void GenerateReferenceLoadTwoRegisters(HInstruction* instruction,
                                          Location out,
                                          Location obj,
-                                         uint32_t offset,
-                                         Location maybe_temp);
+                                         uint32_t offset);
   // Generate a GC root reference load:
   //
   //   root <- *address
@@ -427,7 +426,6 @@
                                              Location ref,
                                              CpuRegister obj,
                                              uint32_t offset,
-                                             Location temp,
                                              bool needs_null_check);
   // Fast path implementation of ReadBarrier::Barrier for a heap
   // reference array load when Baker's read barriers are used.
@@ -436,7 +434,6 @@
                                              CpuRegister obj,
                                              uint32_t data_offset,
                                              Location index,
-                                             Location temp,
                                              bool needs_null_check);
   // Factored implementation used by GenerateFieldLoadWithBakerReadBarrier
   // and GenerateArrayLoadWithBakerReadBarrier.
@@ -444,7 +441,6 @@
                                                  Location ref,
                                                  CpuRegister obj,
                                                  const Address& src,
-                                                 Location temp,
                                                  bool needs_null_check);
 
   // Generate a read barrier for a heap reference within `instruction`
diff --git a/compiler/optimizing/codegen_test.cc b/compiler/optimizing/codegen_test.cc
index 18db507..fe6c0a3 100644
--- a/compiler/optimizing/codegen_test.cc
+++ b/compiler/optimizing/codegen_test.cc
@@ -29,12 +29,6 @@
 #include "arch/x86_64/instruction_set_features_x86_64.h"
 #include "base/macros.h"
 #include "builder.h"
-#include "code_generator_arm.h"
-#include "code_generator_arm64.h"
-#include "code_generator_mips.h"
-#include "code_generator_mips64.h"
-#include "code_generator_x86.h"
-#include "code_generator_x86_64.h"
 #include "code_simulator_container.h"
 #include "common_compiler_test.h"
 #include "dex_file.h"
@@ -52,10 +46,35 @@
 #include "utils/mips64/managed_register_mips64.h"
 #include "utils/x86/managed_register_x86.h"
 
+#ifdef ART_ENABLE_CODEGEN_arm
+#include "code_generator_arm.h"
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_arm64
+#include "code_generator_arm64.h"
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_x86
+#include "code_generator_x86.h"
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_x86_64
+#include "code_generator_x86_64.h"
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_mips
+#include "code_generator_mips.h"
+#endif
+
+#ifdef ART_ENABLE_CODEGEN_mips64
+#include "code_generator_mips64.h"
+#endif
+
 #include "gtest/gtest.h"
 
 namespace art {
 
+#ifdef ART_ENABLE_CODEGEN_arm
 // Provide our own codegen, that ensures the C calling conventions
 // are preserved. Currently, ART and C do not match as R4 is caller-save
 // in ART, and callee-save in C. Alternatively, we could use or write
@@ -80,7 +99,9 @@
     blocked_register_pairs_[arm::R6_R7] = false;
   }
 };
+#endif
 
+#ifdef ART_ENABLE_CODEGEN_x86
 class TestCodeGeneratorX86 : public x86::CodeGeneratorX86 {
  public:
   TestCodeGeneratorX86(HGraph* graph,
@@ -105,6 +126,7 @@
     blocked_register_pairs_[x86::ECX_EDI] = false;
   }
 };
+#endif
 
 class InternalCodeAllocator : public CodeAllocator {
  public:
@@ -234,37 +256,54 @@
                     bool has_result,
                     Expected expected) {
   CompilerOptions compiler_options;
+#ifdef ART_ENABLE_CODEGEN_arm
   if (target_isa == kArm || target_isa == kThumb2) {
     std::unique_ptr<const ArmInstructionSetFeatures> features_arm(
         ArmInstructionSetFeatures::FromCppDefines());
     TestCodeGeneratorARM codegenARM(graph, *features_arm.get(), compiler_options);
     RunCode(&codegenARM, graph, hook_before_codegen, has_result, expected);
-  } else if (target_isa == kArm64) {
+  }
+#endif
+#ifdef ART_ENABLE_CODEGEN_arm64
+  if (target_isa == kArm64) {
     std::unique_ptr<const Arm64InstructionSetFeatures> features_arm64(
         Arm64InstructionSetFeatures::FromCppDefines());
     arm64::CodeGeneratorARM64 codegenARM64(graph, *features_arm64.get(), compiler_options);
     RunCode(&codegenARM64, graph, hook_before_codegen, has_result, expected);
-  } else if (target_isa == kX86) {
+  }
+#endif
+#ifdef ART_ENABLE_CODEGEN_x86
+  if (target_isa == kX86) {
     std::unique_ptr<const X86InstructionSetFeatures> features_x86(
         X86InstructionSetFeatures::FromCppDefines());
     TestCodeGeneratorX86 codegenX86(graph, *features_x86.get(), compiler_options);
     RunCode(&codegenX86, graph, hook_before_codegen, has_result, expected);
-  } else if (target_isa == kX86_64) {
+  }
+#endif
+#ifdef ART_ENABLE_CODEGEN_x86_64
+  if (target_isa == kX86_64) {
     std::unique_ptr<const X86_64InstructionSetFeatures> features_x86_64(
         X86_64InstructionSetFeatures::FromCppDefines());
     x86_64::CodeGeneratorX86_64 codegenX86_64(graph, *features_x86_64.get(), compiler_options);
     RunCode(&codegenX86_64, graph, hook_before_codegen, has_result, expected);
-  } else if (target_isa == kMips) {
+  }
+#endif
+#ifdef ART_ENABLE_CODEGEN_mips
+  if (target_isa == kMips) {
     std::unique_ptr<const MipsInstructionSetFeatures> features_mips(
         MipsInstructionSetFeatures::FromCppDefines());
     mips::CodeGeneratorMIPS codegenMIPS(graph, *features_mips.get(), compiler_options);
     RunCode(&codegenMIPS, graph, hook_before_codegen, has_result, expected);
-  } else if (target_isa == kMips64) {
+  }
+#endif
+#ifdef ART_ENABLE_CODEGEN_mips64
+  if (target_isa == kMips64) {
     std::unique_ptr<const Mips64InstructionSetFeatures> features_mips64(
         Mips64InstructionSetFeatures::FromCppDefines());
     mips64::CodeGeneratorMIPS64 codegenMIPS64(graph, *features_mips64.get(), compiler_options);
     RunCode(&codegenMIPS64, graph, hook_before_codegen, has_result, expected);
   }
+#endif
 }
 
 static ::std::vector<InstructionSet> GetTargetISAs() {
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 89d80cc..b3d5341 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -122,7 +122,10 @@
             new DisassemblerOptions(/* absolute_addresses */ false,
                                     base_address,
                                     end_address,
-                                    /* can_read_literals */ true)));
+                                    /* can_read_literals */ true,
+                                    Is64BitInstructionSet(instruction_set)
+                                        ? &Thread::DumpThreadOffset<PointerSize::k64>
+                                        : &Thread::DumpThreadOffset<PointerSize::k32>)));
   }
 
   ~HGraphVisualizerDisassembler() {
diff --git a/compiler/optimizing/intrinsics_arm.cc b/compiler/optimizing/intrinsics_arm.cc
index 27d9d48..0bbc0e5 100644
--- a/compiler/optimizing/intrinsics_arm.cc
+++ b/compiler/optimizing/intrinsics_arm.cc
@@ -41,6 +41,92 @@
 
 using IntrinsicSlowPathARM = IntrinsicSlowPath<InvokeDexCallingConventionVisitorARM>;
 
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<ArmAssembler*>(codegen->GetAssembler())->  // NOLINT
+
+// Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers.
+class ReadBarrierSystemArrayCopySlowPathARM : public SlowPathCode {
+ public:
+  explicit ReadBarrierSystemArrayCopySlowPathARM(HInstruction* instruction)
+      : SlowPathCode(instruction) {
+    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(kUseBakerReadBarrier);
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
+    LocationSummary* locations = instruction_->GetLocations();
+    DCHECK(locations->CanCall());
+    DCHECK(instruction_->IsInvokeStaticOrDirect())
+        << "Unexpected instruction in read barrier arraycopy slow path: "
+        << instruction_->DebugName();
+    DCHECK(instruction_->GetLocations()->Intrinsified());
+    DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy);
+
+    int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot);
+    uint32_t element_size_shift = Primitive::ComponentSizeShift(Primitive::kPrimNot);
+    uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value();
+
+    Register dest = locations->InAt(2).AsRegister<Register>();
+    Location dest_pos = locations->InAt(3);
+    Register src_curr_addr = locations->GetTemp(0).AsRegister<Register>();
+    Register dst_curr_addr = locations->GetTemp(1).AsRegister<Register>();
+    Register src_stop_addr = locations->GetTemp(2).AsRegister<Register>();
+    Register tmp = locations->GetTemp(3).AsRegister<Register>();
+
+    __ Bind(GetEntryLabel());
+    // Compute the base destination address in `dst_curr_addr`.
+    if (dest_pos.IsConstant()) {
+      int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
+      __ AddConstant(dst_curr_addr, dest, element_size * constant + offset);
+    } else {
+      __ add(dst_curr_addr,
+             dest,
+             ShifterOperand(dest_pos.AsRegister<Register>(), LSL, element_size_shift));
+      __ AddConstant(dst_curr_addr, offset);
+    }
+
+    Label loop;
+    __ Bind(&loop);
+    __ ldr(tmp, Address(src_curr_addr, element_size, Address::PostIndex));
+    __ MaybeUnpoisonHeapReference(tmp);
+    // TODO: Inline the mark bit check before calling the runtime?
+    // tmp = ReadBarrier::Mark(tmp);
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
+    // (See ReadBarrierMarkSlowPathARM::EmitNativeCode for more
+    // explanations.)
+    DCHECK_NE(tmp, SP);
+    DCHECK_NE(tmp, LR);
+    DCHECK_NE(tmp, PC);
+    // IP is used internally by the ReadBarrierMarkRegX entry point
+    // as a temporary (and not preserved).  It thus cannot be used by
+    // any live register in this slow path.
+    DCHECK_NE(src_curr_addr, IP);
+    DCHECK_NE(dst_curr_addr, IP);
+    DCHECK_NE(src_stop_addr, IP);
+    DCHECK_NE(tmp, IP);
+    DCHECK(0 <= tmp && tmp < kNumberOfCoreRegisters) << tmp;
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArmPointerSize>(tmp);
+    // This runtime call does not require a stack map.
+    arm_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
+    __ MaybePoisonHeapReference(tmp);
+    __ str(tmp, Address(dst_curr_addr, element_size, Address::PostIndex));
+    __ cmp(src_curr_addr, ShifterOperand(src_stop_addr));
+    __ b(&loop, NE);
+    __ b(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierSystemArrayCopySlowPathARM"; }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathARM);
+};
+
+#undef __
+
 bool IntrinsicLocationsBuilderARM::TryDispatch(HInvoke* invoke) {
   Dispatch(invoke);
   LocationSummary* res = invoke->GetLocations();
@@ -1337,9 +1423,9 @@
 }
 
 void IntrinsicLocationsBuilderARM::VisitSystemArrayCopy(HInvoke* invoke) {
-  // TODO(rpl): Implement read barriers in the SystemArrayCopy
-  // intrinsic and re-enable it (b/29516905).
-  if (kEmitCompilerReadBarrier) {
+  // The only read barrier implementation supporting the
+  // SystemArrayCopy intrinsic is the Baker-style read barriers.
+  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
@@ -1362,6 +1448,13 @@
   if (length != nullptr && !assembler_->ShifterOperandCanAlwaysHold(length->GetValue())) {
     locations->SetInAt(4, Location::RequiresRegister());
   }
+  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    // Temporary register IP cannot be used in
+    // ReadBarrierSystemArrayCopySlowPathARM64 (because that register
+    // is clobbered by ReadBarrierMarkRegX entry points). Get an extra
+    // temporary register from the register allocator.
+    locations->AddTemp(Location::RequiresRegister());
+  }
 }
 
 static void CheckPosition(ArmAssembler* assembler,
@@ -1427,9 +1520,9 @@
 }
 
 void IntrinsicCodeGeneratorARM::VisitSystemArrayCopy(HInvoke* invoke) {
-  // TODO(rpl): Implement read barriers in the SystemArrayCopy
-  // intrinsic and re-enable it (b/29516905).
-  DCHECK(!kEmitCompilerReadBarrier);
+  // The only read barrier implementation supporting the
+  // SystemArrayCopy intrinsic is the Baker-style read barriers.
+  DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
 
   ArmAssembler* assembler = GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
@@ -1438,18 +1531,22 @@
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
+  uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
 
   Register src = locations->InAt(0).AsRegister<Register>();
   Location src_pos = locations->InAt(1);
   Register dest = locations->InAt(2).AsRegister<Register>();
   Location dest_pos = locations->InAt(3);
   Location length = locations->InAt(4);
-  Register temp1 = locations->GetTemp(0).AsRegister<Register>();
-  Register temp2 = locations->GetTemp(1).AsRegister<Register>();
-  Register temp3 = locations->GetTemp(2).AsRegister<Register>();
+  Location temp1_loc = locations->GetTemp(0);
+  Register temp1 = temp1_loc.AsRegister<Register>();
+  Location temp2_loc = locations->GetTemp(1);
+  Register temp2 = temp2_loc.AsRegister<Register>();
+  Location temp3_loc = locations->GetTemp(2);
+  Register temp3 = temp3_loc.AsRegister<Register>();
 
-  SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathARM(invoke);
-  codegen_->AddSlowPath(slow_path);
+  SlowPathCode* intrinsic_slow_path = new (GetAllocator()) IntrinsicSlowPathARM(invoke);
+  codegen_->AddSlowPath(intrinsic_slow_path);
 
   Label conditions_on_positions_validated;
   SystemArrayCopyOptimizations optimizations(invoke);
@@ -1465,7 +1562,7 @@
         DCHECK_GE(src_pos_constant, dest_pos_constant);
       } else if (src_pos_constant < dest_pos_constant) {
         __ cmp(src, ShifterOperand(dest));
-        __ b(slow_path->GetEntryLabel(), EQ);
+        __ b(intrinsic_slow_path->GetEntryLabel(), EQ);
       }
 
       // Checked when building locations.
@@ -1477,7 +1574,7 @@
         __ b(&conditions_on_positions_validated, NE);
       }
       __ cmp(dest_pos.AsRegister<Register>(), ShifterOperand(src_pos_constant));
-      __ b(slow_path->GetEntryLabel(), GT);
+      __ b(intrinsic_slow_path->GetEntryLabel(), GT);
     }
   } else {
     if (!optimizations.GetDestinationIsSource()) {
@@ -1490,19 +1587,19 @@
     } else {
       __ cmp(src_pos.AsRegister<Register>(), ShifterOperand(dest_pos.AsRegister<Register>()));
     }
-    __ b(slow_path->GetEntryLabel(), LT);
+    __ b(intrinsic_slow_path->GetEntryLabel(), LT);
   }
 
   __ Bind(&conditions_on_positions_validated);
 
   if (!optimizations.GetSourceIsNotNull()) {
     // Bail out if the source is null.
-    __ CompareAndBranchIfZero(src, slow_path->GetEntryLabel());
+    __ CompareAndBranchIfZero(src, intrinsic_slow_path->GetEntryLabel());
   }
 
   if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) {
     // Bail out if the destination is null.
-    __ CompareAndBranchIfZero(dest, slow_path->GetEntryLabel());
+    __ CompareAndBranchIfZero(dest, intrinsic_slow_path->GetEntryLabel());
   }
 
   // If the length is negative, bail out.
@@ -1511,7 +1608,7 @@
       !optimizations.GetCountIsSourceLength() &&
       !optimizations.GetCountIsDestinationLength()) {
     __ cmp(length.AsRegister<Register>(), ShifterOperand(0));
-    __ b(slow_path->GetEntryLabel(), LT);
+    __ b(intrinsic_slow_path->GetEntryLabel(), LT);
   }
 
   // Validity checks: source.
@@ -1519,7 +1616,7 @@
                 src_pos,
                 src,
                 length,
-                slow_path,
+                intrinsic_slow_path,
                 temp1,
                 optimizations.GetCountIsSourceLength());
 
@@ -1528,7 +1625,7 @@
                 dest_pos,
                 dest,
                 length,
-                slow_path,
+                intrinsic_slow_path,
                 temp1,
                 optimizations.GetCountIsDestinationLength());
 
@@ -1537,112 +1634,287 @@
     // type of the destination array. We do two checks: the classes are the same,
     // or the destination is Object[]. If none of these checks succeed, we go to the
     // slow path.
-    __ LoadFromOffset(kLoadWord, temp1, dest, class_offset);
-    __ LoadFromOffset(kLoadWord, temp2, src, class_offset);
-    bool did_unpoison = false;
-    if (!optimizations.GetDestinationIsNonPrimitiveArray() ||
-        !optimizations.GetSourceIsNonPrimitiveArray()) {
-      // One or two of the references need to be unpoisoned. Unpoison them
-      // both to make the identity check valid.
-      __ MaybeUnpoisonHeapReference(temp1);
-      __ MaybeUnpoisonHeapReference(temp2);
-      did_unpoison = true;
-    }
 
-    if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
-      // Bail out if the destination is not a non primitive array.
-      // /* HeapReference<Class> */ temp3 = temp1->component_type_
-      __ LoadFromOffset(kLoadWord, temp3, temp1, component_offset);
-      __ CompareAndBranchIfZero(temp3, slow_path->GetEntryLabel());
-      __ MaybeUnpoisonHeapReference(temp3);
-      __ LoadFromOffset(kLoadUnsignedHalfword, temp3, temp3, primitive_offset);
-      static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
-      __ CompareAndBranchIfNonZero(temp3, slow_path->GetEntryLabel());
-    }
-
-    if (!optimizations.GetSourceIsNonPrimitiveArray()) {
-      // Bail out if the source is not a non primitive array.
-      // /* HeapReference<Class> */ temp3 = temp2->component_type_
-      __ LoadFromOffset(kLoadWord, temp3, temp2, component_offset);
-      __ CompareAndBranchIfZero(temp3, slow_path->GetEntryLabel());
-      __ MaybeUnpoisonHeapReference(temp3);
-      __ LoadFromOffset(kLoadUnsignedHalfword, temp3, temp3, primitive_offset);
-      static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
-      __ CompareAndBranchIfNonZero(temp3, slow_path->GetEntryLabel());
-    }
-
-    __ cmp(temp1, ShifterOperand(temp2));
-
-    if (optimizations.GetDestinationIsTypedObjectArray()) {
-      Label do_copy;
-      __ b(&do_copy, EQ);
-      if (!did_unpoison) {
-        __ MaybeUnpoisonHeapReference(temp1);
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      if (!optimizations.GetSourceIsNonPrimitiveArray()) {
+        // /* HeapReference<Class> */ temp1 = src->klass_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, temp1_loc, src, class_offset, temp2_loc, /* needs_null_check */ false);
+        // Bail out if the source is not a non primitive array.
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false);
+        __ CompareAndBranchIfZero(temp1, intrinsic_slow_path->GetEntryLabel());
+        // If heap poisoning is enabled, `temp1` has been unpoisoned
+        // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+        // /* uint16_t */ temp1 = static_cast<uint16>(temp1->primitive_type_);
+        __ LoadFromOffset(kLoadUnsignedHalfword, temp1, temp1, primitive_offset);
+        static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
+        __ CompareAndBranchIfNonZero(temp1, intrinsic_slow_path->GetEntryLabel());
       }
-      // /* HeapReference<Class> */ temp1 = temp1->component_type_
-      __ LoadFromOffset(kLoadWord, temp1, temp1, component_offset);
-      __ MaybeUnpoisonHeapReference(temp1);
-      // /* HeapReference<Class> */ temp1 = temp1->super_class_
-      __ LoadFromOffset(kLoadWord, temp1, temp1, super_offset);
-      // No need to unpoison the result, we're comparing against null.
-      __ CompareAndBranchIfNonZero(temp1, slow_path->GetEntryLabel());
-      __ Bind(&do_copy);
+
+      // /* HeapReference<Class> */ temp1 = dest->klass_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp1_loc, dest, class_offset, temp2_loc, /* needs_null_check */ false);
+
+      if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
+        // Bail out if the destination is not a non primitive array.
+        //
+        // Register `temp1` is not trashed by the read barrier emitted
+        // by GenerateFieldLoadWithBakerReadBarrier below, as that
+        // method produces a call to a ReadBarrierMarkRegX entry point,
+        // which saves all potentially live registers, including
+        // temporaries such a `temp1`.
+        // /* HeapReference<Class> */ temp2 = temp1->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, temp2_loc, temp1, component_offset, temp3_loc, /* needs_null_check */ false);
+        __ CompareAndBranchIfZero(temp2, intrinsic_slow_path->GetEntryLabel());
+        // If heap poisoning is enabled, `temp2` has been unpoisoned
+        // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+        // /* uint16_t */ temp2 = static_cast<uint16>(temp2->primitive_type_);
+        __ LoadFromOffset(kLoadUnsignedHalfword, temp2, temp2, primitive_offset);
+        static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
+        __ CompareAndBranchIfNonZero(temp2, intrinsic_slow_path->GetEntryLabel());
+      }
+
+      // For the same reason given earlier, `temp1` is not trashed by the
+      // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below.
+      // /* HeapReference<Class> */ temp2 = src->klass_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp2_loc, src, class_offset, temp3_loc, /* needs_null_check */ false);
+      // Note: if heap poisoning is on, we are comparing two unpoisoned references here.
+      __ cmp(temp1, ShifterOperand(temp2));
+
+      if (optimizations.GetDestinationIsTypedObjectArray()) {
+        Label do_copy;
+        __ b(&do_copy, EQ);
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, temp1_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false);
+        // /* HeapReference<Class> */ temp1 = temp1->super_class_
+        // We do not need to emit a read barrier for the following
+        // heap reference load, as `temp1` is only used in a
+        // comparison with null below, and this reference is not
+        // kept afterwards.
+        __ LoadFromOffset(kLoadWord, temp1, temp1, super_offset);
+        __ CompareAndBranchIfNonZero(temp1, intrinsic_slow_path->GetEntryLabel());
+        __ Bind(&do_copy);
+      } else {
+        __ b(intrinsic_slow_path->GetEntryLabel(), NE);
+      }
     } else {
-      __ b(slow_path->GetEntryLabel(), NE);
+      // Non read barrier code.
+
+      // /* HeapReference<Class> */ temp1 = dest->klass_
+      __ LoadFromOffset(kLoadWord, temp1, dest, class_offset);
+      // /* HeapReference<Class> */ temp2 = src->klass_
+      __ LoadFromOffset(kLoadWord, temp2, src, class_offset);
+      bool did_unpoison = false;
+      if (!optimizations.GetDestinationIsNonPrimitiveArray() ||
+          !optimizations.GetSourceIsNonPrimitiveArray()) {
+        // One or two of the references need to be unpoisoned. Unpoison them
+        // both to make the identity check valid.
+        __ MaybeUnpoisonHeapReference(temp1);
+        __ MaybeUnpoisonHeapReference(temp2);
+        did_unpoison = true;
+      }
+
+      if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
+        // Bail out if the destination is not a non primitive array.
+        // /* HeapReference<Class> */ temp3 = temp1->component_type_
+        __ LoadFromOffset(kLoadWord, temp3, temp1, component_offset);
+        __ CompareAndBranchIfZero(temp3, intrinsic_slow_path->GetEntryLabel());
+        __ MaybeUnpoisonHeapReference(temp3);
+        // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_);
+        __ LoadFromOffset(kLoadUnsignedHalfword, temp3, temp3, primitive_offset);
+        static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
+        __ CompareAndBranchIfNonZero(temp3, intrinsic_slow_path->GetEntryLabel());
+      }
+
+      if (!optimizations.GetSourceIsNonPrimitiveArray()) {
+        // Bail out if the source is not a non primitive array.
+        // /* HeapReference<Class> */ temp3 = temp2->component_type_
+        __ LoadFromOffset(kLoadWord, temp3, temp2, component_offset);
+        __ CompareAndBranchIfZero(temp3, intrinsic_slow_path->GetEntryLabel());
+        __ MaybeUnpoisonHeapReference(temp3);
+        // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_);
+        __ LoadFromOffset(kLoadUnsignedHalfword, temp3, temp3, primitive_offset);
+        static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
+        __ CompareAndBranchIfNonZero(temp3, intrinsic_slow_path->GetEntryLabel());
+      }
+
+      __ cmp(temp1, ShifterOperand(temp2));
+
+      if (optimizations.GetDestinationIsTypedObjectArray()) {
+        Label do_copy;
+        __ b(&do_copy, EQ);
+        if (!did_unpoison) {
+          __ MaybeUnpoisonHeapReference(temp1);
+        }
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        __ LoadFromOffset(kLoadWord, temp1, temp1, component_offset);
+        __ MaybeUnpoisonHeapReference(temp1);
+        // /* HeapReference<Class> */ temp1 = temp1->super_class_
+        __ LoadFromOffset(kLoadWord, temp1, temp1, super_offset);
+        // No need to unpoison the result, we're comparing against null.
+        __ CompareAndBranchIfNonZero(temp1, intrinsic_slow_path->GetEntryLabel());
+        __ Bind(&do_copy);
+      } else {
+        __ b(intrinsic_slow_path->GetEntryLabel(), NE);
+      }
     }
   } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
     DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
     // Bail out if the source is not a non primitive array.
-    // /* HeapReference<Class> */ temp1 = src->klass_
-    __ LoadFromOffset(kLoadWord, temp1, src, class_offset);
-    __ MaybeUnpoisonHeapReference(temp1);
-    // /* HeapReference<Class> */ temp3 = temp1->component_type_
-    __ LoadFromOffset(kLoadWord, temp3, temp1, component_offset);
-    __ CompareAndBranchIfZero(temp3, slow_path->GetEntryLabel());
-    __ MaybeUnpoisonHeapReference(temp3);
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      // /* HeapReference<Class> */ temp1 = src->klass_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp1_loc, src, class_offset, temp2_loc, /* needs_null_check */ false);
+      // /* HeapReference<Class> */ temp3 = temp1->component_type_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp3_loc, temp1, component_offset, temp2_loc, /* needs_null_check */ false);
+      __ CompareAndBranchIfZero(temp3, intrinsic_slow_path->GetEntryLabel());
+      // If heap poisoning is enabled, `temp3` has been unpoisoned
+      // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+    } else {
+      // /* HeapReference<Class> */ temp1 = src->klass_
+      __ LoadFromOffset(kLoadWord, temp1, src, class_offset);
+      __ MaybeUnpoisonHeapReference(temp1);
+      // /* HeapReference<Class> */ temp3 = temp1->component_type_
+      __ LoadFromOffset(kLoadWord, temp3, temp1, component_offset);
+      __ CompareAndBranchIfZero(temp3, intrinsic_slow_path->GetEntryLabel());
+      __ MaybeUnpoisonHeapReference(temp3);
+    }
+    // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_);
     __ LoadFromOffset(kLoadUnsignedHalfword, temp3, temp3, primitive_offset);
     static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
-    __ CompareAndBranchIfNonZero(temp3, slow_path->GetEntryLabel());
+    __ CompareAndBranchIfNonZero(temp3, intrinsic_slow_path->GetEntryLabel());
   }
 
-  // Compute base source address, base destination address, and end source address.
-
   int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot);
+  uint32_t element_size_shift = Primitive::ComponentSizeShift(Primitive::kPrimNot);
   uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value();
+
+  // Compute the base source address in `temp1`.
   if (src_pos.IsConstant()) {
     int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
     __ AddConstant(temp1, src, element_size * constant + offset);
   } else {
-    __ add(temp1, src, ShifterOperand(src_pos.AsRegister<Register>(), LSL, 2));
+    __ add(temp1, src, ShifterOperand(src_pos.AsRegister<Register>(), LSL, element_size_shift));
     __ AddConstant(temp1, offset);
   }
 
-  if (dest_pos.IsConstant()) {
-    int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
-    __ AddConstant(temp2, dest, element_size * constant + offset);
-  } else {
-    __ add(temp2, dest, ShifterOperand(dest_pos.AsRegister<Register>(), LSL, 2));
-    __ AddConstant(temp2, offset);
-  }
-
+  // Compute the end source address in `temp3`.
   if (length.IsConstant()) {
     int32_t constant = length.GetConstant()->AsIntConstant()->GetValue();
     __ AddConstant(temp3, temp1, element_size * constant);
   } else {
-    __ add(temp3, temp1, ShifterOperand(length.AsRegister<Register>(), LSL, 2));
+    __ add(temp3, temp1, ShifterOperand(length.AsRegister<Register>(), LSL, element_size_shift));
   }
 
-  // Iterate over the arrays and do a raw copy of the objects. We don't need to
-  // poison/unpoison.
-  Label loop, done;
-  __ cmp(temp1, ShifterOperand(temp3));
-  __ b(&done, EQ);
-  __ Bind(&loop);
-  __ ldr(IP, Address(temp1, element_size, Address::PostIndex));
-  __ str(IP, Address(temp2, element_size, Address::PostIndex));
-  __ cmp(temp1, ShifterOperand(temp3));
-  __ b(&loop, NE);
-  __ Bind(&done);
+  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    // The base destination address is computed later, as `temp2` is
+    // used for intermediate computations.
+
+    // SystemArrayCopy implementation for Baker read barriers (see
+    // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier):
+    //
+    //   if (src_ptr != end_ptr) {
+    //     uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
+    //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+    //     bool is_gray = (rb_state == ReadBarrier::gray_ptr_);
+    //     if (is_gray) {
+    //       // Slow-path copy.
+    //       do {
+    //         *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++)));
+    //       } while (src_ptr != end_ptr)
+    //     } else {
+    //       // Fast-path copy.
+    //       do {
+    //         *dest_ptr++ = *src_ptr++;
+    //       } while (src_ptr != end_ptr)
+    //     }
+    //   }
+
+    Label loop, done;
+
+    // Don't enter copy loop if `length == 0`.
+    __ cmp(temp1, ShifterOperand(temp3));
+    __ b(&done, EQ);
+
+    // /* int32_t */ monitor = src->monitor_
+    __ LoadFromOffset(kLoadWord, temp2, src, monitor_offset);
+    // /* LockWord */ lock_word = LockWord(monitor)
+    static_assert(sizeof(LockWord) == sizeof(int32_t),
+                  "art::LockWord and int32_t have different sizes.");
+
+    // Introduce a dependency on the lock_word including the rb_state,
+    // which shall prevent load-load reordering without using
+    // a memory barrier (which would be more expensive).
+    // `src` is unchanged by this operation, but its value now depends
+    // on `temp2`.
+    __ add(src, src, ShifterOperand(temp2, LSR, 32));
+
+    // Slow path used to copy array when `src` is gray.
+    SlowPathCode* read_barrier_slow_path =
+        new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathARM(invoke);
+    codegen_->AddSlowPath(read_barrier_slow_path);
+
+    // Given the numeric representation, it's enough to check the low bit of the
+    // rb_state. We do that by shifting the bit out of the lock word with LSRS
+    // which can be a 16-bit instruction unlike the TST immediate.
+    static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+    static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+    static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+    __ Lsrs(temp2, temp2, LockWord::kReadBarrierStateShift + 1);
+    // Carry flag is the last bit shifted out by LSRS.
+    __ b(read_barrier_slow_path->GetEntryLabel(), CS);
+
+    // Fast-path copy.
+
+    // Compute the base destination address in `temp2`.
+    if (dest_pos.IsConstant()) {
+      int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
+      __ AddConstant(temp2, dest, element_size * constant + offset);
+    } else {
+      __ add(temp2, dest, ShifterOperand(dest_pos.AsRegister<Register>(), LSL, element_size_shift));
+      __ AddConstant(temp2, offset);
+    }
+
+    // Iterate over the arrays and do a raw copy of the objects. We don't need to
+    // poison/unpoison.
+    __ Bind(&loop);
+    __ ldr(IP, Address(temp1, element_size, Address::PostIndex));
+    __ str(IP, Address(temp2, element_size, Address::PostIndex));
+    __ cmp(temp1, ShifterOperand(temp3));
+    __ b(&loop, NE);
+
+    __ Bind(read_barrier_slow_path->GetExitLabel());
+    __ Bind(&done);
+  } else {
+    // Non read barrier code.
+
+    // Compute the base destination address in `temp2`.
+    if (dest_pos.IsConstant()) {
+      int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
+      __ AddConstant(temp2, dest, element_size * constant + offset);
+    } else {
+      __ add(temp2, dest, ShifterOperand(dest_pos.AsRegister<Register>(), LSL, element_size_shift));
+      __ AddConstant(temp2, offset);
+    }
+
+    // Iterate over the arrays and do a raw copy of the objects. We don't need to
+    // poison/unpoison.
+    Label loop, done;
+    __ cmp(temp1, ShifterOperand(temp3));
+    __ b(&done, EQ);
+    __ Bind(&loop);
+    __ ldr(IP, Address(temp1, element_size, Address::PostIndex));
+    __ str(IP, Address(temp2, element_size, Address::PostIndex));
+    __ cmp(temp1, ShifterOperand(temp3));
+    __ b(&loop, NE);
+    __ Bind(&done);
+  }
 
   // We only need one card marking on the destination array.
   codegen_->MarkGCCard(temp1,
@@ -1651,7 +1923,7 @@
                        Register(kNoRegister),
                        /* value_can_be_null */ false);
 
-  __ Bind(slow_path->GetExitLabel());
+  __ Bind(intrinsic_slow_path->GetExitLabel());
 }
 
 static void CreateFPToFPCallLocations(ArenaAllocator* arena, HInvoke* invoke) {
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 9cfe3ce..91374b3 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -144,6 +144,73 @@
   DISALLOW_COPY_AND_ASSIGN(IntrinsicSlowPathARM64);
 };
 
+// Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers.
+class ReadBarrierSystemArrayCopySlowPathARM64 : public SlowPathCodeARM64 {
+ public:
+  ReadBarrierSystemArrayCopySlowPathARM64(HInstruction* instruction, Location tmp)
+      : SlowPathCodeARM64(instruction), tmp_(tmp) {
+    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(kUseBakerReadBarrier);
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen_in) OVERRIDE {
+    CodeGeneratorARM64* codegen = down_cast<CodeGeneratorARM64*>(codegen_in);
+    LocationSummary* locations = instruction_->GetLocations();
+    DCHECK(locations->CanCall());
+    DCHECK(instruction_->IsInvokeStaticOrDirect())
+        << "Unexpected instruction in read barrier arraycopy slow path: "
+        << instruction_->DebugName();
+    DCHECK(instruction_->GetLocations()->Intrinsified());
+    DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy);
+
+    const int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot);
+
+    Register src_curr_addr = XRegisterFrom(locations->GetTemp(0));
+    Register dst_curr_addr = XRegisterFrom(locations->GetTemp(1));
+    Register src_stop_addr = XRegisterFrom(locations->GetTemp(2));
+    Register tmp_reg = WRegisterFrom(tmp_);
+
+    __ Bind(GetEntryLabel());
+    vixl::aarch64::Label slow_copy_loop;
+    __ Bind(&slow_copy_loop);
+    __ Ldr(tmp_reg, MemOperand(src_curr_addr, element_size, PostIndex));
+    codegen->GetAssembler()->MaybeUnpoisonHeapReference(tmp_reg);
+    // TODO: Inline the mark bit check before calling the runtime?
+    // tmp_reg = ReadBarrier::Mark(tmp_reg);
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
+    // (See ReadBarrierMarkSlowPathARM64::EmitNativeCode for more
+    // explanations.)
+    DCHECK_NE(tmp_.reg(), LR);
+    DCHECK_NE(tmp_.reg(), WSP);
+    DCHECK_NE(tmp_.reg(), WZR);
+    // IP0 is used internally by the ReadBarrierMarkRegX entry point
+    // as a temporary (and not preserved).  It thus cannot be used by
+    // any live register in this slow path.
+    DCHECK_NE(LocationFrom(src_curr_addr).reg(), IP0);
+    DCHECK_NE(LocationFrom(dst_curr_addr).reg(), IP0);
+    DCHECK_NE(LocationFrom(src_stop_addr).reg(), IP0);
+    DCHECK_NE(tmp_.reg(), IP0);
+    DCHECK(0 <= tmp_.reg() && tmp_.reg() < kNumberOfWRegisters) << tmp_.reg();
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kArm64PointerSize>(tmp_.reg());
+    // This runtime call does not require a stack map.
+    codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
+    codegen->GetAssembler()->MaybePoisonHeapReference(tmp_reg);
+    __ Str(tmp_reg, MemOperand(dst_curr_addr, element_size, PostIndex));
+    __ Cmp(src_curr_addr, src_stop_addr);
+    __ B(&slow_copy_loop, ne);
+    __ B(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierSystemArrayCopySlowPathARM64"; }
+
+ private:
+  Location tmp_;
+
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathARM64);
+};
 #undef __
 
 bool IntrinsicLocationsBuilderARM64::TryDispatch(HInvoke* invoke) {
@@ -2035,9 +2102,9 @@
 // We want to use two temporary registers in order to reduce the register pressure in arm64.
 // So we don't use the CodeGenerator::CreateSystemArrayCopyLocationSummary.
 void IntrinsicLocationsBuilderARM64::VisitSystemArrayCopy(HInvoke* invoke) {
-  // TODO(rpl): Implement read barriers in the SystemArrayCopy
-  // intrinsic and re-enable it (b/29516905).
-  if (kEmitCompilerReadBarrier) {
+  // The only read barrier implementation supporting the
+  // SystemArrayCopy intrinsic is the Baker-style read barriers.
+  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
@@ -2090,12 +2157,20 @@
 
   locations->AddTemp(Location::RequiresRegister());
   locations->AddTemp(Location::RequiresRegister());
+  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    // Temporary register IP0, obtained from the VIXL scratch register
+    // pool, cannot be used in ReadBarrierSystemArrayCopySlowPathARM64
+    // (because that register is clobbered by ReadBarrierMarkRegX
+    // entry points). Get an extra temporary register from the
+    // register allocator.
+    locations->AddTemp(Location::RequiresRegister());
+  }
 }
 
 void IntrinsicCodeGeneratorARM64::VisitSystemArrayCopy(HInvoke* invoke) {
-  // TODO(rpl): Implement read barriers in the SystemArrayCopy
-  // intrinsic and re-enable it (b/29516905).
-  DCHECK(!kEmitCompilerReadBarrier);
+  // The only read barrier implementation supporting the
+  // SystemArrayCopy intrinsic is the Baker-style read barriers.
+  DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
 
   MacroAssembler* masm = GetVIXLAssembler();
   LocationSummary* locations = invoke->GetLocations();
@@ -2104,6 +2179,7 @@
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
+  uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
 
   Register src = XRegisterFrom(locations->InAt(0));
   Location src_pos = locations->InAt(1);
@@ -2111,10 +2187,12 @@
   Location dest_pos = locations->InAt(3);
   Location length = locations->InAt(4);
   Register temp1 = WRegisterFrom(locations->GetTemp(0));
+  Location temp1_loc = LocationFrom(temp1);
   Register temp2 = WRegisterFrom(locations->GetTemp(1));
+  Location temp2_loc = LocationFrom(temp2);
 
-  SlowPathCodeARM64* slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke);
-  codegen_->AddSlowPath(slow_path);
+  SlowPathCodeARM64* intrinsic_slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke);
+  codegen_->AddSlowPath(intrinsic_slow_path);
 
   vixl::aarch64::Label conditions_on_positions_validated;
   SystemArrayCopyOptimizations optimizations(invoke);
@@ -2130,7 +2208,7 @@
         DCHECK_GE(src_pos_constant, dest_pos_constant);
       } else if (src_pos_constant < dest_pos_constant) {
         __ Cmp(src, dest);
-        __ B(slow_path->GetEntryLabel(), eq);
+        __ B(intrinsic_slow_path->GetEntryLabel(), eq);
       }
       // Checked when building locations.
       DCHECK(!optimizations.GetDestinationIsSource()
@@ -2141,7 +2219,7 @@
         __ B(&conditions_on_positions_validated, ne);
       }
       __ Cmp(WRegisterFrom(dest_pos), src_pos_constant);
-      __ B(slow_path->GetEntryLabel(), gt);
+      __ B(intrinsic_slow_path->GetEntryLabel(), gt);
     }
   } else {
     if (!optimizations.GetDestinationIsSource()) {
@@ -2150,19 +2228,19 @@
     }
     __ Cmp(RegisterFrom(src_pos, invoke->InputAt(1)->GetType()),
            OperandFrom(dest_pos, invoke->InputAt(3)->GetType()));
-    __ B(slow_path->GetEntryLabel(), lt);
+    __ B(intrinsic_slow_path->GetEntryLabel(), lt);
   }
 
   __ Bind(&conditions_on_positions_validated);
 
   if (!optimizations.GetSourceIsNotNull()) {
     // Bail out if the source is null.
-    __ Cbz(src, slow_path->GetEntryLabel());
+    __ Cbz(src, intrinsic_slow_path->GetEntryLabel());
   }
 
   if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) {
     // Bail out if the destination is null.
-    __ Cbz(dest, slow_path->GetEntryLabel());
+    __ Cbz(dest, intrinsic_slow_path->GetEntryLabel());
   }
 
   // We have already checked in the LocationsBuilder for the constant case.
@@ -2170,17 +2248,17 @@
       !optimizations.GetCountIsSourceLength() &&
       !optimizations.GetCountIsDestinationLength()) {
     // If the length is negative, bail out.
-    __ Tbnz(WRegisterFrom(length), kWRegSize - 1, slow_path->GetEntryLabel());
+    __ Tbnz(WRegisterFrom(length), kWRegSize - 1, intrinsic_slow_path->GetEntryLabel());
     // If the length >= 128 then (currently) prefer native implementation.
     __ Cmp(WRegisterFrom(length), kSystemArrayCopyThreshold);
-    __ B(slow_path->GetEntryLabel(), ge);
+    __ B(intrinsic_slow_path->GetEntryLabel(), ge);
   }
   // Validity checks: source.
   CheckSystemArrayCopyPosition(masm,
                                src_pos,
                                src,
                                length,
-                               slow_path,
+                               intrinsic_slow_path,
                                temp1,
                                optimizations.GetCountIsSourceLength());
 
@@ -2189,90 +2267,236 @@
                                dest_pos,
                                dest,
                                length,
-                               slow_path,
+                               intrinsic_slow_path,
                                temp1,
                                optimizations.GetCountIsDestinationLength());
   {
     // We use a block to end the scratch scope before the write barrier, thus
     // freeing the temporary registers so they can be used in `MarkGCCard`.
     UseScratchRegisterScope temps(masm);
+    // Note: Because it is acquired from VIXL's scratch register pool,
+    // `temp3` might be IP0, and thus cannot be used as `ref` argument
+    // of CodeGeneratorARM64::GenerateFieldLoadWithBakerReadBarrier
+    // calls below (see ReadBarrierMarkSlowPathARM64 for more details).
     Register temp3 = temps.AcquireW();
+
     if (!optimizations.GetDoesNotNeedTypeCheck()) {
       // Check whether all elements of the source array are assignable to the component
       // type of the destination array. We do two checks: the classes are the same,
       // or the destination is Object[]. If none of these checks succeed, we go to the
       // slow path.
-      __ Ldr(temp1, MemOperand(dest, class_offset));
-      __ Ldr(temp2, MemOperand(src, class_offset));
-      bool did_unpoison = false;
-      if (!optimizations.GetDestinationIsNonPrimitiveArray() ||
-          !optimizations.GetSourceIsNonPrimitiveArray()) {
-        // One or two of the references need to be unpoisoned. Unpoison them
-        // both to make the identity check valid.
-        codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
-        codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp2);
-        did_unpoison = true;
-      }
 
-      if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
-        // Bail out if the destination is not a non primitive array.
-        // /* HeapReference<Class> */ temp3 = temp1->component_type_
-        __ Ldr(temp3, HeapOperand(temp1, component_offset));
-        __ Cbz(temp3, slow_path->GetEntryLabel());
-        codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp3);
-        __ Ldrh(temp3, HeapOperand(temp3, primitive_offset));
-        static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
-        __ Cbnz(temp3, slow_path->GetEntryLabel());
-      }
-
-      if (!optimizations.GetSourceIsNonPrimitiveArray()) {
-        // Bail out if the source is not a non primitive array.
-        // /* HeapReference<Class> */ temp3 = temp2->component_type_
-        __ Ldr(temp3, HeapOperand(temp2, component_offset));
-        __ Cbz(temp3, slow_path->GetEntryLabel());
-        codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp3);
-        __ Ldrh(temp3, HeapOperand(temp3, primitive_offset));
-        static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
-        __ Cbnz(temp3, slow_path->GetEntryLabel());
-      }
-
-      __ Cmp(temp1, temp2);
-
-      if (optimizations.GetDestinationIsTypedObjectArray()) {
-        vixl::aarch64::Label do_copy;
-        __ B(&do_copy, eq);
-        if (!did_unpoison) {
-          codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        if (!optimizations.GetSourceIsNonPrimitiveArray()) {
+          // /* HeapReference<Class> */ temp1 = src->klass_
+          codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
+                                                          temp1_loc,
+                                                          src.W(),
+                                                          class_offset,
+                                                          temp2,
+                                                          /* needs_null_check */ false,
+                                                          /* use_load_acquire */ false);
+          // Bail out if the source is not a non primitive array.
+          // /* HeapReference<Class> */ temp1 = temp1->component_type_
+          codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
+                                                          temp1_loc,
+                                                          temp1,
+                                                          component_offset,
+                                                          temp2,
+                                                          /* needs_null_check */ false,
+                                                          /* use_load_acquire */ false);
+          __ Cbz(temp1, intrinsic_slow_path->GetEntryLabel());
+          // If heap poisoning is enabled, `temp1` has been unpoisoned
+          // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+          // /* uint16_t */ temp1 = static_cast<uint16>(temp1->primitive_type_);
+          __ Ldrh(temp1, HeapOperand(temp1, primitive_offset));
+          static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
+          __ Cbnz(temp1, intrinsic_slow_path->GetEntryLabel());
         }
-        // /* HeapReference<Class> */ temp1 = temp1->component_type_
-        __ Ldr(temp1, HeapOperand(temp1, component_offset));
-        codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
-        // /* HeapReference<Class> */ temp1 = temp1->super_class_
-        __ Ldr(temp1, HeapOperand(temp1, super_offset));
-        // No need to unpoison the result, we're comparing against null.
-        __ Cbnz(temp1, slow_path->GetEntryLabel());
-        __ Bind(&do_copy);
+
+        // /* HeapReference<Class> */ temp1 = dest->klass_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
+                                                        temp1_loc,
+                                                        dest.W(),
+                                                        class_offset,
+                                                        temp2,
+                                                        /* needs_null_check */ false,
+                                                        /* use_load_acquire */ false);
+
+        if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
+          // Bail out if the destination is not a non primitive array.
+          //
+          // Register `temp1` is not trashed by the read barrier emitted
+          // by GenerateFieldLoadWithBakerReadBarrier below, as that
+          // method produces a call to a ReadBarrierMarkRegX entry point,
+          // which saves all potentially live registers, including
+          // temporaries such a `temp1`.
+          // /* HeapReference<Class> */ temp2 = temp1->component_type_
+          codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
+                                                          temp2_loc,
+                                                          temp1,
+                                                          component_offset,
+                                                          temp3,
+                                                          /* needs_null_check */ false,
+                                                          /* use_load_acquire */ false);
+          __ Cbz(temp2, intrinsic_slow_path->GetEntryLabel());
+          // If heap poisoning is enabled, `temp2` has been unpoisoned
+          // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+          // /* uint16_t */ temp2 = static_cast<uint16>(temp2->primitive_type_);
+          __ Ldrh(temp2, HeapOperand(temp2, primitive_offset));
+          static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
+          __ Cbnz(temp2, intrinsic_slow_path->GetEntryLabel());
+        }
+
+        // For the same reason given earlier, `temp1` is not trashed by the
+        // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below.
+        // /* HeapReference<Class> */ temp2 = src->klass_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
+                                                        temp2_loc,
+                                                        src.W(),
+                                                        class_offset,
+                                                        temp3,
+                                                        /* needs_null_check */ false,
+                                                        /* use_load_acquire */ false);
+        // Note: if heap poisoning is on, we are comparing two unpoisoned references here.
+        __ Cmp(temp1, temp2);
+
+        if (optimizations.GetDestinationIsTypedObjectArray()) {
+          vixl::aarch64::Label do_copy;
+          __ B(&do_copy, eq);
+          // /* HeapReference<Class> */ temp1 = temp1->component_type_
+          codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
+                                                          temp1_loc,
+                                                          temp1,
+                                                          component_offset,
+                                                          temp2,
+                                                          /* needs_null_check */ false,
+                                                          /* use_load_acquire */ false);
+          // /* HeapReference<Class> */ temp1 = temp1->super_class_
+          // We do not need to emit a read barrier for the following
+          // heap reference load, as `temp1` is only used in a
+          // comparison with null below, and this reference is not
+          // kept afterwards.
+          __ Ldr(temp1, HeapOperand(temp1, super_offset));
+          __ Cbnz(temp1, intrinsic_slow_path->GetEntryLabel());
+          __ Bind(&do_copy);
+        } else {
+          __ B(intrinsic_slow_path->GetEntryLabel(), ne);
+        }
       } else {
-        __ B(slow_path->GetEntryLabel(), ne);
+        // Non read barrier code.
+
+        // /* HeapReference<Class> */ temp1 = dest->klass_
+        __ Ldr(temp1, MemOperand(dest, class_offset));
+        // /* HeapReference<Class> */ temp2 = src->klass_
+        __ Ldr(temp2, MemOperand(src, class_offset));
+        bool did_unpoison = false;
+        if (!optimizations.GetDestinationIsNonPrimitiveArray() ||
+            !optimizations.GetSourceIsNonPrimitiveArray()) {
+          // One or two of the references need to be unpoisoned. Unpoison them
+          // both to make the identity check valid.
+          codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
+          codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp2);
+          did_unpoison = true;
+        }
+
+        if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
+          // Bail out if the destination is not a non primitive array.
+          // /* HeapReference<Class> */ temp3 = temp1->component_type_
+          __ Ldr(temp3, HeapOperand(temp1, component_offset));
+          __ Cbz(temp3, intrinsic_slow_path->GetEntryLabel());
+          codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp3);
+          // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_);
+          __ Ldrh(temp3, HeapOperand(temp3, primitive_offset));
+          static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
+          __ Cbnz(temp3, intrinsic_slow_path->GetEntryLabel());
+        }
+
+        if (!optimizations.GetSourceIsNonPrimitiveArray()) {
+          // Bail out if the source is not a non primitive array.
+          // /* HeapReference<Class> */ temp3 = temp2->component_type_
+          __ Ldr(temp3, HeapOperand(temp2, component_offset));
+          __ Cbz(temp3, intrinsic_slow_path->GetEntryLabel());
+          codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp3);
+          // /* uint16_t */ temp3 = static_cast<uint16>(temp3->primitive_type_);
+          __ Ldrh(temp3, HeapOperand(temp3, primitive_offset));
+          static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
+          __ Cbnz(temp3, intrinsic_slow_path->GetEntryLabel());
+        }
+
+        __ Cmp(temp1, temp2);
+
+        if (optimizations.GetDestinationIsTypedObjectArray()) {
+          vixl::aarch64::Label do_copy;
+          __ B(&do_copy, eq);
+          if (!did_unpoison) {
+            codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
+          }
+          // /* HeapReference<Class> */ temp1 = temp1->component_type_
+          __ Ldr(temp1, HeapOperand(temp1, component_offset));
+          codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
+          // /* HeapReference<Class> */ temp1 = temp1->super_class_
+          __ Ldr(temp1, HeapOperand(temp1, super_offset));
+          // No need to unpoison the result, we're comparing against null.
+          __ Cbnz(temp1, intrinsic_slow_path->GetEntryLabel());
+          __ Bind(&do_copy);
+        } else {
+          __ B(intrinsic_slow_path->GetEntryLabel(), ne);
+        }
       }
     } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
       DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
       // Bail out if the source is not a non primitive array.
-      // /* HeapReference<Class> */ temp1 = src->klass_
-      __ Ldr(temp1, HeapOperand(src.W(), class_offset));
-      codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
-      // /* HeapReference<Class> */ temp3 = temp1->component_type_
-      __ Ldr(temp3, HeapOperand(temp1, component_offset));
-      __ Cbz(temp3, slow_path->GetEntryLabel());
-      codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp3);
-      __ Ldrh(temp3, HeapOperand(temp3, primitive_offset));
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        // /* HeapReference<Class> */ temp1 = src->klass_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
+                                                        temp1_loc,
+                                                        src.W(),
+                                                        class_offset,
+                                                        temp2,
+                                                        /* needs_null_check */ false,
+                                                        /* use_load_acquire */ false);
+        // /* HeapReference<Class> */ temp2 = temp1->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(invoke,
+                                                        temp2_loc,
+                                                        temp1,
+                                                        component_offset,
+                                                        temp3,
+                                                        /* needs_null_check */ false,
+                                                        /* use_load_acquire */ false);
+        __ Cbz(temp2, intrinsic_slow_path->GetEntryLabel());
+        // If heap poisoning is enabled, `temp2` has been unpoisoned
+        // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+      } else {
+        // /* HeapReference<Class> */ temp1 = src->klass_
+        __ Ldr(temp1, HeapOperand(src.W(), class_offset));
+        codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp1);
+        // /* HeapReference<Class> */ temp2 = temp1->component_type_
+        __ Ldr(temp2, HeapOperand(temp1, component_offset));
+        __ Cbz(temp2, intrinsic_slow_path->GetEntryLabel());
+        codegen_->GetAssembler()->MaybeUnpoisonHeapReference(temp2);
+      }
+      // /* uint16_t */ temp2 = static_cast<uint16>(temp2->primitive_type_);
+      __ Ldrh(temp2, HeapOperand(temp2, primitive_offset));
       static_assert(Primitive::kPrimNot == 0, "Expected 0 for kPrimNot");
-      __ Cbnz(temp3, slow_path->GetEntryLabel());
+      __ Cbnz(temp2, intrinsic_slow_path->GetEntryLabel());
     }
 
     Register src_curr_addr = temp1.X();
     Register dst_curr_addr = temp2.X();
-    Register src_stop_addr = temp3.X();
+    Register src_stop_addr;
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      // Temporary register IP0, obtained from the VIXL scratch
+      // register pool as `temp3`, cannot be used in
+      // ReadBarrierSystemArrayCopySlowPathARM64 (because that
+      // register is clobbered by ReadBarrierMarkRegX entry points).
+      // So another temporary register allocated by the register
+      // allocator instead.
+      DCHECK_EQ(LocationFrom(temp3).reg(), IP0);
+      src_stop_addr = XRegisterFrom(locations->GetTemp(2));
+    } else {
+      src_stop_addr = temp3.X();
+    }
 
     GenSystemArrayCopyAddresses(masm,
                                 Primitive::kPrimNot,
@@ -2285,25 +2509,98 @@
                                 dst_curr_addr,
                                 src_stop_addr);
 
-    // Iterate over the arrays and do a raw copy of the objects. We don't need to
-    // poison/unpoison.
-    vixl::aarch64::Label loop, done;
     const int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot);
-    __ Bind(&loop);
-    __ Cmp(src_curr_addr, src_stop_addr);
-    __ B(&done, eq);
-    {
+
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      // SystemArrayCopy implementation for Baker read barriers (see
+      // also CodeGeneratorARM::GenerateReferenceLoadWithBakerReadBarrier):
+      //
+      //   if (src_ptr != end_ptr) {
+      //     uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
+      //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+      //     bool is_gray = (rb_state == ReadBarrier::gray_ptr_);
+      //     if (is_gray) {
+      //       // Slow-path copy.
+      //       do {
+      //         *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++)));
+      //       } while (src_ptr != end_ptr)
+      //     } else {
+      //       // Fast-path copy.
+      //       do {
+      //         *dest_ptr++ = *src_ptr++;
+      //       } while (src_ptr != end_ptr)
+      //     }
+      //   }
+
+      vixl::aarch64::Label loop, done;
+
+      // Don't enter copy loop if `length == 0`.
+      __ Cmp(src_curr_addr, src_stop_addr);
+      __ B(&done, eq);
+
       Register tmp = temps.AcquireW();
+      // Make sure `tmp` is not IP0, as it is clobbered by
+      // ReadBarrierMarkRegX entry points in
+      // ReadBarrierSystemArrayCopySlowPathARM64.
+      DCHECK_NE(LocationFrom(tmp).reg(), IP0);
+
+      // /* int32_t */ monitor = src->monitor_
+      __ Ldr(tmp, HeapOperand(src.W(), monitor_offset));
+      // /* LockWord */ lock_word = LockWord(monitor)
+      static_assert(sizeof(LockWord) == sizeof(int32_t),
+                    "art::LockWord and int32_t have different sizes.");
+
+      // Introduce a dependency on the lock_word including rb_state,
+      // to prevent load-load reordering, and without using
+      // a memory barrier (which would be more expensive).
+      // `src` is unchanged by this operation, but its value now depends
+      // on `tmp`.
+      __ Add(src.X(), src.X(), Operand(tmp.X(), LSR, 32));
+
+      // Slow path used to copy array when `src` is gray.
+      SlowPathCodeARM64* read_barrier_slow_path =
+          new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathARM64(invoke, LocationFrom(tmp));
+      codegen_->AddSlowPath(read_barrier_slow_path);
+
+      // Given the numeric representation, it's enough to check the low bit of the rb_state.
+      static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+      static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+      static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+      __ Tbnz(tmp, LockWord::kReadBarrierStateShift, read_barrier_slow_path->GetEntryLabel());
+
+      // Fast-path copy.
+      // Iterate over the arrays and do a raw copy of the objects. We don't need to
+      // poison/unpoison.
+      __ Bind(&loop);
       __ Ldr(tmp, MemOperand(src_curr_addr, element_size, PostIndex));
       __ Str(tmp, MemOperand(dst_curr_addr, element_size, PostIndex));
+      __ Cmp(src_curr_addr, src_stop_addr);
+      __ B(&loop, ne);
+
+      __ Bind(read_barrier_slow_path->GetExitLabel());
+      __ Bind(&done);
+    } else {
+      // Non read barrier code.
+
+      // Iterate over the arrays and do a raw copy of the objects. We don't need to
+      // poison/unpoison.
+      vixl::aarch64::Label loop, done;
+      __ Bind(&loop);
+      __ Cmp(src_curr_addr, src_stop_addr);
+      __ B(&done, eq);
+      {
+        Register tmp = temps.AcquireW();
+        __ Ldr(tmp, MemOperand(src_curr_addr, element_size, PostIndex));
+        __ Str(tmp, MemOperand(dst_curr_addr, element_size, PostIndex));
+      }
+      __ B(&loop);
+      __ Bind(&done);
     }
-    __ B(&loop);
-    __ Bind(&done);
   }
   // We only need one card marking on the destination array.
   codegen_->MarkGCCard(dest.W(), Register(), /* value_can_be_null */ false);
 
-  __ Bind(slow_path->GetExitLabel());
+  __ Bind(intrinsic_slow_path->GetExitLabel());
 }
 
 static void GenIsInfinite(LocationSummary* locations,
diff --git a/compiler/optimizing/intrinsics_mips.cc b/compiler/optimizing/intrinsics_mips.cc
index 55e1ab2..6e5eb66 100644
--- a/compiler/optimizing/intrinsics_mips.cc
+++ b/compiler/optimizing/intrinsics_mips.cc
@@ -2456,16 +2456,18 @@
   __ FloorWS(FTMP, in);
   __ Mfc1(out, FTMP);
 
-  __ LoadConst32(TMP, 1);
+  if (!IsR6()) {
+    __ LoadConst32(TMP, -1);
+  }
 
-  // TMP = (out = java.lang.Integer.MAX_VALUE) ? 1 : 0;
+  // TMP = (out = java.lang.Integer.MAX_VALUE) ? -1 : 0;
   __ LoadConst32(AT, std::numeric_limits<int32_t>::max());
   __ Bne(AT, out, &finite);
 
   __ Mtc1(ZERO, FTMP);
   if (IsR6()) {
     __ CmpLtS(FTMP, in, FTMP);
-    __ Mfc1(AT, FTMP);
+    __ Mfc1(TMP, FTMP);
   } else {
     __ ColtS(in, FTMP);
   }
@@ -2474,28 +2476,26 @@
 
   __ Bind(&finite);
 
-  // TMP = (0.5f <= (in - out)) ? 1 : 0;
+  // TMP = (0.5f <= (in - out)) ? -1 : 0;
   __ Cvtsw(FTMP, FTMP);  // Convert output of floor.w.s back to "float".
   __ LoadConst32(AT, bit_cast<int32_t, float>(0.5f));
   __ SubS(FTMP, in, FTMP);
   __ Mtc1(AT, half);
   if (IsR6()) {
     __ CmpLeS(FTMP, half, FTMP);
-    __ Mfc1(AT, FTMP);
+    __ Mfc1(TMP, FTMP);
   } else {
     __ ColeS(half, FTMP);
   }
 
   __ Bind(&add);
 
-  if (IsR6()) {
-    __ Selnez(TMP, TMP, AT);
-  } else {
+  if (!IsR6()) {
     __ Movf(TMP, ZERO);
   }
 
-  // Return out += TMP.
-  __ Addu(out, out, TMP);
+  // Return out -= TMP.
+  __ Subu(out, out, TMP);
 
   __ Bind(&done);
 }
diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc
index 22f4181..cf4a040 100644
--- a/compiler/optimizing/intrinsics_x86.cc
+++ b/compiler/optimizing/intrinsics_x86.cc
@@ -70,6 +70,105 @@
 
 using IntrinsicSlowPathX86 = IntrinsicSlowPath<InvokeDexCallingConventionVisitorX86>;
 
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<X86Assembler*>(codegen->GetAssembler())->  // NOLINT
+
+// Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers.
+class ReadBarrierSystemArrayCopySlowPathX86 : public SlowPathCode {
+ public:
+  explicit ReadBarrierSystemArrayCopySlowPathX86(HInstruction* instruction)
+      : SlowPathCode(instruction) {
+    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(kUseBakerReadBarrier);
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen);
+    LocationSummary* locations = instruction_->GetLocations();
+    DCHECK(locations->CanCall());
+    DCHECK(instruction_->IsInvokeStaticOrDirect())
+        << "Unexpected instruction in read barrier arraycopy slow path: "
+        << instruction_->DebugName();
+    DCHECK(instruction_->GetLocations()->Intrinsified());
+    DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy);
+
+    int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot);
+    uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value();
+
+    Register src = locations->InAt(0).AsRegister<Register>();
+    Location src_pos = locations->InAt(1);
+    Register dest = locations->InAt(2).AsRegister<Register>();
+    Location dest_pos = locations->InAt(3);
+    Location length = locations->InAt(4);
+    Location temp1_loc = locations->GetTemp(0);
+    Register temp1 = temp1_loc.AsRegister<Register>();
+    Register temp2 = locations->GetTemp(1).AsRegister<Register>();
+    Register temp3 = locations->GetTemp(2).AsRegister<Register>();
+
+    __ Bind(GetEntryLabel());
+    // In this code path, registers `temp1`, `temp2`, and `temp3`
+    // (resp.) are not used for the base source address, the base
+    // destination address, and the end source address (resp.), as in
+    // other SystemArrayCopy intrinsic code paths.  Instead they are
+    // (resp.) used for:
+    // - the loop index (`i`);
+    // - the source index (`src_index`) and the loaded (source)
+    //   reference (`value`); and
+    // - the destination index (`dest_index`).
+
+    // i = 0
+    __ xorl(temp1, temp1);
+    NearLabel loop;
+    __ Bind(&loop);
+    // value = src_array[i + src_pos]
+    if (src_pos.IsConstant()) {
+      int32_t constant = src_pos.GetConstant()->AsIntConstant()->GetValue();
+      int32_t adjusted_offset = offset + constant * element_size;
+      __ movl(temp2, Address(src, temp1, ScaleFactor::TIMES_4, adjusted_offset));
+    } else {
+      __ leal(temp2, Address(src_pos.AsRegister<Register>(), temp1, ScaleFactor::TIMES_1, 0));
+      __ movl(temp2, Address(src, temp2, ScaleFactor::TIMES_4, offset));
+    }
+    __ MaybeUnpoisonHeapReference(temp2);
+    // TODO: Inline the mark bit check before calling the runtime?
+    // value = ReadBarrier::Mark(value)
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
+    // (See ReadBarrierMarkSlowPathX86::EmitNativeCode for more
+    // explanations.)
+    DCHECK_NE(temp2, ESP);
+    DCHECK(0 <= temp2 && temp2 < kNumberOfCpuRegisters) << temp2;
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86PointerSize>(temp2);
+    // This runtime call does not require a stack map.
+    x86_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
+    __ MaybePoisonHeapReference(temp2);
+    // dest_array[i + dest_pos] = value
+    if (dest_pos.IsConstant()) {
+      int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
+      int32_t adjusted_offset = offset + constant * element_size;
+      __ movl(Address(dest, temp1, ScaleFactor::TIMES_4, adjusted_offset), temp2);
+    } else {
+      __ leal(temp3, Address(dest_pos.AsRegister<Register>(), temp1, ScaleFactor::TIMES_1, 0));
+      __ movl(Address(dest, temp3, ScaleFactor::TIMES_4, offset), temp2);
+    }
+    // ++i
+    __ addl(temp1, Immediate(1));
+    // if (i != length) goto loop
+    x86_codegen->GenerateIntCompare(temp1_loc, length);
+    __ j(kNotEqual, &loop);
+    __ jmp(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierSystemArrayCopySlowPathX86"; }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathX86);
+};
+
+#undef __
+
 #define __ assembler->
 
 static void CreateFPToIntLocations(ArenaAllocator* arena, HInvoke* invoke, bool is64bit) {
@@ -1835,10 +1934,9 @@
       Register output = output_loc.AsRegister<Register>();
       if (kEmitCompilerReadBarrier) {
         if (kUseBakerReadBarrier) {
-          Location temp = locations->GetTemp(0);
           Address src(base, offset, ScaleFactor::TIMES_1, 0);
           codegen->GenerateReferenceLoadWithBakerReadBarrier(
-              invoke, output_loc, base, src, temp, /* needs_null_check */ false);
+              invoke, output_loc, base, src, /* needs_null_check */ false);
         } else {
           __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
           codegen->GenerateReadBarrierSlow(
@@ -1901,11 +1999,6 @@
     locations->SetOut(Location::RequiresRegister(),
                       can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap);
   }
-  if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-    // We need a temporary register for the read barrier marking slow
-    // path in InstructionCodeGeneratorX86::GenerateReferenceLoadWithBakerReadBarrier.
-    locations->AddTemp(Location::RequiresRegister());
-  }
 }
 
 void IntrinsicLocationsBuilderX86::VisitUnsafeGet(HInvoke* invoke) {
@@ -2678,9 +2771,9 @@
 }
 
 void IntrinsicLocationsBuilderX86::VisitSystemArrayCopy(HInvoke* invoke) {
-  // TODO(rpl): Implement read barriers in the SystemArrayCopy
-  // intrinsic and re-enable it (b/29516905).
-  if (kEmitCompilerReadBarrier) {
+  // The only read barrier implementation supporting the
+  // SystemArrayCopy intrinsic is the Baker-style read barriers.
+  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
@@ -2710,9 +2803,9 @@
 }
 
 void IntrinsicCodeGeneratorX86::VisitSystemArrayCopy(HInvoke* invoke) {
-  // TODO(rpl): Implement read barriers in the SystemArrayCopy
-  // intrinsic and re-enable it (b/29516905).
-  DCHECK(!kEmitCompilerReadBarrier);
+  // The only read barrier implementation supporting the
+  // SystemArrayCopy intrinsic is the Baker-style read barriers.
+  DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
 
   X86Assembler* assembler = GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
@@ -2721,17 +2814,21 @@
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
+  uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
 
   Register src = locations->InAt(0).AsRegister<Register>();
   Location src_pos = locations->InAt(1);
   Register dest = locations->InAt(2).AsRegister<Register>();
   Location dest_pos = locations->InAt(3);
-  Location length = locations->InAt(4);
-  Register temp1 = locations->GetTemp(0).AsRegister<Register>();
-  Register temp2 = locations->GetTemp(1).AsRegister<Register>();
+  Location length_arg = locations->InAt(4);
+  Location length = length_arg;
+  Location temp1_loc = locations->GetTemp(0);
+  Register temp1 = temp1_loc.AsRegister<Register>();
+  Location temp2_loc = locations->GetTemp(1);
+  Register temp2 = temp2_loc.AsRegister<Register>();
 
-  SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86(invoke);
-  codegen_->AddSlowPath(slow_path);
+  SlowPathCode* intrinsic_slow_path = new (GetAllocator()) IntrinsicSlowPathX86(invoke);
+  codegen_->AddSlowPath(intrinsic_slow_path);
 
   NearLabel conditions_on_positions_validated;
   SystemArrayCopyOptimizations optimizations(invoke);
@@ -2747,7 +2844,7 @@
         DCHECK_GE(src_pos_constant, dest_pos_constant);
       } else if (src_pos_constant < dest_pos_constant) {
         __ cmpl(src, dest);
-        __ j(kEqual, slow_path->GetEntryLabel());
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
       }
     } else {
       if (!optimizations.GetDestinationIsSource()) {
@@ -2755,7 +2852,7 @@
         __ j(kNotEqual, &conditions_on_positions_validated);
       }
       __ cmpl(dest_pos.AsRegister<Register>(), Immediate(src_pos_constant));
-      __ j(kGreater, slow_path->GetEntryLabel());
+      __ j(kGreater, intrinsic_slow_path->GetEntryLabel());
     }
   } else {
     if (!optimizations.GetDestinationIsSource()) {
@@ -2765,10 +2862,10 @@
     if (dest_pos.IsConstant()) {
       int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
       __ cmpl(src_pos.AsRegister<Register>(), Immediate(dest_pos_constant));
-      __ j(kLess, slow_path->GetEntryLabel());
+      __ j(kLess, intrinsic_slow_path->GetEntryLabel());
     } else {
       __ cmpl(src_pos.AsRegister<Register>(), dest_pos.AsRegister<Register>());
-      __ j(kLess, slow_path->GetEntryLabel());
+      __ j(kLess, intrinsic_slow_path->GetEntryLabel());
     }
   }
 
@@ -2777,16 +2874,17 @@
   if (!optimizations.GetSourceIsNotNull()) {
     // Bail out if the source is null.
     __ testl(src, src);
-    __ j(kEqual, slow_path->GetEntryLabel());
+    __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   }
 
   if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) {
     // Bail out if the destination is null.
     __ testl(dest, dest);
-    __ j(kEqual, slow_path->GetEntryLabel());
+    __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   }
 
-  Register temp3 = locations->GetTemp(2).AsRegister<Register>();
+  Location temp3_loc = locations->GetTemp(2);
+  Register temp3 = temp3_loc.AsRegister<Register>();
   if (length.IsStackSlot()) {
     __ movl(temp3, Address(ESP, length.GetStackIndex()));
     length = Location::RegisterLocation(temp3);
@@ -2798,7 +2896,7 @@
       !optimizations.GetCountIsSourceLength() &&
       !optimizations.GetCountIsDestinationLength()) {
     __ testl(length.AsRegister<Register>(), length.AsRegister<Register>());
-    __ j(kLess, slow_path->GetEntryLabel());
+    __ j(kLess, intrinsic_slow_path->GetEntryLabel());
   }
 
   // Validity checks: source.
@@ -2806,7 +2904,7 @@
                 src_pos,
                 src,
                 length,
-                slow_path,
+                intrinsic_slow_path,
                 temp1,
                 optimizations.GetCountIsSourceLength());
 
@@ -2815,7 +2913,7 @@
                 dest_pos,
                 dest,
                 length,
-                slow_path,
+                intrinsic_slow_path,
                 temp1,
                 optimizations.GetCountIsDestinationLength());
 
@@ -2824,72 +2922,159 @@
     // type of the destination array. We do two checks: the classes are the same,
     // or the destination is Object[]. If none of these checks succeed, we go to the
     // slow path.
+
     if (!optimizations.GetSourceIsNonPrimitiveArray()) {
-      // /* HeapReference<Class> */ temp1 = temp1->klass_
-      __ movl(temp1, Address(src, class_offset));
-      __ MaybeUnpoisonHeapReference(temp1);
-      // Bail out if the source is not a non primitive array.
-      // /* HeapReference<Class> */ temp1 = temp1->component_type_
-      __ movl(temp1, Address(temp1, component_offset));
-      __ testl(temp1, temp1);
-      __ j(kEqual, slow_path->GetEntryLabel());
-      __ MaybeUnpoisonHeapReference(temp1);
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        // /* HeapReference<Class> */ temp1 = src->klass_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, temp1_loc, src, class_offset, /* needs_null_check */ false);
+        // Bail out if the source is not a non primitive array.
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, temp1_loc, temp1, component_offset, /* needs_null_check */ false);
+        __ testl(temp1, temp1);
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+        // If heap poisoning is enabled, `temp1` has been unpoisoned
+        // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+      } else {
+        // /* HeapReference<Class> */ temp1 = src->klass_
+        __ movl(temp1, Address(src, class_offset));
+        __ MaybeUnpoisonHeapReference(temp1);
+        // Bail out if the source is not a non primitive array.
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        __ movl(temp1, Address(temp1, component_offset));
+        __ testl(temp1, temp1);
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+        __ MaybeUnpoisonHeapReference(temp1);
+      }
       __ cmpw(Address(temp1, primitive_offset), Immediate(Primitive::kPrimNot));
-      __ j(kNotEqual, slow_path->GetEntryLabel());
+      __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
     }
 
-    if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
-      // /* HeapReference<Class> */ temp1 = temp1->klass_
-      __ movl(temp1, Address(dest, class_offset));
-      __ MaybeUnpoisonHeapReference(temp1);
-      // Bail out if the destination is not a non primitive array.
-      // /* HeapReference<Class> */ temp2 = temp1->component_type_
-      __ movl(temp2, Address(temp1, component_offset));
-      __ testl(temp2, temp2);
-      __ j(kEqual, slow_path->GetEntryLabel());
-      __ MaybeUnpoisonHeapReference(temp2);
-      __ cmpw(Address(temp2, primitive_offset), Immediate(Primitive::kPrimNot));
-      __ j(kNotEqual, slow_path->GetEntryLabel());
-      // Re-poison the heap reference to make the compare instruction below
-      // compare two poisoned references.
-      __ PoisonHeapReference(temp1);
-    } else {
-      // /* HeapReference<Class> */ temp1 = temp1->klass_
-      __ movl(temp1, Address(dest, class_offset));
-    }
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      if (length.Equals(Location::RegisterLocation(temp3))) {
+        // When Baker read barriers are enabled, register `temp3`,
+        // which in the present case contains the `length` parameter,
+        // will be overwritten below.  Make the `length` location
+        // reference the original stack location; it will be moved
+        // back to `temp3` later if necessary.
+        DCHECK(length_arg.IsStackSlot());
+        length = length_arg;
+      }
 
-    // Note: if poisoning is on, we are here comparing two poisoned references.
-    __ cmpl(temp1, Address(src, class_offset));
+      // /* HeapReference<Class> */ temp1 = dest->klass_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp1_loc, dest, class_offset, /* needs_null_check */ false);
 
-    if (optimizations.GetDestinationIsTypedObjectArray()) {
-      NearLabel do_copy;
-      __ j(kEqual, &do_copy);
-      __ MaybeUnpoisonHeapReference(temp1);
-      // /* HeapReference<Class> */ temp1 = temp1->component_type_
-      __ movl(temp1, Address(temp1, component_offset));
-      __ MaybeUnpoisonHeapReference(temp1);
-      __ cmpl(Address(temp1, super_offset), Immediate(0));
-      __ j(kNotEqual, slow_path->GetEntryLabel());
-      __ Bind(&do_copy);
+      if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
+        // Bail out if the destination is not a non primitive array.
+        //
+        // Register `temp1` is not trashed by the read barrier emitted
+        // by GenerateFieldLoadWithBakerReadBarrier below, as that
+        // method produces a call to a ReadBarrierMarkRegX entry point,
+        // which saves all potentially live registers, including
+        // temporaries such a `temp1`.
+        // /* HeapReference<Class> */ temp2 = temp1->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, temp2_loc, temp1, component_offset, /* needs_null_check */ false);
+        __ testl(temp2, temp2);
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+        // If heap poisoning is enabled, `temp2` has been unpoisoned
+        // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+        __ cmpw(Address(temp2, primitive_offset), Immediate(Primitive::kPrimNot));
+        __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
+      }
+
+      // For the same reason given earlier, `temp1` is not trashed by the
+      // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below.
+      // /* HeapReference<Class> */ temp2 = src->klass_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp2_loc, src, class_offset, /* needs_null_check */ false);
+      // Note: if heap poisoning is on, we are comparing two unpoisoned references here.
+      __ cmpl(temp1, temp2);
+
+      if (optimizations.GetDestinationIsTypedObjectArray()) {
+        NearLabel do_copy;
+        __ j(kEqual, &do_copy);
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, temp1_loc, temp1, component_offset, /* needs_null_check */ false);
+        // We do not need to emit a read barrier for the following
+        // heap reference load, as `temp1` is only used in a
+        // comparison with null below, and this reference is not
+        // kept afterwards.
+        __ cmpl(Address(temp1, super_offset), Immediate(0));
+        __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
+        __ Bind(&do_copy);
+      } else {
+        __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
+      }
     } else {
-      __ j(kNotEqual, slow_path->GetEntryLabel());
+      // Non read barrier code.
+
+      // /* HeapReference<Class> */ temp1 = dest->klass_
+      __ movl(temp1, Address(dest, class_offset));
+      if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
+        __ MaybeUnpoisonHeapReference(temp1);
+        // Bail out if the destination is not a non primitive array.
+        // /* HeapReference<Class> */ temp2 = temp1->component_type_
+        __ movl(temp2, Address(temp1, component_offset));
+        __ testl(temp2, temp2);
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+        __ MaybeUnpoisonHeapReference(temp2);
+        __ cmpw(Address(temp2, primitive_offset), Immediate(Primitive::kPrimNot));
+        __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
+        // Re-poison the heap reference to make the compare instruction below
+        // compare two poisoned references.
+        __ PoisonHeapReference(temp1);
+      }
+
+      // Note: if heap poisoning is on, we are comparing two poisoned references here.
+      __ cmpl(temp1, Address(src, class_offset));
+
+      if (optimizations.GetDestinationIsTypedObjectArray()) {
+        NearLabel do_copy;
+        __ j(kEqual, &do_copy);
+        __ MaybeUnpoisonHeapReference(temp1);
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        __ movl(temp1, Address(temp1, component_offset));
+        __ MaybeUnpoisonHeapReference(temp1);
+        __ cmpl(Address(temp1, super_offset), Immediate(0));
+        __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
+        __ Bind(&do_copy);
+      } else {
+        __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
+      }
     }
   } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
     DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
     // Bail out if the source is not a non primitive array.
-    // /* HeapReference<Class> */ temp1 = src->klass_
-    __ movl(temp1, Address(src, class_offset));
-    __ MaybeUnpoisonHeapReference(temp1);
-    // /* HeapReference<Class> */ temp1 = temp1->component_type_
-    __ movl(temp1, Address(temp1, component_offset));
-    __ testl(temp1, temp1);
-    __ j(kEqual, slow_path->GetEntryLabel());
-    __ MaybeUnpoisonHeapReference(temp1);
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      // /* HeapReference<Class> */ temp1 = src->klass_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp1_loc, src, class_offset, /* needs_null_check */ false);
+      // /* HeapReference<Class> */ temp1 = temp1->component_type_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp1_loc, temp1, component_offset, /* needs_null_check */ false);
+      __ testl(temp1, temp1);
+      __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+      // If heap poisoning is enabled, `temp1` has been unpoisoned
+      // by the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+    } else {
+      // /* HeapReference<Class> */ temp1 = src->klass_
+      __ movl(temp1, Address(src, class_offset));
+      __ MaybeUnpoisonHeapReference(temp1);
+      // /* HeapReference<Class> */ temp1 = temp1->component_type_
+      __ movl(temp1, Address(temp1, component_offset));
+      __ testl(temp1, temp1);
+      __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+      __ MaybeUnpoisonHeapReference(temp1);
+    }
     __ cmpw(Address(temp1, primitive_offset), Immediate(Primitive::kPrimNot));
-    __ j(kNotEqual, slow_path->GetEntryLabel());
+    __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
   }
 
-  // Compute base source address, base destination address, and end source address.
+  // Compute the base source address in `temp1`.
   int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot);
   DCHECK_EQ(element_size, 4);
   uint32_t offset = mirror::Array::DataOffset(element_size).Uint32Value();
@@ -2900,35 +3085,138 @@
     __ leal(temp1, Address(src, src_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset));
   }
 
-  if (dest_pos.IsConstant()) {
-    int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
-    __ leal(temp2, Address(dest, element_size * constant + offset));
-  } else {
-    __ leal(temp2, Address(dest, dest_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset));
-  }
+  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    // If it is needed (in the case of the fast-path loop), the base
+    // destination address is computed later, as `temp2` is used for
+    // intermediate computations.
 
-  if (length.IsConstant()) {
-    int32_t constant = length.GetConstant()->AsIntConstant()->GetValue();
-    __ leal(temp3, Address(temp1, element_size * constant));
-  } else {
-    __ leal(temp3, Address(temp1, length.AsRegister<Register>(), ScaleFactor::TIMES_4, 0));
-  }
+    // Compute the end source address in `temp3`.
+    if (length.IsConstant()) {
+      int32_t constant = length.GetConstant()->AsIntConstant()->GetValue();
+      __ leal(temp3, Address(temp1, element_size * constant));
+    } else {
+      if (length.IsStackSlot()) {
+        // Location `length` is again pointing at a stack slot, as
+        // register `temp3` (which was containing the length parameter
+        // earlier) has been overwritten; restore it now
+        DCHECK(length.Equals(length_arg));
+        __ movl(temp3, Address(ESP, length.GetStackIndex()));
+        length = Location::RegisterLocation(temp3);
+      }
+      __ leal(temp3, Address(temp1, length.AsRegister<Register>(), ScaleFactor::TIMES_4, 0));
+    }
 
-  // Iterate over the arrays and do a raw copy of the objects. We don't need to
-  // poison/unpoison.
-  NearLabel loop, done;
-  __ cmpl(temp1, temp3);
-  __ j(kEqual, &done);
-  __ Bind(&loop);
-  __ pushl(Address(temp1, 0));
-  __ cfi().AdjustCFAOffset(4);
-  __ popl(Address(temp2, 0));
-  __ cfi().AdjustCFAOffset(-4);
-  __ addl(temp1, Immediate(element_size));
-  __ addl(temp2, Immediate(element_size));
-  __ cmpl(temp1, temp3);
-  __ j(kNotEqual, &loop);
-  __ Bind(&done);
+    // SystemArrayCopy implementation for Baker read barriers (see
+    // also CodeGeneratorX86::GenerateReferenceLoadWithBakerReadBarrier):
+    //
+    //   if (src_ptr != end_ptr) {
+    //     uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
+    //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+    //     bool is_gray = (rb_state == ReadBarrier::gray_ptr_);
+    //     if (is_gray) {
+    //       // Slow-path copy.
+    //       for (size_t i = 0; i != length; ++i) {
+    //         dest_array[dest_pos + i] =
+    //             MaybePoison(ReadBarrier::Mark(MaybeUnpoison(src_array[src_pos + i])));
+    //       }
+    //     } else {
+    //       // Fast-path copy.
+    //       do {
+    //         *dest_ptr++ = *src_ptr++;
+    //       } while (src_ptr != end_ptr)
+    //     }
+    //   }
+
+    NearLabel loop, done;
+
+    // Don't enter copy loop if `length == 0`.
+    __ cmpl(temp1, temp3);
+    __ j(kEqual, &done);
+
+    // Given the numeric representation, it's enough to check the low bit of the rb_state.
+    static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+    static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+    static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+    constexpr uint32_t gray_byte_position = LockWord::kReadBarrierStateShift / kBitsPerByte;
+    constexpr uint32_t gray_bit_position = LockWord::kReadBarrierStateShift % kBitsPerByte;
+    constexpr int32_t test_value = static_cast<int8_t>(1 << gray_bit_position);
+
+    // if (rb_state == ReadBarrier::gray_ptr_)
+    //   goto slow_path;
+    // At this point, just do the "if" and make sure that flags are preserved until the branch.
+    __ testb(Address(src, monitor_offset + gray_byte_position), Immediate(test_value));
+
+    // Load fence to prevent load-load reordering.
+    // Note that this is a no-op, thanks to the x86 memory model.
+    codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
+
+    // Slow path used to copy array when `src` is gray.
+    SlowPathCode* read_barrier_slow_path =
+        new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathX86(invoke);
+    codegen_->AddSlowPath(read_barrier_slow_path);
+
+    // We have done the "if" of the gray bit check above, now branch based on the flags.
+    __ j(kNotZero, read_barrier_slow_path->GetEntryLabel());
+
+    // Fast-path copy.
+
+    // Set the base destination address in `temp2`.
+    if (dest_pos.IsConstant()) {
+      int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
+      __ leal(temp2, Address(dest, element_size * constant + offset));
+    } else {
+      __ leal(temp2, Address(dest, dest_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset));
+    }
+
+    // Iterate over the arrays and do a raw copy of the objects. We don't need to
+    // poison/unpoison.
+    __ Bind(&loop);
+    __ pushl(Address(temp1, 0));
+    __ cfi().AdjustCFAOffset(4);
+    __ popl(Address(temp2, 0));
+    __ cfi().AdjustCFAOffset(-4);
+    __ addl(temp1, Immediate(element_size));
+    __ addl(temp2, Immediate(element_size));
+    __ cmpl(temp1, temp3);
+    __ j(kNotEqual, &loop);
+
+    __ Bind(read_barrier_slow_path->GetExitLabel());
+    __ Bind(&done);
+  } else {
+    // Non read barrier code.
+
+    // Compute the base destination address in `temp2`.
+    if (dest_pos.IsConstant()) {
+      int32_t constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
+      __ leal(temp2, Address(dest, element_size * constant + offset));
+    } else {
+      __ leal(temp2, Address(dest, dest_pos.AsRegister<Register>(), ScaleFactor::TIMES_4, offset));
+    }
+
+    // Compute the end source address in `temp3`.
+    if (length.IsConstant()) {
+      int32_t constant = length.GetConstant()->AsIntConstant()->GetValue();
+      __ leal(temp3, Address(temp1, element_size * constant));
+    } else {
+      __ leal(temp3, Address(temp1, length.AsRegister<Register>(), ScaleFactor::TIMES_4, 0));
+    }
+
+    // Iterate over the arrays and do a raw copy of the objects. We don't need to
+    // poison/unpoison.
+    NearLabel loop, done;
+    __ cmpl(temp1, temp3);
+    __ j(kEqual, &done);
+    __ Bind(&loop);
+    __ pushl(Address(temp1, 0));
+    __ cfi().AdjustCFAOffset(4);
+    __ popl(Address(temp2, 0));
+    __ cfi().AdjustCFAOffset(-4);
+    __ addl(temp1, Immediate(element_size));
+    __ addl(temp2, Immediate(element_size));
+    __ cmpl(temp1, temp3);
+    __ j(kNotEqual, &loop);
+    __ Bind(&done);
+  }
 
   // We only need one card marking on the destination array.
   codegen_->MarkGCCard(temp1,
@@ -2937,7 +3225,7 @@
                        Register(kNoRegister),
                        /* value_can_be_null */ false);
 
-  __ Bind(slow_path->GetExitLabel());
+  __ Bind(intrinsic_slow_path->GetExitLabel());
 }
 
 UNIMPLEMENTED_INTRINSIC(X86, MathRoundDouble)
diff --git a/compiler/optimizing/intrinsics_x86_64.cc b/compiler/optimizing/intrinsics_x86_64.cc
index ab8b05c..a4ee546 100644
--- a/compiler/optimizing/intrinsics_x86_64.cc
+++ b/compiler/optimizing/intrinsics_x86_64.cc
@@ -64,6 +64,65 @@
 
 using IntrinsicSlowPathX86_64 = IntrinsicSlowPath<InvokeDexCallingConventionVisitorX86_64>;
 
+// NOLINT on __ macro to suppress wrong warning/fix (misc-macro-parentheses) from clang-tidy.
+#define __ down_cast<X86_64Assembler*>(codegen->GetAssembler())->  // NOLINT
+
+// Slow path implementing the SystemArrayCopy intrinsic copy loop with read barriers.
+class ReadBarrierSystemArrayCopySlowPathX86_64 : public SlowPathCode {
+ public:
+  explicit ReadBarrierSystemArrayCopySlowPathX86_64(HInstruction* instruction)
+      : SlowPathCode(instruction) {
+    DCHECK(kEmitCompilerReadBarrier);
+    DCHECK(kUseBakerReadBarrier);
+  }
+
+  void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
+    CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
+    LocationSummary* locations = instruction_->GetLocations();
+    DCHECK(locations->CanCall());
+    DCHECK(instruction_->IsInvokeStaticOrDirect())
+        << "Unexpected instruction in read barrier arraycopy slow path: "
+        << instruction_->DebugName();
+    DCHECK(instruction_->GetLocations()->Intrinsified());
+    DCHECK_EQ(instruction_->AsInvoke()->GetIntrinsic(), Intrinsics::kSystemArrayCopy);
+
+    int32_t element_size = Primitive::ComponentSize(Primitive::kPrimNot);
+
+    CpuRegister src_curr_addr = locations->GetTemp(0).AsRegister<CpuRegister>();
+    CpuRegister dst_curr_addr = locations->GetTemp(1).AsRegister<CpuRegister>();
+    CpuRegister src_stop_addr = locations->GetTemp(2).AsRegister<CpuRegister>();
+
+    __ Bind(GetEntryLabel());
+    NearLabel loop;
+    __ Bind(&loop);
+    __ movl(CpuRegister(TMP), Address(src_curr_addr, 0));
+    __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
+    // TODO: Inline the mark bit check before calling the runtime?
+    // TMP = ReadBarrier::Mark(TMP);
+    // No need to save live registers; it's taken care of by the
+    // entrypoint. Also, there is no need to update the stack mask,
+    // as this runtime call will not trigger a garbage collection.
+    int32_t entry_point_offset =
+        CodeGenerator::GetReadBarrierMarkEntryPointsOffset<kX86_64PointerSize>(TMP);
+    // This runtime call does not require a stack map.
+    x86_64_codegen->InvokeRuntimeWithoutRecordingPcInfo(entry_point_offset, instruction_, this);
+    __ MaybePoisonHeapReference(CpuRegister(TMP));
+    __ movl(Address(dst_curr_addr, 0), CpuRegister(TMP));
+    __ addl(src_curr_addr, Immediate(element_size));
+    __ addl(dst_curr_addr, Immediate(element_size));
+    __ cmpl(src_curr_addr, src_stop_addr);
+    __ j(kNotEqual, &loop);
+    __ jmp(GetExitLabel());
+  }
+
+  const char* GetDescription() const OVERRIDE { return "ReadBarrierSystemArrayCopySlowPathX86_64"; }
+
+ private:
+  DISALLOW_COPY_AND_ASSIGN(ReadBarrierSystemArrayCopySlowPathX86_64);
+};
+
+#undef __
+
 #define __ assembler->
 
 static void CreateFPToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
@@ -1053,9 +1112,9 @@
 
 
 void IntrinsicLocationsBuilderX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
-  // TODO(rpl): Implement read barriers in the SystemArrayCopy
-  // intrinsic and re-enable it (b/29516905).
-  if (kEmitCompilerReadBarrier) {
+  // The only read barrier implementation supporting the
+  // SystemArrayCopy intrinsic is the Baker-style read barriers.
+  if (kEmitCompilerReadBarrier && !kUseBakerReadBarrier) {
     return;
   }
 
@@ -1063,9 +1122,9 @@
 }
 
 void IntrinsicCodeGeneratorX86_64::VisitSystemArrayCopy(HInvoke* invoke) {
-  // TODO(rpl): Implement read barriers in the SystemArrayCopy
-  // intrinsic and re-enable it (b/29516905).
-  DCHECK(!kEmitCompilerReadBarrier);
+  // The only read barrier implementation supporting the
+  // SystemArrayCopy intrinsic is the Baker-style read barriers.
+  DCHECK(!kEmitCompilerReadBarrier || kUseBakerReadBarrier);
 
   X86_64Assembler* assembler = GetAssembler();
   LocationSummary* locations = invoke->GetLocations();
@@ -1074,18 +1133,23 @@
   uint32_t super_offset = mirror::Class::SuperClassOffset().Int32Value();
   uint32_t component_offset = mirror::Class::ComponentTypeOffset().Int32Value();
   uint32_t primitive_offset = mirror::Class::PrimitiveTypeOffset().Int32Value();
+  uint32_t monitor_offset = mirror::Object::MonitorOffset().Int32Value();
 
   CpuRegister src = locations->InAt(0).AsRegister<CpuRegister>();
   Location src_pos = locations->InAt(1);
   CpuRegister dest = locations->InAt(2).AsRegister<CpuRegister>();
   Location dest_pos = locations->InAt(3);
   Location length = locations->InAt(4);
-  CpuRegister temp1 = locations->GetTemp(0).AsRegister<CpuRegister>();
-  CpuRegister temp2 = locations->GetTemp(1).AsRegister<CpuRegister>();
-  CpuRegister temp3 = locations->GetTemp(2).AsRegister<CpuRegister>();
+  Location temp1_loc = locations->GetTemp(0);
+  CpuRegister temp1 = temp1_loc.AsRegister<CpuRegister>();
+  Location temp2_loc = locations->GetTemp(1);
+  CpuRegister temp2 = temp2_loc.AsRegister<CpuRegister>();
+  Location temp3_loc = locations->GetTemp(2);
+  CpuRegister temp3 = temp3_loc.AsRegister<CpuRegister>();
+  Location TMP_loc = Location::RegisterLocation(TMP);
 
-  SlowPathCode* slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
-  codegen_->AddSlowPath(slow_path);
+  SlowPathCode* intrinsic_slow_path = new (GetAllocator()) IntrinsicSlowPathX86_64(invoke);
+  codegen_->AddSlowPath(intrinsic_slow_path);
 
   NearLabel conditions_on_positions_validated;
   SystemArrayCopyOptimizations optimizations(invoke);
@@ -1101,7 +1165,7 @@
         DCHECK_GE(src_pos_constant, dest_pos_constant);
       } else if (src_pos_constant < dest_pos_constant) {
         __ cmpl(src, dest);
-        __ j(kEqual, slow_path->GetEntryLabel());
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
       }
     } else {
       if (!optimizations.GetDestinationIsSource()) {
@@ -1109,7 +1173,7 @@
         __ j(kNotEqual, &conditions_on_positions_validated);
       }
       __ cmpl(dest_pos.AsRegister<CpuRegister>(), Immediate(src_pos_constant));
-      __ j(kGreater, slow_path->GetEntryLabel());
+      __ j(kGreater, intrinsic_slow_path->GetEntryLabel());
     }
   } else {
     if (!optimizations.GetDestinationIsSource()) {
@@ -1119,10 +1183,10 @@
     if (dest_pos.IsConstant()) {
       int32_t dest_pos_constant = dest_pos.GetConstant()->AsIntConstant()->GetValue();
       __ cmpl(src_pos.AsRegister<CpuRegister>(), Immediate(dest_pos_constant));
-      __ j(kLess, slow_path->GetEntryLabel());
+      __ j(kLess, intrinsic_slow_path->GetEntryLabel());
     } else {
       __ cmpl(src_pos.AsRegister<CpuRegister>(), dest_pos.AsRegister<CpuRegister>());
-      __ j(kLess, slow_path->GetEntryLabel());
+      __ j(kLess, intrinsic_slow_path->GetEntryLabel());
     }
   }
 
@@ -1131,13 +1195,13 @@
   if (!optimizations.GetSourceIsNotNull()) {
     // Bail out if the source is null.
     __ testl(src, src);
-    __ j(kEqual, slow_path->GetEntryLabel());
+    __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   }
 
   if (!optimizations.GetDestinationIsNotNull() && !optimizations.GetDestinationIsSource()) {
     // Bail out if the destination is null.
     __ testl(dest, dest);
-    __ j(kEqual, slow_path->GetEntryLabel());
+    __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
   }
 
   // If the length is negative, bail out.
@@ -1146,7 +1210,7 @@
       !optimizations.GetCountIsSourceLength() &&
       !optimizations.GetCountIsDestinationLength()) {
     __ testl(length.AsRegister<CpuRegister>(), length.AsRegister<CpuRegister>());
-    __ j(kLess, slow_path->GetEntryLabel());
+    __ j(kLess, intrinsic_slow_path->GetEntryLabel());
   }
 
   // Validity checks: source.
@@ -1154,7 +1218,7 @@
                 src_pos,
                 src,
                 length,
-                slow_path,
+                intrinsic_slow_path,
                 temp1,
                 optimizations.GetCountIsSourceLength());
 
@@ -1163,7 +1227,7 @@
                 dest_pos,
                 dest,
                 length,
-                slow_path,
+                intrinsic_slow_path,
                 temp1,
                 optimizations.GetCountIsDestinationLength());
 
@@ -1172,38 +1236,80 @@
     // type of the destination array. We do two checks: the classes are the same,
     // or the destination is Object[]. If none of these checks succeed, we go to the
     // slow path.
-    __ movl(temp1, Address(dest, class_offset));
-    __ movl(temp2, Address(src, class_offset));
+
     bool did_unpoison = false;
-    if (!optimizations.GetDestinationIsNonPrimitiveArray() ||
-        !optimizations.GetSourceIsNonPrimitiveArray()) {
-      // One or two of the references need to be unpoisoned. Unpoison them
-      // both to make the identity check valid.
-      __ MaybeUnpoisonHeapReference(temp1);
-      __ MaybeUnpoisonHeapReference(temp2);
-      did_unpoison = true;
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      // /* HeapReference<Class> */ temp1 = dest->klass_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp1_loc, dest, class_offset, /* needs_null_check */ false);
+      // Register `temp1` is not trashed by the read barrier emitted
+      // by GenerateFieldLoadWithBakerReadBarrier below, as that
+      // method produces a call to a ReadBarrierMarkRegX entry point,
+      // which saves all potentially live registers, including
+      // temporaries such a `temp1`.
+      // /* HeapReference<Class> */ temp2 = src->klass_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp2_loc, src, class_offset, /* needs_null_check */ false);
+      // If heap poisoning is enabled, `temp1` and `temp2` have been
+      // unpoisoned by the the previous calls to
+      // GenerateFieldLoadWithBakerReadBarrier.
+    } else {
+      // /* HeapReference<Class> */ temp1 = dest->klass_
+      __ movl(temp1, Address(dest, class_offset));
+      // /* HeapReference<Class> */ temp2 = src->klass_
+      __ movl(temp2, Address(src, class_offset));
+      if (!optimizations.GetDestinationIsNonPrimitiveArray() ||
+          !optimizations.GetSourceIsNonPrimitiveArray()) {
+        // One or two of the references need to be unpoisoned. Unpoison them
+        // both to make the identity check valid.
+        __ MaybeUnpoisonHeapReference(temp1);
+        __ MaybeUnpoisonHeapReference(temp2);
+        did_unpoison = true;
+      }
     }
 
     if (!optimizations.GetDestinationIsNonPrimitiveArray()) {
       // Bail out if the destination is not a non primitive array.
-      // /* HeapReference<Class> */ TMP = temp1->component_type_
-      __ movl(CpuRegister(TMP), Address(temp1, component_offset));
-      __ testl(CpuRegister(TMP), CpuRegister(TMP));
-      __ j(kEqual, slow_path->GetEntryLabel());
-      __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        // /* HeapReference<Class> */ TMP = temp1->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, TMP_loc, temp1, component_offset, /* needs_null_check */ false);
+        __ testl(CpuRegister(TMP), CpuRegister(TMP));
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+        // If heap poisoning is enabled, `TMP` has been unpoisoned by
+        // the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+      } else {
+        // /* HeapReference<Class> */ TMP = temp1->component_type_
+        __ movl(CpuRegister(TMP), Address(temp1, component_offset));
+        __ testl(CpuRegister(TMP), CpuRegister(TMP));
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+        __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
+      }
       __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
-      __ j(kNotEqual, slow_path->GetEntryLabel());
+      __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
     }
 
     if (!optimizations.GetSourceIsNonPrimitiveArray()) {
       // Bail out if the source is not a non primitive array.
-      // /* HeapReference<Class> */ TMP = temp2->component_type_
-      __ movl(CpuRegister(TMP), Address(temp2, component_offset));
-      __ testl(CpuRegister(TMP), CpuRegister(TMP));
-      __ j(kEqual, slow_path->GetEntryLabel());
-      __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        // For the same reason given earlier, `temp1` is not trashed by the
+        // read barrier emitted by GenerateFieldLoadWithBakerReadBarrier below.
+        // /* HeapReference<Class> */ TMP = temp2->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, TMP_loc, temp2, component_offset, /* needs_null_check */ false);
+        __ testl(CpuRegister(TMP), CpuRegister(TMP));
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+        // If heap poisoning is enabled, `TMP` has been unpoisoned by
+        // the the previous call to GenerateFieldLoadWithBakerReadBarrier.
+      } else {
+        // /* HeapReference<Class> */ TMP = temp2->component_type_
+        __ movl(CpuRegister(TMP), Address(temp2, component_offset));
+        __ testl(CpuRegister(TMP), CpuRegister(TMP));
+        __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+        __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
+      }
       __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
-      __ j(kNotEqual, slow_path->GetEntryLabel());
+      __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
     }
 
     __ cmpl(temp1, temp2);
@@ -1211,34 +1317,56 @@
     if (optimizations.GetDestinationIsTypedObjectArray()) {
       NearLabel do_copy;
       __ j(kEqual, &do_copy);
-      if (!did_unpoison) {
+      if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        codegen_->GenerateFieldLoadWithBakerReadBarrier(
+            invoke, temp1_loc, temp1, component_offset, /* needs_null_check */ false);
+        // We do not need to emit a read barrier for the following
+        // heap reference load, as `temp1` is only used in a
+        // comparison with null below, and this reference is not
+        // kept afterwards.
+        __ cmpl(Address(temp1, super_offset), Immediate(0));
+      } else {
+        if (!did_unpoison) {
+          __ MaybeUnpoisonHeapReference(temp1);
+        }
+        // /* HeapReference<Class> */ temp1 = temp1->component_type_
+        __ movl(temp1, Address(temp1, component_offset));
         __ MaybeUnpoisonHeapReference(temp1);
+        // No need to unpoison the following heap reference load, as
+        // we're comparing against null.
+        __ cmpl(Address(temp1, super_offset), Immediate(0));
       }
-      // /* HeapReference<Class> */ temp1 = temp1->component_type_
-      __ movl(temp1, Address(temp1, component_offset));
-      __ MaybeUnpoisonHeapReference(temp1);
-      // /* HeapReference<Class> */ temp1 = temp1->super_class_
-      __ movl(temp1, Address(temp1, super_offset));
-      // No need to unpoison the result, we're comparing against null.
-      __ testl(temp1, temp1);
-      __ j(kNotEqual, slow_path->GetEntryLabel());
+      __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
       __ Bind(&do_copy);
     } else {
-      __ j(kNotEqual, slow_path->GetEntryLabel());
+      __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
     }
   } else if (!optimizations.GetSourceIsNonPrimitiveArray()) {
     DCHECK(optimizations.GetDestinationIsNonPrimitiveArray());
     // Bail out if the source is not a non primitive array.
-    // /* HeapReference<Class> */ temp1 = src->klass_
-    __ movl(temp1, Address(src, class_offset));
-    __ MaybeUnpoisonHeapReference(temp1);
-    // /* HeapReference<Class> */ TMP = temp1->component_type_
-    __ movl(CpuRegister(TMP), Address(temp1, component_offset));
-    __ testl(CpuRegister(TMP), CpuRegister(TMP));
-    __ j(kEqual, slow_path->GetEntryLabel());
-    __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
+    if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+      // /* HeapReference<Class> */ temp1 = src->klass_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, temp1_loc, src, class_offset, /* needs_null_check */ false);
+      // /* HeapReference<Class> */ TMP = temp1->component_type_
+      codegen_->GenerateFieldLoadWithBakerReadBarrier(
+          invoke, TMP_loc, temp1, component_offset, /* needs_null_check */ false);
+      __ testl(CpuRegister(TMP), CpuRegister(TMP));
+      __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+    } else {
+      // /* HeapReference<Class> */ temp1 = src->klass_
+      __ movl(temp1, Address(src, class_offset));
+      __ MaybeUnpoisonHeapReference(temp1);
+      // /* HeapReference<Class> */ TMP = temp1->component_type_
+      __ movl(CpuRegister(TMP), Address(temp1, component_offset));
+      // No need to unpoison `TMP` now, as we're comparing against null.
+      __ testl(CpuRegister(TMP), CpuRegister(TMP));
+      __ j(kEqual, intrinsic_slow_path->GetEntryLabel());
+      __ MaybeUnpoisonHeapReference(CpuRegister(TMP));
+    }
     __ cmpw(Address(CpuRegister(TMP), primitive_offset), Immediate(Primitive::kPrimNot));
-    __ j(kNotEqual, slow_path->GetEntryLabel());
+    __ j(kNotEqual, intrinsic_slow_path->GetEntryLabel());
   }
 
   // Compute base source address, base destination address, and end source address.
@@ -1266,19 +1394,88 @@
     __ leal(temp3, Address(temp1, length.AsRegister<CpuRegister>(), ScaleFactor::TIMES_4, 0));
   }
 
-  // Iterate over the arrays and do a raw copy of the objects. We don't need to
-  // poison/unpoison.
-  NearLabel loop, done;
-  __ cmpl(temp1, temp3);
-  __ j(kEqual, &done);
-  __ Bind(&loop);
-  __ movl(CpuRegister(TMP), Address(temp1, 0));
-  __ movl(Address(temp2, 0), CpuRegister(TMP));
-  __ addl(temp1, Immediate(element_size));
-  __ addl(temp2, Immediate(element_size));
-  __ cmpl(temp1, temp3);
-  __ j(kNotEqual, &loop);
-  __ Bind(&done);
+  if (kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
+    // SystemArrayCopy implementation for Baker read barriers (see
+    // also CodeGeneratorX86_64::GenerateReferenceLoadWithBakerReadBarrier):
+    //
+    //   if (src_ptr != end_ptr) {
+    //     uint32_t rb_state = Lockword(src->monitor_).ReadBarrierState();
+    //     lfence;  // Load fence or artificial data dependency to prevent load-load reordering
+    //     bool is_gray = (rb_state == ReadBarrier::gray_ptr_);
+    //     if (is_gray) {
+    //       // Slow-path copy.
+    //       do {
+    //         *dest_ptr++ = MaybePoison(ReadBarrier::Mark(MaybeUnpoison(*src_ptr++)));
+    //       } while (src_ptr != end_ptr)
+    //     } else {
+    //       // Fast-path copy.
+    //       do {
+    //         *dest_ptr++ = *src_ptr++;
+    //       } while (src_ptr != end_ptr)
+    //     }
+    //   }
+
+    NearLabel loop, done;
+
+    // Don't enter copy loop if `length == 0`.
+    __ cmpl(temp1, temp3);
+    __ j(kEqual, &done);
+
+    // Given the numeric representation, it's enough to check the low bit of the rb_state.
+    static_assert(ReadBarrier::white_ptr_ == 0, "Expecting white to have value 0");
+    static_assert(ReadBarrier::gray_ptr_ == 1, "Expecting gray to have value 1");
+    static_assert(ReadBarrier::black_ptr_ == 2, "Expecting black to have value 2");
+    constexpr uint32_t gray_byte_position = LockWord::kReadBarrierStateShift / kBitsPerByte;
+    constexpr uint32_t gray_bit_position = LockWord::kReadBarrierStateShift % kBitsPerByte;
+    constexpr int32_t test_value = static_cast<int8_t>(1 << gray_bit_position);
+
+    // if (rb_state == ReadBarrier::gray_ptr_)
+    //   goto slow_path;
+    // At this point, just do the "if" and make sure that flags are preserved until the branch.
+    __ testb(Address(src, monitor_offset + gray_byte_position), Immediate(test_value));
+
+    // Load fence to prevent load-load reordering.
+    // Note that this is a no-op, thanks to the x86-64 memory model.
+    codegen_->GenerateMemoryBarrier(MemBarrierKind::kLoadAny);
+
+    // Slow path used to copy array when `src` is gray.
+    SlowPathCode* read_barrier_slow_path =
+        new (GetAllocator()) ReadBarrierSystemArrayCopySlowPathX86_64(invoke);
+    codegen_->AddSlowPath(read_barrier_slow_path);
+
+    // We have done the "if" of the gray bit check above, now branch based on the flags.
+    __ j(kNotZero, read_barrier_slow_path->GetEntryLabel());
+
+    // Fast-path copy.
+    // Iterate over the arrays and do a raw copy of the objects. We don't need to
+    // poison/unpoison.
+    __ Bind(&loop);
+    __ movl(CpuRegister(TMP), Address(temp1, 0));
+    __ movl(Address(temp2, 0), CpuRegister(TMP));
+    __ addl(temp1, Immediate(element_size));
+    __ addl(temp2, Immediate(element_size));
+    __ cmpl(temp1, temp3);
+    __ j(kNotEqual, &loop);
+
+    __ Bind(read_barrier_slow_path->GetExitLabel());
+    __ Bind(&done);
+  } else {
+    // Non read barrier code.
+
+    // Iterate over the arrays and do a raw copy of the objects. We don't need to
+    // poison/unpoison.
+    NearLabel loop, done;
+    __ cmpl(temp1, temp3);
+    __ j(kEqual, &done);
+    __ Bind(&loop);
+    __ movl(CpuRegister(TMP), Address(temp1, 0));
+    __ movl(Address(temp2, 0), CpuRegister(TMP));
+    __ addl(temp1, Immediate(element_size));
+    __ addl(temp2, Immediate(element_size));
+    __ cmpl(temp1, temp3);
+    __ j(kNotEqual, &loop);
+    __ Bind(&done);
+  }
 
   // We only need one card marking on the destination array.
   codegen_->MarkGCCard(temp1,
@@ -1287,7 +1484,7 @@
                        CpuRegister(kNoRegister),
                        /* value_can_be_null */ false);
 
-  __ Bind(slow_path->GetExitLabel());
+  __ Bind(intrinsic_slow_path->GetExitLabel());
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitStringCompareTo(HInvoke* invoke) {
@@ -1892,10 +2089,9 @@
     case Primitive::kPrimNot: {
       if (kEmitCompilerReadBarrier) {
         if (kUseBakerReadBarrier) {
-          Location temp = locations->GetTemp(0);
           Address src(base, offset, ScaleFactor::TIMES_1, 0);
           codegen->GenerateReferenceLoadWithBakerReadBarrier(
-              invoke, output_loc, base, src, temp, /* needs_null_check */ false);
+              invoke, output_loc, base, src, /* needs_null_check */ false);
         } else {
           __ movl(output, Address(base, offset, ScaleFactor::TIMES_1, 0));
           codegen->GenerateReadBarrierSlow(
@@ -1918,9 +2114,7 @@
   }
 }
 
-static void CreateIntIntIntToIntLocations(ArenaAllocator* arena,
-                                          HInvoke* invoke,
-                                          Primitive::Type type) {
+static void CreateIntIntIntToIntLocations(ArenaAllocator* arena, HInvoke* invoke) {
   bool can_call = kEmitCompilerReadBarrier &&
       (invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObject ||
        invoke->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile);
@@ -1934,30 +2128,25 @@
   locations->SetInAt(2, Location::RequiresRegister());
   locations->SetOut(Location::RequiresRegister(),
                     can_call ? Location::kOutputOverlap : Location::kNoOutputOverlap);
-  if (type == Primitive::kPrimNot && kEmitCompilerReadBarrier && kUseBakerReadBarrier) {
-    // We need a temporary register for the read barrier marking slow
-    // path in InstructionCodeGeneratorX86_64::GenerateReferenceLoadWithBakerReadBarrier.
-    locations->AddTemp(Location::RequiresRegister());
-  }
 }
 
 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGet(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimInt);
+  CreateIntIntIntToIntLocations(arena_, invoke);
 }
 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetVolatile(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimInt);
+  CreateIntIntIntToIntLocations(arena_, invoke);
 }
 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetLong(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimLong);
+  CreateIntIntIntToIntLocations(arena_, invoke);
 }
 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetLongVolatile(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimLong);
+  CreateIntIntIntToIntLocations(arena_, invoke);
 }
 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObject(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimNot);
+  CreateIntIntIntToIntLocations(arena_, invoke);
 }
 void IntrinsicLocationsBuilderX86_64::VisitUnsafeGetObjectVolatile(HInvoke* invoke) {
-  CreateIntIntIntToIntLocations(arena_, invoke, Primitive::kPrimNot);
+  CreateIntIntIntToIntLocations(arena_, invoke);
 }
 
 
diff --git a/compiler/optimizing/optimizing_cfi_test.cc b/compiler/optimizing/optimizing_cfi_test.cc
index a6d234d..8c0231e 100644
--- a/compiler/optimizing/optimizing_cfi_test.cc
+++ b/compiler/optimizing/optimizing_cfi_test.cc
@@ -157,13 +157,26 @@
     TestImpl(isa, #isa, expected_asm, expected_cfi);          \
   }
 
+#ifdef ART_ENABLE_CODEGEN_arm
 TEST_ISA(kThumb2)
+#endif
+#ifdef ART_ENABLE_CODEGEN_arm64
 TEST_ISA(kArm64)
+#endif
+#ifdef ART_ENABLE_CODEGEN_x86
 TEST_ISA(kX86)
+#endif
+#ifdef ART_ENABLE_CODEGEN_x86_64
 TEST_ISA(kX86_64)
+#endif
+#ifdef ART_ENABLE_CODEGEN_mips
 TEST_ISA(kMips)
+#endif
+#ifdef ART_ENABLE_CODEGEN_mips64
 TEST_ISA(kMips64)
+#endif
 
+#ifdef ART_ENABLE_CODEGEN_arm
 TEST_F(OptimizingCFITest, kThumb2Adjust) {
   std::vector<uint8_t> expected_asm(
       expected_asm_kThumb2_adjust,
@@ -184,7 +197,9 @@
   Finish();
   Check(kThumb2, "kThumb2_adjust", expected_asm, expected_cfi);
 }
+#endif
 
+#ifdef ART_ENABLE_CODEGEN_mips
 TEST_F(OptimizingCFITest, kMipsAdjust) {
   // One NOP in delay slot, 1 << 15 NOPS have size 1 << 17 which exceeds 18-bit signed maximum.
   static constexpr size_t kNumNops = 1u + (1u << 15);
@@ -212,7 +227,9 @@
   Finish();
   Check(kMips, "kMips_adjust", expected_asm, expected_cfi);
 }
+#endif
 
+#ifdef ART_ENABLE_CODEGEN_mips64
 TEST_F(OptimizingCFITest, kMips64Adjust) {
   // One NOP in forbidden slot, 1 << 15 NOPS have size 1 << 17 which exceeds 18-bit signed maximum.
   static constexpr size_t kNumNops = 1u + (1u << 15);
@@ -240,6 +257,7 @@
   Finish();
   Check(kMips64, "kMips64_adjust", expected_asm, expected_cfi);
 }
+#endif
 
 #endif  // ART_TARGET_ANDROID
 
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index 698b0b6..f7c325e 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -428,8 +428,14 @@
       || instruction_set == kX86_64;
 }
 
+// Strip pass name suffix to get optimization name.
+static std::string ConvertPassNameToOptimizationName(const std::string& pass_name) {
+  size_t pos = pass_name.find(kPassNameSeparator);
+  return pos == std::string::npos ? pass_name : pass_name.substr(0, pos);
+}
+
 static HOptimization* BuildOptimization(
-    const std::string& opt_name,
+    const std::string& pass_name,
     ArenaAllocator* arena,
     HGraph* graph,
     OptimizingCompilerStats* stats,
@@ -439,6 +445,7 @@
     StackHandleScopeCollection* handles,
     SideEffectsAnalysis* most_recent_side_effects,
     HInductionVarAnalysis* most_recent_induction) {
+  std::string opt_name = ConvertPassNameToOptimizationName(pass_name);
   if (opt_name == BoundsCheckElimination::kBoundsCheckEliminationPassName) {
     CHECK(most_recent_side_effects != nullptr && most_recent_induction != nullptr);
     return new (arena) BoundsCheckElimination(graph,
@@ -446,11 +453,11 @@
                                               most_recent_induction);
   } else if (opt_name == GVNOptimization::kGlobalValueNumberingPassName) {
     CHECK(most_recent_side_effects != nullptr);
-    return new (arena) GVNOptimization(graph, *most_recent_side_effects);
+    return new (arena) GVNOptimization(graph, *most_recent_side_effects, pass_name.c_str());
   } else if (opt_name == HConstantFolding::kConstantFoldingPassName) {
-    return new (arena) HConstantFolding(graph);
+    return new (arena) HConstantFolding(graph, pass_name.c_str());
   } else if (opt_name == HDeadCodeElimination::kDeadCodeEliminationPassName) {
-    return new (arena) HDeadCodeElimination(graph, stats);
+    return new (arena) HDeadCodeElimination(graph, stats, pass_name.c_str());
   } else if (opt_name == HInliner::kInlinerPassName) {
     size_t number_of_dex_registers = dex_compilation_unit.GetCodeItem()->registers_size_;
     return new (arena) HInliner(graph,                   // outer_graph
@@ -470,7 +477,7 @@
   } else if (opt_name == HInductionVarAnalysis::kInductionPassName) {
     return new (arena) HInductionVarAnalysis(graph);
   } else if (opt_name == InstructionSimplifier::kInstructionSimplifierPassName) {
-    return new (arena) InstructionSimplifier(graph, stats);
+    return new (arena) InstructionSimplifier(graph, stats, pass_name.c_str());
   } else if (opt_name == IntrinsicsRecognizer::kIntrinsicsRecognizerPassName) {
     return new (arena) IntrinsicsRecognizer(graph, driver, stats);
   } else if (opt_name == LICM::kLoopInvariantCodeMotionPassName) {
@@ -522,12 +529,9 @@
   SideEffectsAnalysis* most_recent_side_effects = nullptr;
   HInductionVarAnalysis* most_recent_induction = nullptr;
   ArenaVector<HOptimization*> ret(arena->Adapter());
-  for (std::string pass_name : pass_names) {
-    size_t pos = pass_name.find(kPassNameSeparator);    // Strip suffix to get base pass name.
-    std::string opt_name = pos == std::string::npos ? pass_name : pass_name.substr(0, pos);
-
+  for (const std::string& pass_name : pass_names) {
     HOptimization* opt = BuildOptimization(
-        opt_name,
+        pass_name,
         arena,
         graph,
         stats,
@@ -540,6 +544,7 @@
     CHECK(opt != nullptr) << "Couldn't build optimization: \"" << pass_name << "\"";
     ret.push_back(opt);
 
+    std::string opt_name = ConvertPassNameToOptimizationName(pass_name);
     if (opt_name == SideEffectsAnalysis::kSideEffectsAnalysisPassName) {
       most_recent_side_effects = down_cast<SideEffectsAnalysis*>(opt);
     } else if (opt_name == HInductionVarAnalysis::kInductionPassName) {
diff --git a/compiler/optimizing/sharpening.cc b/compiler/optimizing/sharpening.cc
index b73f738..6effc30 100644
--- a/compiler/optimizing/sharpening.cc
+++ b/compiler/optimizing/sharpening.cc
@@ -279,8 +279,7 @@
   const DexFile& dex_file = load_string->GetDexFile();
   uint32_t string_index = load_string->GetStringIndex();
 
-  bool is_in_dex_cache = false;
-  HLoadString::LoadKind desired_load_kind;
+  HLoadString::LoadKind desired_load_kind = HLoadString::LoadKind::kDexCacheViaMethod;
   uint64_t address = 0u;  // String or dex cache element address.
   {
     Runtime* runtime = Runtime::Current();
@@ -296,33 +295,14 @@
       DCHECK(!runtime->UseJitCompilation());
       mirror::String* string = class_linker->ResolveString(dex_file, string_index, dex_cache);
       CHECK(string != nullptr);
-      if (!compiler_driver_->GetSupportBootImageFixup()) {
-        // MIPS/MIPS64 or compiler_driver_test. Do not sharpen.
-        desired_load_kind = HLoadString::LoadKind::kDexCacheViaMethod;
-      } else {
-        DCHECK(ContainsElement(compiler_driver_->GetDexFilesForOatFile(), &dex_file));
-        is_in_dex_cache = true;
-        desired_load_kind = codegen_->GetCompilerOptions().GetCompilePic()
-            ? HLoadString::LoadKind::kBootImageLinkTimePcRelative
-            : HLoadString::LoadKind::kBootImageLinkTimeAddress;
-      }
+      // TODO: In follow up CL, add PcRelative and Address back in.
     } else if (runtime->UseJitCompilation()) {
       // TODO: Make sure we don't set the "compile PIC" flag for JIT as that's bogus.
       // DCHECK(!codegen_->GetCompilerOptions().GetCompilePic());
       mirror::String* string = dex_cache->GetResolvedString(string_index);
-      is_in_dex_cache = (string != nullptr);
       if (string != nullptr && runtime->GetHeap()->ObjectIsInBootImageSpace(string)) {
-        // TODO: Use direct pointers for all non-moving spaces, not just boot image. Bug: 29530787
         desired_load_kind = HLoadString::LoadKind::kBootImageAddress;
         address = reinterpret_cast64<uint64_t>(string);
-      } else {
-        // Note: If the string is not in the dex cache, the instruction needs environment
-        // and will not be inlined across dex files. Within a dex file, the slow-path helper
-        // loads the correct string and inlined frames are used correctly for OOM stack trace.
-        // TODO: Write a test for this. Bug: 29416588
-        desired_load_kind = HLoadString::LoadKind::kDexCacheAddress;
-        void* dex_cache_element_address = &dex_cache->GetStrings()[string_index];
-        address = reinterpret_cast64<uint64_t>(dex_cache_element_address);
       }
     } else {
       // AOT app compilation. Try to lookup the string without allocating if not found.
@@ -332,19 +312,9 @@
           !codegen_->GetCompilerOptions().GetCompilePic()) {
         desired_load_kind = HLoadString::LoadKind::kBootImageAddress;
         address = reinterpret_cast64<uint64_t>(string);
-      } else {
-        // Not JIT and either the string is not in boot image or we are compiling in PIC mode.
-        // Use PC-relative load from the dex cache if the dex file belongs
-        // to the oat file that we're currently compiling.
-        desired_load_kind = ContainsElement(compiler_driver_->GetDexFilesForOatFile(), &dex_file)
-            ? HLoadString::LoadKind::kDexCachePcRelative
-            : HLoadString::LoadKind::kDexCacheViaMethod;
       }
     }
   }
-  if (is_in_dex_cache) {
-    load_string->MarkInDexCache();
-  }
 
   HLoadString::LoadKind load_kind = codegen_->GetSupportedLoadStringKind(desired_load_kind);
   switch (load_kind) {
diff --git a/compiler/utils/arm/assembler_arm.h b/compiler/utils/arm/assembler_arm.h
index 8ba6fb4..17a6650 100644
--- a/compiler/utils/arm/assembler_arm.h
+++ b/compiler/utils/arm/assembler_arm.h
@@ -906,6 +906,12 @@
     // reg = -reg.
     rsb(reg, reg, ShifterOperand(0));
   }
+  // Poison a heap reference contained in `reg` if heap poisoning is enabled.
+  void MaybePoisonHeapReference(Register reg) {
+    if (kPoisonHeapReferences) {
+      PoisonHeapReference(reg);
+    }
+  }
   // Unpoison a heap reference contained in `reg` if heap poisoning is enabled.
   void MaybeUnpoisonHeapReference(Register reg) {
     if (kPoisonHeapReferences) {
diff --git a/compiler/utils/arm64/assembler_arm64.cc b/compiler/utils/arm64/assembler_arm64.cc
index 19450b3..f91bcfa 100644
--- a/compiler/utils/arm64/assembler_arm64.cc
+++ b/compiler/utils/arm64/assembler_arm64.cc
@@ -146,6 +146,12 @@
   ___ Neg(reg, Operand(reg));
 }
 
+void Arm64Assembler::MaybePoisonHeapReference(Register reg) {
+  if (kPoisonHeapReferences) {
+    PoisonHeapReference(reg);
+  }
+}
+
 void Arm64Assembler::MaybeUnpoisonHeapReference(Register reg) {
   if (kPoisonHeapReferences) {
     UnpoisonHeapReference(reg);
diff --git a/compiler/utils/arm64/assembler_arm64.h b/compiler/utils/arm64/assembler_arm64.h
index 2847cb8..66a7fed 100644
--- a/compiler/utils/arm64/assembler_arm64.h
+++ b/compiler/utils/arm64/assembler_arm64.h
@@ -93,6 +93,8 @@
   void PoisonHeapReference(vixl::aarch64::Register reg);
   // Unpoison a heap reference contained in `reg`.
   void UnpoisonHeapReference(vixl::aarch64::Register reg);
+  // Poison a heap reference contained in `reg` if heap poisoning is enabled.
+  void MaybePoisonHeapReference(vixl::aarch64::Register reg);
   // Unpoison a heap reference contained in `reg` if heap poisoning is enabled.
   void MaybeUnpoisonHeapReference(vixl::aarch64::Register reg);
 
diff --git a/compiler/utils/jni_macro_assembler.cc b/compiler/utils/jni_macro_assembler.cc
index 797a98c..1b74313 100644
--- a/compiler/utils/jni_macro_assembler.cc
+++ b/compiler/utils/jni_macro_assembler.cc
@@ -99,6 +99,7 @@
       return MacroAsm64UniquePtr(new (arena) x86_64::X86_64JNIMacroAssembler(arena));
 #endif
     default:
+      UNUSED(arena);
       LOG(FATAL) << "Unknown/unsupported 8B InstructionSet: " << instruction_set;
       UNREACHABLE();
   }
diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index 8b7da3f..bfc63d1 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc
@@ -1407,44 +1407,6 @@
   }
 }
 
-void MipsAssembler::StoreConst32ToOffset(int32_t value,
-                                         Register base,
-                                         int32_t offset,
-                                         Register temp) {
-  CHECK_NE(temp, AT);  // Must not use AT as temp, so as not to overwrite the adjusted base.
-  AdjustBaseAndOffset(base, offset, /* is_doubleword */ false);
-  if (value == 0) {
-    temp = ZERO;
-  } else {
-    LoadConst32(temp, value);
-  }
-  Sw(temp, base, offset);
-}
-
-void MipsAssembler::StoreConst64ToOffset(int64_t value,
-                                         Register base,
-                                         int32_t offset,
-                                         Register temp) {
-  CHECK_NE(temp, AT);  // Must not use AT as temp, so as not to overwrite the adjusted base.
-  AdjustBaseAndOffset(base, offset, /* is_doubleword */ true);
-  uint32_t low = Low32Bits(value);
-  uint32_t high = High32Bits(value);
-  if (low == 0) {
-    Sw(ZERO, base, offset);
-  } else {
-    LoadConst32(temp, low);
-    Sw(temp, base, offset);
-  }
-  if (high == 0) {
-    Sw(ZERO, base, offset + kMipsWordSize);
-  } else {
-    if (high != low) {
-      LoadConst32(temp, high);
-    }
-    Sw(temp, base, offset + kMipsWordSize);
-  }
-}
-
 void MipsAssembler::LoadSConst32(FRegister r, int32_t value, Register temp) {
   if (value == 0) {
     temp = ZERO;
@@ -2533,61 +2495,19 @@
   CHECK_EQ(misalignment, offset & (kMipsDoublewordSize - 1));
 }
 
-void MipsAssembler::LoadFromOffset(LoadOperandType type, Register reg, Register base,
+void MipsAssembler::LoadFromOffset(LoadOperandType type,
+                                   Register reg,
+                                   Register base,
                                    int32_t offset) {
-  AdjustBaseAndOffset(base, offset, /* is_doubleword */ (type == kLoadDoubleword));
-  switch (type) {
-    case kLoadSignedByte:
-      Lb(reg, base, offset);
-      break;
-    case kLoadUnsignedByte:
-      Lbu(reg, base, offset);
-      break;
-    case kLoadSignedHalfword:
-      Lh(reg, base, offset);
-      break;
-    case kLoadUnsignedHalfword:
-      Lhu(reg, base, offset);
-      break;
-    case kLoadWord:
-      Lw(reg, base, offset);
-      break;
-    case kLoadDoubleword:
-      if (reg == base) {
-        // This will clobber the base when loading the lower register. Since we have to load the
-        // higher register as well, this will fail. Solution: reverse the order.
-        Lw(static_cast<Register>(reg + 1), base, offset + kMipsWordSize);
-        Lw(reg, base, offset);
-      } else {
-        Lw(reg, base, offset);
-        Lw(static_cast<Register>(reg + 1), base, offset + kMipsWordSize);
-      }
-      break;
-    default:
-      LOG(FATAL) << "UNREACHABLE";
-  }
+  LoadFromOffset<>(type, reg, base, offset);
 }
 
 void MipsAssembler::LoadSFromOffset(FRegister reg, Register base, int32_t offset) {
-  AdjustBaseAndOffset(base, offset, /* is_doubleword */ false, /* is_float */ true);
-  Lwc1(reg, base, offset);
+  LoadSFromOffset<>(reg, base, offset);
 }
 
 void MipsAssembler::LoadDFromOffset(FRegister reg, Register base, int32_t offset) {
-  AdjustBaseAndOffset(base, offset, /* is_doubleword */ true, /* is_float */ true);
-  if (offset & 0x7) {
-    if (Is32BitFPU()) {
-      Lwc1(reg, base, offset);
-      Lwc1(static_cast<FRegister>(reg + 1), base, offset + kMipsWordSize);
-    } else {
-      // 64-bit FPU.
-      Lwc1(reg, base, offset);
-      Lw(T8, base, offset + kMipsWordSize);
-      Mthc1(T8, reg);
-    }
-  } else {
-    Ldc1(reg, base, offset);
-  }
+  LoadDFromOffset<>(reg, base, offset);
 }
 
 void MipsAssembler::EmitLoad(ManagedRegister m_dst, Register src_register, int32_t src_offset,
@@ -2611,53 +2531,19 @@
   }
 }
 
-void MipsAssembler::StoreToOffset(StoreOperandType type, Register reg, Register base,
+void MipsAssembler::StoreToOffset(StoreOperandType type,
+                                  Register reg,
+                                  Register base,
                                   int32_t offset) {
-  // Must not use AT as `reg`, so as not to overwrite the value being stored
-  // with the adjusted `base`.
-  CHECK_NE(reg, AT);
-  AdjustBaseAndOffset(base, offset, /* is_doubleword */ (type == kStoreDoubleword));
-  switch (type) {
-    case kStoreByte:
-      Sb(reg, base, offset);
-      break;
-    case kStoreHalfword:
-      Sh(reg, base, offset);
-      break;
-    case kStoreWord:
-      Sw(reg, base, offset);
-      break;
-    case kStoreDoubleword:
-      CHECK_NE(reg, base);
-      CHECK_NE(static_cast<Register>(reg + 1), base);
-      Sw(reg, base, offset);
-      Sw(static_cast<Register>(reg + 1), base, offset + kMipsWordSize);
-      break;
-    default:
-      LOG(FATAL) << "UNREACHABLE";
-  }
+  StoreToOffset<>(type, reg, base, offset);
 }
 
 void MipsAssembler::StoreSToOffset(FRegister reg, Register base, int32_t offset) {
-  AdjustBaseAndOffset(base, offset, /* is_doubleword */ false, /* is_float */ true);
-  Swc1(reg, base, offset);
+  StoreSToOffset<>(reg, base, offset);
 }
 
 void MipsAssembler::StoreDToOffset(FRegister reg, Register base, int32_t offset) {
-  AdjustBaseAndOffset(base, offset, /* is_doubleword */ true, /* is_float */ true);
-  if (offset & 0x7) {
-    if (Is32BitFPU()) {
-      Swc1(reg, base, offset);
-      Swc1(static_cast<FRegister>(reg + 1), base, offset + kMipsWordSize);
-    } else {
-      // 64-bit FPU.
-      Mfhc1(T8, reg);
-      Swc1(reg, base, offset);
-      Sw(T8, base, offset + kMipsWordSize);
-    }
-  } else {
-    Sdc1(reg, base, offset);
-  }
+  StoreDToOffset<>(reg, base, offset);
 }
 
 static dwarf::Reg DWARFReg(Register reg) {
diff --git a/compiler/utils/mips/assembler_mips.h b/compiler/utils/mips/assembler_mips.h
index 41b6c6b..434ca67 100644
--- a/compiler/utils/mips/assembler_mips.h
+++ b/compiler/utils/mips/assembler_mips.h
@@ -412,8 +412,6 @@
   void LoadConst64(Register reg_hi, Register reg_lo, int64_t value);
   void LoadDConst64(FRegister rd, int64_t value, Register temp);
   void LoadSConst32(FRegister r, int32_t value, Register temp);
-  void StoreConst32ToOffset(int32_t value, Register base, int32_t offset, Register temp);
-  void StoreConst64ToOffset(int64_t value, Register base, int32_t offset, Register temp);
   void Addiu32(Register rt, Register rs, int32_t value, Register rtmp = AT);
 
   // These will generate R2 branches or R6 branches as appropriate.
@@ -444,6 +442,204 @@
                            int32_t& offset,
                            bool is_doubleword,
                            bool is_float = false);
+
+ private:
+  struct NoImplicitNullChecker {
+    void operator()() {}
+  };
+
+ public:
+  template <typename ImplicitNullChecker = NoImplicitNullChecker>
+  void StoreConst32ToOffset(int32_t value,
+                            Register base,
+                            int32_t offset,
+                            Register temp,
+                            ImplicitNullChecker null_checker = NoImplicitNullChecker()) {
+    CHECK_NE(temp, AT);  // Must not use AT as temp, so as not to overwrite the adjusted base.
+    AdjustBaseAndOffset(base, offset, /* is_doubleword */ false);
+    if (value == 0) {
+      temp = ZERO;
+    } else {
+      LoadConst32(temp, value);
+    }
+    Sw(temp, base, offset);
+    null_checker();
+  }
+
+  template <typename ImplicitNullChecker = NoImplicitNullChecker>
+  void StoreConst64ToOffset(int64_t value,
+                            Register base,
+                            int32_t offset,
+                            Register temp,
+                            ImplicitNullChecker null_checker = NoImplicitNullChecker()) {
+    CHECK_NE(temp, AT);  // Must not use AT as temp, so as not to overwrite the adjusted base.
+    AdjustBaseAndOffset(base, offset, /* is_doubleword */ true);
+    uint32_t low = Low32Bits(value);
+    uint32_t high = High32Bits(value);
+    if (low == 0) {
+      Sw(ZERO, base, offset);
+    } else {
+      LoadConst32(temp, low);
+      Sw(temp, base, offset);
+    }
+    null_checker();
+    if (high == 0) {
+      Sw(ZERO, base, offset + kMipsWordSize);
+    } else {
+      if (high != low) {
+        LoadConst32(temp, high);
+      }
+      Sw(temp, base, offset + kMipsWordSize);
+    }
+  }
+
+  template <typename ImplicitNullChecker = NoImplicitNullChecker>
+  void LoadFromOffset(LoadOperandType type,
+                      Register reg,
+                      Register base,
+                      int32_t offset,
+                      ImplicitNullChecker null_checker = NoImplicitNullChecker()) {
+    AdjustBaseAndOffset(base, offset, /* is_doubleword */ (type == kLoadDoubleword));
+    switch (type) {
+      case kLoadSignedByte:
+        Lb(reg, base, offset);
+        break;
+      case kLoadUnsignedByte:
+        Lbu(reg, base, offset);
+        break;
+      case kLoadSignedHalfword:
+        Lh(reg, base, offset);
+        break;
+      case kLoadUnsignedHalfword:
+        Lhu(reg, base, offset);
+        break;
+      case kLoadWord:
+        Lw(reg, base, offset);
+        break;
+      case kLoadDoubleword:
+        if (reg == base) {
+          // This will clobber the base when loading the lower register. Since we have to load the
+          // higher register as well, this will fail. Solution: reverse the order.
+          Lw(static_cast<Register>(reg + 1), base, offset + kMipsWordSize);
+          null_checker();
+          Lw(reg, base, offset);
+        } else {
+          Lw(reg, base, offset);
+          null_checker();
+          Lw(static_cast<Register>(reg + 1), base, offset + kMipsWordSize);
+        }
+        break;
+      default:
+        LOG(FATAL) << "UNREACHABLE";
+    }
+    if (type != kLoadDoubleword) {
+      null_checker();
+    }
+  }
+
+  template <typename ImplicitNullChecker = NoImplicitNullChecker>
+  void LoadSFromOffset(FRegister reg,
+                       Register base,
+                       int32_t offset,
+                       ImplicitNullChecker null_checker = NoImplicitNullChecker()) {
+    AdjustBaseAndOffset(base, offset, /* is_doubleword */ false, /* is_float */ true);
+    Lwc1(reg, base, offset);
+    null_checker();
+  }
+
+  template <typename ImplicitNullChecker = NoImplicitNullChecker>
+  void LoadDFromOffset(FRegister reg,
+                       Register base,
+                       int32_t offset,
+                       ImplicitNullChecker null_checker = NoImplicitNullChecker()) {
+    AdjustBaseAndOffset(base, offset, /* is_doubleword */ true, /* is_float */ true);
+    if (IsAligned<kMipsDoublewordSize>(offset)) {
+      Ldc1(reg, base, offset);
+      null_checker();
+    } else {
+      if (Is32BitFPU()) {
+        Lwc1(reg, base, offset);
+        null_checker();
+        Lwc1(static_cast<FRegister>(reg + 1), base, offset + kMipsWordSize);
+      } else {
+        // 64-bit FPU.
+        Lwc1(reg, base, offset);
+        null_checker();
+        Lw(T8, base, offset + kMipsWordSize);
+        Mthc1(T8, reg);
+      }
+    }
+  }
+
+  template <typename ImplicitNullChecker = NoImplicitNullChecker>
+  void StoreToOffset(StoreOperandType type,
+                     Register reg,
+                     Register base,
+                     int32_t offset,
+                     ImplicitNullChecker null_checker = NoImplicitNullChecker()) {
+    // Must not use AT as `reg`, so as not to overwrite the value being stored
+    // with the adjusted `base`.
+    CHECK_NE(reg, AT);
+    AdjustBaseAndOffset(base, offset, /* is_doubleword */ (type == kStoreDoubleword));
+    switch (type) {
+      case kStoreByte:
+        Sb(reg, base, offset);
+        break;
+      case kStoreHalfword:
+        Sh(reg, base, offset);
+        break;
+      case kStoreWord:
+        Sw(reg, base, offset);
+        break;
+      case kStoreDoubleword:
+        CHECK_NE(reg, base);
+        CHECK_NE(static_cast<Register>(reg + 1), base);
+        Sw(reg, base, offset);
+        null_checker();
+        Sw(static_cast<Register>(reg + 1), base, offset + kMipsWordSize);
+        break;
+      default:
+        LOG(FATAL) << "UNREACHABLE";
+    }
+    if (type != kStoreDoubleword) {
+      null_checker();
+    }
+  }
+
+  template <typename ImplicitNullChecker = NoImplicitNullChecker>
+  void StoreSToOffset(FRegister reg,
+                      Register base,
+                      int32_t offset,
+                      ImplicitNullChecker null_checker = NoImplicitNullChecker()) {
+    AdjustBaseAndOffset(base, offset, /* is_doubleword */ false, /* is_float */ true);
+    Swc1(reg, base, offset);
+    null_checker();
+  }
+
+  template <typename ImplicitNullChecker = NoImplicitNullChecker>
+  void StoreDToOffset(FRegister reg,
+                      Register base,
+                      int32_t offset,
+                      ImplicitNullChecker null_checker = NoImplicitNullChecker()) {
+    AdjustBaseAndOffset(base, offset, /* is_doubleword */ true, /* is_float */ true);
+    if (IsAligned<kMipsDoublewordSize>(offset)) {
+      Sdc1(reg, base, offset);
+      null_checker();
+    } else {
+      if (Is32BitFPU()) {
+        Swc1(reg, base, offset);
+        null_checker();
+        Swc1(static_cast<FRegister>(reg + 1), base, offset + kMipsWordSize);
+      } else {
+        // 64-bit FPU.
+        Mfhc1(T8, reg);
+        Swc1(reg, base, offset);
+        null_checker();
+        Sw(T8, base, offset + kMipsWordSize);
+      }
+    }
+  }
+
   void LoadFromOffset(LoadOperandType type, Register reg, Register base, int32_t offset);
   void LoadSFromOffset(FRegister reg, Register base, int32_t offset);
   void LoadDFromOffset(FRegister reg, Register base, int32_t offset);
diff --git a/compiler/utils/x86/assembler_x86.cc b/compiler/utils/x86/assembler_x86.cc
index f1a9915..f2ef41f 100644
--- a/compiler/utils/x86/assembler_x86.cc
+++ b/compiler/utils/x86/assembler_x86.cc
@@ -1148,6 +1148,23 @@
 }
 
 
+void X86Assembler::testb(const Address& dst, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF6);
+  EmitOperand(EAX, dst);
+  CHECK(imm.is_int8());
+  EmitUint8(imm.value() & 0xFF);
+}
+
+
+void X86Assembler::testl(const Address& dst, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0xF7);
+  EmitOperand(0, dst);
+  EmitImmediate(imm);
+}
+
+
 void X86Assembler::andl(Register dst, Register src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitUint8(0x23);
diff --git a/compiler/utils/x86/assembler_x86.h b/compiler/utils/x86/assembler_x86.h
index 92a92a5..2ddcd76 100644
--- a/compiler/utils/x86/assembler_x86.h
+++ b/compiler/utils/x86/assembler_x86.h
@@ -496,6 +496,9 @@
   void testl(Register reg, const Immediate& imm);
   void testl(Register reg1, const Address& address);
 
+  void testb(const Address& dst, const Immediate& imm);
+  void testl(const Address& dst, const Immediate& imm);
+
   void andl(Register dst, const Immediate& imm);
   void andl(Register dst, Register src);
   void andl(Register dst, const Address& address);
@@ -639,6 +642,12 @@
   void PoisonHeapReference(Register reg) { negl(reg); }
   // Unpoison a heap reference contained in `reg`.
   void UnpoisonHeapReference(Register reg) { negl(reg); }
+  // Poison a heap reference contained in `reg` if heap poisoning is enabled.
+  void MaybePoisonHeapReference(Register reg) {
+    if (kPoisonHeapReferences) {
+      PoisonHeapReference(reg);
+    }
+  }
   // Unpoison a heap reference contained in `reg` if heap poisoning is enabled.
   void MaybeUnpoisonHeapReference(Register reg) {
     if (kPoisonHeapReferences) {
diff --git a/compiler/utils/x86/assembler_x86_test.cc b/compiler/utils/x86/assembler_x86_test.cc
index 307e034..61d70d7 100644
--- a/compiler/utils/x86/assembler_x86_test.cc
+++ b/compiler/utils/x86/assembler_x86_test.cc
@@ -375,6 +375,42 @@
   DriverStr(expected, "cmovl_address");
 }
 
+TEST_F(AssemblerX86Test, TestbAddressImmediate) {
+  GetAssembler()->testb(
+      x86::Address(x86::Register(x86::EDI), x86::Register(x86::EBX), x86::TIMES_4, 12),
+      x86::Immediate(1));
+  GetAssembler()->testb(
+      x86::Address(x86::Register(x86::ESP), FrameOffset(7)),
+      x86::Immediate(-128));
+  GetAssembler()->testb(
+      x86::Address(x86::Register(x86::EBX), MemberOffset(130)),
+      x86::Immediate(127));
+  const char* expected =
+      "testb $1, 0xc(%EDI,%EBX,4)\n"
+      "testb $-128, 0x7(%ESP)\n"
+      "testb $127, 0x82(%EBX)\n";
+
+  DriverStr(expected, "TestbAddressImmediate");
+}
+
+TEST_F(AssemblerX86Test, TestlAddressImmediate) {
+  GetAssembler()->testl(
+      x86::Address(x86::Register(x86::EDI), x86::Register(x86::EBX), x86::TIMES_4, 12),
+      x86::Immediate(1));
+  GetAssembler()->testl(
+      x86::Address(x86::Register(x86::ESP), FrameOffset(7)),
+      x86::Immediate(-100000));
+  GetAssembler()->testl(
+      x86::Address(x86::Register(x86::EBX), MemberOffset(130)),
+      x86::Immediate(77777777));
+  const char* expected =
+      "testl $1, 0xc(%EDI,%EBX,4)\n"
+      "testl $-100000, 0x7(%ESP)\n"
+      "testl $77777777, 0x82(%EBX)\n";
+
+  DriverStr(expected, "TestlAddressImmediate");
+}
+
 /////////////////
 // Near labels //
 /////////////////
diff --git a/compiler/utils/x86_64/assembler_x86_64.cc b/compiler/utils/x86_64/assembler_x86_64.cc
index ddc8244..1f73aa7 100644
--- a/compiler/utils/x86_64/assembler_x86_64.cc
+++ b/compiler/utils/x86_64/assembler_x86_64.cc
@@ -1389,6 +1389,25 @@
 }
 
 
+void X86_64Assembler::testb(const Address& dst, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst);
+  EmitUint8(0xF6);
+  EmitOperand(Register::RAX, dst);
+  CHECK(imm.is_int8());
+  EmitUint8(imm.value() & 0xFF);
+}
+
+
+void X86_64Assembler::testl(const Address& dst, const Immediate& imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitOptionalRex32(dst);
+  EmitUint8(0xF7);
+  EmitOperand(0, dst);
+  EmitImmediate(imm);
+}
+
+
 void X86_64Assembler::andl(CpuRegister dst, CpuRegister src) {
   AssemblerBuffer::EnsureCapacity ensured(&buffer_);
   EmitOptionalRex32(dst, src);
diff --git a/compiler/utils/x86_64/assembler_x86_64.h b/compiler/utils/x86_64/assembler_x86_64.h
index 370f49c..3a4bfca 100644
--- a/compiler/utils/x86_64/assembler_x86_64.h
+++ b/compiler/utils/x86_64/assembler_x86_64.h
@@ -528,6 +528,9 @@
   void testq(CpuRegister reg1, CpuRegister reg2);
   void testq(CpuRegister reg, const Address& address);
 
+  void testb(const Address& address, const Immediate& imm);
+  void testl(const Address& address, const Immediate& imm);
+
   void andl(CpuRegister dst, const Immediate& imm);
   void andl(CpuRegister dst, CpuRegister src);
   void andl(CpuRegister reg, const Address& address);
@@ -741,6 +744,12 @@
   void PoisonHeapReference(CpuRegister reg) { negl(reg); }
   // Unpoison a heap reference contained in `reg`.
   void UnpoisonHeapReference(CpuRegister reg) { negl(reg); }
+  // Poison a heap reference contained in `reg` if heap poisoning is enabled.
+  void MaybePoisonHeapReference(CpuRegister reg) {
+    if (kPoisonHeapReferences) {
+      PoisonHeapReference(reg);
+    }
+  }
   // Unpoison a heap reference contained in `reg` if heap poisoning is enabled.
   void MaybeUnpoisonHeapReference(CpuRegister reg) {
     if (kPoisonHeapReferences) {
diff --git a/compiler/utils/x86_64/assembler_x86_64_test.cc b/compiler/utils/x86_64/assembler_x86_64_test.cc
index 36c966b..48a1876 100644
--- a/compiler/utils/x86_64/assembler_x86_64_test.cc
+++ b/compiler/utils/x86_64/assembler_x86_64_test.cc
@@ -1526,6 +1526,48 @@
   DriverStr(expected, "cmpb");
 }
 
+TEST_F(AssemblerX86_64Test, TestbAddressImmediate) {
+  GetAssembler()->testb(
+      x86_64::Address(x86_64::CpuRegister(x86_64::RDI),
+                      x86_64::CpuRegister(x86_64::RBX),
+                      x86_64::TIMES_4,
+                      12),
+      x86_64::Immediate(1));
+  GetAssembler()->testb(
+      x86_64::Address(x86_64::CpuRegister(x86_64::RSP), FrameOffset(7)),
+      x86_64::Immediate(-128));
+  GetAssembler()->testb(
+      x86_64::Address(x86_64::CpuRegister(x86_64::RBX), MemberOffset(130)),
+      x86_64::Immediate(127));
+  const char* expected =
+      "testb $1, 0xc(%RDI,%RBX,4)\n"
+      "testb $-128, 0x7(%RSP)\n"
+      "testb $127, 0x82(%RBX)\n";
+
+  DriverStr(expected, "TestbAddressImmediate");
+}
+
+TEST_F(AssemblerX86_64Test, TestlAddressImmediate) {
+  GetAssembler()->testl(
+      x86_64::Address(x86_64::CpuRegister(x86_64::RDI),
+                      x86_64::CpuRegister(x86_64::RBX),
+                      x86_64::TIMES_4,
+                      12),
+      x86_64::Immediate(1));
+  GetAssembler()->testl(
+      x86_64::Address(x86_64::CpuRegister(x86_64::RSP), FrameOffset(7)),
+      x86_64::Immediate(-100000));
+  GetAssembler()->testl(
+      x86_64::Address(x86_64::CpuRegister(x86_64::RBX), MemberOffset(130)),
+      x86_64::Immediate(77777777));
+  const char* expected =
+      "testl $1, 0xc(%RDI,%RBX,4)\n"
+      "testl $-100000, 0x7(%RSP)\n"
+      "testl $77777777, 0x82(%RBX)\n";
+
+  DriverStr(expected, "TestlAddressImmediate");
+}
+
 class JNIMacroAssemblerX86_64Test : public JNIMacroAssemblerTest<x86_64::X86_64JNIMacroAssembler> {
  public:
   using Base = JNIMacroAssemblerTest<x86_64::X86_64JNIMacroAssembler>;
diff --git a/dalvikvm/Android.mk b/dalvikvm/Android.mk
index 71e9a28..6c0bcb1 100644
--- a/dalvikvm/Android.mk
+++ b/dalvikvm/Android.mk
@@ -18,7 +18,7 @@
 
 include art/build/Android.common.mk
 
-dalvikvm_cflags := -Wall -Werror -Wextra -std=gnu++11
+dalvikvm_cflags := -Wall -Werror -Wextra
 
 include $(CLEAR_VARS)
 LOCAL_MODULE := dalvikvm
diff --git a/disassembler/disassembler.cc b/disassembler/disassembler.cc
index e604c1f..bcd0d16 100644
--- a/disassembler/disassembler.cc
+++ b/disassembler/disassembler.cc
@@ -32,10 +32,8 @@
     return new arm::DisassemblerArm(options);
   } else if (instruction_set == kArm64) {
     return new arm64::DisassemblerArm64(options);
-  } else if (instruction_set == kMips) {
-    return new mips::DisassemblerMips(options, false);
-  } else if (instruction_set == kMips64) {
-    return new mips::DisassemblerMips(options, true);
+  } else if (instruction_set == kMips || instruction_set == kMips64) {
+    return new mips::DisassemblerMips(options);
   } else if (instruction_set == kX86) {
     return new x86::DisassemblerX86(options, false);
   } else if (instruction_set == kX86_64) {
diff --git a/disassembler/disassembler.h b/disassembler/disassembler.h
index b080315..86793cc 100644
--- a/disassembler/disassembler.h
+++ b/disassembler/disassembler.h
@@ -28,8 +28,9 @@
 
 class DisassemblerOptions {
  public:
-  // Should the disassembler print absolute or relative addresses.
-  const bool absolute_addresses_;
+  using ThreadOffsetNameFunction = void (*)(std::ostream& os, uint32_t offset);
+
+  ThreadOffsetNameFunction thread_offset_name_function_;
 
   // Base address for calculating relative code offsets when absolute_addresses_ is false.
   const uint8_t* const base_address_;
@@ -37,6 +38,9 @@
   // End address (exclusive);
   const uint8_t* const end_address_;
 
+  // Should the disassembler print absolute or relative addresses.
+  const bool absolute_addresses_;
+
   // If set, the disassembler is allowed to look at load targets in literal
   // pools.
   const bool can_read_literals_;
@@ -44,10 +48,12 @@
   DisassemblerOptions(bool absolute_addresses,
                       const uint8_t* base_address,
                       const uint8_t* end_address,
-                      bool can_read_literals)
-      : absolute_addresses_(absolute_addresses),
+                      bool can_read_literals,
+                      ThreadOffsetNameFunction fn)
+      : thread_offset_name_function_(fn),
         base_address_(base_address),
         end_address_(end_address),
+        absolute_addresses_(absolute_addresses),
         can_read_literals_(can_read_literals) {}
 
  private:
diff --git a/disassembler/disassembler_arm.cc b/disassembler/disassembler_arm.cc
index 4f0e144..a47b6ad 100644
--- a/disassembler/disassembler_arm.cc
+++ b/disassembler/disassembler_arm.cc
@@ -25,7 +25,6 @@
 #include "base/bit_utils.h"
 #include "base/logging.h"
 #include "base/stringprintf.h"
-#include "thread.h"
 
 namespace art {
 namespace arm {
@@ -329,7 +328,7 @@
           }
           if (rn.r == 9) {
             args << "  ; ";
-            Thread::DumpThreadOffset<kArmPointerSize>(args, offset);
+            GetDisassemblerOptions()->thread_offset_name_function_(args, offset);
           }
         }
       }
@@ -1401,7 +1400,7 @@
             args << Rt << ", [" << Rn << ", #" << (U != 0u ? "" : "-") << imm12 << "]";
             if (Rn.r == TR && is_load) {
               args << "  ; ";
-              Thread::DumpThreadOffset<kArmPointerSize>(args, imm12);
+              GetDisassemblerOptions()->thread_offset_name_function_(args, imm12);
             } else if (Rn.r == PC) {
               T2LitType lit_type[] = {
                   kT2LitUByte, kT2LitUHalf, kT2LitHexWord, kT2LitInvalid,
diff --git a/disassembler/disassembler_arm64.cc b/disassembler/disassembler_arm64.cc
index 0ef9025..80bacb2 100644
--- a/disassembler/disassembler_arm64.cc
+++ b/disassembler/disassembler_arm64.cc
@@ -22,7 +22,6 @@
 
 #include "base/logging.h"
 #include "base/stringprintf.h"
-#include "thread.h"
 
 using namespace vixl::aarch64;  // NOLINT(build/namespaces)
 
@@ -102,7 +101,7 @@
   if (instr->GetRn() == TR) {
     int64_t offset = instr->GetImmLSUnsigned() << instr->GetSizeLS();
     std::ostringstream tmp_stream;
-    Thread::DumpThreadOffset<kArm64PointerSize>(tmp_stream, static_cast<uint32_t>(offset));
+    options_->thread_offset_name_function_(tmp_stream, static_cast<uint32_t>(offset));
     AppendToOutput(" ; %s", tmp_stream.str().c_str());
   }
 }
diff --git a/disassembler/disassembler_arm64.h b/disassembler/disassembler_arm64.h
index 7c64792..19e4dfb 100644
--- a/disassembler/disassembler_arm64.h
+++ b/disassembler/disassembler_arm64.h
@@ -35,7 +35,8 @@
       : vixl::aarch64::Disassembler(),
         read_literals_(options->can_read_literals_),
         base_address_(options->base_address_),
-        end_address_(options->end_address_) {
+        end_address_(options->end_address_),
+        options_(options) {
     if (!options->absolute_addresses_) {
       MapCodeAddress(0,
                      reinterpret_cast<const vixl::aarch64::Instruction*>(options->base_address_));
@@ -64,6 +65,8 @@
   // Valid address range: [base_address_, end_address_)
   const void* const base_address_;
   const void* const end_address_;
+
+  DisassemblerOptions* options_;
 };
 
 class DisassemblerArm64 FINAL : public Disassembler {
diff --git a/disassembler/disassembler_mips.cc b/disassembler/disassembler_mips.cc
index 3448878..02c6d71 100644
--- a/disassembler/disassembler_mips.cc
+++ b/disassembler/disassembler_mips.cc
@@ -21,7 +21,6 @@
 
 #include "base/logging.h"
 #include "base/stringprintf.h"
-#include "thread.h"
 
 namespace art {
 namespace mips {
@@ -503,11 +502,7 @@
               args << StringPrintf("%+d(r%d)", offset, rs);
               if (rs == 17) {
                 args << "  ; ";
-                if (is64bit_) {
-                  Thread::DumpThreadOffset<kMips64PointerSize>(args, offset);
-                } else {
-                  Thread::DumpThreadOffset<kMipsPointerSize>(args, offset);
-                }
+                GetDisassemblerOptions()->thread_offset_name_function_(args, offset);
               }
             }
             break;
diff --git a/disassembler/disassembler_mips.h b/disassembler/disassembler_mips.h
index b0e49b3..6342f22 100644
--- a/disassembler/disassembler_mips.h
+++ b/disassembler/disassembler_mips.h
@@ -26,9 +26,8 @@
 
 class DisassemblerMips FINAL : public Disassembler {
  public:
-  DisassemblerMips(DisassemblerOptions* options, bool is64bit)
+  explicit DisassemblerMips(DisassemblerOptions* options)
       : Disassembler(options),
-        is64bit_(is64bit),
         last_ptr_(nullptr),
         last_instr_(0) {}
 
@@ -36,8 +35,6 @@
   void Dump(std::ostream& os, const uint8_t* begin, const uint8_t* end) OVERRIDE;
 
  private:
-  const bool is64bit_;
-
   // Address and encoding of the last disassembled instruction.
   // Needed to produce more readable disassembly of certain 2-instruction sequences.
   const uint8_t* last_ptr_;
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index 147e0b1..2ca84e5 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -23,7 +23,6 @@
 
 #include "base/logging.h"
 #include "base/stringprintf.h"
-#include "thread.h"
 
 namespace art {
 namespace x86 {
@@ -1409,11 +1408,11 @@
   }
   if (prefix[1] == kFs && !supports_rex_) {
     args << "  ; ";
-    Thread::DumpThreadOffset<kX86PointerSize>(args, address_bits);
+    GetDisassemblerOptions()->thread_offset_name_function_(args, address_bits);
   }
   if (prefix[1] == kGs && supports_rex_) {
     args << "  ; ";
-    Thread::DumpThreadOffset<kX86_64PointerSize>(args, address_bits);
+    GetDisassemblerOptions()->thread_offset_name_function_(args, address_bits);
   }
   const char* prefix_str;
   switch (prefix[0]) {
diff --git a/oatdump/oatdump.cc b/oatdump/oatdump.cc
index 77730b9..96c8e94 100644
--- a/oatdump/oatdump.cc
+++ b/oatdump/oatdump.cc
@@ -335,10 +335,14 @@
       resolved_addr2instr_(0),
       instruction_set_(oat_file_.GetOatHeader().GetInstructionSet()),
       disassembler_(Disassembler::Create(instruction_set_,
-                                         new DisassemblerOptions(options_.absolute_addresses_,
-                                                                 oat_file.Begin(),
-                                                                 oat_file.End(),
-                                                                 true /* can_read_literals_ */))) {
+                                         new DisassemblerOptions(
+                                             options_.absolute_addresses_,
+                                             oat_file.Begin(),
+                                             oat_file.End(),
+                                             true /* can_read_literals_ */,
+                                             Is64BitInstructionSet(instruction_set_)
+                                                 ? &Thread::DumpThreadOffset<PointerSize::k64>
+                                                 : &Thread::DumpThreadOffset<PointerSize::k32>))) {
     CHECK(options_.class_loader_ != nullptr);
     CHECK(options_.class_filter_ != nullptr);
     CHECK(options_.method_filter_ != nullptr);
@@ -1402,7 +1406,7 @@
   const std::vector<const OatFile::OatDexFile*> oat_dex_files_;
   const OatDumperOptions& options_;
   uint32_t resolved_addr2instr_;
-  InstructionSet instruction_set_;
+  const InstructionSet instruction_set_;
   std::set<uintptr_t> offsets_;
   Disassembler* disassembler_;
 };
diff --git a/patchoat/patchoat.cc b/patchoat/patchoat.cc
index 9432384..3f6531b 100644
--- a/patchoat/patchoat.cc
+++ b/patchoat/patchoat.cc
@@ -37,6 +37,7 @@
 #include "gc/space/image_space.h"
 #include "image-inl.h"
 #include "mirror/abstract_method.h"
+#include "mirror/dex_cache.h"
 #include "mirror/object-inl.h"
 #include "mirror/method.h"
 #include "mirror/reference.h"
@@ -592,8 +593,8 @@
     // 64-bit values here, clearing the top 32 bits for 32-bit targets. The zero-extension is
     // done by casting to the unsigned type uintptr_t before casting to int64_t, i.e.
     //     static_cast<int64_t>(reinterpret_cast<uintptr_t>(image_begin_ + offset))).
-    GcRoot<mirror::String>* orig_strings = orig_dex_cache->GetStrings();
-    GcRoot<mirror::String>* relocated_strings = RelocatedAddressOfPointer(orig_strings);
+    mirror::StringDexCacheType* orig_strings = orig_dex_cache->GetStrings();
+    mirror::StringDexCacheType* relocated_strings = RelocatedAddressOfPointer(orig_strings);
     copy_dex_cache->SetField64<false>(
         mirror::DexCache::StringsOffset(),
         static_cast<int64_t>(reinterpret_cast<uintptr_t>(relocated_strings)));
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index c4ec726..e25e93f 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -191,7 +191,7 @@
     .cfi_rel_offset r11, 44
     .cfi_rel_offset ip, 48
     .cfi_rel_offset lr, 52
-    vpush {d0-d15}                      @ 32 words of float args.
+    vpush {d0-d15}                      @ 32 words, 2 for each of the 16 saved doubles.
     .cfi_adjust_cfa_offset 128
     sub sp, #8                          @ 2 words of space, alignment padding and Method*
     .cfi_adjust_cfa_offset 8
@@ -1030,11 +1030,49 @@
 END art_quick_set64_instance
 
     /*
-     * Entry from managed code to resolve a string, this stub will allocate a String and deliver an
-     * exception on error. On success the String is returned. R0 holds the string index. The fast
-     * path check for hit in strings cache has already been performed.
+     * Entry from managed code to resolve a string, this stub will
+     * check the dex cache for a matching string (the fast path), and if not found,
+     * it will allocate a String and deliver an exception on error.
+     * On success the String is returned. R0 holds the string index.
      */
-ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+
+ENTRY art_quick_resolve_string
+    ldr    r1, [sp]                                              @ load referrer
+    ldr    r1, [r1, #ART_METHOD_DECLARING_CLASS_OFFSET]          @ load declaring class
+    ldr    r1, [r1, #DECLARING_CLASS_DEX_CACHE_STRINGS_OFFSET]   @ load string dex cache
+    ubfx   r2, r0, #0, #STRING_DEX_CACHE_HASH_BITS
+    add    r1, r1, r2, LSL #STRING_DEX_CACHE_ELEMENT_SIZE_SHIFT
+    ldrd   r2, r3, [r1]                                    @ load index into r3 and pointer into r2
+    cmp    r0, r3
+    bne    .Lart_quick_resolve_string_slow_path
+#ifdef USE_READ_BARRIER
+    ldr    r3, [rSELF, #THREAD_IS_GC_MARKING_OFFSET]
+    cbnz   r3, .Lart_quick_resolve_string_marking
+#endif
+    mov    r0, r2
+    bx     lr
+// Slow path case, the index did not match
+.Lart_quick_resolve_string_slow_path:
+    SETUP_SAVE_REFS_ONLY_FRAME r2                    @ save callee saves in case of GC
+    mov    r1, r9                                    @ pass Thread::Current
+    mov    r3, sp
+    bl     artResolveStringFromCode                  @ (uint32_t type_idx, Method* method, Thread*)
+    RESTORE_SAVE_REFS_ONLY_FRAME
+    RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+// GC is marking case, need to check the mark bit.
+.Lart_quick_resolve_string_marking:
+    ldr    r3, [r2, MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    tst    r3, #LOCK_WORD_MARK_BIT_MASK_SHIFTED
+    mov    r0, r2
+    bne    .Lart_quick_resolve_string_no_rb
+    push   {r1, r2, r3, lr}                          @ Save x1, LR
+    .cfi_adjust_cfa_offset 16
+    bl     artReadBarrierMark                        @ Get the marked string back.
+    pop    {r1, r2, r3, lr}                          @ Restore registers.
+    .cfi_adjust_cfa_offset -16
+.Lart_quick_resolve_string_no_rb:
+    bx     lr
+END art_quick_resolve_string
 
 // Generate the allocation entrypoints for each allocator.
 GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index 4289cab..202846a 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -331,6 +331,7 @@
 #endif
 
     // Save FP registers.
+    // For better performance, store d0 and d31 separately, so that all STPs are 16-byte aligned.
     str d0,       [sp, #8]
     stp d1, d2,   [sp, #16]
     stp d3, d4,   [sp, #32]
@@ -431,6 +432,7 @@
 
 .macro RESTORE_SAVE_EVERYTHING_FRAME
     // Restore FP registers.
+    // For better performance, load d0 and d31 separately, so that all LDPs are 16-byte aligned.
     ldr d0,       [sp, #8]
     ldp d1, d2,   [sp, #16]
     ldp d3, d4,   [sp, #32]
@@ -1784,11 +1786,48 @@
 END art_quick_set64_static
 
     /*
-     * Entry from managed code to resolve a string, this stub will allocate a String and deliver an
-     * exception on error. On success the String is returned. w0 holds the string index. The fast
-     * path check for hit in strings cache has already been performed.
+     * Entry from managed code to resolve a string, this stub will
+     * check the dex cache for a matching string (the fast path), and if not found,
+     * it will allocate a String and deliver an exception on error.
+     * On success the String is returned. R0 holds the string index.
      */
-ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+
+ENTRY art_quick_resolve_string
+    ldr   x1, [sp]                                               // load referrer
+    ldr   w2, [x1, #ART_METHOD_DECLARING_CLASS_OFFSET]           // load declaring class
+    ldr   x1, [x2, #DECLARING_CLASS_DEX_CACHE_STRINGS_OFFSET]    // load string dex cache
+    and   x2, x0, #STRING_DEX_CACHE_SIZE_MINUS_ONE               // get masked string index into x2
+    ldr   x2, [x1, x2, lsl #STRING_DEX_CACHE_ELEMENT_SIZE_SHIFT]  // load dex cache pair into x2
+    cmp   x0, x2, lsr #32                                         // compare against upper 32 bits
+    bne   .Lart_quick_resolve_string_slow_path
+    ubfx  x0, x2, #0, #32                                        // extract lower 32 bits into x0
+#ifdef USE_READ_BARRIER
+    // Most common case: GC is not marking.
+    ldr    w3, [xSELF, #THREAD_IS_GC_MARKING_OFFSET]
+    cbnz   x3, .Lart_quick_resolve_string_marking
+#endif
+    ret
+
+// Slow path case, the index did not match.
+.Lart_quick_resolve_string_slow_path:
+    SETUP_SAVE_REFS_ONLY_FRAME                      // save callee saves in case of GC
+    mov   x1, xSELF                                 // pass Thread::Current
+    bl    artResolveStringFromCode                  // (int32_t string_idx, Thread* self)
+    RESTORE_SAVE_REFS_ONLY_FRAME
+    RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+
+// GC is marking case, need to check the mark bit.
+.Lart_quick_resolve_string_marking:
+    ldr   x3, [x0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    tbnz  x3, #LOCK_WORD_MARK_BIT_SHIFT, .Lart_quick_resolve_string_no_rb
+    // Save LR so that we can return, also x1 for alignment purposes.
+    stp    x1, xLR, [sp, #-16]!                     // Save x1, LR.
+    bl     artReadBarrierMark                       // Get the marked string back.
+    ldp    x1, xLR, [sp], #16                       // Restore registers.
+.Lart_quick_resolve_string_no_rb:
+    ret
+
+END art_quick_resolve_string
 
 // Generate the allocation entrypoints for each allocator.
 GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_REGION_TLAB_ALLOCATORS
diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc
index 80bb51d..10adb3a 100644
--- a/runtime/arch/stub_test.cc
+++ b/runtime/arch/stub_test.cc
@@ -1203,6 +1203,7 @@
 
 
 TEST_F(StubTest, StringCompareTo) {
+  TEST_DISABLED_FOR_STRING_COMPRESSION();
   // There is no StringCompareTo runtime entrypoint for __arm__ or __aarch64__.
 #if defined(__i386__) || defined(__mips__) || \
     (defined(__x86_64__) && !defined(__APPLE__))
diff --git a/runtime/arch/x86/fault_handler_x86.cc b/runtime/arch/x86/fault_handler_x86.cc
index 3efeb40..c7af249 100644
--- a/runtime/arch/x86/fault_handler_x86.cc
+++ b/runtime/arch/x86/fault_handler_x86.cc
@@ -191,6 +191,27 @@
         immediate_size = operand_size_prefix ? 2 : 4;
         break;
 
+      case 0xf6:
+      case 0xf7:
+        modrm = *pc++;
+        has_modrm = true;
+        switch ((modrm >> 3) & 7) {  // Extract "reg/opcode" from "modr/m".
+          case 0:  // test
+            immediate_size = (opcode == 0xf6) ? 1 : (operand_size_prefix ? 2 : 4);
+            break;
+          case 2:  // not
+          case 3:  // neg
+          case 4:  // mul
+          case 5:  // imul
+          case 6:  // div
+          case 7:  // idiv
+            break;
+          default:
+            unhandled_instruction = true;
+            break;
+        }
+        break;
+
       default:
         unhandled_instruction = true;
         break;
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 2e9682e..282f10d 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1108,7 +1108,54 @@
     ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeRegionTLAB
 END_FUNCTION art_quick_alloc_object_region_tlab
 
-ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+DEFINE_FUNCTION art_quick_resolve_string
+    SETUP_SAVE_REFS_ONLY_FRAME  ebx, ebx
+    movl FRAME_SIZE_SAVE_REFS_ONLY(%esp), %ecx                   // get referrer
+    movl ART_METHOD_DECLARING_CLASS_OFFSET(%ecx), %ecx           // get declaring class
+    movl DECLARING_CLASS_DEX_CACHE_STRINGS_OFFSET(%ecx), %ecx    // get string dex cache
+    movl LITERAL(STRING_DEX_CACHE_SIZE_MINUS_ONE), %edx
+    andl %eax, %edx
+    shl LITERAL(STRING_DEX_CACHE_ELEMENT_SIZE_SHIFT), %edx
+    addl %ecx, %edx
+    movlps (%edx), %xmm0                                     // load string idx and pointer to xmm0
+    movd %xmm0, %ecx                                         // extract pointer
+    pshufd LITERAL(0x55), %xmm0, %xmm0                       // shuffle index into lowest bits
+    movd %xmm0, %edx                                         // extract index
+    cmp %edx, %eax
+    jne .Lart_quick_resolve_string_slow_path
+    movl %ecx, %eax
+#ifdef USE_READ_BARRIER
+    cmpl LITERAL(0), %fs:THREAD_IS_GC_MARKING_OFFSET
+    jne .Lart_quick_resolve_string_marking
+#endif
+    RESTORE_SAVE_REFS_ONLY_FRAME
+    ret
+.Lart_quick_resolve_string_slow_path:
+    // Outgoing argument set up
+    subl LITERAL(8), %esp                                        // push padding
+    CFI_ADJUST_CFA_OFFSET(8)
+    pushl %fs:THREAD_SELF_OFFSET                                 // pass Thread::Current()
+    CFI_ADJUST_CFA_OFFSET(4)
+    PUSH eax                                                     // pass arg1
+    call SYMBOL(artResolveStringFromCode)
+    addl LITERAL(16), %esp                                       // pop arguments
+    CFI_ADJUST_CFA_OFFSET(-16)
+    RESTORE_SAVE_REFS_ONLY_FRAME
+    RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+.Lart_quick_resolve_string_marking:
+    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%eax)
+    jnz .Lart_quick_resolve_string_no_rb
+    subl LITERAL(12), %esp                                   // alignment padding
+    CFI_ADJUST_CFA_OFFSET(12)
+    PUSH eax                                                 // Pass the string as the first param.
+    call SYMBOL(artReadBarrierMark)
+    addl LITERAL(16), %esp
+    CFI_ADJUST_CFA_OFFSET(-16)
+.Lart_quick_resolve_string_no_rb:
+    RESTORE_SAVE_REFS_ONLY_FRAME
+    ret
+END_FUNCTION art_quick_resolve_string
+
 ONE_ARG_DOWNCALL art_quick_initialize_static_storage, artInitializeStaticStorageFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
 ONE_ARG_DOWNCALL art_quick_initialize_type, artInitializeTypeFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
 ONE_ARG_DOWNCALL art_quick_initialize_type_and_verify_access, artInitializeTypeAndVerifyAccessFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 32768b0..62808ab 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -1330,7 +1330,52 @@
     ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeInitializedRegionTLAB
 END_FUNCTION art_quick_alloc_object_initialized_region_tlab
 
-ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+DEFINE_FUNCTION art_quick_resolve_string
+    movq 8(%rsp), %rcx                                         // get referrer
+    movl ART_METHOD_DECLARING_CLASS_OFFSET(%rcx), %ecx         // get declaring class
+    movq DECLARING_CLASS_DEX_CACHE_STRINGS_OFFSET(%ecx), %rcx  // get string dex cache
+    movq LITERAL(STRING_DEX_CACHE_SIZE_MINUS_ONE), %rdx
+    andq %rdi, %rdx
+    shlq LITERAL(STRING_DEX_CACHE_ELEMENT_SIZE_SHIFT), %rdx
+    addq %rcx, %rdx
+    movq %rax, %rcx
+    movq (%rdx), %rdx
+    movq %rdx, %rax
+    movl %eax, %eax
+    shrq LITERAL(32), %rdx
+    cmp %rdx, %rdi
+    jne .Lart_quick_resolve_string_slow_path
+#ifdef USE_READ_BARRIER
+    cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET
+    jne .Lart_quick_resolve_string_marking
+#endif
+    ret
+// Slow path, the index did not match
+.Lart_quick_resolve_string_slow_path:
+    SETUP_SAVE_REFS_ONLY_FRAME
+    movq %rcx, %rax
+    // Outgoing argument set up
+    movq %gs:THREAD_SELF_OFFSET, %rsi           // pass Thread::Current()
+    call SYMBOL(artResolveStringFromCode)       // artResolveStringFromCode(arg0, referrer, Thread*)
+    RESTORE_SAVE_REFS_ONLY_FRAME                // restore frame up to return address
+    RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+// GC is marking case, need to check the mark bit.
+.Lart_quick_resolve_string_marking:
+    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%rax)
+    jnz .Lart_quick_resolve_string_no_rb
+    // Save LR so that we can return, also x1 for alignment purposes
+    PUSH rdi
+    PUSH rsi
+    subq LITERAL(8), %rsp                         // 16 byte alignment
+    movq %rax, %rdi
+    call SYMBOL(artReadBarrierMark)
+    addq LITERAL(8), %rsp
+    POP  rsi
+    POP  rdi
+.Lart_quick_resolve_string_no_rb:
+    ret
+END_FUNCTION art_quick_resolve_string
+
 ONE_ARG_DOWNCALL art_quick_initialize_static_storage, artInitializeStaticStorageFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
 ONE_ARG_DOWNCALL art_quick_initialize_type, artInitializeTypeFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
 ONE_ARG_DOWNCALL art_quick_initialize_type_and_verify_access, artInitializeTypeAndVerifyAccessFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
diff --git a/runtime/art_method.cc b/runtime/art_method.cc
index d812590..f9bc249 100644
--- a/runtime/art_method.cc
+++ b/runtime/art_method.cc
@@ -477,7 +477,7 @@
 
   DCHECK(method_header->Contains(pc))
       << PrettyMethod(this)
-      << std::hex << pc << " " << oat_entry_point
+      << " " << std::hex << pc << " " << oat_entry_point
       << " " << (uintptr_t)(method_header->code_ + method_header->code_size_);
   return method_header;
 }
diff --git a/runtime/asm_support.h b/runtime/asm_support.h
index 848f8e5..102b993 100644
--- a/runtime/asm_support.h
+++ b/runtime/asm_support.h
@@ -19,12 +19,15 @@
 
 #if defined(__cplusplus)
 #include "art_method.h"
+#include "base/bit_utils.h"
 #include "gc/allocator/rosalloc.h"
 #include "gc/heap.h"
 #include "jit/jit.h"
 #include "lock_word.h"
 #include "mirror/class.h"
+#include "mirror/dex_cache.h"
 #include "mirror/string.h"
+#include "utils/dex_cache_arrays_layout.h"
 #include "runtime.h"
 #include "thread.h"
 #endif
diff --git a/runtime/base/arena_allocator.cc b/runtime/base/arena_allocator.cc
index b84e29f..aeb990c 100644
--- a/runtime/base/arena_allocator.cc
+++ b/runtime/base/arena_allocator.cc
@@ -163,6 +163,7 @@
 MallocArena::MallocArena(size_t size) {
   memory_ = reinterpret_cast<uint8_t*>(calloc(1, size));
   CHECK(memory_ != nullptr);  // Abort on OOM.
+  DCHECK_ALIGNED(memory_, ArenaAllocator::kAlignment);
   size_ = size;
 }
 
@@ -370,6 +371,7 @@
     arena_head_ = new_arena;
     // Update our internal data structures.
     begin_ = new_arena->Begin();
+    DCHECK_ALIGNED(begin_, kAlignment);
     ptr_ = begin_ + bytes;
     end_ = new_arena->End();
   }
diff --git a/runtime/base/arena_allocator.h b/runtime/base/arena_allocator.h
index 6c1a898..3fad96b 100644
--- a/runtime/base/arena_allocator.h
+++ b/runtime/base/arena_allocator.h
@@ -310,6 +310,7 @@
       return AllocFromNewArena(bytes);
     }
     uint8_t* ret = ptr_;
+    DCHECK_ALIGNED(ret, kAlignment);
     ptr_ += bytes;
     return ret;
   }
@@ -319,20 +320,24 @@
                 ArenaAllocKind kind = kArenaAllocMisc) ALWAYS_INLINE {
     DCHECK_GE(new_size, ptr_size);
     DCHECK_EQ(ptr == nullptr, ptr_size == 0u);
-    auto* end = reinterpret_cast<uint8_t*>(ptr) + ptr_size;
+    // We always allocate aligned.
+    const size_t aligned_ptr_size = RoundUp(ptr_size, kAlignment);
+    auto* end = reinterpret_cast<uint8_t*>(ptr) + aligned_ptr_size;
     // If we haven't allocated anything else, we can safely extend.
     if (end == ptr_) {
       DCHECK(!IsRunningOnMemoryTool());  // Red zone prevents end == ptr_.
-      const size_t size_delta = new_size - ptr_size;
+      const size_t aligned_new_size = RoundUp(new_size, kAlignment);
+      const size_t size_delta = aligned_new_size - aligned_ptr_size;
       // Check remain space.
       const size_t remain = end_ - ptr_;
       if (remain >= size_delta) {
         ptr_ += size_delta;
         ArenaAllocatorStats::RecordAlloc(size_delta, kind);
+        DCHECK_ALIGNED(ptr_, kAlignment);
         return ptr;
       }
     }
-    auto* new_ptr = Alloc(new_size, kind);
+    auto* new_ptr = Alloc(new_size, kind);  // Note: Alloc will take care of aligning new_size.
     memcpy(new_ptr, ptr, ptr_size);
     // TODO: Call free on ptr if linear alloc supports free.
     return new_ptr;
@@ -362,11 +367,12 @@
 
   bool Contains(const void* ptr) const;
 
+  static constexpr size_t kAlignment = 8;
+
  private:
   void* AllocWithMemoryTool(size_t bytes, ArenaAllocKind kind);
   uint8_t* AllocFromNewArena(size_t bytes);
 
-  static constexpr size_t kAlignment = 8;
 
   void UpdateBytesAllocated();
 
diff --git a/runtime/base/arena_allocator_test.cc b/runtime/base/arena_allocator_test.cc
index 9de3cc4..fd48a3f 100644
--- a/runtime/base/arena_allocator_test.cc
+++ b/runtime/base/arena_allocator_test.cc
@@ -16,6 +16,7 @@
 
 #include "base/arena_allocator.h"
 #include "base/arena_bit_vector.h"
+#include "base/memory_tool.h"
 #include "gtest/gtest.h"
 
 namespace art {
@@ -124,4 +125,221 @@
   }
 }
 
+TEST_F(ArenaAllocatorTest, AllocAlignment) {
+  ArenaPool pool;
+  ArenaAllocator arena(&pool);
+  for (size_t iterations = 0; iterations <= 10; ++iterations) {
+    for (size_t size = 1; size <= ArenaAllocator::kAlignment + 1; ++size) {
+      void* allocation = arena.Alloc(size);
+      EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(allocation))
+          << reinterpret_cast<uintptr_t>(allocation);
+    }
+  }
+}
+
+TEST_F(ArenaAllocatorTest, ReallocReuse) {
+  // Realloc does not reuse arenas when running under sanitization. So we cannot do those
+  if (RUNNING_ON_MEMORY_TOOL != 0) {
+    printf("WARNING: TEST DISABLED FOR MEMORY_TOOL\n");
+    return;
+  }
+
+  {
+    // Case 1: small aligned allocation, aligned extend inside arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = ArenaAllocator::kAlignment * 2;
+    void* original_allocation = arena.Alloc(original_size);
+
+    const size_t new_size = ArenaAllocator::kAlignment * 3;
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_EQ(original_allocation, realloc_allocation);
+  }
+
+  {
+    // Case 2: small aligned allocation, non-aligned extend inside arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = ArenaAllocator::kAlignment * 2;
+    void* original_allocation = arena.Alloc(original_size);
+
+    const size_t new_size = ArenaAllocator::kAlignment * 2 + (ArenaAllocator::kAlignment / 2);
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_EQ(original_allocation, realloc_allocation);
+  }
+
+  {
+    // Case 3: small non-aligned allocation, aligned extend inside arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = ArenaAllocator::kAlignment * 2 + (ArenaAllocator::kAlignment / 2);
+    void* original_allocation = arena.Alloc(original_size);
+
+    const size_t new_size = ArenaAllocator::kAlignment * 4;
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_EQ(original_allocation, realloc_allocation);
+  }
+
+  {
+    // Case 4: small non-aligned allocation, aligned non-extend inside arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = ArenaAllocator::kAlignment * 2 + (ArenaAllocator::kAlignment / 2);
+    void* original_allocation = arena.Alloc(original_size);
+
+    const size_t new_size = ArenaAllocator::kAlignment * 3;
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_EQ(original_allocation, realloc_allocation);
+  }
+
+  // The next part is brittle, as the default size for an arena is variable, and we don't know about
+  // sanitization.
+
+  {
+    // Case 5: large allocation, aligned extend into next arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = Arena::kDefaultSize - ArenaAllocator::kAlignment * 5;
+    void* original_allocation = arena.Alloc(original_size);
+
+    const size_t new_size = Arena::kDefaultSize + ArenaAllocator::kAlignment * 2;
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_NE(original_allocation, realloc_allocation);
+  }
+
+  {
+    // Case 6: large allocation, non-aligned extend into next arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = Arena::kDefaultSize -
+        ArenaAllocator::kAlignment * 4 -
+        ArenaAllocator::kAlignment / 2;
+    void* original_allocation = arena.Alloc(original_size);
+
+    const size_t new_size = Arena::kDefaultSize +
+        ArenaAllocator::kAlignment * 2 +
+        ArenaAllocator::kAlignment / 2;
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_NE(original_allocation, realloc_allocation);
+  }
+}
+
+TEST_F(ArenaAllocatorTest, ReallocAlignment) {
+  {
+    // Case 1: small aligned allocation, aligned extend inside arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = ArenaAllocator::kAlignment * 2;
+    void* original_allocation = arena.Alloc(original_size);
+    ASSERT_TRUE(IsAligned<ArenaAllocator::kAlignment>(original_allocation));
+
+    const size_t new_size = ArenaAllocator::kAlignment * 3;
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(realloc_allocation));
+
+    void* after_alloc = arena.Alloc(1);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(after_alloc));
+  }
+
+  {
+    // Case 2: small aligned allocation, non-aligned extend inside arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = ArenaAllocator::kAlignment * 2;
+    void* original_allocation = arena.Alloc(original_size);
+    ASSERT_TRUE(IsAligned<ArenaAllocator::kAlignment>(original_allocation));
+
+    const size_t new_size = ArenaAllocator::kAlignment * 2 + (ArenaAllocator::kAlignment / 2);
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(realloc_allocation));
+
+    void* after_alloc = arena.Alloc(1);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(after_alloc));
+  }
+
+  {
+    // Case 3: small non-aligned allocation, aligned extend inside arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = ArenaAllocator::kAlignment * 2 + (ArenaAllocator::kAlignment / 2);
+    void* original_allocation = arena.Alloc(original_size);
+    ASSERT_TRUE(IsAligned<ArenaAllocator::kAlignment>(original_allocation));
+
+    const size_t new_size = ArenaAllocator::kAlignment * 4;
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(realloc_allocation));
+
+    void* after_alloc = arena.Alloc(1);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(after_alloc));
+  }
+
+  {
+    // Case 4: small non-aligned allocation, aligned non-extend inside arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = ArenaAllocator::kAlignment * 2 + (ArenaAllocator::kAlignment / 2);
+    void* original_allocation = arena.Alloc(original_size);
+    ASSERT_TRUE(IsAligned<ArenaAllocator::kAlignment>(original_allocation));
+
+    const size_t new_size = ArenaAllocator::kAlignment * 3;
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(realloc_allocation));
+
+    void* after_alloc = arena.Alloc(1);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(after_alloc));
+  }
+
+  // The next part is brittle, as the default size for an arena is variable, and we don't know about
+  // sanitization.
+
+  {
+    // Case 5: large allocation, aligned extend into next arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = Arena::kDefaultSize - ArenaAllocator::kAlignment * 5;
+    void* original_allocation = arena.Alloc(original_size);
+    ASSERT_TRUE(IsAligned<ArenaAllocator::kAlignment>(original_allocation));
+
+    const size_t new_size = Arena::kDefaultSize + ArenaAllocator::kAlignment * 2;
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(realloc_allocation));
+
+    void* after_alloc = arena.Alloc(1);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(after_alloc));
+  }
+
+  {
+    // Case 6: large allocation, non-aligned extend into next arena.
+    ArenaPool pool;
+    ArenaAllocator arena(&pool);
+
+    const size_t original_size = Arena::kDefaultSize -
+        ArenaAllocator::kAlignment * 4 -
+        ArenaAllocator::kAlignment / 2;
+    void* original_allocation = arena.Alloc(original_size);
+    ASSERT_TRUE(IsAligned<ArenaAllocator::kAlignment>(original_allocation));
+
+    const size_t new_size = Arena::kDefaultSize +
+        ArenaAllocator::kAlignment * 2 +
+        ArenaAllocator::kAlignment / 2;
+    void* realloc_allocation = arena.Realloc(original_allocation, original_size, new_size);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(realloc_allocation));
+
+    void* after_alloc = arena.Alloc(1);
+    EXPECT_TRUE(IsAligned<ArenaAllocator::kAlignment>(after_alloc));
+  }
+}
+
+
 }  // namespace art
diff --git a/runtime/base/dchecked_vector.h b/runtime/base/dchecked_vector.h
index 51dfba8..77f0ea2 100644
--- a/runtime/base/dchecked_vector.h
+++ b/runtime/base/dchecked_vector.h
@@ -59,10 +59,8 @@
       : Base() { }
   explicit dchecked_vector(const allocator_type& alloc)
       : Base(alloc) { }
-  // Note that we cannot forward to std::vector(size_type, const allocator_type&) because it is not
-  // available in C++11, which is the latest GCC can support. http://b/25022512
   explicit dchecked_vector(size_type n, const allocator_type& alloc = allocator_type())
-      : Base(alloc) { resize(n); }
+      : Base(n, alloc) { }
   dchecked_vector(size_type n,
                   const value_type& value,
                   const allocator_type& alloc = allocator_type())
diff --git a/runtime/base/histogram-inl.h b/runtime/base/histogram-inl.h
index 4af47d1..ca9a694 100644
--- a/runtime/base/histogram-inl.h
+++ b/runtime/base/histogram-inl.h
@@ -228,10 +228,8 @@
   DCHECK_LE(std::abs(out_data->perc_.back() - 1.0), 0.001);
 }
 
-#if defined(__clang__)
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wfloat-equal"
-#endif
 
 template <class Value>
 inline double Histogram<Value>::Percentile(double per, const CumulativeData& data) const {
@@ -273,9 +271,7 @@
   return value;
 }
 
-#if defined(__clang__)
 #pragma clang diagnostic pop
-#endif
 
 }  // namespace art
 #endif  // ART_RUNTIME_BASE_HISTOGRAM_INL_H_
diff --git a/runtime/base/macros.h b/runtime/base/macros.h
index 5a50247..0ec6e6d 100644
--- a/runtime/base/macros.h
+++ b/runtime/base/macros.h
@@ -30,16 +30,8 @@
   _rc; })
 #endif
 
-#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-
-// C++11 final and override keywords that were introduced in GCC version 4.7.
-#if defined(__clang__) || GCC_VERSION >= 40700
 #define OVERRIDE override
 #define FINAL final
-#else
-#define OVERRIDE
-#define FINAL
-#endif
 
 // Declare a friend relationship in a class with a test. Used rather that FRIEND_TEST to avoid
 // globally importing gtest/gtest.h into the main ART header files.
@@ -158,12 +150,9 @@
 #define ALWAYS_INLINE  __attribute__ ((always_inline))
 #endif
 
-#ifdef __clang__
-/* clang doesn't like attributes on lambda functions */
+// clang doesn't like attributes on lambda functions. It would be nice to say:
+//   #define ALWAYS_INLINE_LAMBDA ALWAYS_INLINE
 #define ALWAYS_INLINE_LAMBDA
-#else
-#define ALWAYS_INLINE_LAMBDA ALWAYS_INLINE
-#endif
 
 #define NO_INLINE __attribute__ ((noinline))
 
@@ -228,75 +217,46 @@
 //
 //  In either case this macro has no effect on runtime behavior and performance
 //  of code.
-#if defined(__clang__) && __cplusplus >= 201103L && defined(__has_warning)
 #if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough")
 #define FALLTHROUGH_INTENDED [[clang::fallthrough]]  // NOLINT
 #endif
-#endif
 
 #ifndef FALLTHROUGH_INTENDED
 #define FALLTHROUGH_INTENDED do { } while (0)
 #endif
 
 // Annotalysis thread-safety analysis support.
-#if defined(__SUPPORT_TS_ANNOTATION__) || defined(__clang__)
-#define THREAD_ANNOTATION_ATTRIBUTE__(x)   __attribute__((x))
-#else
-#define THREAD_ANNOTATION_ATTRIBUTE__(x)   // no-op
-#endif
 
-#define ACQUIRED_AFTER(...) THREAD_ANNOTATION_ATTRIBUTE__(acquired_after(__VA_ARGS__))
-#define ACQUIRED_BEFORE(...) THREAD_ANNOTATION_ATTRIBUTE__(acquired_before(__VA_ARGS__))
-#define GUARDED_BY(x) THREAD_ANNOTATION_ATTRIBUTE__(guarded_by(x))
-#define GUARDED_VAR THREAD_ANNOTATION_ATTRIBUTE__(guarded)
-#define LOCK_RETURNED(x) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(x))
-#define NO_THREAD_SAFETY_ANALYSIS THREAD_ANNOTATION_ATTRIBUTE__(no_thread_safety_analysis)
+#define ACQUIRED_AFTER(...) __attribute__((acquired_after(__VA_ARGS__)))
+#define ACQUIRED_BEFORE(...) __attribute__((acquired_before(__VA_ARGS__)))
+#define GUARDED_BY(x) __attribute__((guarded_by(x)))
+#define GUARDED_VAR __attribute__((guarded))
+#define LOCK_RETURNED(x) __attribute__((lock_returned(x)))
+#define NO_THREAD_SAFETY_ANALYSIS __attribute__((no_thread_safety_analysis))
 #define PT_GUARDED_BY(x)
 // THREAD_ANNOTATION_ATTRIBUTE__(point_to_guarded_by(x))
-#define PT_GUARDED_VAR THREAD_ANNOTATION_ATTRIBUTE__(point_to_guarded)
-#define SCOPED_LOCKABLE THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
+#define PT_GUARDED_VAR __attribute__((point_to_guarded))
+#define SCOPED_LOCKABLE __attribute__((scoped_lockable))
 
-#if defined(__clang__)
-#define EXCLUSIVE_LOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(exclusive_lock_function(__VA_ARGS__))
-#define EXCLUSIVE_TRYLOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(exclusive_trylock_function(__VA_ARGS__))
-#define SHARED_LOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(shared_lock_function(__VA_ARGS__))
-#define SHARED_TRYLOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(shared_trylock_function(__VA_ARGS__))
-#define UNLOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(unlock_function(__VA_ARGS__))
-#define REQUIRES(...) THREAD_ANNOTATION_ATTRIBUTE__(requires_capability(__VA_ARGS__))
-#define SHARED_REQUIRES(...) THREAD_ANNOTATION_ATTRIBUTE__(requires_shared_capability(__VA_ARGS__))
-#define CAPABILITY(...) THREAD_ANNOTATION_ATTRIBUTE__(capability(__VA_ARGS__))
-#define SHARED_CAPABILITY(...) THREAD_ANNOTATION_ATTRIBUTE__(shared_capability(__VA_ARGS__))
-#define ASSERT_CAPABILITY(...) THREAD_ANNOTATION_ATTRIBUTE__(assert_capability(__VA_ARGS__))
-#define ASSERT_SHARED_CAPABILITY(...) THREAD_ANNOTATION_ATTRIBUTE__(assert_shared_capability(__VA_ARGS__))
-#define RETURN_CAPABILITY(...) THREAD_ANNOTATION_ATTRIBUTE__(lock_returned(__VA_ARGS__))
-#define TRY_ACQUIRE(...) THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_capability(__VA_ARGS__))
-#define TRY_ACQUIRE_SHARED(...) THREAD_ANNOTATION_ATTRIBUTE__(try_acquire_shared_capability(__VA_ARGS__))
-#define ACQUIRE(...) THREAD_ANNOTATION_ATTRIBUTE__(acquire_capability(__VA_ARGS__))
-#define ACQUIRE_SHARED(...) THREAD_ANNOTATION_ATTRIBUTE__(acquire_shared_capability(__VA_ARGS__))
-#define RELEASE(...) THREAD_ANNOTATION_ATTRIBUTE__(release_capability(__VA_ARGS__))
-#define RELEASE_SHARED(...) THREAD_ANNOTATION_ATTRIBUTE__(release_shared_capability(__VA_ARGS__))
-#define SCOPED_CAPABILITY THREAD_ANNOTATION_ATTRIBUTE__(scoped_lockable)
-#else
-#define EXCLUSIVE_LOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(exclusive_lock(__VA_ARGS__))
-#define EXCLUSIVE_TRYLOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(exclusive_trylock(__VA_ARGS__))
-#define SHARED_LOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(shared_lock(__VA_ARGS__))
-#define SHARED_TRYLOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(shared_trylock(__VA_ARGS__))
-#define UNLOCK_FUNCTION(...) THREAD_ANNOTATION_ATTRIBUTE__(unlock(__VA_ARGS__))
-#define REQUIRES(...)
-#define SHARED_REQUIRES(...)
-#define CAPABILITY(...)
-#define SHARED_CAPABILITY(...)
-#define ASSERT_CAPABILITY(...)
-#define ASSERT_SHARED_CAPABILITY(...)
-#define RETURN_CAPABILITY(...)
-#define TRY_ACQUIRE(...)
-#define TRY_ACQUIRE_SHARED(...)
-#define ACQUIRE(...)
-#define ACQUIRE_SHARED(...)
-#define RELEASE(...)
-#define RELEASE_SHARED(...)
-#define SCOPED_CAPABILITY
-#endif
+#define EXCLUSIVE_LOCK_FUNCTION(...) __attribute__((exclusive_lock_function(__VA_ARGS__)))
+#define EXCLUSIVE_TRYLOCK_FUNCTION(...) __attribute__((exclusive_trylock_function(__VA_ARGS__)))
+#define SHARED_LOCK_FUNCTION(...) __attribute__((shared_lock_function(__VA_ARGS__)))
+#define SHARED_TRYLOCK_FUNCTION(...) __attribute__((shared_trylock_function(__VA_ARGS__)))
+#define UNLOCK_FUNCTION(...) __attribute__((unlock_function(__VA_ARGS__)))
+#define REQUIRES(...) __attribute__((requires_capability(__VA_ARGS__)))
+#define SHARED_REQUIRES(...) __attribute__((requires_shared_capability(__VA_ARGS__)))
+#define CAPABILITY(...) __attribute__((capability(__VA_ARGS__)))
+#define SHARED_CAPABILITY(...) __attribute__((shared_capability(__VA_ARGS__)))
+#define ASSERT_CAPABILITY(...) __attribute__((assert_capability(__VA_ARGS__)))
+#define ASSERT_SHARED_CAPABILITY(...) __attribute__((assert_shared_capability(__VA_ARGS__)))
+#define RETURN_CAPABILITY(...) __attribute__((lock_returned(__VA_ARGS__)))
+#define TRY_ACQUIRE(...) __attribute__((try_acquire_capability(__VA_ARGS__)))
+#define TRY_ACQUIRE_SHARED(...) __attribute__((try_acquire_shared_capability(__VA_ARGS__)))
+#define ACQUIRE(...) __attribute__((acquire_capability(__VA_ARGS__)))
+#define ACQUIRE_SHARED(...) __attribute__((acquire_shared_capability(__VA_ARGS__)))
+#define RELEASE(...) __attribute__((release_capability(__VA_ARGS__)))
+#define RELEASE_SHARED(...) __attribute__((release_shared_capability(__VA_ARGS__)))
+#define SCOPED_CAPABILITY __attribute__((scoped_lockable))
 
 #define LOCKABLE CAPABILITY("mutex")
 #define SHARED_LOCKABLE SHARED_CAPABILITY("mutex")
diff --git a/runtime/base/mutex.cc b/runtime/base/mutex.cc
index 264a530..fec918b 100644
--- a/runtime/base/mutex.cc
+++ b/runtime/base/mutex.cc
@@ -98,12 +98,7 @@
   }
 
   ~ScopedAllMutexesLock() {
-#if !defined(__clang__)
-    // TODO: remove this workaround target GCC/libc++/bionic bug "invalid failure memory model".
-    while (!gAllMutexData->all_mutexes_guard.CompareExchangeWeakSequentiallyConsistent(mutex_, 0)) {
-#else
     while (!gAllMutexData->all_mutexes_guard.CompareExchangeWeakRelease(mutex_, 0)) {
-#endif
       NanoSleep(100);
     }
   }
diff --git a/runtime/class_linker-inl.h b/runtime/class_linker-inl.h
index f2575f7..97aa499 100644
--- a/runtime/class_linker-inl.h
+++ b/runtime/class_linker-inl.h
@@ -27,6 +27,8 @@
 #include "mirror/object_array.h"
 #include "handle_scope-inl.h"
 
+#include <atomic>
+
 namespace art {
 
 inline mirror::Class* ClassLinker::FindSystemClass(Thread* self, const char* descriptor) {
@@ -63,18 +65,21 @@
 inline mirror::String* ClassLinker::ResolveString(uint32_t string_idx, ArtMethod* referrer) {
   mirror::Class* declaring_class = referrer->GetDeclaringClass();
   // MethodVerifier refuses methods with string_idx out of bounds.
-  DCHECK_LT(string_idx, declaring_class->GetDexCache()->NumStrings());
-  mirror::String* resolved_string = declaring_class->GetDexCacheStrings()[string_idx].Read();
-  if (UNLIKELY(resolved_string == nullptr)) {
+  DCHECK_LT(string_idx, declaring_class->GetDexFile().NumStringIds());;
+  mirror::String* string =
+        mirror::StringDexCachePair::LookupString(declaring_class->GetDexCacheStrings(),
+                                                 string_idx,
+                                                 mirror::DexCache::kDexCacheStringCacheSize).Read();
+  if (UNLIKELY(string == nullptr)) {
     StackHandleScope<1> hs(Thread::Current());
     Handle<mirror::DexCache> dex_cache(hs.NewHandle(declaring_class->GetDexCache()));
     const DexFile& dex_file = *dex_cache->GetDexFile();
-    resolved_string = ResolveString(dex_file, string_idx, dex_cache);
-    if (resolved_string != nullptr) {
-      DCHECK_EQ(dex_cache->GetResolvedString(string_idx), resolved_string);
+    string = ResolveString(dex_file, string_idx, dex_cache);
+    if (string != nullptr) {
+      DCHECK_EQ(dex_cache->GetResolvedString(string_idx), string);
     }
   }
-  return resolved_string;
+  return string;
 }
 
 inline mirror::Class* ClassLinker::ResolveType(uint16_t type_idx, ArtMethod* referrer) {
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 4d48da6..f4400c3 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -66,6 +66,7 @@
 #include "mirror/class.h"
 #include "mirror/class-inl.h"
 #include "mirror/class_loader.h"
+#include "mirror/dex_cache.h"
 #include "mirror/dex_cache-inl.h"
 #include "mirror/field.h"
 #include "mirror/iftable-inl.h"
@@ -1271,7 +1272,10 @@
       // If the oat file expects the dex cache arrays to be in the BSS, then allocate there and
         // copy over the arrays.
         DCHECK(dex_file != nullptr);
-        const size_t num_strings = dex_file->NumStringIds();
+        size_t num_strings = mirror::DexCache::kDexCacheStringCacheSize;
+        if (dex_file->NumStringIds() < num_strings) {
+          num_strings = dex_file->NumStringIds();
+        }
         const size_t num_types = dex_file->NumTypeIds();
         const size_t num_methods = dex_file->NumMethodIds();
         const size_t num_fields = dex_file->NumFieldIds();
@@ -1281,16 +1285,17 @@
         CHECK_EQ(num_fields, dex_cache->NumResolvedFields());
         DexCacheArraysLayout layout(image_pointer_size_, dex_file);
         uint8_t* const raw_arrays = oat_dex_file->GetDexCacheArrays();
-        // The space is not yet visible to the GC, we can avoid the read barriers and use
-        // std::copy_n.
         if (num_strings != 0u) {
-          GcRoot<mirror::String>* const image_resolved_strings = dex_cache->GetStrings();
-          GcRoot<mirror::String>* const strings =
-              reinterpret_cast<GcRoot<mirror::String>*>(raw_arrays + layout.StringsOffset());
-          for (size_t j = 0; kIsDebugBuild && j < num_strings; ++j) {
-            DCHECK(strings[j].IsNull());
+          mirror::StringDexCacheType* const image_resolved_strings = dex_cache->GetStrings();
+          mirror::StringDexCacheType* const strings =
+              reinterpret_cast<mirror::StringDexCacheType*>(raw_arrays + layout.StringsOffset());
+          for (size_t j = 0; j < num_strings; ++j) {
+            DCHECK_EQ(strings[j].load(std::memory_order_relaxed).string_index, 0u);
+            DCHECK(strings[j].load(std::memory_order_relaxed).string_pointer.IsNull());
+            strings[j].store(image_resolved_strings[j].load(std::memory_order_relaxed),
+                             std::memory_order_relaxed);
           }
-          std::copy_n(image_resolved_strings, num_strings, strings);
+          mirror::StringDexCachePair::Initialize(strings);
           dex_cache->SetStrings(strings);
         }
         if (num_types != 0u) {
@@ -1473,14 +1478,14 @@
 
   bool operator()(mirror::Class* klass) const SHARED_REQUIRES(Locks::mutator_lock_) {
     if (forward_strings_) {
-      GcRoot<mirror::String>* strings = klass->GetDexCacheStrings();
+      mirror::StringDexCacheType* strings = klass->GetDexCacheStrings();
       if (strings != nullptr) {
         DCHECK(
             space_->GetImageHeader().GetImageSection(ImageHeader::kSectionDexCacheArrays).Contains(
                 reinterpret_cast<uint8_t*>(strings) - space_->Begin()))
             << "String dex cache array for " << PrettyClass(klass) << " is not in app image";
         // Dex caches have already been updated, so take the strings pointer from there.
-        GcRoot<mirror::String>* new_strings = klass->GetDexCache()->GetStrings();
+        mirror::StringDexCacheType* new_strings = klass->GetDexCache()->GetStrings();
         DCHECK_NE(strings, new_strings);
         klass->SetDexCacheStrings(new_strings);
       }
@@ -2079,18 +2084,31 @@
     // Zero-initialized.
     raw_arrays = reinterpret_cast<uint8_t*>(linear_alloc->Alloc(self, layout.Size()));
   }
-  GcRoot<mirror::String>* strings = (dex_file.NumStringIds() == 0u) ? nullptr :
-      reinterpret_cast<GcRoot<mirror::String>*>(raw_arrays + layout.StringsOffset());
+  mirror::StringDexCacheType* strings = (dex_file.NumStringIds() == 0u) ? nullptr :
+      reinterpret_cast<mirror::StringDexCacheType*>(raw_arrays + layout.StringsOffset());
   GcRoot<mirror::Class>* types = (dex_file.NumTypeIds() == 0u) ? nullptr :
       reinterpret_cast<GcRoot<mirror::Class>*>(raw_arrays + layout.TypesOffset());
   ArtMethod** methods = (dex_file.NumMethodIds() == 0u) ? nullptr :
       reinterpret_cast<ArtMethod**>(raw_arrays + layout.MethodsOffset());
   ArtField** fields = (dex_file.NumFieldIds() == 0u) ? nullptr :
       reinterpret_cast<ArtField**>(raw_arrays + layout.FieldsOffset());
+  size_t num_strings = mirror::DexCache::kDexCacheStringCacheSize;
+  if (dex_file.NumStringIds() < num_strings) {
+    num_strings = dex_file.NumStringIds();
+  }
+  DCHECK_ALIGNED(raw_arrays, alignof(mirror::StringDexCacheType)) <<
+                 "Expected raw_arrays to align to StringDexCacheType.";
+  DCHECK_ALIGNED(layout.StringsOffset(), alignof(mirror::StringDexCacheType)) <<
+                 "Expected StringsOffset() to align to StringDexCacheType.";
+  DCHECK_ALIGNED(strings, alignof(mirror::StringDexCacheType)) <<
+                 "Expected strings to align to StringDexCacheType.";
+  static_assert(alignof(mirror::StringDexCacheType) == 8u,
+                "Expected StringDexCacheType to have align of 8.");
   if (kIsDebugBuild) {
     // Sanity check to make sure all the dex cache arrays are empty. b/28992179
-    for (size_t i = 0; i < dex_file.NumStringIds(); ++i) {
-      CHECK(strings[i].Read<kWithoutReadBarrier>() == nullptr);
+    for (size_t i = 0; i < num_strings; ++i) {
+      CHECK_EQ(strings[i].load(std::memory_order_relaxed).string_index, 0u);
+      CHECK(strings[i].load(std::memory_order_relaxed).string_pointer.IsNull());
     }
     for (size_t i = 0; i < dex_file.NumTypeIds(); ++i) {
       CHECK(types[i].Read<kWithoutReadBarrier>() == nullptr);
@@ -2102,10 +2120,13 @@
       CHECK(mirror::DexCache::GetElementPtrSize(fields, i, image_pointer_size_) == nullptr);
     }
   }
+  if (strings != nullptr) {
+    mirror::StringDexCachePair::Initialize(strings);
+  }
   dex_cache->Init(&dex_file,
                   location.Get(),
                   strings,
-                  dex_file.NumStringIds(),
+                  num_strings,
                   types,
                   dex_file.NumTypeIds(),
                   methods,
@@ -4525,7 +4546,8 @@
     }
     self->AllowThreadSuspension();
 
-    CHECK_EQ(klass->GetStatus(), mirror::Class::kStatusVerified) << PrettyClass(klass.Get());
+    CHECK_EQ(klass->GetStatus(), mirror::Class::kStatusVerified) << PrettyClass(klass.Get())
+        << " self.tid=" << self->GetTid() << " clinit.tid=" << klass->GetClinitThreadId();
 
     // From here out other threads may observe that we're initializing and so changes of state
     // require the a notification.
diff --git a/runtime/common_runtime_test.h b/runtime/common_runtime_test.h
index f445e52..2d16a49 100644
--- a/runtime/common_runtime_test.h
+++ b/runtime/common_runtime_test.h
@@ -207,6 +207,12 @@
     return; \
   }
 
+#define TEST_DISABLED_FOR_STRING_COMPRESSION() \
+  if (mirror::kUseStringCompression) { \
+    printf("WARNING: TEST DISABLED FOR STRING COMPRESSION\n"); \
+    return; \
+  }
+
 }  // namespace art
 
 namespace std {
diff --git a/runtime/common_throws.cc b/runtime/common_throws.cc
index 99732c6..e1da23c 100644
--- a/runtime/common_throws.cc
+++ b/runtime/common_throws.cc
@@ -402,6 +402,16 @@
                                                dex_file, type);
 }
 
+static bool IsValidReadBarrierImplicitCheck(uintptr_t addr) {
+  DCHECK(kEmitCompilerReadBarrier);
+  uint32_t monitor_offset = mirror::Object::MonitorOffset().Uint32Value();
+  if (kUseBakerReadBarrier && (kRuntimeISA == kX86 || kRuntimeISA == kX86_64)) {
+    constexpr uint32_t gray_byte_position = LockWord::kReadBarrierStateShift / kBitsPerByte;
+    monitor_offset += gray_byte_position;
+  }
+  return addr == monitor_offset;
+}
+
 static bool IsValidImplicitCheck(uintptr_t addr, ArtMethod* method, const Instruction& instr)
     SHARED_REQUIRES(Locks::mutator_lock_) {
   if (!CanDoImplicitNullCheckOn(addr)) {
@@ -424,9 +434,13 @@
       return true;
     }
 
+    case Instruction::IGET_OBJECT:
+      if (kEmitCompilerReadBarrier && IsValidReadBarrierImplicitCheck(addr)) {
+        return true;
+      }
+      FALLTHROUGH_INTENDED;
     case Instruction::IGET:
     case Instruction::IGET_WIDE:
-    case Instruction::IGET_OBJECT:
     case Instruction::IGET_BOOLEAN:
     case Instruction::IGET_BYTE:
     case Instruction::IGET_CHAR:
@@ -440,18 +454,20 @@
     case Instruction::IPUT_SHORT: {
       ArtField* field =
           Runtime::Current()->GetClassLinker()->ResolveField(instr.VRegC_22c(), method, false);
-      return (addr == 0) ||
-          (addr == field->GetOffset().Uint32Value()) ||
-          (kEmitCompilerReadBarrier && (addr == mirror::Object::MonitorOffset().Uint32Value()));
+      return (addr == 0) || (addr == field->GetOffset().Uint32Value());
     }
 
+    case Instruction::IGET_OBJECT_QUICK:
+      if (kEmitCompilerReadBarrier && IsValidReadBarrierImplicitCheck(addr)) {
+        return true;
+      }
+      FALLTHROUGH_INTENDED;
     case Instruction::IGET_QUICK:
     case Instruction::IGET_BOOLEAN_QUICK:
     case Instruction::IGET_BYTE_QUICK:
     case Instruction::IGET_CHAR_QUICK:
     case Instruction::IGET_SHORT_QUICK:
     case Instruction::IGET_WIDE_QUICK:
-    case Instruction::IGET_OBJECT_QUICK:
     case Instruction::IPUT_QUICK:
     case Instruction::IPUT_BOOLEAN_QUICK:
     case Instruction::IPUT_BYTE_QUICK:
@@ -459,14 +475,16 @@
     case Instruction::IPUT_SHORT_QUICK:
     case Instruction::IPUT_WIDE_QUICK:
     case Instruction::IPUT_OBJECT_QUICK: {
-      return (addr == 0u) ||
-          (addr == instr.VRegC_22c()) ||
-          (kEmitCompilerReadBarrier && (addr == mirror::Object::MonitorOffset().Uint32Value()));
+      return (addr == 0u) || (addr == instr.VRegC_22c());
     }
 
+    case Instruction::AGET_OBJECT:
+      if (kEmitCompilerReadBarrier && IsValidReadBarrierImplicitCheck(addr)) {
+        return true;
+      }
+      FALLTHROUGH_INTENDED;
     case Instruction::AGET:
     case Instruction::AGET_WIDE:
-    case Instruction::AGET_OBJECT:
     case Instruction::AGET_BOOLEAN:
     case Instruction::AGET_BYTE:
     case Instruction::AGET_CHAR:
@@ -482,9 +500,7 @@
     case Instruction::ARRAY_LENGTH: {
       // The length access should crash. We currently do not do implicit checks on
       // the array access itself.
-      return (addr == 0u) ||
-          (addr == mirror::Array::LengthOffset().Uint32Value()) ||
-          (kEmitCompilerReadBarrier && (addr == mirror::Object::MonitorOffset().Uint32Value()));
+      return (addr == 0u) || (addr == mirror::Array::LengthOffset().Uint32Value());
     }
 
     default: {
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index 2a5198b..a5b0689 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -1286,8 +1286,7 @@
   if (c->IsStringClass()) {
     // Special case for java.lang.String.
     gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
-    mirror::SetStringCountVisitor visitor(0);
-    new_object = mirror::String::Alloc<true>(self, 0, allocator_type, visitor);
+    new_object = mirror::String::AllocEmptyString<true>(self, allocator_type);
   } else {
     new_object = c->AllocObject(self);
   }
@@ -4059,7 +4058,7 @@
   // Prepare JDWP ids for the reply.
   JDWP::JdwpTag result_tag = BasicTagFromDescriptor(m->GetShorty());
   const bool is_object_result = (result_tag == JDWP::JT_OBJECT);
-  StackHandleScope<2> hs(soa.Self());
+  StackHandleScope<3> hs(soa.Self());
   Handle<mirror::Object> object_result = hs.NewHandle(is_object_result ? result.GetL() : nullptr);
   Handle<mirror::Throwable> exception = hs.NewHandle(soa.Self()->GetException());
   soa.Self()->ClearException();
@@ -4098,10 +4097,17 @@
     // unless we threw, in which case we return null.
     DCHECK_EQ(JDWP::JT_VOID, result_tag);
     if (exceptionObjectId == 0) {
-      // TODO we could keep the receiver ObjectId in the DebugInvokeReq to avoid looking into the
-      // object registry.
-      result_value = GetObjectRegistry()->Add(pReq->receiver.Read());
-      result_tag = TagFromObject(soa, pReq->receiver.Read());
+      if (m->GetDeclaringClass()->IsStringClass()) {
+        // For string constructors, the new string is remapped to the receiver (stored in ref).
+        Handle<mirror::Object> decoded_ref = hs.NewHandle(soa.Self()->DecodeJObject(ref.get()));
+        result_value = gRegistry->Add(decoded_ref);
+        result_tag = TagFromObject(soa, decoded_ref.Get());
+      } else {
+        // TODO we could keep the receiver ObjectId in the DebugInvokeReq to avoid looking into the
+        // object registry.
+        result_value = GetObjectRegistry()->Add(pReq->receiver.Read());
+        result_tag = TagFromObject(soa, pReq->receiver.Read());
+      }
     } else {
       result_value = 0;
       result_tag = JDWP::JT_OBJECT;
@@ -4327,10 +4333,16 @@
     Handle<mirror::String> name(hs.NewHandle(t->GetThreadName(soa)));
     size_t char_count = (name.Get() != nullptr) ? name->GetLength() : 0;
     const jchar* chars = (name.Get() != nullptr) ? name->GetValue() : nullptr;
+    bool is_compressed = (name.Get() != nullptr) ? name->IsCompressed() : false;
 
     std::vector<uint8_t> bytes;
     JDWP::Append4BE(bytes, t->GetThreadId());
-    JDWP::AppendUtf16BE(bytes, chars, char_count);
+    if (is_compressed) {
+      const uint8_t* chars_compressed = name->GetValueCompressed();
+      JDWP::AppendUtf16CompressedBE(bytes, chars_compressed, char_count);
+    } else {
+      JDWP::AppendUtf16BE(bytes, chars, char_count);
+    }
     CHECK_EQ(bytes.size(), char_count*2 + sizeof(uint32_t)*2);
     Dbg::DdmSendChunk(type, bytes);
   }
diff --git a/runtime/entrypoints/quick/quick_math_entrypoints.cc b/runtime/entrypoints/quick/quick_math_entrypoints.cc
index 1c658b7..51d2784 100644
--- a/runtime/entrypoints/quick/quick_math_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_math_entrypoints.cc
@@ -18,10 +18,8 @@
 
 namespace art {
 
-#if defined(__clang__)
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wfloat-equal"
-#endif
 
 int CmplFloat(float a, float b) {
   if (a == b) {
@@ -67,9 +65,7 @@
   return -1;
 }
 
-#if defined(__clang__)
 #pragma clang diagnostic pop
-#endif
 
 extern "C" int64_t artLmul(int64_t a, int64_t b) {
   return a * b;
diff --git a/runtime/gc/accounting/space_bitmap-inl.h b/runtime/gc/accounting/space_bitmap-inl.h
index 4cf5b4f..9feaf41 100644
--- a/runtime/gc/accounting/space_bitmap-inl.h
+++ b/runtime/gc/accounting/space_bitmap-inl.h
@@ -36,7 +36,7 @@
   const uintptr_t offset = addr - heap_begin_;
   const size_t index = OffsetToIndex(offset);
   const uintptr_t mask = OffsetToMask(offset);
-  Atomic<uintptr_t>* atomic_entry = reinterpret_cast<Atomic<uintptr_t>*>(&bitmap_begin_[index]);
+  Atomic<uintptr_t>* atomic_entry = &bitmap_begin_[index];
   DCHECK_LT(index, bitmap_size_ / sizeof(intptr_t)) << " bitmap_size_ = " << bitmap_size_;
   uintptr_t old_word;
   do {
@@ -58,7 +58,7 @@
   DCHECK(bitmap_begin_ != nullptr);
   DCHECK_GE(addr, heap_begin_);
   const uintptr_t offset = addr - heap_begin_;
-  return (bitmap_begin_[OffsetToIndex(offset)] & OffsetToMask(offset)) != 0;
+  return (bitmap_begin_[OffsetToIndex(offset)].LoadRelaxed() & OffsetToMask(offset)) != 0;
 }
 
 template<size_t kAlignment> template<typename Visitor>
@@ -116,7 +116,7 @@
 
     // Traverse the middle, full part.
     for (size_t i = index_start + 1; i < index_end; ++i) {
-      uintptr_t w = bitmap_begin_[i];
+      uintptr_t w = bitmap_begin_[i].LoadRelaxed();
       if (w != 0) {
         const uintptr_t ptr_base = IndexToOffset(i) + heap_begin_;
         do {
@@ -164,8 +164,8 @@
   const size_t index = OffsetToIndex(offset);
   const uintptr_t mask = OffsetToMask(offset);
   DCHECK_LT(index, bitmap_size_ / sizeof(intptr_t)) << " bitmap_size_ = " << bitmap_size_;
-  uintptr_t* address = &bitmap_begin_[index];
-  uintptr_t old_word = *address;
+  Atomic<uintptr_t>* atomic_entry = &bitmap_begin_[index];
+  uintptr_t old_word = atomic_entry->LoadRelaxed();
   if (kSetBit) {
     // Check the bit before setting the word incase we are trying to mark a read only bitmap
     // like an image space bitmap. This bitmap is mapped as read only and will fault if we
@@ -173,10 +173,10 @@
     // occur if we check before setting the bit. This also prevents dirty pages that would
     // occur if the bitmap was read write and we did not check the bit.
     if ((old_word & mask) == 0) {
-      *address = old_word | mask;
+      atomic_entry->StoreRelaxed(old_word | mask);
     }
   } else {
-    *address = old_word & ~mask;
+    atomic_entry->StoreRelaxed(old_word & ~mask);
   }
   DCHECK_EQ(Test(obj), kSetBit);
   return (old_word & mask) != 0;
diff --git a/runtime/gc/accounting/space_bitmap.cc b/runtime/gc/accounting/space_bitmap.cc
index b43f77f..3df02ed 100644
--- a/runtime/gc/accounting/space_bitmap.cc
+++ b/runtime/gc/accounting/space_bitmap.cc
@@ -51,7 +51,9 @@
 template<size_t kAlignment>
 SpaceBitmap<kAlignment>::SpaceBitmap(const std::string& name, MemMap* mem_map, uintptr_t* bitmap_begin,
                                      size_t bitmap_size, const void* heap_begin)
-    : mem_map_(mem_map), bitmap_begin_(bitmap_begin), bitmap_size_(bitmap_size),
+    : mem_map_(mem_map),
+      bitmap_begin_(reinterpret_cast<Atomic<uintptr_t>*>(bitmap_begin)),
+      bitmap_size_(bitmap_size),
       heap_begin_(reinterpret_cast<uintptr_t>(heap_begin)),
       name_(name) {
   CHECK(bitmap_begin_ != nullptr);
@@ -104,7 +106,12 @@
 template<size_t kAlignment>
 void SpaceBitmap<kAlignment>::CopyFrom(SpaceBitmap* source_bitmap) {
   DCHECK_EQ(Size(), source_bitmap->Size());
-  std::copy(source_bitmap->Begin(), source_bitmap->Begin() + source_bitmap->Size() / sizeof(intptr_t), Begin());
+  const size_t count = source_bitmap->Size() / sizeof(intptr_t);
+  Atomic<uintptr_t>* const src = source_bitmap->Begin();
+  Atomic<uintptr_t>* const dest = Begin();
+  for (size_t i = 0; i < count; ++i) {
+    dest[i].StoreRelaxed(src[i].LoadRelaxed());
+  }
 }
 
 template<size_t kAlignment>
@@ -113,9 +120,9 @@
   CHECK(callback != nullptr);
 
   uintptr_t end = OffsetToIndex(HeapLimit() - heap_begin_ - 1);
-  uintptr_t* bitmap_begin = bitmap_begin_;
+  Atomic<uintptr_t>* bitmap_begin = bitmap_begin_;
   for (uintptr_t i = 0; i <= end; ++i) {
-    uintptr_t w = bitmap_begin[i];
+    uintptr_t w = bitmap_begin[i].LoadRelaxed();
     if (w != 0) {
       uintptr_t ptr_base = IndexToOffset(i) + heap_begin_;
       do {
@@ -160,10 +167,10 @@
   size_t start = OffsetToIndex(sweep_begin - live_bitmap.heap_begin_);
   size_t end = OffsetToIndex(sweep_end - live_bitmap.heap_begin_ - 1);
   CHECK_LT(end, live_bitmap.Size() / sizeof(intptr_t));
-  uintptr_t* live = live_bitmap.bitmap_begin_;
-  uintptr_t* mark = mark_bitmap.bitmap_begin_;
+  Atomic<uintptr_t>* live = live_bitmap.bitmap_begin_;
+  Atomic<uintptr_t>* mark = mark_bitmap.bitmap_begin_;
   for (size_t i = start; i <= end; i++) {
-    uintptr_t garbage = live[i] & ~mark[i];
+    uintptr_t garbage = live[i].LoadRelaxed() & ~mark[i].LoadRelaxed();
     if (UNLIKELY(garbage != 0)) {
       uintptr_t ptr_base = IndexToOffset(i) + live_bitmap.heap_begin_;
       do {
@@ -251,7 +258,7 @@
   uintptr_t end = Size() / sizeof(intptr_t);
   for (uintptr_t i = 0; i < end; ++i) {
     // Need uint for unsigned shift.
-    uintptr_t w = bitmap_begin_[i];
+    uintptr_t w = bitmap_begin_[i].LoadRelaxed();
     if (UNLIKELY(w != 0)) {
       uintptr_t ptr_base = IndexToOffset(i) + heap_begin_;
       while (w != 0) {
diff --git a/runtime/gc/accounting/space_bitmap.h b/runtime/gc/accounting/space_bitmap.h
index b8ff471..829b1b1 100644
--- a/runtime/gc/accounting/space_bitmap.h
+++ b/runtime/gc/accounting/space_bitmap.h
@@ -147,7 +147,7 @@
   void CopyFrom(SpaceBitmap* source_bitmap);
 
   // Starting address of our internal storage.
-  uintptr_t* Begin() {
+  Atomic<uintptr_t>* Begin() {
     return bitmap_begin_;
   }
 
@@ -215,7 +215,7 @@
   std::unique_ptr<MemMap> mem_map_;
 
   // This bitmap itself, word sized for efficiency in scanning.
-  uintptr_t* const bitmap_begin_;
+  Atomic<uintptr_t>* const bitmap_begin_;
 
   // Size of this bitmap.
   size_t bitmap_size_;
diff --git a/runtime/gc/collector/concurrent_copying-inl.h b/runtime/gc/collector/concurrent_copying-inl.h
index fb774a4..76f500c 100644
--- a/runtime/gc/collector/concurrent_copying-inl.h
+++ b/runtime/gc/collector/concurrent_copying-inl.h
@@ -34,32 +34,27 @@
   // to gray even though the object has already been marked through. This happens if a mutator
   // thread gets preempted before the AtomicSetReadBarrierPointer below, GC marks through the
   // object (changes it from white to gray and back to white), and the thread runs and
-  // incorrectly changes it from white to gray. We need to detect such "false gray" cases and
-  // change the objects back to white at the end of marking.
+  // incorrectly changes it from white to gray. If this happens, the object will get added to the
+  // mark stack again and get changed back to white after it is processed.
   if (kUseBakerReadBarrier) {
-    // Test the bitmap first to reduce the chance of false gray cases.
+    // Test the bitmap first to avoid graying an object that has already been marked through most
+    // of the time.
     if (bitmap->Test(ref)) {
       return ref;
     }
   }
   // This may or may not succeed, which is ok because the object may already be gray.
-  bool cas_success = false;
+  bool success = false;
   if (kUseBakerReadBarrier) {
-    cas_success = ref->AtomicSetReadBarrierPointer(ReadBarrier::WhitePtr(),
-                                                   ReadBarrier::GrayPtr());
-  }
-  if (bitmap->AtomicTestAndSet(ref)) {
-    // Already marked.
-    if (kUseBakerReadBarrier &&
-        cas_success &&
-        // The object could be white here if a thread gets preempted after a success at the
-        // above AtomicSetReadBarrierPointer, GC has marked through it, and the thread runs up
-        // to this point.
-        ref->GetReadBarrierPointer() == ReadBarrier::GrayPtr()) {
-      // Register a "false-gray" object to change it from gray to white at the end of marking.
-      PushOntoFalseGrayStack(ref);
-    }
+    // GC will mark the bitmap when popping from mark stack. If only the GC is touching the bitmap
+    // we can avoid an expensive CAS.
+    // For the baker case, an object is marked if either the mark bit marked or the bitmap bit is
+    // set.
+    success = ref->AtomicSetReadBarrierPointer(ReadBarrier::WhitePtr(), ReadBarrier::GrayPtr());
   } else {
+    success = !bitmap->AtomicTestAndSet(ref);
+  }
+  if (success) {
     // Newly marked.
     if (kUseBakerReadBarrier) {
       DCHECK_EQ(ref->GetReadBarrierPointer(), ReadBarrier::GrayPtr());
@@ -99,13 +94,16 @@
   return ref;
 }
 
-template<bool kGrayImmuneObject>
+template<bool kGrayImmuneObject, bool kFromGCThread>
 inline mirror::Object* ConcurrentCopying::Mark(mirror::Object* from_ref) {
   if (from_ref == nullptr) {
     return nullptr;
   }
   DCHECK(heap_->collector_type_ == kCollectorTypeCC);
-  if (UNLIKELY(kUseBakerReadBarrier && !is_active_)) {
+  if (kFromGCThread) {
+    DCHECK(is_active_);
+    DCHECK_EQ(Thread::Current(), thread_running_gc_);
+  } else if (UNLIKELY(kUseBakerReadBarrier && !is_active_)) {
     // In the lock word forward address state, the read barrier bits
     // in the lock word are part of the stored forwarding address and
     // invalid. This is usually OK as the from-space copy of objects
@@ -192,6 +190,16 @@
   }
 }
 
+inline bool ConcurrentCopying::IsMarkedInUnevacFromSpace(mirror::Object* from_ref) {
+  // Use load acquire on the read barrier pointer to ensure that we never see a white read barrier
+  // pointer with an unmarked bit due to reordering.
+  DCHECK(region_space_->IsInUnevacFromSpace(from_ref));
+  if (kUseBakerReadBarrier && from_ref->GetReadBarrierPointerAcquire() == ReadBarrier::GrayPtr()) {
+    return true;
+  }
+  return region_space_bitmap_->Test(from_ref);
+}
+
 }  // namespace collector
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index 42816a0..651669e 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -1302,8 +1302,19 @@
         << " " << to_ref << " " << to_ref->GetReadBarrierPointer()
         << " is_marked=" << IsMarked(to_ref);
   }
-  // Scan ref fields.
-  Scan(to_ref);
+  bool add_to_live_bytes = false;
+  if (region_space_->IsInUnevacFromSpace(to_ref)) {
+    // Mark the bitmap only in the GC thread here so that we don't need a CAS.
+    if (!kUseBakerReadBarrier || !region_space_bitmap_->Set(to_ref)) {
+      // It may be already marked if we accidentally pushed the same object twice due to the racy
+      // bitmap read in MarkUnevacFromSpaceRegion.
+      Scan(to_ref);
+      // Only add to the live bytes if the object was not already marked.
+      add_to_live_bytes = true;
+    }
+  } else {
+    Scan(to_ref);
+  }
   if (kUseBakerReadBarrier) {
     DCHECK(to_ref->GetReadBarrierPointer() == ReadBarrier::GrayPtr())
         << " " << to_ref << " " << to_ref->GetReadBarrierPointer()
@@ -1332,7 +1343,7 @@
   DCHECK(!kUseBakerReadBarrier);
 #endif
 
-  if (region_space_->IsInUnevacFromSpace(to_ref)) {
+  if (add_to_live_bytes) {
     // Add to the live bytes per unevacuated from space. Note this code is always run by the
     // GC-running thread (no synchronization required).
     DCHECK(region_space_bitmap_->Test(to_ref));
@@ -1567,7 +1578,7 @@
       // OK.
       return;
     } else if (region_space_->IsInUnevacFromSpace(ref)) {
-      CHECK(region_space_bitmap_->Test(ref)) << ref;
+      CHECK(IsMarkedInUnevacFromSpace(ref)) << ref;
     } else if (region_space_->IsInFromSpace(ref)) {
       // Not OK. Do extra logging.
       if (obj != nullptr) {
@@ -1614,7 +1625,7 @@
       // OK.
       return;
     } else if (region_space_->IsInUnevacFromSpace(ref)) {
-      CHECK(region_space_bitmap_->Test(ref)) << ref;
+      CHECK(IsMarkedInUnevacFromSpace(ref)) << ref;
     } else if (region_space_->IsInFromSpace(ref)) {
       // Not OK. Do extra logging.
       if (gc_root_source == nullptr) {
@@ -1654,7 +1665,7 @@
     LOG(INFO) << "holder is in the to-space.";
   } else if (region_space_->IsInUnevacFromSpace(obj)) {
     LOG(INFO) << "holder is in the unevac from-space.";
-    if (region_space_bitmap_->Test(obj)) {
+    if (IsMarkedInUnevacFromSpace(obj)) {
       LOG(INFO) << "holder is marked in the region space bitmap.";
     } else {
       LOG(INFO) << "holder is not marked in the region space bitmap.";
@@ -1783,7 +1794,7 @@
   DCHECK_EQ(Thread::Current(), thread_running_gc_);
   mirror::Object* ref = obj->GetFieldObject<
       mirror::Object, kVerifyNone, kWithoutReadBarrier, false>(offset);
-  mirror::Object* to_ref = Mark</*kGrayImmuneObject*/false>(ref);
+  mirror::Object* to_ref = Mark</*kGrayImmuneObject*/false, /*kFromGCThread*/true>(ref);
   if (to_ref == ref) {
     return;
   }
@@ -2126,7 +2137,7 @@
            heap_->non_moving_space_->HasAddress(to_ref))
         << "from_ref=" << from_ref << " to_ref=" << to_ref;
   } else if (rtype == space::RegionSpace::RegionType::kRegionTypeUnevacFromSpace) {
-    if (region_space_bitmap_->Test(from_ref)) {
+    if (IsMarkedInUnevacFromSpace(from_ref)) {
       to_ref = from_ref;
     } else {
       to_ref = nullptr;
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index 5b0e2d6..97f4555 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -104,7 +104,7 @@
     DCHECK(ref != nullptr);
     return IsMarked(ref) == ref;
   }
-  template<bool kGrayImmuneObject = true>
+  template<bool kGrayImmuneObject = true, bool kFromGCThread = false>
   ALWAYS_INLINE mirror::Object* Mark(mirror::Object* from_ref)
       SHARED_REQUIRES(Locks::mutator_lock_)
       REQUIRES(!mark_stack_lock_, !skipped_blocks_lock_, !immune_gray_stack_lock_);
@@ -179,6 +179,8 @@
       REQUIRES(!mark_stack_lock_, !skipped_blocks_lock_, !immune_gray_stack_lock_);
   virtual mirror::Object* IsMarked(mirror::Object* from_ref) OVERRIDE
       SHARED_REQUIRES(Locks::mutator_lock_);
+  bool IsMarkedInUnevacFromSpace(mirror::Object* from_ref)
+      SHARED_REQUIRES(Locks::mutator_lock_);
   virtual bool IsMarkedHeapReference(mirror::HeapReference<mirror::Object>* field) OVERRIDE
       SHARED_REQUIRES(Locks::mutator_lock_);
   void SweepSystemWeaks(Thread* self)
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index 4505c24..c87312b 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -454,8 +454,7 @@
                                           const std::string& image_filename,
                                           bool is_zygote,
                                           bool is_global_cache,
-                                          bool is_system,
-                                          bool relocated_version_used,
+                                          bool validate_oat_file,
                                           std::string* error_msg)
       SHARED_REQUIRES(Locks::mutator_lock_) {
     // Note that we must not use the file descriptor associated with
@@ -483,7 +482,7 @@
     // file name.
     return Init(image_filename.c_str(),
                 image_location,
-                !(is_system || relocated_version_used),
+                validate_oat_file,
                 /* oat_file */nullptr,
                 error_msg);
   }
@@ -1197,9 +1196,9 @@
       for (int32_t i = 0, count = dex_caches->GetLength(); i < count; ++i) {
         mirror::DexCache* dex_cache = dex_caches->Get<kVerifyNone, kWithoutReadBarrier>(i);
         // Fix up dex cache pointers.
-        GcRoot<mirror::String>* strings = dex_cache->GetStrings();
+        mirror::StringDexCacheType* strings = dex_cache->GetStrings();
         if (strings != nullptr) {
-          GcRoot<mirror::String>* new_strings = fixup_adapter.ForwardObject(strings);
+          mirror::StringDexCacheType* new_strings = fixup_adapter.ForwardObject(strings);
           if (strings != new_strings) {
             dex_cache->SetStrings(new_strings);
           }
@@ -1473,8 +1472,7 @@
                                  cache_filename,
                                  is_zygote,
                                  is_global_cache,
-                                 /* is_system */ false,
-                                 /* relocated_version_used */ true,
+                                 /* validate_oat_file */ false,
                                  &local_error_msg);
       if (relocated_space != nullptr) {
         return relocated_space;
@@ -1491,8 +1489,7 @@
                                cache_filename,
                                is_zygote,
                                is_global_cache,
-                               /* is_system */ false,
-                               /* relocated_version_used */ true,
+                               /* validate_oat_file */ true,
                                &local_error_msg);
     if (cache_space != nullptr) {
       return cache_space;
@@ -1512,8 +1509,7 @@
                                system_filename,
                                is_zygote,
                                is_global_cache,
-                               /* is_system */ true,
-                               /* relocated_version_used */ false,
+                               /* validate_oat_file */ false,
                                &local_error_msg);
     if (system_space != nullptr) {
       return system_space;
@@ -1538,8 +1534,7 @@
                                    cache_filename,
                                    is_zygote,
                                    is_global_cache,
-                                   /* is_system */ false,
-                                   /* relocated_version_used */ true,
+                                   /* validate_oat_file */ false,
                                    &local_error_msg);
         if (patched_space != nullptr) {
           return patched_space;
@@ -1568,8 +1563,7 @@
                                    cache_filename,
                                    is_zygote,
                                    is_global_cache,
-                                   /* is_system */ false,
-                                   /* relocated_version_used */ true,
+                                   /* validate_oat_file */ false,
                                    &local_error_msg);
         if (compiled_space != nullptr) {
           return compiled_space;
diff --git a/runtime/generated/asm_support_gen.h b/runtime/generated/asm_support_gen.h
index 716c23d..40b71c4 100644
--- a/runtime/generated/asm_support_gen.h
+++ b/runtime/generated/asm_support_gen.h
@@ -70,6 +70,16 @@
 DEFINE_CHECK_EQ(static_cast<int32_t>(ART_METHOD_QUICK_CODE_OFFSET_32), (static_cast<int32_t>(art::ArtMethod:: EntryPointFromQuickCompiledCodeOffset(art::PointerSize::k32).Int32Value())))
 #define ART_METHOD_QUICK_CODE_OFFSET_64 48
 DEFINE_CHECK_EQ(static_cast<int32_t>(ART_METHOD_QUICK_CODE_OFFSET_64), (static_cast<int32_t>(art::ArtMethod:: EntryPointFromQuickCompiledCodeOffset(art::PointerSize::k64).Int32Value())))
+#define ART_METHOD_DECLARING_CLASS_OFFSET 0
+DEFINE_CHECK_EQ(static_cast<int32_t>(ART_METHOD_DECLARING_CLASS_OFFSET), (static_cast<int32_t>(art::ArtMethod:: DeclaringClassOffset().Int32Value())))
+#define DECLARING_CLASS_DEX_CACHE_STRINGS_OFFSET 40
+DEFINE_CHECK_EQ(static_cast<int32_t>(DECLARING_CLASS_DEX_CACHE_STRINGS_OFFSET), (static_cast<int32_t>(art::mirror::Class:: DexCacheStringsOffset().Int32Value())))
+#define STRING_DEX_CACHE_ELEMENT_SIZE_SHIFT 3
+DEFINE_CHECK_EQ(static_cast<int32_t>(STRING_DEX_CACHE_ELEMENT_SIZE_SHIFT), (static_cast<int32_t>(art::WhichPowerOf2(sizeof(art::mirror::StringDexCachePair)))))
+#define STRING_DEX_CACHE_SIZE_MINUS_ONE 1023
+DEFINE_CHECK_EQ(static_cast<int32_t>(STRING_DEX_CACHE_SIZE_MINUS_ONE), (static_cast<int32_t>(art::mirror::DexCache::kDexCacheStringCacheSize - 1)))
+#define STRING_DEX_CACHE_HASH_BITS 10
+DEFINE_CHECK_EQ(static_cast<int32_t>(STRING_DEX_CACHE_HASH_BITS), (static_cast<int32_t>(art::LeastSignificantBit(art::mirror::DexCache::kDexCacheStringCacheSize))))
 #define MIN_LARGE_OBJECT_THRESHOLD 0x3000
 DEFINE_CHECK_EQ(static_cast<size_t>(MIN_LARGE_OBJECT_THRESHOLD), (static_cast<size_t>(art::gc::Heap::kMinLargeObjectThreshold)))
 #define LOCK_WORD_STATE_SHIFT 30
diff --git a/runtime/hprof/hprof.cc b/runtime/hprof/hprof.cc
index 9895395..4005f05 100644
--- a/runtime/hprof/hprof.cc
+++ b/runtime/hprof/hprof.cc
@@ -223,6 +223,12 @@
     HandleU1List(values, count);
     length_ += count;
   }
+  void AddU1AsU2List(const uint8_t* values, size_t count) {
+    HandleU1AsU2List(values, count);
+    // Array of char from compressed String (8-bit) is added as 16-bit blocks
+    int ceil_count_to_even = count + ((count & 1) ? 1 : 0);
+    length_ += ceil_count_to_even * sizeof(uint8_t);
+  }
   void AddU2List(const uint16_t* values, size_t count) {
     HandleU2List(values, count);
     length_ += count * sizeof(uint16_t);
@@ -268,6 +274,9 @@
   virtual void HandleU1List(const uint8_t* values ATTRIBUTE_UNUSED,
                             size_t count ATTRIBUTE_UNUSED) {
   }
+  virtual void HandleU1AsU2List(const uint8_t* values ATTRIBUTE_UNUSED,
+                                size_t count ATTRIBUTE_UNUSED) {
+  }
   virtual void HandleU2List(const uint16_t* values ATTRIBUTE_UNUSED,
                             size_t count ATTRIBUTE_UNUSED) {
   }
@@ -308,6 +317,19 @@
     buffer_.insert(buffer_.end(), values, values + count);
   }
 
+  void HandleU1AsU2List(const uint8_t* values, size_t count) OVERRIDE {
+    DCHECK_EQ(length_, buffer_.size());
+    // All 8-bits are grouped in 2 to make 16-bit block like Java Char
+    if (count & 1) {
+      buffer_.push_back(0);
+    }
+    for (size_t i = 0; i < count; ++i) {
+      uint8_t value = *values;
+      buffer_.push_back(value);
+      values++;
+    }
+  }
+
   void HandleU2List(const uint16_t* values, size_t count) OVERRIDE {
     DCHECK_EQ(length_, buffer_.size());
     for (size_t i = 0; i < count; ++i) {
@@ -1354,7 +1376,11 @@
         string_value = reinterpret_cast<mirror::Object*>(
             reinterpret_cast<uintptr_t>(s) + kObjectAlignment);
       } else {
-        string_value = reinterpret_cast<mirror::Object*>(s->GetValue());
+        if (s->IsCompressed()) {
+          string_value = reinterpret_cast<mirror::Object*>(s->GetValueCompressed());
+        } else {
+          string_value = reinterpret_cast<mirror::Object*>(s->GetValue());
+        }
       }
       __ AddObjectId(string_value);
     }
@@ -1369,12 +1395,18 @@
   CHECK_EQ(obj->IsString(), string_value != nullptr);
   if (string_value != nullptr) {
     mirror::String* s = obj->AsString();
+    // Compressed string's (8-bit) length is ceil(length/2) in 16-bit blocks
+    int length_in_16_bit = (s->IsCompressed()) ? ((s->GetLength() + 1) / 2) : s->GetLength();
     __ AddU1(HPROF_PRIMITIVE_ARRAY_DUMP);
     __ AddObjectId(string_value);
     __ AddStackTraceSerialNumber(LookupStackTraceSerialNumber(obj));
-    __ AddU4(s->GetLength());
+    __ AddU4(length_in_16_bit);
     __ AddU1(hprof_basic_char);
-    __ AddU2List(s->GetValue(), s->GetLength());
+    if (s->IsCompressed()) {
+      __ AddU1AsU2List(s->GetValueCompressed(), s->GetLength());
+    } else {
+      __ AddU2List(s->GetValue(), s->GetLength());
+    }
   }
 }
 
diff --git a/runtime/intern_table.cc b/runtime/intern_table.cc
index eceb593..1940d67 100644
--- a/runtime/intern_table.cc
+++ b/runtime/intern_table.cc
@@ -386,8 +386,23 @@
   if (a_length != b.GetUtf16Length()) {
     return false;
   }
-  const uint16_t* a_value = a_string->GetValue();
-  return CompareModifiedUtf8ToUtf16AsCodePointValues(b.GetUtf8Data(), a_value, a_length) == 0;
+  if (a_string->IsCompressed()) {
+    size_t b_byte_count = strlen(b.GetUtf8Data());
+    size_t b_utf8_length = CountModifiedUtf8Chars(b.GetUtf8Data(), b_byte_count);
+    // Modified UTF-8 single byte character range is 0x01 .. 0x7f
+    // The string compression occurs on regular ASCII with same exact range,
+    // not on extended ASCII which up to 0xff
+    const bool is_b_regular_ascii = (b_byte_count == b_utf8_length);
+    if (is_b_regular_ascii) {
+      return memcmp(b.GetUtf8Data(),
+                    a_string->GetValueCompressed(), a_length * sizeof(uint8_t)) == 0;
+    } else {
+      return false;
+    }
+  } else {
+    const uint16_t* a_value = a_string->GetValue();
+    return CompareModifiedUtf8ToUtf16AsCodePointValues(b.GetUtf8Data(), a_value, a_length) == 0;
+  }
 }
 
 size_t InternTable::Table::AddTableFromMemory(const uint8_t* ptr) {
diff --git a/runtime/interpreter/interpreter.cc b/runtime/interpreter/interpreter.cc
index f1f7f42..101c9a1 100644
--- a/runtime/interpreter/interpreter.cc
+++ b/runtime/interpreter/interpreter.cc
@@ -20,6 +20,9 @@
 
 #include "common_throws.h"
 #include "interpreter_common.h"
+#include "interpreter_goto_table_impl.h"
+#include "interpreter_mterp_impl.h"
+#include "interpreter_switch_impl.h"
 #include "mirror/string-inl.h"
 #include "scoped_thread_state_change.h"
 #include "ScopedLocalRef.h"
@@ -242,28 +245,6 @@
 
 static constexpr InterpreterImplKind kInterpreterImplKind = kMterpImplKind;
 
-#if defined(__clang__)
-// Clang 3.4 fails to build the goto interpreter implementation.
-template<bool do_access_check, bool transaction_active>
-JValue ExecuteGotoImpl(Thread*, const DexFile::CodeItem*, ShadowFrame&, JValue) {
-  LOG(FATAL) << "UNREACHABLE";
-  UNREACHABLE();
-}
-// Explicit definitions of ExecuteGotoImpl.
-template<> SHARED_REQUIRES(Locks::mutator_lock_)
-JValue ExecuteGotoImpl<true, false>(Thread* self, const DexFile::CodeItem* code_item,
-                                    ShadowFrame& shadow_frame, JValue result_register);
-template<> SHARED_REQUIRES(Locks::mutator_lock_)
-JValue ExecuteGotoImpl<false, false>(Thread* self, const DexFile::CodeItem* code_item,
-                                     ShadowFrame& shadow_frame, JValue result_register);
-template<> SHARED_REQUIRES(Locks::mutator_lock_)
-JValue ExecuteGotoImpl<true, true>(Thread* self,  const DexFile::CodeItem* code_item,
-                                   ShadowFrame& shadow_frame, JValue result_register);
-template<> SHARED_REQUIRES(Locks::mutator_lock_)
-JValue ExecuteGotoImpl<false, true>(Thread* self, const DexFile::CodeItem* code_item,
-                                    ShadowFrame& shadow_frame, JValue result_register);
-#endif
-
 static inline JValue Execute(
     Thread* self,
     const DexFile::CodeItem* code_item,
diff --git a/runtime/interpreter/interpreter_common.h b/runtime/interpreter/interpreter_common.h
index 4fd1514..7b38473 100644
--- a/runtime/interpreter/interpreter_common.h
+++ b/runtime/interpreter/interpreter_common.h
@@ -23,6 +23,7 @@
 
 #include <iostream>
 #include <sstream>
+#include <atomic>
 
 #include "art_field-inl.h"
 #include "art_method-inl.h"
@@ -37,6 +38,8 @@
 #include "handle_scope-inl.h"
 #include "jit/jit.h"
 #include "mirror/class-inl.h"
+#include "mirror/dex_cache.h"
+#include "mirror/method.h"
 #include "mirror/object-inl.h"
 #include "mirror/object_array-inl.h"
 #include "mirror/string-inl.h"
@@ -62,21 +65,6 @@
 namespace art {
 namespace interpreter {
 
-// External references to all interpreter implementations.
-
-template<bool do_access_check, bool transaction_active>
-extern JValue ExecuteSwitchImpl(Thread* self, const DexFile::CodeItem* code_item,
-                                ShadowFrame& shadow_frame, JValue result_register,
-                                bool interpret_one_instruction);
-
-template<bool do_access_check, bool transaction_active>
-extern JValue ExecuteGotoImpl(Thread* self, const DexFile::CodeItem* code_item,
-                              ShadowFrame& shadow_frame, JValue result_register);
-
-// Mterp does not support transactions or access check, thus no templated versions.
-extern "C" bool ExecuteMterpImpl(Thread* self, const DexFile::CodeItem* code_item,
-                                 ShadowFrame* shadow_frame, JValue* result_register);
-
 void ThrowNullPointerExceptionFromInterpreter()
     SHARED_REQUIRES(Locks::mutator_lock_);
 
@@ -264,15 +252,20 @@
   ArtMethod* method = shadow_frame.GetMethod();
   mirror::Class* declaring_class = method->GetDeclaringClass();
   // MethodVerifier refuses methods with string_idx out of bounds.
-  DCHECK_LT(string_idx, declaring_class->GetDexCache()->NumStrings());
-  mirror::String* s = declaring_class->GetDexCacheStrings()[string_idx].Read();
-  if (UNLIKELY(s == nullptr)) {
+  DCHECK_LT(string_idx % mirror::DexCache::kDexCacheStringCacheSize,
+            declaring_class->GetDexFile().NumStringIds());
+  mirror::String* string_ptr =
+      mirror::StringDexCachePair::LookupString(declaring_class->GetDexCacheStrings(),
+                                               string_idx,
+                                               mirror::DexCache::kDexCacheStringCacheSize).Read();
+  if (UNLIKELY(string_ptr == nullptr)) {
     StackHandleScope<1> hs(self);
     Handle<mirror::DexCache> dex_cache(hs.NewHandle(declaring_class->GetDexCache()));
-    s = Runtime::Current()->GetClassLinker()->ResolveString(*method->GetDexFile(), string_idx,
-                                                            dex_cache);
+    string_ptr = Runtime::Current()->GetClassLinker()->ResolveString(*method->GetDexFile(),
+                                                                     string_idx,
+                                                                     dex_cache);
   }
-  return s;
+  return string_ptr;
 }
 
 // Handles div-int, div-int/2addr, div-int/li16 and div-int/lit8 instructions.
@@ -442,7 +435,7 @@
       oss << StringPrintf(" vreg%u=0x%08X", i, raw_value);
       if (ref_value != nullptr) {
         if (ref_value->GetClass()->IsStringClass() &&
-            ref_value->AsString()->GetValue() != nullptr) {
+            !ref_value->AsString()->IsValueNull()) {
           oss << "/java.lang.String \"" << ref_value->AsString()->ToModifiedUtf8() << "\"";
         } else {
           oss << "/" << PrettyTypeOf(ref_value);
diff --git a/runtime/interpreter/interpreter_goto_table_impl.cc b/runtime/interpreter/interpreter_goto_table_impl.cc
index 43b2778..37dd63b 100644
--- a/runtime/interpreter/interpreter_goto_table_impl.cc
+++ b/runtime/interpreter/interpreter_goto_table_impl.cc
@@ -14,18 +14,29 @@
  * limitations under the License.
  */
 
+#include "interpreter_goto_table_impl.h"
+
+// Common includes
+#include "base/logging.h"
+#include "base/macros.h"
+#include "base/mutex.h"
+#include "stack.h"
+#include "thread.h"
+
+// Clang compiles the GOTO interpreter very slowly. So we skip it. These are the implementation
+// details only necessary when compiling it.
 #if !defined(__clang__)
-// Clang 3.4 fails to build the goto interpreter implementation.
-
-
 #include "experimental_flags.h"
 #include "interpreter_common.h"
 #include "jit/jit.h"
 #include "safe_math.h"
+#endif
 
 namespace art {
 namespace interpreter {
 
+#if !defined(__clang__)
+
 // In the following macros, we expect the following local variables exist:
 // - "self": the current Thread*.
 // - "inst" : the current Instruction*.
@@ -530,8 +541,7 @@
     if (LIKELY(c != nullptr)) {
       if (UNLIKELY(c->IsStringClass())) {
         gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
-        mirror::SetStringCountVisitor visitor(0);
-        obj = String::Alloc<true>(self, 0, allocator_type, visitor);
+        obj = mirror::String::AllocEmptyString<true>(self, allocator_type);
       } else {
         obj = AllocObjectFromCode<do_access_check, true>(
             inst->VRegB_21c(), shadow_frame.GetMethod(), self,
@@ -2558,20 +2568,40 @@
 }  // NOLINT(readability/fn_size)
 
 // Explicit definitions of ExecuteGotoImpl.
-template SHARED_REQUIRES(Locks::mutator_lock_) HOT_ATTR
+template HOT_ATTR
 JValue ExecuteGotoImpl<true, false>(Thread* self, const DexFile::CodeItem* code_item,
                                     ShadowFrame& shadow_frame, JValue result_register);
-template SHARED_REQUIRES(Locks::mutator_lock_) HOT_ATTR
+template HOT_ATTR
 JValue ExecuteGotoImpl<false, false>(Thread* self, const DexFile::CodeItem* code_item,
                                      ShadowFrame& shadow_frame, JValue result_register);
-template SHARED_REQUIRES(Locks::mutator_lock_)
+template
 JValue ExecuteGotoImpl<true, true>(Thread* self, const DexFile::CodeItem* code_item,
                                    ShadowFrame& shadow_frame, JValue result_register);
-template SHARED_REQUIRES(Locks::mutator_lock_)
+template
 JValue ExecuteGotoImpl<false, true>(Thread* self, const DexFile::CodeItem* code_item,
                                     ShadowFrame& shadow_frame, JValue result_register);
 
+#else
+
+template<bool do_access_check, bool transaction_active>
+JValue ExecuteGotoImpl(Thread*, const DexFile::CodeItem*, ShadowFrame&, JValue) {
+  LOG(FATAL) << "UNREACHABLE";
+  UNREACHABLE();
+}
+// Explicit definitions of ExecuteGotoImpl.
+template<>
+JValue ExecuteGotoImpl<true, false>(Thread* self, const DexFile::CodeItem* code_item,
+                                    ShadowFrame& shadow_frame, JValue result_register);
+template<>
+JValue ExecuteGotoImpl<false, false>(Thread* self, const DexFile::CodeItem* code_item,
+                                     ShadowFrame& shadow_frame, JValue result_register);
+template<>
+JValue ExecuteGotoImpl<true, true>(Thread* self,  const DexFile::CodeItem* code_item,
+                                   ShadowFrame& shadow_frame, JValue result_register);
+template<>
+JValue ExecuteGotoImpl<false, true>(Thread* self, const DexFile::CodeItem* code_item,
+                                    ShadowFrame& shadow_frame, JValue result_register);
+#endif
+
 }  // namespace interpreter
 }  // namespace art
-
-#endif
diff --git a/runtime/interpreter/interpreter_goto_table_impl.h b/runtime/interpreter/interpreter_goto_table_impl.h
new file mode 100644
index 0000000..bb9be88
--- /dev/null
+++ b/runtime/interpreter/interpreter_goto_table_impl.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_INTERPRETER_INTERPRETER_GOTO_TABLE_IMPL_H_
+#define ART_RUNTIME_INTERPRETER_INTERPRETER_GOTO_TABLE_IMPL_H_
+
+#include "base/macros.h"
+#include "base/mutex.h"
+#include "dex_file.h"
+#include "jvalue.h"
+
+namespace art {
+
+class ShadowFrame;
+class Thread;
+
+namespace interpreter {
+
+template<bool do_access_check, bool transaction_active>
+JValue ExecuteGotoImpl(Thread* self,
+                       const DexFile::CodeItem* code_item,
+                       ShadowFrame& shadow_frame,
+                       JValue result_register) SHARED_REQUIRES(Locks::mutator_lock_);
+
+}  // namespace interpreter
+}  // namespace art
+
+#endif  // ART_RUNTIME_INTERPRETER_INTERPRETER_GOTO_TABLE_IMPL_H_
diff --git a/runtime/interpreter/interpreter_mterp_impl.h b/runtime/interpreter/interpreter_mterp_impl.h
new file mode 100644
index 0000000..322df4e
--- /dev/null
+++ b/runtime/interpreter/interpreter_mterp_impl.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_INTERPRETER_INTERPRETER_MTERP_IMPL_H_
+#define ART_RUNTIME_INTERPRETER_INTERPRETER_MTERP_IMPL_H_
+
+#include "base/macros.h"
+#include "base/mutex.h"
+#include "dex_file.h"
+#include "jvalue.h"
+
+namespace art {
+
+class ShadowFrame;
+class Thread;
+
+namespace interpreter {
+
+// Mterp does not support transactions or access check, thus no templated versions.
+extern "C" bool ExecuteMterpImpl(Thread* self,
+                                 const DexFile::CodeItem* code_item,
+                                 ShadowFrame* shadow_frame,
+                                 JValue* result_register) SHARED_REQUIRES(Locks::mutator_lock_);
+
+}  // namespace interpreter
+}  // namespace art
+
+#endif  // ART_RUNTIME_INTERPRETER_INTERPRETER_MTERP_IMPL_H_
diff --git a/runtime/interpreter/interpreter_switch_impl.cc b/runtime/interpreter/interpreter_switch_impl.cc
index a6349fc..227130e 100644
--- a/runtime/interpreter/interpreter_switch_impl.cc
+++ b/runtime/interpreter/interpreter_switch_impl.cc
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "interpreter_switch_impl.h"
+
 #include "base/enums.h"
 #include "experimental_flags.h"
 #include "interpreter_common.h"
@@ -477,8 +479,7 @@
         if (LIKELY(c != nullptr)) {
           if (UNLIKELY(c->IsStringClass())) {
             gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
-            mirror::SetStringCountVisitor visitor(0);
-            obj = String::Alloc<true>(self, 0, allocator_type, visitor);
+            obj = mirror::String::AllocEmptyString<true>(self, allocator_type);
           } else {
             obj = AllocObjectFromCode<do_access_check, true>(
               inst->VRegB_21c(), shadow_frame.GetMethod(), self,
@@ -622,10 +623,8 @@
         break;
       }
 
-#if defined(__clang__)
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wfloat-equal"
-#endif
 
       case Instruction::CMPL_FLOAT: {
         PREAMBLE();
@@ -693,9 +692,7 @@
         break;
       }
 
-#if defined(__clang__)
 #pragma clang diagnostic pop
-#endif
 
       case Instruction::CMP_LONG: {
         PREAMBLE();
@@ -2337,19 +2334,19 @@
 }  // NOLINT(readability/fn_size)
 
 // Explicit definitions of ExecuteSwitchImpl.
-template SHARED_REQUIRES(Locks::mutator_lock_) HOT_ATTR
+template HOT_ATTR
 JValue ExecuteSwitchImpl<true, false>(Thread* self, const DexFile::CodeItem* code_item,
                                       ShadowFrame& shadow_frame, JValue result_register,
                                       bool interpret_one_instruction);
-template SHARED_REQUIRES(Locks::mutator_lock_) HOT_ATTR
+template HOT_ATTR
 JValue ExecuteSwitchImpl<false, false>(Thread* self, const DexFile::CodeItem* code_item,
                                        ShadowFrame& shadow_frame, JValue result_register,
                                        bool interpret_one_instruction);
-template SHARED_REQUIRES(Locks::mutator_lock_)
+template
 JValue ExecuteSwitchImpl<true, true>(Thread* self, const DexFile::CodeItem* code_item,
                                      ShadowFrame& shadow_frame, JValue result_register,
                                      bool interpret_one_instruction);
-template SHARED_REQUIRES(Locks::mutator_lock_)
+template
 JValue ExecuteSwitchImpl<false, true>(Thread* self, const DexFile::CodeItem* code_item,
                                       ShadowFrame& shadow_frame, JValue result_register,
                                       bool interpret_one_instruction);
diff --git a/runtime/interpreter/interpreter_switch_impl.h b/runtime/interpreter/interpreter_switch_impl.h
new file mode 100644
index 0000000..90ec908
--- /dev/null
+++ b/runtime/interpreter/interpreter_switch_impl.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_INTERPRETER_INTERPRETER_SWITCH_IMPL_H_
+#define ART_RUNTIME_INTERPRETER_INTERPRETER_SWITCH_IMPL_H_
+
+#include "base/macros.h"
+#include "base/mutex.h"
+#include "dex_file.h"
+#include "jvalue.h"
+
+namespace art {
+
+class ShadowFrame;
+class Thread;
+
+namespace interpreter {
+
+template<bool do_access_check, bool transaction_active>
+JValue ExecuteSwitchImpl(Thread* self,
+                         const DexFile::CodeItem* code_item,
+                         ShadowFrame& shadow_frame,
+                         JValue result_register,
+                         bool interpret_one_instruction) SHARED_REQUIRES(Locks::mutator_lock_);
+
+}  // namespace interpreter
+}  // namespace art
+
+#endif  // ART_RUNTIME_INTERPRETER_INTERPRETER_SWITCH_IMPL_H_
diff --git a/runtime/interpreter/mterp/mterp.cc b/runtime/interpreter/mterp/mterp.cc
index c25cd78..20a0753 100644
--- a/runtime/interpreter/mterp/mterp.cc
+++ b/runtime/interpreter/mterp/mterp.cc
@@ -358,8 +358,7 @@
   if (LIKELY(c != nullptr)) {
     if (UNLIKELY(c->IsStringClass())) {
       gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
-      mirror::SetStringCountVisitor visitor(0);
-      obj = String::Alloc<true>(self, 0, allocator_type, visitor);
+      obj = mirror::String::AllocEmptyString<true>(self, allocator_type);
     } else {
       obj = AllocObjectFromCode<false, true>(
         inst->VRegB_21c(), shadow_frame->GetMethod(), self,
diff --git a/runtime/interpreter/unstarted_runtime_test.cc b/runtime/interpreter/unstarted_runtime_test.cc
index 7e1f795..c324600 100644
--- a/runtime/interpreter/unstarted_runtime_test.cc
+++ b/runtime/interpreter/unstarted_runtime_test.cc
@@ -401,8 +401,23 @@
   interpreter::DoCall<false, false>(method, self, *shadow_frame, inst, inst_data[0], &result);
   mirror::String* string_result = reinterpret_cast<mirror::String*>(result.GetL());
   EXPECT_EQ(string_arg->GetLength(), string_result->GetLength());
-  EXPECT_EQ(memcmp(string_arg->GetValue(), string_result->GetValue(),
-                   string_arg->GetLength() * sizeof(uint16_t)), 0);
+
+  if (string_arg->IsCompressed() && string_result->IsCompressed()) {
+    EXPECT_EQ(memcmp(string_arg->GetValueCompressed(), string_result->GetValueCompressed(),
+                     string_arg->GetLength() * sizeof(uint8_t)), 0);
+  } else if (!string_arg->IsCompressed() && !string_result->IsCompressed()) {
+    EXPECT_EQ(memcmp(string_arg->GetValue(), string_result->GetValue(),
+                     string_arg->GetLength() * sizeof(uint16_t)), 0);
+  } else {
+    bool equal = true;
+    for (int i = 0; i < string_arg->GetLength(); ++i) {
+      if (string_arg->CharAt(i) != string_result->CharAt(i)) {
+        equal = false;
+        break;
+      }
+    }
+    EXPECT_EQ(equal, true);
+  }
 
   ShadowFrame::DeleteDeoptimizedFrame(shadow_frame);
 }
diff --git a/runtime/jdwp/jdwp_bits.h b/runtime/jdwp/jdwp_bits.h
index f9cf9ca..33b98f3 100644
--- a/runtime/jdwp/jdwp_bits.h
+++ b/runtime/jdwp/jdwp_bits.h
@@ -59,13 +59,22 @@
   bytes.push_back(static_cast<uint8_t>(value));
 }
 
-static inline void AppendUtf16BE(std::vector<uint8_t>& bytes, const uint16_t* chars, size_t char_count) {
+static inline void AppendUtf16BE(std::vector<uint8_t>& bytes, const uint16_t* chars,
+                                 size_t char_count) {
   Append4BE(bytes, char_count);
   for (size_t i = 0; i < char_count; ++i) {
     Append2BE(bytes, chars[i]);
   }
 }
 
+static inline void AppendUtf16CompressedBE(std::vector<uint8_t>& bytes,
+                                           const uint8_t* chars, size_t char_count) {
+  Append4BE(bytes, char_count);
+  for (size_t i = 0; i < char_count; ++i) {
+    Append2BE(bytes, static_cast<uint16_t>(chars[i]));
+  }
+}
+
 // @deprecated
 static inline void Set1(uint8_t* buf, uint8_t val) {
   *buf = val;
diff --git a/runtime/jit/profile_saver.cc b/runtime/jit/profile_saver.cc
index b35c958..927681c 100644
--- a/runtime/jit/profile_saver.cc
+++ b/runtime/jit/profile_saver.cc
@@ -63,19 +63,6 @@
       options_(options) {
   DCHECK(options_.IsEnabled());
   AddTrackedLocations(output_filename, app_data_dir, code_paths);
-  if (!app_data_dir.empty()) {
-    // The application directory is used to determine which dex files are owned by app.
-    // Since it could be a symlink (e.g. /data/data instead of /data/user/0), and we
-    // don't have control over how the dex files are actually loaded (symlink or canonical path),
-    // store it's canonical form to be sure we use the same base when comparing.
-    UniqueCPtr<const char[]> app_data_dir_real_path(realpath(app_data_dir.c_str(), nullptr));
-    if (app_data_dir_real_path != nullptr) {
-      app_data_dirs_.emplace(app_data_dir_real_path.get());
-    } else {
-      LOG(WARNING) << "Failed to get the real path for app dir: " << app_data_dir
-          << ". The app dir will not be used to determine which dex files belong to the app";
-    }
-  }
 }
 
 void ProfileSaver::Run() {
@@ -498,12 +485,18 @@
   if (it == tracked_dex_base_locations_.end()) {
     tracked_dex_base_locations_.Put(output_filename,
                                     std::set<std::string>(code_paths.begin(), code_paths.end()));
-    app_data_dirs_.insert(app_data_dir);
+    if (!app_data_dir.empty()) {
+      app_data_dirs_.insert(app_data_dir);
+    }
   } else {
     it->second.insert(code_paths.begin(), code_paths.end());
   }
 }
 
+// TODO(calin): This may lead to several calls to realpath.
+// Consider moving the logic to the saver thread (i.e. when notified,
+// only cache the location, and then wake up the saver thread to do the
+// comparisons with the real file paths and to create the markers).
 void ProfileSaver::NotifyDexUse(const std::string& dex_location) {
   if (!ShouldProfileLocation(dex_location)) {
     return;
@@ -536,63 +529,32 @@
   }
 }
 
-bool ProfileSaver::MaybeRecordDexUseInternal(
-      const std::string& dex_location,
-      const std::set<std::string>& app_code_paths,
-      const std::string& foreign_dex_profile_path,
-      const std::set<std::string>& app_data_dirs) {
-  if (dex_location.empty()) {
-    LOG(WARNING) << "Asked to record foreign dex use with an empty dex location.";
-    return false;
-  }
-  if (foreign_dex_profile_path.empty()) {
-    LOG(WARNING) << "Asked to record foreign dex use without a valid profile path ";
-    return false;
-  }
-
-  UniqueCPtr<const char[]> dex_location_real_path(realpath(dex_location.c_str(), nullptr));
-  if (dex_location_real_path == nullptr) {
-    PLOG(WARNING) << "Could not get realpath for " << dex_location;
-  }
-  std::string dex_location_real_path_str((dex_location_real_path == nullptr)
-    ? dex_location.c_str()
-    : dex_location_real_path.get());
-
-  if (app_data_dirs.find(dex_location_real_path_str) != app_data_dirs.end()) {
-    // The dex location is under the application folder. Nothing to record.
-    return false;
-  }
-
-  if (app_code_paths.find(dex_location) != app_code_paths.end()) {
-    // The dex location belongs to the application code paths. Nothing to record.
-    return false;
-  }
-  // Do another round of checks with the real paths.
-  // Note that we could cache all the real locations in the saver (since it's an expensive
-  // operation). However we expect that app_code_paths is small (usually 1 element), and
-  // NotifyDexUse is called just a few times in the app lifetime. So we make the compromise
-  // to save some bytes of memory usage.
-  for (const auto& app_code_location : app_code_paths) {
-    UniqueCPtr<const char[]> real_app_code_location(realpath(app_code_location.c_str(), nullptr));
-    if (real_app_code_location == nullptr) {
-      PLOG(WARNING) << "Could not get realpath for " << app_code_location;
+static bool CheckContainsWithRealPath(const std::set<std::string>& paths_set,
+                                      const std::string& path_to_check) {
+  for (const auto& path : paths_set) {
+    UniqueCPtr<const char[]> real_path(realpath(path.c_str(), nullptr));
+    if (real_path == nullptr) {
+      PLOG(WARNING) << "Could not get realpath for " << path;
+      continue;
     }
-    std::string real_app_code_location_str((real_app_code_location == nullptr)
-        ? app_code_location.c_str()
-        : real_app_code_location.get());
-    if (real_app_code_location_str == dex_location_real_path_str) {
-      // The dex location belongs to the application code paths. Nothing to record.
-      return false;
+    std::string real_path_str(real_path.get());
+    if (real_path_str == path_to_check) {
+      return true;
     }
   }
+  return false;
+}
 
+// After the call, dex_location_real_path will contain the marker's name.
+static bool CreateForeignDexMarker(const std::string& foreign_dex_profile_path,
+                                   /*in-out*/ std::string* dex_location_real_path) {
   // For foreign dex files we record a flag on disk. PackageManager will (potentially) take this
   // into account when deciding how to optimize the loaded dex file.
   // The expected flag name is the canonical path of the apk where '/' is substituted to '@'.
   // (it needs to be kept in sync with
   // frameworks/base/services/core/java/com/android/server/pm/PackageDexOptimizer.java)
-  std::replace(dex_location_real_path_str.begin(), dex_location_real_path_str.end(), '/', '@');
-  std::string flag_path = foreign_dex_profile_path + "/" + dex_location_real_path_str;
+  std::replace(dex_location_real_path->begin(), dex_location_real_path->end(), '/', '@');
+  std::string flag_path = foreign_dex_profile_path + "/" + *dex_location_real_path;
   // We use O_RDONLY as the access mode because we must supply some access
   // mode, and there is no access mode that means 'create but do not read' the
   // file. We will not not actually read from the file.
@@ -614,6 +576,57 @@
   }
 }
 
+bool ProfileSaver::MaybeRecordDexUseInternal(
+      const std::string& dex_location,
+      const std::set<std::string>& app_code_paths,
+      const std::string& foreign_dex_profile_path,
+      const std::set<std::string>& app_data_dirs) {
+  if (dex_location.empty()) {
+    LOG(WARNING) << "Asked to record foreign dex use with an empty dex location.";
+    return false;
+  }
+  if (foreign_dex_profile_path.empty()) {
+    LOG(WARNING) << "Asked to record foreign dex use without a valid profile path ";
+    return false;
+  }
+
+  if (app_code_paths.find(dex_location) != app_code_paths.end()) {
+    // The dex location belongs to the application code paths. Nothing to record.
+    return false;
+  }
+
+  if (app_data_dirs.find(dex_location) != app_data_dirs.end()) {
+    // The dex location is under the application folder. Nothing to record.
+    return false;
+  }
+
+  // Do another round of checks with the real paths.
+  // Application directory could be a symlink (e.g. /data/data instead of /data/user/0), and we
+  // don't have control over how the dex files are actually loaded (symlink or canonical path),
+
+  // Note that we could cache all the real locations in the saver (since it's an expensive
+  // operation). However we expect that app_code_paths is small (usually 1 element), and
+  // NotifyDexUse is called just a few times in the app lifetime. So we make the compromise
+  // to save some bytes of memory usage.
+
+  UniqueCPtr<const char[]> dex_location_real_path(realpath(dex_location.c_str(), nullptr));
+  if (dex_location_real_path == nullptr) {
+    PLOG(WARNING) << "Could not get realpath for " << dex_location;
+    return false;
+  }
+  std::string dex_location_real_path_str(dex_location_real_path.get());
+
+  if (CheckContainsWithRealPath(app_code_paths, dex_location_real_path_str)) {
+    return false;
+  }
+
+  if (CheckContainsWithRealPath(app_data_dirs, dex_location_real_path_str)) {
+    return false;
+  }
+
+  return CreateForeignDexMarker(foreign_dex_profile_path, &dex_location_real_path_str);
+}
+
 void ProfileSaver::DumpInstanceInfo(std::ostream& os) {
   MutexLock mu(Thread::Current(), *Locks::profiler_lock_);
   if (instance_ != nullptr) {
diff --git a/runtime/jni_internal.cc b/runtime/jni_internal.cc
index c322475..7bcadd8 100644
--- a/runtime/jni_internal.cc
+++ b/runtime/jni_internal.cc
@@ -592,9 +592,8 @@
     }
     if (c->IsStringClass()) {
       gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
-      mirror::SetStringCountVisitor visitor(0);
-      return soa.AddLocalReference<jobject>(mirror::String::Alloc<true>(soa.Self(), 0,
-                                                                        allocator_type, visitor));
+      return soa.AddLocalReference<jobject>(mirror::String::AllocEmptyString<true>(soa.Self(),
+                                                                              allocator_type));
     }
     return soa.AddLocalReference<jobject>(c->AllocObject(soa.Self()));
   }
@@ -1673,8 +1672,14 @@
       ThrowSIOOBE(soa, start, length, s->GetLength());
     } else {
       CHECK_NON_NULL_MEMCPY_ARGUMENT(length, buf);
-      const jchar* chars = s->GetValue();
-      memcpy(buf, chars + start, length * sizeof(jchar));
+      if (s->IsCompressed()) {
+        for (int i = 0; i < length; ++i) {
+          buf[i] = static_cast<jchar>(s->CharAt(start+i));
+        }
+      } else {
+        const jchar* chars = static_cast<jchar*>(s->GetValue());
+        memcpy(buf, chars + start, length * sizeof(jchar));
+      }
     }
   }
 
@@ -1687,9 +1692,15 @@
       ThrowSIOOBE(soa, start, length, s->GetLength());
     } else {
       CHECK_NON_NULL_MEMCPY_ARGUMENT(length, buf);
-      const jchar* chars = s->GetValue();
-      size_t bytes = CountUtf8Bytes(chars + start, length);
-      ConvertUtf16ToModifiedUtf8(buf, bytes, chars + start, length);
+      if (s->IsCompressed()) {
+        for (int i = 0; i < length; ++i) {
+          buf[i] = s->CharAt(start+i);
+        }
+      } else {
+        const jchar* chars = s->GetValue();
+        size_t bytes = CountUtf8Bytes(chars + start, length);
+        ConvertUtf16ToModifiedUtf8(buf, bytes, chars + start, length);
+      }
     }
   }
 
@@ -1698,9 +1709,16 @@
     ScopedObjectAccess soa(env);
     mirror::String* s = soa.Decode<mirror::String*>(java_string);
     gc::Heap* heap = Runtime::Current()->GetHeap();
-    if (heap->IsMovableObject(s)) {
+    if (heap->IsMovableObject(s) || s->IsCompressed()) {
       jchar* chars = new jchar[s->GetLength()];
-      memcpy(chars, s->GetValue(), sizeof(jchar) * s->GetLength());
+      if (s->IsCompressed()) {
+        int32_t length = s->GetLength();
+        for (int i = 0; i < length; ++i) {
+          chars[i] = s->CharAt(i);
+        }
+      } else {
+        memcpy(chars, s->GetValue(), sizeof(jchar) * s->GetLength());
+      }
       if (is_copy != nullptr) {
         *is_copy = JNI_TRUE;
       }
@@ -1716,7 +1734,7 @@
     CHECK_NON_NULL_ARGUMENT_RETURN_VOID(java_string);
     ScopedObjectAccess soa(env);
     mirror::String* s = soa.Decode<mirror::String*>(java_string);
-    if (chars != s->GetValue()) {
+    if (s->IsCompressed() || (s->IsCompressed() == false && chars != s->GetValue())) {
       delete[] chars;
     }
   }
@@ -1737,15 +1755,27 @@
         heap->IncrementDisableThreadFlip(soa.Self());
       }
     }
-    if (is_copy != nullptr) {
-      *is_copy = JNI_FALSE;
+    if (s->IsCompressed()) {
+      if (is_copy != nullptr) {
+        *is_copy = JNI_TRUE;
+      }
+      int32_t length = s->GetLength();
+      jchar* chars = new jchar[length];
+      for (int i = 0; i < length; ++i) {
+        chars[i] = s->CharAt(i);
+      }
+      return chars;
+    } else {
+      if (is_copy != nullptr) {
+        *is_copy = JNI_FALSE;
+      }
+      return static_cast<jchar*>(s->GetValue());
     }
-    return static_cast<jchar*>(s->GetValue());
   }
 
   static void ReleaseStringCritical(JNIEnv* env,
                                     jstring java_string,
-                                    const jchar* chars ATTRIBUTE_UNUSED) {
+                                    const jchar* chars) {
     CHECK_NON_NULL_ARGUMENT_RETURN_VOID(java_string);
     ScopedObjectAccess soa(env);
     gc::Heap* heap = Runtime::Current()->GetHeap();
@@ -1757,6 +1787,9 @@
         heap->DecrementDisableThreadFlip(soa.Self());
       }
     }
+    if (s->IsCompressed() || (s->IsCompressed() == false && s->GetValue() != chars)) {
+      delete[] chars;
+    }
   }
 
   static const char* GetStringUTFChars(JNIEnv* env, jstring java_string, jboolean* is_copy) {
@@ -1771,8 +1804,14 @@
     size_t byte_count = s->GetUtfLength();
     char* bytes = new char[byte_count + 1];
     CHECK(bytes != nullptr);  // bionic aborts anyway.
-    const uint16_t* chars = s->GetValue();
-    ConvertUtf16ToModifiedUtf8(bytes, byte_count, chars, s->GetLength());
+    if (s->IsCompressed()) {
+      for (size_t i = 0; i < byte_count; ++i) {
+        bytes[i] = s->CharAt(i);
+      }
+    } else {
+      const uint16_t* chars = s->GetValue();
+      ConvertUtf16ToModifiedUtf8(bytes, byte_count, chars, s->GetLength());
+    }
     bytes[byte_count] = '\0';
     return bytes;
   }
diff --git a/runtime/jni_internal_test.cc b/runtime/jni_internal_test.cc
index 04ba8df..6495474 100644
--- a/runtime/jni_internal_test.cc
+++ b/runtime/jni_internal_test.cc
@@ -880,8 +880,15 @@
   ASSERT_NE(fid2, nullptr);
   // Make sure we can actually use it.
   jstring s = env_->NewStringUTF("poop");
-  ASSERT_EQ(4, env_->GetIntField(s, fid2));
-
+  if (mirror::kUseStringCompression) {
+    // Negative because s is compressed (first bit is 1)
+    ASSERT_EQ(-2147483644, env_->GetIntField(s, fid2));
+    // Create incompressible string
+    jstring s_16 = env_->NewStringUTF("\u0444\u0444");
+    ASSERT_EQ(2, env_->GetIntField(s_16, fid2));
+  } else {
+    ASSERT_EQ(4, env_->GetIntField(s, fid2));
+  }
   // Bad arguments.
   GetFromReflectedField_ToReflectedFieldBadArgumentTest(false);
   GetFromReflectedField_ToReflectedFieldBadArgumentTest(true);
@@ -1632,13 +1639,28 @@
 
   jboolean is_copy = JNI_TRUE;
   chars = env_->GetStringCritical(s, &is_copy);
-  EXPECT_EQ(JNI_FALSE, is_copy);
+  if (mirror::kUseStringCompression) {
+    // is_copy has to be JNI_TRUE because "hello" is all-ASCII
+    EXPECT_EQ(JNI_TRUE, is_copy);
+  } else {
+    EXPECT_EQ(JNI_FALSE, is_copy);
+  }
   EXPECT_EQ(expected[0], chars[0]);
   EXPECT_EQ(expected[1], chars[1]);
   EXPECT_EQ(expected[2], chars[2]);
   EXPECT_EQ(expected[3], chars[3]);
   EXPECT_EQ(expected[4], chars[4]);
   env_->ReleaseStringCritical(s, chars);
+
+  if (mirror::kUseStringCompression) {
+    // is_copy has to be JNI_FALSE because "\xed\xa0\x81\xed\xb0\x80" is incompressible
+    jboolean is_copy_16 = JNI_TRUE;
+    jstring s_16 = env_->NewStringUTF("\xed\xa0\x81\xed\xb0\x80");
+    chars = env_->GetStringCritical(s_16, &is_copy_16);
+    EXPECT_EQ(2, env_->GetStringLength(s_16));
+    EXPECT_EQ(4, env_->GetStringUTFLength(s_16));
+    env_->ReleaseStringCritical(s_16, chars);
+  }
 }
 
 TEST_F(JniInternalTest, GetObjectArrayElement_SetObjectArrayElement) {
diff --git a/runtime/mirror/class-inl.h b/runtime/mirror/class-inl.h
index 8ad47eb..0f2aac2 100644
--- a/runtime/mirror/class-inl.h
+++ b/runtime/mirror/class-inl.h
@@ -26,7 +26,6 @@
 #include "base/length_prefixed_array.h"
 #include "class_loader.h"
 #include "common_throws.h"
-#include "dex_cache.h"
 #include "dex_file.h"
 #include "gc/heap-inl.h"
 #include "iftable.h"
@@ -899,12 +898,12 @@
   }
 }
 
-inline void Class::SetDexCacheStrings(GcRoot<String>* new_dex_cache_strings) {
+inline void Class::SetDexCacheStrings(StringDexCacheType* new_dex_cache_strings) {
   SetFieldPtr<false>(DexCacheStringsOffset(), new_dex_cache_strings);
 }
 
-inline GcRoot<String>* Class::GetDexCacheStrings() {
-  return GetFieldPtr<GcRoot<String>*>(DexCacheStringsOffset());
+inline StringDexCacheType* Class::GetDexCacheStrings() {
+  return GetFieldPtr64<StringDexCacheType*>(DexCacheStringsOffset());
 }
 
 template<ReadBarrierOption kReadBarrierOption, class Visitor>
@@ -1058,8 +1057,8 @@
     dest->SetMethodsPtrInternal(new_methods);
   }
   // Update dex cache strings.
-  GcRoot<mirror::String>* strings = GetDexCacheStrings();
-  GcRoot<mirror::String>* new_strings = visitor(strings);
+  StringDexCacheType* strings = GetDexCacheStrings();
+  StringDexCacheType* new_strings = visitor(strings);
   if (strings != new_strings) {
     dest->SetDexCacheStrings(new_strings);
   }
diff --git a/runtime/mirror/class.h b/runtime/mirror/class.h
index 978fc4c..e2cd649 100644
--- a/runtime/mirror/class.h
+++ b/runtime/mirror/class.h
@@ -54,6 +54,9 @@
 class DexCache;
 class IfTable;
 class Method;
+struct StringDexCachePair;
+
+using StringDexCacheType = std::atomic<mirror::StringDexCachePair>;
 
 // C++ mirror of java.lang.Class
 class MANAGED Class FINAL : public Object {
@@ -1219,8 +1222,8 @@
   bool GetSlowPathEnabled() SHARED_REQUIRES(Locks::mutator_lock_);
   void SetSlowPath(bool enabled) SHARED_REQUIRES(Locks::mutator_lock_);
 
-  GcRoot<String>* GetDexCacheStrings() SHARED_REQUIRES(Locks::mutator_lock_);
-  void SetDexCacheStrings(GcRoot<String>* new_dex_cache_strings)
+  StringDexCacheType* GetDexCacheStrings() SHARED_REQUIRES(Locks::mutator_lock_);
+  void SetDexCacheStrings(StringDexCacheType* new_dex_cache_strings)
       SHARED_REQUIRES(Locks::mutator_lock_);
   static MemberOffset DexCacheStringsOffset() {
     return OFFSET_OF_OBJECT_MEMBER(Class, dex_cache_strings_);
diff --git a/runtime/mirror/dex_cache-inl.h b/runtime/mirror/dex_cache-inl.h
index 84469ea..a3071b7 100644
--- a/runtime/mirror/dex_cache-inl.h
+++ b/runtime/mirror/dex_cache-inl.h
@@ -27,6 +27,8 @@
 #include "mirror/class.h"
 #include "runtime.h"
 
+#include <atomic>
+
 namespace art {
 namespace mirror {
 
@@ -35,15 +37,18 @@
   return Class::ComputeClassSize(true, vtable_entries, 0, 0, 0, 0, 0, pointer_size);
 }
 
-inline String* DexCache::GetResolvedString(uint32_t string_idx) {
-  DCHECK_LT(string_idx, NumStrings());
-  return GetStrings()[string_idx].Read();
+inline mirror::String* DexCache::GetResolvedString(uint32_t string_idx) {
+  DCHECK_LT(string_idx, GetDexFile()->NumStringIds());
+  return StringDexCachePair::LookupString(GetStrings(), string_idx, NumStrings()).Read();
 }
 
-inline void DexCache::SetResolvedString(uint32_t string_idx, String* resolved) {
-  DCHECK_LT(string_idx, NumStrings());
+inline void DexCache::SetResolvedString(uint32_t string_idx, mirror::String* resolved) {
+  DCHECK_LT(string_idx % NumStrings(), NumStrings());
   // TODO default transaction support.
-  GetStrings()[string_idx] = GcRoot<String>(resolved);
+  StringDexCachePair idx_ptr;
+  idx_ptr.string_index = string_idx;
+  idx_ptr.string_pointer = GcRoot<String>(resolved);
+  GetStrings()[string_idx % NumStrings()].store(idx_ptr, std::memory_order_relaxed);
   // TODO: Fine-grained marking, so that we don't need to go through all arrays in full.
   Runtime::Current()->GetHeap()->WriteBarrierEveryFieldOf(this);
 }
@@ -131,9 +136,16 @@
   VisitInstanceFieldsReferences<kVerifyFlags, kReadBarrierOption>(klass, visitor);
   // Visit arrays after.
   if (kVisitNativeRoots) {
-    GcRoot<mirror::String>* strings = GetStrings();
+    mirror::StringDexCacheType* strings = GetStrings();
     for (size_t i = 0, num_strings = NumStrings(); i != num_strings; ++i) {
-      visitor.VisitRootIfNonNull(strings[i].AddressWithoutBarrier());
+      StringDexCachePair source = strings[i].load(std::memory_order_relaxed);
+      mirror::String* before = source.string_pointer.Read<kReadBarrierOption>();
+      GcRoot<mirror::String> root(before);
+      visitor.VisitRootIfNonNull(root.AddressWithoutBarrier());
+      if (root.Read() != before) {
+        source.string_pointer = GcRoot<String>(root.Read());
+        strings[i].store(source, std::memory_order_relaxed);
+      }
     }
     GcRoot<mirror::Class>* resolved_types = GetResolvedTypes();
     for (size_t i = 0, num_types = NumResolvedTypes(); i != num_types; ++i) {
@@ -143,12 +155,14 @@
 }
 
 template <ReadBarrierOption kReadBarrierOption, typename Visitor>
-inline void DexCache::FixupStrings(GcRoot<mirror::String>* dest, const Visitor& visitor) {
-  GcRoot<mirror::String>* src = GetStrings();
+inline void DexCache::FixupStrings(mirror::StringDexCacheType* dest, const Visitor& visitor) {
+  mirror::StringDexCacheType* src = GetStrings();
   for (size_t i = 0, count = NumStrings(); i < count; ++i) {
-    mirror::String* source = src[i].Read<kReadBarrierOption>();
-    mirror::String* new_source = visitor(source);
-    dest[i] = GcRoot<mirror::String>(new_source);
+    StringDexCachePair source = src[i].load(std::memory_order_relaxed);
+    mirror::String* ptr = source.string_pointer.Read<kReadBarrierOption>();
+    mirror::String* new_source = visitor(ptr);
+    source.string_pointer = GcRoot<String>(new_source);
+    dest[i].store(source, std::memory_order_relaxed);
   }
 }
 
diff --git a/runtime/mirror/dex_cache.cc b/runtime/mirror/dex_cache.cc
index 57066d8..cfcec9c 100644
--- a/runtime/mirror/dex_cache.cc
+++ b/runtime/mirror/dex_cache.cc
@@ -33,7 +33,7 @@
 
 void DexCache::Init(const DexFile* dex_file,
                     String* location,
-                    GcRoot<String>* strings,
+                    StringDexCacheType* strings,
                     uint32_t num_strings,
                     GcRoot<Class>* resolved_types,
                     uint32_t num_resolved_types,
diff --git a/runtime/mirror/dex_cache.h b/runtime/mirror/dex_cache.h
index d02a0d8..4ddfc7b 100644
--- a/runtime/mirror/dex_cache.h
+++ b/runtime/mirror/dex_cache.h
@@ -35,12 +35,61 @@
 
 class String;
 
+struct PACKED(8) StringDexCachePair {
+  GcRoot<String> string_pointer;
+  uint32_t string_index;
+  // The array is initially [ {0,0}, {0,0}, {0,0} ... ]
+  // We maintain the invariant that once a dex cache entry is populated,
+  // the pointer is always non-0
+  // Any given entry would thus be:
+  // {non-0, non-0} OR {0,0}
+  //
+  // It's generally sufficiently enough then to check if the
+  // lookup string index matches the stored string index (for a >0 string index)
+  // because if it's true the pointer is also non-null.
+  //
+  // For the 0th entry which is a special case, the value is either
+  // {0,0} (initial state) or {non-0, 0} which indicates
+  // that a valid string is stored at that index for a dex string id of 0.
+  //
+  // As an optimization, we want to avoid branching on the string pointer since
+  // it's always non-null if the string id branch succeeds (except for the 0th string id).
+  // Set the initial state for the 0th entry to be {0,1} which is guaranteed to fail
+  // the lookup string id == stored id branch.
+  static void Initialize(StringDexCacheType* strings) {
+    mirror::StringDexCachePair first_elem;
+    first_elem.string_pointer = GcRoot<String>(nullptr);
+    first_elem.string_index = 1;
+    strings[0].store(first_elem, std::memory_order_relaxed);
+  }
+  static GcRoot<String> LookupString(StringDexCacheType* dex_cache,
+                                     uint32_t string_idx,
+                                     uint32_t cache_size) {
+    StringDexCachePair index_string = dex_cache[string_idx % cache_size]
+        .load(std::memory_order_relaxed);
+    if (string_idx != index_string.string_index) return GcRoot<String>(nullptr);
+    DCHECK(!index_string.string_pointer.IsNull());
+    return index_string.string_pointer;
+  }
+};
+using StringDexCacheType = std::atomic<StringDexCachePair>;
+
+
 // C++ mirror of java.lang.DexCache.
 class MANAGED DexCache FINAL : public Object {
  public:
   // Size of java.lang.DexCache.class.
   static uint32_t ClassSize(PointerSize pointer_size);
 
+  // Size of string dex cache. Needs to be a power of 2 for entrypoint assumptions to hold.
+  static constexpr size_t kDexCacheStringCacheSize = 1024;
+  static_assert(IsPowerOfTwo(kDexCacheStringCacheSize),
+                "String dex cache size is not a power of 2.");
+
+  static constexpr size_t StaticStringSize() {
+    return kDexCacheStringCacheSize;
+  }
+
   // Size of an instance of java.lang.DexCache not including referenced values.
   static constexpr uint32_t InstanceSize() {
     return sizeof(DexCache);
@@ -48,7 +97,7 @@
 
   void Init(const DexFile* dex_file,
             String* location,
-            GcRoot<String>* strings,
+            StringDexCacheType* strings,
             uint32_t num_strings,
             GcRoot<Class>* resolved_types,
             uint32_t num_resolved_types,
@@ -62,7 +111,7 @@
       SHARED_REQUIRES(Locks::mutator_lock_);
 
   template <ReadBarrierOption kReadBarrierOption = kWithReadBarrier, typename Visitor>
-  void FixupStrings(GcRoot<mirror::String>* dest, const Visitor& visitor)
+  void FixupStrings(StringDexCacheType* dest, const Visitor& visitor)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
   template <ReadBarrierOption kReadBarrierOption = kWithReadBarrier, typename Visitor>
@@ -109,10 +158,10 @@
     return OFFSET_OF_OBJECT_MEMBER(DexCache, num_resolved_methods_);
   }
 
-  String* GetResolvedString(uint32_t string_idx) ALWAYS_INLINE
+  mirror::String* GetResolvedString(uint32_t string_idx) ALWAYS_INLINE
       SHARED_REQUIRES(Locks::mutator_lock_);
 
-  void SetResolvedString(uint32_t string_idx, String* resolved) ALWAYS_INLINE
+  void SetResolvedString(uint32_t string_idx, mirror::String* resolved) ALWAYS_INLINE
       SHARED_REQUIRES(Locks::mutator_lock_);
 
   Class* GetResolvedType(uint32_t type_idx) SHARED_REQUIRES(Locks::mutator_lock_);
@@ -135,11 +184,11 @@
   ALWAYS_INLINE void SetResolvedField(uint32_t idx, ArtField* field, PointerSize ptr_size)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
-  GcRoot<String>* GetStrings() ALWAYS_INLINE SHARED_REQUIRES(Locks::mutator_lock_) {
-    return GetFieldPtr<GcRoot<String>*>(StringsOffset());
+  StringDexCacheType* GetStrings() ALWAYS_INLINE SHARED_REQUIRES(Locks::mutator_lock_) {
+    return GetFieldPtr64<StringDexCacheType*>(StringsOffset());
   }
 
-  void SetStrings(GcRoot<String>* strings) ALWAYS_INLINE SHARED_REQUIRES(Locks::mutator_lock_) {
+  void SetStrings(StringDexCacheType* strings) ALWAYS_INLINE SHARED_REQUIRES(Locks::mutator_lock_) {
     SetFieldPtr<false>(StringsOffset(), strings);
   }
 
@@ -224,7 +273,8 @@
   uint64_t resolved_fields_;    // ArtField*, array with num_resolved_fields_ elements.
   uint64_t resolved_methods_;   // ArtMethod*, array with num_resolved_methods_ elements.
   uint64_t resolved_types_;     // GcRoot<Class>*, array with num_resolved_types_ elements.
-  uint64_t strings_;            // GcRoot<String>*, array with num_strings_ elements.
+  uint64_t strings_;            // std::atomic<StringDexCachePair>*,
+                                // array with num_strings_ elements.
   uint32_t num_resolved_fields_;    // Number of elements in the resolved_fields_ array.
   uint32_t num_resolved_methods_;   // Number of elements in the resolved_methods_ array.
   uint32_t num_resolved_types_;     // Number of elements in the resolved_types_ array.
diff --git a/runtime/mirror/dex_cache_test.cc b/runtime/mirror/dex_cache_test.cc
index 48f2ca5..175997c 100644
--- a/runtime/mirror/dex_cache_test.cc
+++ b/runtime/mirror/dex_cache_test.cc
@@ -22,6 +22,7 @@
 #include "common_runtime_test.h"
 #include "linear_alloc.h"
 #include "mirror/class_loader-inl.h"
+#include "mirror/dex_cache-inl.h"
 #include "handle_scope-inl.h"
 #include "scoped_thread_state_change.h"
 
@@ -40,7 +41,8 @@
                                                 Runtime::Current()->GetLinearAlloc())));
   ASSERT_TRUE(dex_cache.Get() != nullptr);
 
-  EXPECT_EQ(java_lang_dex_file_->NumStringIds(), dex_cache->NumStrings());
+  EXPECT_TRUE(dex_cache->StaticStringSize() == dex_cache->NumStrings()
+      || java_lang_dex_file_->NumStringIds() == dex_cache->NumStrings());
   EXPECT_EQ(java_lang_dex_file_->NumTypeIds(),   dex_cache->NumResolvedTypes());
   EXPECT_EQ(java_lang_dex_file_->NumMethodIds(), dex_cache->NumResolvedMethods());
   EXPECT_EQ(java_lang_dex_file_->NumFieldIds(),  dex_cache->NumResolvedFields());
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index 0495c95..27f8bd7 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -147,6 +147,18 @@
 #endif
 }
 
+inline Object* Object::GetReadBarrierPointerAcquire() {
+#ifdef USE_BAKER_READ_BARRIER
+  DCHECK(kUseBakerReadBarrier);
+  LockWord lw(GetFieldAcquire<uint32_t>(OFFSET_OF_OBJECT_MEMBER(Object, monitor_)));
+  return reinterpret_cast<Object*>(lw.ReadBarrierState());
+#else
+  LOG(FATAL) << "Unreachable";
+  UNREACHABLE();
+#endif
+}
+
+
 inline uint32_t Object::GetMarkBit() {
 #ifdef USE_READ_BARRIER
   return GetLockWord(false).MarkBitState();
@@ -814,6 +826,13 @@
   }
 }
 
+template<typename kSize>
+inline kSize Object::GetFieldAcquire(MemberOffset field_offset) {
+  const uint8_t* raw_addr = reinterpret_cast<const uint8_t*>(this) + field_offset.Int32Value();
+  const kSize* addr = reinterpret_cast<const kSize*>(raw_addr);
+  return reinterpret_cast<const Atomic<kSize>*>(addr)->LoadAcquire();
+}
+
 template<bool kTransactionActive, bool kCheckTransaction, VerifyObjectFlags kVerifyFlags>
 inline bool Object::CasFieldWeakSequentiallyConsistent64(MemberOffset field_offset,
                                                          int64_t old_value, int64_t new_value) {
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index 5b129bf..8649294 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -93,9 +93,12 @@
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   void SetClass(Class* new_klass) SHARED_REQUIRES(Locks::mutator_lock_);
 
-  // TODO: Clean this up and change to return int32_t
+  // TODO: Clean these up and change to return int32_t
   Object* GetReadBarrierPointer() SHARED_REQUIRES(Locks::mutator_lock_);
 
+  // Get the read barrier pointer with release semantics, only supported for baker.
+  Object* GetReadBarrierPointerAcquire() SHARED_REQUIRES(Locks::mutator_lock_);
+
 #ifndef USE_BAKER_OR_BROOKS_READ_BARRIER
   NO_RETURN
 #endif
@@ -574,6 +577,10 @@
   template<typename kSize, bool kIsVolatile>
   ALWAYS_INLINE kSize GetField(MemberOffset field_offset)
       SHARED_REQUIRES(Locks::mutator_lock_);
+  // Get a field with acquire semantics.
+  template<typename kSize>
+  ALWAYS_INLINE kSize GetFieldAcquire(MemberOffset field_offset)
+      SHARED_REQUIRES(Locks::mutator_lock_);
 
   // Verify the type correctness of stores to fields.
   // TODO: This can cause thread suspension and isn't moving GC safe.
diff --git a/runtime/mirror/object_test.cc b/runtime/mirror/object_test.cc
index 0034220..b35a479 100644
--- a/runtime/mirror/object_test.cc
+++ b/runtime/mirror/object_test.cc
@@ -62,7 +62,7 @@
     Handle<String> string(
         hs.NewHandle(String::AllocFromModifiedUtf8(self, expected_utf16_length, utf8_in)));
     ASSERT_EQ(expected_utf16_length, string->GetLength());
-    ASSERT_TRUE(string->GetValue() != nullptr);
+    ASSERT_EQ(string->IsValueNull(), false);
     // strlen is necessary because the 1-character string "\x00\x00" is interpreted as ""
     ASSERT_TRUE(string->Equals(utf8_in) || (expected_utf16_length == 1 && strlen(utf8_in) == 0));
     ASSERT_TRUE(string->Equals(StringPiece(utf8_in)) ||
diff --git a/runtime/mirror/string-inl.h b/runtime/mirror/string-inl.h
index d3660e5..bc39ea8 100644
--- a/runtime/mirror/string-inl.h
+++ b/runtime/mirror/string-inl.h
@@ -13,7 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #ifndef ART_RUNTIME_MIRROR_STRING_INL_H_
 #define ART_RUNTIME_MIRROR_STRING_INL_H_
 
@@ -49,6 +48,7 @@
     // Avoid AsString as object is not yet in live bitmap or allocation stack.
     String* string = down_cast<String*>(obj);
     string->SetCount(count_);
+    DCHECK(!string->IsCompressed() || kUseStringCompression);
   }
 
  private:
@@ -68,10 +68,19 @@
     // Avoid AsString as object is not yet in live bitmap or allocation stack.
     String* string = down_cast<String*>(obj);
     string->SetCount(count_);
-    uint16_t* value = string->GetValue();
+    DCHECK(!string->IsCompressed() || kUseStringCompression);
+    int32_t length = String::GetLengthFromCount(count_);
     const uint8_t* const src = reinterpret_cast<uint8_t*>(src_array_->GetData()) + offset_;
-    for (int i = 0; i < count_; i++) {
-      value[i] = high_byte_ + (src[i] & 0xFF);
+    if (string->IsCompressed()) {
+      uint8_t* valueCompressed = string->GetValueCompressed();
+      for (int i = 0; i < length; i++) {
+        valueCompressed[i] = (src[i] & 0xFF);
+      }
+    } else {
+      uint16_t* value = string->GetValue();
+      for (int i = 0; i < length; i++) {
+        value[i] = high_byte_ + (src[i] & 0xFF);
+      }
     }
   }
 
@@ -96,7 +105,16 @@
     String* string = down_cast<String*>(obj);
     string->SetCount(count_);
     const uint16_t* const src = src_array_->GetData() + offset_;
-    memcpy(string->GetValue(), src, count_ * sizeof(uint16_t));
+    const int32_t length = String::GetLengthFromCount(count_);
+    bool compressible = kUseStringCompression && String::GetCompressionFlagFromCount(count_);
+    DCHECK(!compressible || kUseStringCompression);
+    if (compressible) {
+      for (int i = 0; i < length; ++i) {
+        string->GetValueCompressed()[i] = static_cast<uint8_t>(src[i]);
+      }
+    } else {
+      memcpy(string->GetValue(), src, length * sizeof(uint16_t));
+    }
   }
 
  private:
@@ -118,8 +136,22 @@
     // Avoid AsString as object is not yet in live bitmap or allocation stack.
     String* string = down_cast<String*>(obj);
     string->SetCount(count_);
-    const uint16_t* const src = src_string_->GetValue() + offset_;
-    memcpy(string->GetValue(), src, count_ * sizeof(uint16_t));
+    const int32_t length = String::GetLengthFromCount(count_);
+    bool compressible = kUseStringCompression && String::GetCompressionFlagFromCount(count_);
+    DCHECK(!compressible || kUseStringCompression);
+    if (src_string_->IsCompressed()) {
+      const uint8_t* const src = src_string_->GetValueCompressed() + offset_;
+      memcpy(string->GetValueCompressed(), src, length * sizeof(uint8_t));
+    } else {
+      const uint16_t* const src = src_string_->GetValue() + offset_;
+      if (compressible) {
+        for (int i = 0; i < length; ++i) {
+          string->GetValueCompressed()[i] = static_cast<uint8_t>(src[i]);
+        }
+      } else {
+        memcpy(string->GetValue(), src, length * sizeof(uint16_t));
+      }
+    }
   }
 
  private:
@@ -133,17 +165,38 @@
 }
 
 inline uint16_t String::CharAt(int32_t index) {
-  int32_t count = GetField32(OFFSET_OF_OBJECT_MEMBER(String, count_));
+  int32_t count = GetLength();
   if (UNLIKELY((index < 0) || (index >= count))) {
     ThrowStringIndexOutOfBoundsException(index, count);
     return 0;
   }
-  return GetValue()[index];
+  if (IsCompressed()) {
+    return GetValueCompressed()[index];
+  } else {
+    return GetValue()[index];
+  }
+}
+
+template <typename MemoryType>
+int32_t String::FastIndexOf(MemoryType* chars, int32_t ch, int32_t start) {
+  const MemoryType* p = chars + start;
+  const MemoryType* end = chars + GetLength();
+  while (p < end) {
+    if (*p++ == ch) {
+      return (p - 1) - chars;
+    }
+  }
+  return -1;
 }
 
 template<VerifyObjectFlags kVerifyFlags>
 inline size_t String::SizeOf() {
-  size_t size = sizeof(String) + (sizeof(uint16_t) * GetLength<kVerifyFlags>());
+  size_t size = sizeof(String);
+  if (IsCompressed()) {
+    size += (sizeof(uint8_t) * GetLength<kVerifyFlags>());
+  } else {
+    size += (sizeof(uint16_t) * GetLength<kVerifyFlags>());
+  }
   // String.equals() intrinsics assume zero-padding up to kObjectAlignment,
   // so make sure the zero-padding is actually copied around if GC compaction
   // chooses to copy only SizeOf() bytes.
@@ -152,31 +205,35 @@
 }
 
 template <bool kIsInstrumented, typename PreFenceVisitor>
-inline String* String::Alloc(Thread* self, int32_t utf16_length, gc::AllocatorType allocator_type,
+inline String* String::Alloc(Thread* self, int32_t utf16_length_with_flag,
+                             gc::AllocatorType allocator_type,
                              const PreFenceVisitor& pre_fence_visitor) {
   constexpr size_t header_size = sizeof(String);
-  static_assert(sizeof(utf16_length) <= sizeof(size_t),
+  const bool compressible = kUseStringCompression &&
+                            String::GetCompressionFlagFromCount(utf16_length_with_flag);
+  const size_t block_size = (compressible) ? sizeof(uint8_t) : sizeof(uint16_t);
+  size_t length = String::GetLengthFromCount(utf16_length_with_flag);
+  static_assert(sizeof(length) <= sizeof(size_t),
                 "static_cast<size_t>(utf16_length) must not lose bits.");
-  size_t length = static_cast<size_t>(utf16_length);
-  size_t data_size = sizeof(uint16_t) * length;
+  size_t data_size = block_size * length;
   size_t size = header_size + data_size;
   // String.equals() intrinsics assume zero-padding up to kObjectAlignment,
   // so make sure the allocator clears the padding as well.
   // http://b/23528461
   size_t alloc_size = RoundUp(size, kObjectAlignment);
-  Class* string_class = GetJavaLangString();
 
+  Class* string_class = GetJavaLangString();
   // Check for overflow and throw OutOfMemoryError if this was an unreasonable request.
   // Do this by comparing with the maximum length that will _not_ cause an overflow.
-  constexpr size_t overflow_length = (-header_size) / sizeof(uint16_t);  // Unsigned arithmetic.
-  constexpr size_t max_alloc_length = overflow_length - 1u;
+  const size_t overflow_length = (-header_size) / block_size;   // Unsigned arithmetic.
+  const size_t max_alloc_length = overflow_length - 1u;
   static_assert(IsAligned<sizeof(uint16_t)>(kObjectAlignment),
                 "kObjectAlignment must be at least as big as Java char alignment");
-  constexpr size_t max_length = RoundDown(max_alloc_length, kObjectAlignment / sizeof(uint16_t));
+  const size_t max_length = RoundDown(max_alloc_length, kObjectAlignment / block_size);
   if (UNLIKELY(length > max_length)) {
     self->ThrowOutOfMemoryError(StringPrintf("%s of length %d would overflow",
                                              PrettyDescriptor(string_class).c_str(),
-                                             utf16_length).c_str());
+                                             static_cast<int>(length)).c_str());
     return nullptr;
   }
 
@@ -187,11 +244,22 @@
 }
 
 template <bool kIsInstrumented>
+inline String* String::AllocEmptyString(Thread* self, gc::AllocatorType allocator_type) {
+  SetStringCountVisitor visitor(0);
+  return Alloc<kIsInstrumented>(self, 0, allocator_type, visitor);
+}
+
+template <bool kIsInstrumented>
 inline String* String::AllocFromByteArray(Thread* self, int32_t byte_length,
                                           Handle<ByteArray> array, int32_t offset,
                                           int32_t high_byte, gc::AllocatorType allocator_type) {
-  SetStringCountAndBytesVisitor visitor(byte_length, array, offset, high_byte << 8);
-  String* string = Alloc<kIsInstrumented>(self, byte_length, allocator_type, visitor);
+  const uint8_t* const src = reinterpret_cast<uint8_t*>(array->GetData()) + offset;
+  const bool compressible = kUseStringCompression && String::AllASCII<uint8_t>(src, byte_length)
+                                            && (high_byte == 0);
+  const int32_t length_with_flag = (compressible) ? String::GetFlaggedCount(byte_length)
+                                                  : byte_length;
+  SetStringCountAndBytesVisitor visitor(length_with_flag, array, offset, high_byte << 8);
+  String* string = Alloc<kIsInstrumented>(self, length_with_flag, allocator_type, visitor);
   return string;
 }
 
@@ -201,16 +269,24 @@
                                           gc::AllocatorType allocator_type) {
   // It is a caller error to have a count less than the actual array's size.
   DCHECK_GE(array->GetLength(), count);
-  SetStringCountAndValueVisitorFromCharArray visitor(count, array, offset);
-  String* new_string = Alloc<kIsInstrumented>(self, count, allocator_type, visitor);
+  const bool compressible = kUseStringCompression &&
+                            String::AllASCII<uint16_t>(array->GetData() + offset, count);
+  const int32_t length_with_flag = (compressible) ? String::GetFlaggedCount(count) : count;
+  SetStringCountAndValueVisitorFromCharArray visitor(length_with_flag, array, offset);
+  String* new_string = Alloc<kIsInstrumented>(self, length_with_flag, allocator_type, visitor);
   return new_string;
 }
 
 template <bool kIsInstrumented>
 inline String* String::AllocFromString(Thread* self, int32_t string_length, Handle<String> string,
                                        int32_t offset, gc::AllocatorType allocator_type) {
-  SetStringCountAndValueVisitorFromString visitor(string_length, string, offset);
-  String* new_string = Alloc<kIsInstrumented>(self, string_length, allocator_type, visitor);
+  const bool compressible = kUseStringCompression &&
+      ((string->IsCompressed()) ? true : String::AllASCII<uint16_t>(string->GetValue() + offset,
+                                                                    string_length));
+  const int32_t length_with_flag = (compressible) ? String::GetFlaggedCount(string_length)
+                                                  : string_length;
+  SetStringCountAndValueVisitorFromString visitor(length_with_flag, string, offset);
+  String* new_string = Alloc<kIsInstrumented>(self, length_with_flag, allocator_type, visitor);
   return new_string;
 }
 
@@ -219,11 +295,28 @@
   if (UNLIKELY(result == 0)) {
     result = ComputeHashCode();
   }
-  DCHECK(result != 0 || ComputeUtf16Hash(GetValue(), GetLength()) == 0)
-      << ToModifiedUtf8() << " " << result;
+  if (kIsDebugBuild) {
+    if (IsCompressed()) {
+      DCHECK(result != 0 || ComputeUtf16Hash(GetValueCompressed(), GetLength()) == 0)
+          << ToModifiedUtf8() << " " << result;
+    } else {
+      DCHECK(result != 0 || ComputeUtf16Hash(GetValue(), GetLength()) == 0)
+          << ToModifiedUtf8() << " " << result;
+    }
+  }
   return result;
 }
 
+template<typename MemoryType>
+bool String::AllASCII(const MemoryType* const chars, const int length) {
+  for (int i = 0; i < length; ++i) {
+    if (chars[i] > 0x80) {
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace mirror
 }  // namespace art
 
diff --git a/runtime/mirror/string.cc b/runtime/mirror/string.cc
index 33aca03..46caa4d 100644
--- a/runtime/mirror/string.cc
+++ b/runtime/mirror/string.cc
@@ -41,15 +41,11 @@
   } else if (start > count) {
     start = count;
   }
-  const uint16_t* chars = GetValue();
-  const uint16_t* p = chars + start;
-  const uint16_t* end = chars + count;
-  while (p < end) {
-    if (*p++ == ch) {
-      return (p - 1) - chars;
-    }
+  if (IsCompressed()) {
+    return FastIndexOf<uint8_t>(GetValueCompressed(), ch, start);
+  } else {
+    return FastIndexOf<uint16_t>(GetValue(), ch, start);
   }
-  return -1;
 }
 
 void String::SetClass(Class* java_lang_String) {
@@ -65,45 +61,91 @@
 }
 
 int String::ComputeHashCode() {
-  const int32_t hash_code = ComputeUtf16Hash(GetValue(), GetLength());
+  int32_t hash_code = 0;
+  if (IsCompressed()) {
+    hash_code = ComputeUtf16Hash(GetValueCompressed(), GetLength());
+  } else {
+    hash_code = ComputeUtf16Hash(GetValue(), GetLength());
+  }
   SetHashCode(hash_code);
   return hash_code;
 }
 
 int32_t String::GetUtfLength() {
-  return CountUtf8Bytes(GetValue(), GetLength());
+  if (IsCompressed()) {
+    return GetLength();
+  } else {
+    return CountUtf8Bytes(GetValue(), GetLength());
+  }
 }
 
 void String::SetCharAt(int32_t index, uint16_t c) {
-  DCHECK((index >= 0) && (index < count_));
-  GetValue()[index] = c;
+  DCHECK((index >= 0) && (index < GetLength()));
+  if (IsCompressed()) {
+    // TODO: Handle the case where String is compressed and c is non-ASCII
+    GetValueCompressed()[index] = static_cast<uint8_t>(c);
+  } else {
+    GetValue()[index] = c;
+  }
 }
 
 String* String::AllocFromStrings(Thread* self, Handle<String> string, Handle<String> string2) {
   int32_t length = string->GetLength();
   int32_t length2 = string2->GetLength();
   gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
-  SetStringCountVisitor visitor(length + length2);
-  String* new_string = Alloc<true>(self, length + length2, allocator_type, visitor);
+  const bool compressible = kUseStringCompression && (string->IsCompressed() && string2->IsCompressed());
+  const int32_t length_with_flag = (compressible) ? String::GetFlaggedCount(length + length2)
+                                                  : (length + length2);
+
+  SetStringCountVisitor visitor(length_with_flag);
+  String* new_string = Alloc<true>(self, length_with_flag, allocator_type, visitor);
   if (UNLIKELY(new_string == nullptr)) {
     return nullptr;
   }
-  uint16_t* new_value = new_string->GetValue();
-  memcpy(new_value, string->GetValue(), length * sizeof(uint16_t));
-  memcpy(new_value + length, string2->GetValue(), length2 * sizeof(uint16_t));
+  if (compressible) {
+    uint8_t* new_value = new_string->GetValueCompressed();
+    memcpy(new_value, string->GetValueCompressed(), length * sizeof(uint8_t));
+    memcpy(new_value + length, string2->GetValueCompressed(), length2 * sizeof(uint8_t));
+  } else {
+    uint16_t* new_value = new_string->GetValue();
+    if (string->IsCompressed()) {
+      for (int i = 0; i < length; ++i) {
+        new_value[i] = string->CharAt(i);
+      }
+    } else {
+      memcpy(new_value, string->GetValue(), length * sizeof(uint16_t));
+    }
+    if (string2->IsCompressed()) {
+      for (int i = 0; i < length2; ++i) {
+        new_value[i+length] = string2->CharAt(i);
+      }
+    } else {
+      memcpy(new_value + length, string2->GetValue(), length2 * sizeof(uint16_t));
+    }
+  }
   return new_string;
 }
 
 String* String::AllocFromUtf16(Thread* self, int32_t utf16_length, const uint16_t* utf16_data_in) {
   CHECK(utf16_data_in != nullptr || utf16_length == 0);
   gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
-  SetStringCountVisitor visitor(utf16_length);
-  String* string = Alloc<true>(self, utf16_length, allocator_type, visitor);
+  const bool compressible = kUseStringCompression &&
+                            String::AllASCII<uint16_t>(utf16_data_in, utf16_length);
+  int32_t length_with_flag = (compressible) ? String::GetFlaggedCount(utf16_length)
+                                            : utf16_length;
+  SetStringCountVisitor visitor(length_with_flag);
+  String* string = Alloc<true>(self, length_with_flag, allocator_type, visitor);
   if (UNLIKELY(string == nullptr)) {
     return nullptr;
   }
-  uint16_t* array = string->GetValue();
-  memcpy(array, utf16_data_in, utf16_length * sizeof(uint16_t));
+  if (compressible) {
+    for (int i = 0; i < utf16_length; ++i) {
+      string->GetValueCompressed()[i] = static_cast<uint8_t>(utf16_data_in[i]);
+    }
+  } else {
+    uint16_t* array = string->GetValue();
+    memcpy(array, utf16_data_in, utf16_length * sizeof(uint16_t));
+  }
   return string;
 }
 
@@ -121,13 +163,20 @@
 String* String::AllocFromModifiedUtf8(Thread* self, int32_t utf16_length,
                                       const char* utf8_data_in, int32_t utf8_length) {
   gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
-  SetStringCountVisitor visitor(utf16_length);
-  String* string = Alloc<true>(self, utf16_length, allocator_type, visitor);
+  const bool compressible = kUseStringCompression && (utf16_length == utf8_length);
+  const int32_t utf16_length_with_flag = (compressible) ? String::GetFlaggedCount(utf16_length)
+                                                        : utf16_length;
+  SetStringCountVisitor visitor(utf16_length_with_flag);
+  String* string = Alloc<true>(self, utf16_length_with_flag, allocator_type, visitor);
   if (UNLIKELY(string == nullptr)) {
     return nullptr;
   }
-  uint16_t* utf16_data_out = string->GetValue();
-  ConvertModifiedUtf8ToUtf16(utf16_data_out, utf16_length, utf8_data_in, utf8_length);
+  if (compressible) {
+    memcpy(string->GetValueCompressed(), utf8_data_in, utf16_length * sizeof(uint8_t));
+  } else {
+    uint16_t* utf16_data_out = string->GetValue();
+    ConvertModifiedUtf8ToUtf16(utf16_data_out, utf16_length, utf8_data_in, utf8_length);
+  }
   return string;
 }
 
@@ -219,10 +268,16 @@
 
 // Create a modified UTF-8 encoded std::string from a java/lang/String object.
 std::string String::ToModifiedUtf8() {
-  const uint16_t* chars = GetValue();
   size_t byte_count = GetUtfLength();
   std::string result(byte_count, static_cast<char>(0));
-  ConvertUtf16ToModifiedUtf8(&result[0], byte_count, chars, GetLength());
+  if (IsCompressed()) {
+    for (size_t i = 0; i < byte_count; ++i) {
+      result[i] = static_cast<char>(CharAt(i));
+    }
+  } else {
+    const uint16_t* chars = GetValue();
+    ConvertUtf16ToModifiedUtf8(&result[0], byte_count, chars, GetLength());
+  }
   return result;
 }
 
@@ -242,11 +297,24 @@
   int32_t rhsCount = rhs->GetLength();
   int32_t countDiff = lhsCount - rhsCount;
   int32_t minCount = (countDiff < 0) ? lhsCount : rhsCount;
-  const uint16_t* lhsChars = lhs->GetValue();
-  const uint16_t* rhsChars = rhs->GetValue();
-  int32_t otherRes = MemCmp16(lhsChars, rhsChars, minCount);
-  if (otherRes != 0) {
-    return otherRes;
+  if (lhs->IsCompressed() && rhs->IsCompressed()) {
+    int32_t comparison = memcmp(lhs->GetValueCompressed(), rhs->GetValueCompressed(), minCount * sizeof(uint8_t));
+    if (comparison != 0) {
+      return comparison;
+    }
+  } else if (lhs->IsCompressed() || rhs->IsCompressed()) {
+    for (int32_t i = 0; i < minCount; ++i) {
+      if (lhs->CharAt(i) != rhs->CharAt(i)) {
+        return static_cast<int32_t>(lhs->CharAt(i)) - static_cast<int32_t>(rhs->CharAt(i));
+      }
+    }
+  } else {
+    const uint16_t* lhsChars = lhs->GetValue();
+    const uint16_t* rhsChars = rhs->GetValue();
+    int32_t otherRes = MemCmp16(lhsChars, rhsChars, minCount);
+    if (otherRes != 0) {
+      return otherRes;
+    }
   }
   return countDiff;
 }
@@ -260,7 +328,14 @@
   Handle<String> string(hs.NewHandle(this));
   CharArray* result = CharArray::Alloc(self, GetLength());
   if (result != nullptr) {
-    memcpy(result->GetData(), string->GetValue(), string->GetLength() * sizeof(uint16_t));
+    if (string->IsCompressed()) {
+      int32_t length = string->GetLength();
+      for (int i = 0; i < length; ++i) {
+        result->GetData()[i] = string->CharAt(i);
+      }
+    } else {
+      memcpy(result->GetData(), string->GetValue(), string->GetLength() * sizeof(uint16_t));
+    }
   } else {
     self->AssertPendingOOMException();
   }
@@ -269,8 +344,18 @@
 
 void String::GetChars(int32_t start, int32_t end, Handle<CharArray> array, int32_t index) {
   uint16_t* data = array->GetData() + index;
-  uint16_t* value = GetValue() + start;
-  memcpy(data, value, (end - start) * sizeof(uint16_t));
+  if (IsCompressed()) {
+    for (int i = start; i < end; ++i) {
+      data[i-start] = CharAt(i);
+    }
+  } else {
+    uint16_t* value = GetValue() + start;
+    memcpy(data, value, (end - start) * sizeof(uint16_t));
+  }
+}
+
+bool String::IsValueNull() {
+  return (IsCompressed()) ? (GetValueCompressed() == nullptr) : (GetValue() == nullptr);
 }
 
 }  // namespace mirror
diff --git a/runtime/mirror/string.h b/runtime/mirror/string.h
index d492ba3..8695fe8 100644
--- a/runtime/mirror/string.h
+++ b/runtime/mirror/string.h
@@ -31,6 +31,9 @@
 
 namespace mirror {
 
+// String Compression
+static constexpr bool kUseStringCompression = false;
+
 // C++ mirror of java.lang.String
 class MANAGED String FINAL : public Object {
  public:
@@ -54,18 +57,28 @@
     return &value_[0];
   }
 
+  uint8_t* GetValueCompressed() SHARED_REQUIRES(Locks::mutator_lock_) {
+    return &value_compressed_[0];
+  }
+
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   size_t SizeOf() SHARED_REQUIRES(Locks::mutator_lock_);
 
+  // Taking out the first/uppermost bit because it is not part of actual length value
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   int32_t GetLength() SHARED_REQUIRES(Locks::mutator_lock_) {
+    return GetLengthFromCount(GetCount<kVerifyFlags>());
+  }
+
+  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  int32_t GetCount() SHARED_REQUIRES(Locks::mutator_lock_) {
     return GetField32<kVerifyFlags>(OFFSET_OF_OBJECT_MEMBER(String, count_));
   }
 
   void SetCount(int32_t new_count) SHARED_REQUIRES(Locks::mutator_lock_) {
     // Count is invariant so use non-transactional mode. Also disable check as we may run inside
     // a transaction.
-    DCHECK_LE(0, new_count);
+    DCHECK_LE(0, (new_count & INT32_MAX));
     SetField32<false, false>(OFFSET_OF_OBJECT_MEMBER(String, count_), new_count);
   }
 
@@ -82,12 +95,6 @@
 
   String* Intern() SHARED_REQUIRES(Locks::mutator_lock_);
 
-  template <bool kIsInstrumented, typename PreFenceVisitor>
-  ALWAYS_INLINE static String* Alloc(Thread* self, int32_t utf16_length,
-                                     gc::AllocatorType allocator_type,
-                                     const PreFenceVisitor& pre_fence_visitor)
-      SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!Roles::uninterruptible_);
-
   template <bool kIsInstrumented>
   ALWAYS_INLINE static String* AllocFromByteArray(Thread* self, int32_t byte_length,
                                                   Handle<ByteArray> array, int32_t offset,
@@ -107,6 +114,11 @@
                                                gc::AllocatorType allocator_type)
       SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!Roles::uninterruptible_);
 
+  template <bool kIsInstrumented>
+  ALWAYS_INLINE static String* AllocEmptyString(Thread* self,
+                                                gc::AllocatorType allocator_type)
+      SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!Roles::uninterruptible_);
+
   static String* AllocFromStrings(Thread* self, Handle<String> string, Handle<String> string2)
       SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!Roles::uninterruptible_);
 
@@ -149,6 +161,10 @@
 
   int32_t FastIndexOf(int32_t ch, int32_t start) SHARED_REQUIRES(Locks::mutator_lock_);
 
+  template <typename MemoryType>
+  int32_t FastIndexOf(MemoryType* chars, int32_t ch, int32_t start)
+      SHARED_REQUIRES(Locks::mutator_lock_);
+
   int32_t CompareTo(String* other) SHARED_REQUIRES(Locks::mutator_lock_);
 
   CharArray* ToCharArray(Thread* self) SHARED_REQUIRES(Locks::mutator_lock_)
@@ -157,6 +173,28 @@
   void GetChars(int32_t start, int32_t end, Handle<CharArray> array, int32_t index)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
+  template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  bool IsCompressed() SHARED_REQUIRES(Locks::mutator_lock_) {
+    return kUseStringCompression && GetCompressionFlagFromCount(GetCount());
+  }
+
+  bool IsValueNull() SHARED_REQUIRES(Locks::mutator_lock_);
+
+  template<typename MemoryType>
+  static bool AllASCII(const MemoryType* const chars, const int length);
+
+  ALWAYS_INLINE static bool GetCompressionFlagFromCount(const int32_t count) {
+    return kUseStringCompression && ((count & (1u << 31)) != 0);
+  }
+
+  ALWAYS_INLINE static int32_t GetLengthFromCount(const int32_t count) {
+    return kUseStringCompression ? (count & INT32_MAX) : count;
+  }
+
+  ALWAYS_INLINE static int32_t GetFlaggedCount(const int32_t count) {
+    return kUseStringCompression ? (count | (1u << 31)) : count;
+  }
+
   static Class* GetJavaLangString() SHARED_REQUIRES(Locks::mutator_lock_) {
     DCHECK(!java_lang_String_.IsNull());
     return java_lang_String_.Read();
@@ -174,12 +212,24 @@
     SetField32<false, false>(OFFSET_OF_OBJECT_MEMBER(String, hash_code_), new_hash_code);
   }
 
+  template <bool kIsInstrumented, typename PreFenceVisitor>
+  ALWAYS_INLINE static String* Alloc(Thread* self, int32_t utf16_length_with_flag,
+                                     gc::AllocatorType allocator_type,
+                                     const PreFenceVisitor& pre_fence_visitor)
+      SHARED_REQUIRES(Locks::mutator_lock_) REQUIRES(!Roles::uninterruptible_);
+
   // Field order required by test "ValidateFieldOrderOfJavaCppUnionClasses".
+  // First bit (uppermost/leftmost) is taken out for Compressed/Uncompressed flag
+  // [0] Uncompressed: string uses 16-bit memory | [1] Compressed: 8-bit memory
   int32_t count_;
 
   uint32_t hash_code_;
 
-  uint16_t value_[0];
+  // Compression of all-ASCII into 8-bit memory leads to usage one of these fields
+  union {
+    uint16_t value_[0];
+    uint8_t value_compressed_[0];
+  };
 
   static GcRoot<Class> java_lang_String_;
 
diff --git a/runtime/native/java_lang_Class.cc b/runtime/native/java_lang_Class.cc
index 6d5e7c7..d4e54cf 100644
--- a/runtime/native/java_lang_Class.cc
+++ b/runtime/native/java_lang_Class.cc
@@ -198,12 +198,25 @@
   }
   size_t low = 0;
   size_t high = fields->size();
-  const uint16_t* const data = name->GetValue();
+  const bool is_name_compressed = name->IsCompressed();
+  const uint16_t* const data = (is_name_compressed) ? nullptr : name->GetValue();
+  const uint8_t* const data_compressed = (is_name_compressed) ? name->GetValueCompressed()
+                                                              : nullptr;
   const size_t length = name->GetLength();
   while (low < high) {
     auto mid = (low + high) / 2;
     ArtField& field = fields->At(mid);
-    int result = CompareModifiedUtf8ToUtf16AsCodePointValues(field.GetName(), data, length);
+    int result = 0;
+    if (is_name_compressed) {
+      size_t field_length = strlen(field.GetName());
+      size_t min_size = (length < field_length) ? length : field_length;
+      result = memcmp(field.GetName(), data_compressed, min_size);
+      if (result == 0) {
+        result = field_length - length;
+      }
+    } else {
+      result = CompareModifiedUtf8ToUtf16AsCodePointValues(field.GetName(), data, length);
+    }
     // Alternate approach, only a few % faster at the cost of more allocations.
     // int result = field->GetStringName(self, true)->CompareTo(name);
     if (result < 0) {
@@ -636,8 +649,7 @@
   // Invoke the string allocator to return an empty string for the string class.
   if (klass->IsStringClass()) {
     gc::AllocatorType allocator_type = Runtime::Current()->GetHeap()->GetCurrentAllocator();
-    mirror::SetStringCountVisitor visitor(0);
-    mirror::Object* obj = mirror::String::Alloc<true>(soa.Self(), 0, allocator_type, visitor);
+    mirror::Object* obj = mirror::String::AllocEmptyString<true>(soa.Self(), allocator_type);
     if (UNLIKELY(soa.Self()->IsExceptionPending())) {
       return nullptr;
     } else {
diff --git a/runtime/native/java_lang_DexCache.cc b/runtime/native/java_lang_DexCache.cc
index 994ccb1..f0140a3 100644
--- a/runtime/native/java_lang_DexCache.cc
+++ b/runtime/native/java_lang_DexCache.cc
@@ -59,7 +59,7 @@
 static jobject DexCache_getResolvedString(JNIEnv* env, jobject javaDexCache, jint string_index) {
   ScopedFastNativeObjectAccess soa(env);
   mirror::DexCache* dex_cache = soa.Decode<mirror::DexCache*>(javaDexCache);
-  CHECK_LT(static_cast<size_t>(string_index), dex_cache->NumStrings());
+  CHECK_LT(static_cast<size_t>(string_index), dex_cache->GetDexFile()->NumStringIds());
   return soa.AddLocalReference<jobject>(dex_cache->GetResolvedString(string_index));
 }
 
@@ -75,7 +75,7 @@
                                        jobject string) {
   ScopedFastNativeObjectAccess soa(env);
   mirror::DexCache* dex_cache = soa.Decode<mirror::DexCache*>(javaDexCache);
-  CHECK_LT(static_cast<size_t>(string_index), dex_cache->NumStrings());
+  CHECK_LT(static_cast<size_t>(string_index), dex_cache->GetDexFile()->NumStringIds());
   dex_cache->SetResolvedString(string_index, soa.Decode<mirror::String*>(string));
 }
 
diff --git a/runtime/native/libcore_util_CharsetUtils.cc b/runtime/native/libcore_util_CharsetUtils.cc
index 1216824..64d56f6 100644
--- a/runtime/native/libcore_util_CharsetUtils.cc
+++ b/runtime/native/libcore_util_CharsetUtils.cc
@@ -165,10 +165,9 @@
     return nullptr;
   }
 
-  const jchar* src = &(string->GetValue()[offset]);
   jbyte* dst = &bytes[0];
-  for (int i = length - 1; i >= 0; --i) {
-    jchar ch = *src++;
+  for (int i = 0; i < length; ++i) {
+    jchar ch = string->CharAt(offset + i);
     if (ch > maxValidChar) {
       ch = '?';
     }
diff --git a/runtime/simulator/Android.mk b/runtime/simulator/Android.mk
index a34a841..e39af2d 100644
--- a/runtime/simulator/Android.mk
+++ b/runtime/simulator/Android.mk
@@ -22,6 +22,9 @@
   code_simulator.cc \
   code_simulator_arm64.cc
 
+LIBART_SIMULATOR_CFLAGS := \
+  -DVIXL_INCLUDE_SIMULATOR_AARCH64
+
 # $(1): target or host
 # $(2): ndebug or debug
 define build-libart-simulator
@@ -54,6 +57,7 @@
   LOCAL_MODULE_CLASS := SHARED_LIBRARIES
 
   LOCAL_SRC_FILES := $$(LIBART_SIMULATOR_SRC_FILES)
+  LOCAL_CFLAGS := $$(LIBART_SIMULATOR_CFLAGS)
 
   ifeq ($$(art_target_or_host),target)
     $(call set-target-local-clang-vars)
diff --git a/runtime/stack.h b/runtime/stack.h
index cf33ae1..850d2a4 100644
--- a/runtime/stack.h
+++ b/runtime/stack.h
@@ -224,7 +224,6 @@
   int64_t GetVRegLong(size_t i) const {
     DCHECK_LT(i, NumberOfVRegs());
     const uint32_t* vreg = &vregs_[i];
-    // Alignment attribute required for GCC 4.8
     typedef const int64_t unaligned_int64 __attribute__ ((aligned (4)));
     return *reinterpret_cast<unaligned_int64*>(vreg);
   }
@@ -232,7 +231,6 @@
   double GetVRegDouble(size_t i) const {
     DCHECK_LT(i, NumberOfVRegs());
     const uint32_t* vreg = &vregs_[i];
-    // Alignment attribute required for GCC 4.8
     typedef const double unaligned_double __attribute__ ((aligned (4)));
     return *reinterpret_cast<unaligned_double*>(vreg);
   }
@@ -289,7 +287,6 @@
   void SetVRegLong(size_t i, int64_t val) {
     DCHECK_LT(i, NumberOfVRegs());
     uint32_t* vreg = &vregs_[i];
-    // Alignment attribute required for GCC 4.8
     typedef int64_t unaligned_int64 __attribute__ ((aligned (4)));
     *reinterpret_cast<unaligned_int64*>(vreg) = val;
     // This is needed for moving collectors since these can update the vreg references if they
@@ -303,7 +300,6 @@
   void SetVRegDouble(size_t i, double val) {
     DCHECK_LT(i, NumberOfVRegs());
     uint32_t* vreg = &vregs_[i];
-    // Alignment attribute required for GCC 4.8
     typedef double unaligned_double __attribute__ ((aligned (4)));
     *reinterpret_cast<unaligned_double*>(vreg) = val;
     // This is needed for moving collectors since these can update the vreg references if they
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 0457ba0..79b9f02 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -1431,6 +1431,12 @@
     if (o == nullptr) {
       os << "an unknown object";
     } else {
+      if (kUseReadBarrier && Thread::Current()->GetIsGcMarking()) {
+        // We may call Thread::Dump() in the middle of the CC thread flip and this thread's stack
+        // may have not been flipped yet and "o" may be a from-space (stale) ref, in which case the
+        // IdentityHashCode call below will crash. So explicitly mark/forward it here.
+        o = ReadBarrier::Mark(o);
+      }
       if ((o->GetLockWord(false).GetState() == LockWord::kThinLocked) &&
           Locks::mutator_lock_->IsExclusiveHeld(Thread::Current())) {
         // Getting the identity hashcode here would result in lock inflation and suspension of the
diff --git a/runtime/thread_list.cc b/runtime/thread_list.cc
index 688514c..ab1f198 100644
--- a/runtime/thread_list.cc
+++ b/runtime/thread_list.cc
@@ -60,7 +60,8 @@
 
 // Whether we should try to dump the native stack of unattached threads. See commit ed8b723 for
 // some history.
-static constexpr bool kDumpUnattachedThreadNativeStack = true;
+// Turned off again. b/29248079
+static constexpr bool kDumpUnattachedThreadNativeStack = false;
 
 ThreadList::ThreadList()
     : suspend_all_count_(0),
diff --git a/runtime/utf.cc b/runtime/utf.cc
index 5e9fdf7..7e06482 100644
--- a/runtime/utf.cc
+++ b/runtime/utf.cc
@@ -170,14 +170,6 @@
   }
 }
 
-int32_t ComputeUtf16Hash(const uint16_t* chars, size_t char_count) {
-  uint32_t hash = 0;
-  while (char_count--) {
-    hash = hash * 31 + *chars++;
-  }
-  return static_cast<int32_t>(hash);
-}
-
 int32_t ComputeUtf16HashFromModifiedUtf8(const char* utf8, size_t utf16_length) {
   uint32_t hash = 0;
   while (utf16_length != 0u) {
diff --git a/runtime/utf.h b/runtime/utf.h
index 27d2fd5..7c9c333 100644
--- a/runtime/utf.h
+++ b/runtime/utf.h
@@ -82,7 +82,16 @@
  */
 int32_t ComputeUtf16Hash(mirror::CharArray* chars, int32_t offset, size_t char_count)
     SHARED_REQUIRES(Locks::mutator_lock_);
-int32_t ComputeUtf16Hash(const uint16_t* chars, size_t char_count);
+
+template<typename MemoryType>
+int32_t ComputeUtf16Hash(const MemoryType* chars, size_t char_count) {
+  uint32_t hash = 0;
+  while (char_count--) {
+    hash = hash * 31 + *chars++;
+  }
+  return static_cast<int32_t>(hash);
+}
+
 int32_t ComputeUtf16HashFromModifiedUtf8(const char* utf8, size_t utf16_length);
 
 // Compute a hash code of a modified UTF-8 string. Not the standard java hash since it returns a
diff --git a/runtime/utils.h b/runtime/utils.h
index 84079e2..693e0b8 100644
--- a/runtime/utils.h
+++ b/runtime/utils.h
@@ -380,21 +380,7 @@
 NO_RETURN void SleepForever();
 
 inline void FlushInstructionCache(char* begin, char* end) {
-  // Only use __builtin___clear_cache with Clang or with GCC >= 4.3.0
-  // (__builtin___clear_cache was introduced in GCC 4.3.0).
-#if defined(__clang__) || GCC_VERSION >= 40300
   __builtin___clear_cache(begin, end);
-#else
-  // Only warn on non-Intel platforms, as x86 and x86-64 do not need
-  // cache flush instructions, as long as the "code uses the same
-  // linear address for modifying and fetching the instruction". See
-  // "Intel(R) 64 and IA-32 Architectures Software Developer's Manual
-  // Volume 3A: System Programming Guide, Part 1", section 11.6
-  // "Self-Modifying Code".
-#if !defined(__i386__) && !defined(__x86_64__)
-  UNIMPLEMENTED(WARNING) << "cache flush";
-#endif
-#endif
 }
 
 }  // namespace art
diff --git a/runtime/utils/dex_cache_arrays_layout-inl.h b/runtime/utils/dex_cache_arrays_layout-inl.h
index 7733a51..4c63156 100644
--- a/runtime/utils/dex_cache_arrays_layout-inl.h
+++ b/runtime/utils/dex_cache_arrays_layout-inl.h
@@ -23,6 +23,7 @@
 #include "base/logging.h"
 #include "gc_root.h"
 #include "globals.h"
+#include "mirror/dex_cache.h"
 #include "primitive.h"
 
 namespace art {
@@ -45,12 +46,11 @@
     : DexCacheArraysLayout(pointer_size, dex_file->GetHeader()) {
 }
 
-inline size_t DexCacheArraysLayout::Alignment() const {
+inline constexpr size_t DexCacheArraysLayout::Alignment() {
   // GcRoot<> alignment is 4, i.e. lower than or equal to the pointer alignment.
   static_assert(alignof(GcRoot<mirror::Class>) == 4, "Expecting alignof(GcRoot<>) == 4");
-  static_assert(alignof(GcRoot<mirror::String>) == 4, "Expecting alignof(GcRoot<>) == 4");
-  // Pointer alignment is the same as pointer size.
-  return static_cast<size_t>(pointer_size_);
+  static_assert(alignof(mirror::StringDexCacheType) == 8, "Expecting alignof(StringDexCacheType) == 8");
+  return alignof(mirror::StringDexCacheType);
 }
 
 template <typename T>
@@ -87,15 +87,22 @@
 }
 
 inline size_t DexCacheArraysLayout::StringOffset(uint32_t string_idx) const {
-  return strings_offset_ + ElementOffset(GcRootAsPointerSize<mirror::String>(), string_idx);
+  return strings_offset_ + ElementOffset(PointerSize::k64,
+                                         string_idx % mirror::DexCache::kDexCacheStringCacheSize);
 }
 
 inline size_t DexCacheArraysLayout::StringsSize(size_t num_elements) const {
-  return ArraySize(GcRootAsPointerSize<mirror::String>(), num_elements);
+  size_t cache_size = mirror::DexCache::kDexCacheStringCacheSize;
+  if (num_elements < cache_size) {
+    cache_size = num_elements;
+  }
+  return ArraySize(PointerSize::k64, cache_size);
 }
 
 inline size_t DexCacheArraysLayout::StringsAlignment() const {
-  return alignof(GcRoot<mirror::String>);
+  static_assert(alignof(mirror::StringDexCacheType) == 8,
+                "Expecting alignof(StringDexCacheType) == 8");
+  return alignof(mirror::StringDexCacheType);
 }
 
 inline size_t DexCacheArraysLayout::FieldOffset(uint32_t field_idx) const {
diff --git a/runtime/utils/dex_cache_arrays_layout.h b/runtime/utils/dex_cache_arrays_layout.h
index f2437fa..20ffa90 100644
--- a/runtime/utils/dex_cache_arrays_layout.h
+++ b/runtime/utils/dex_cache_arrays_layout.h
@@ -52,7 +52,7 @@
     return size_;
   }
 
-  size_t Alignment() const;
+  static constexpr size_t Alignment();
 
   size_t TypesOffset() const {
     return types_offset_;
diff --git a/test/020-string/expected.txt b/test/020-string/expected.txt
index 76b8929..83a0835 100644
--- a/test/020-string/expected.txt
+++ b/test/020-string/expected.txt
@@ -1,6 +1,6 @@
 testStr is 'This is a very nice string'
 This is a very nice string
-Compare result is 32
+Compare result is greater than zero
 Compare unicode: -65302
 Got expected exception
 subStr is 'uick brown fox jumps over the lazy '
diff --git a/test/020-string/src/Main.java b/test/020-string/src/Main.java
index 7108082..ccf94aa 100644
--- a/test/020-string/src/Main.java
+++ b/test/020-string/src/Main.java
@@ -45,7 +45,14 @@
         if (testStr.length() != testStr2.length())
             System.out.println("WARNING: stringTest length mismatch");
 
-        System.out.println("Compare result is " + testStr.compareTo(testStr2));
+        int compareResult = testStr.compareTo(testStr2);
+        if (compareResult > 0) {
+          System.out.println("Compare result is greater than zero");
+        } else if (compareResult == 0) {
+          System.out.println("Compare result is equal to zero");
+        } else {
+          System.out.println("Compare result is less than zero");
+        }
 
         // expected: -65302
         String s1 = "\u0c6d\u0cb6\u0d00\u0000\u0080\u0080\u0080\u0000\u0002\u0002\u0002\u0000\u00e9\u00e9\u00e9";
diff --git a/test/031-class-attributes/expected.txt b/test/031-class-attributes/expected.txt
index de99872..72656ae 100644
--- a/test/031-class-attributes/expected.txt
+++ b/test/031-class-attributes/expected.txt
@@ -84,7 +84,7 @@
   enclosingCon: null
   enclosingMeth: null
   modifiers: 1
-  package: package otherpackage
+  package: package otherpackage, Unknown, version 0.0
   declaredClasses: [0]
   member classes: [0]
   isAnnotation: false
diff --git a/test/439-npe/expected.txt b/test/439-npe/expected.txt
index 271d40d..34855ee 100644
--- a/test/439-npe/expected.txt
+++ b/test/439-npe/expected.txt
@@ -1,18 +1,54 @@
-$opt$setObjectField
-$opt$setIntField
-$opt$setFloatField
-$opt$setLongField
-$opt$setDoubleField
-$opt$setByteField
-$opt$setBooleanField
-$opt$setCharField
-$opt$setShortField
-$opt$getObjectField
-$opt$getIntField
-$opt$getFloatField
-$opt$getLongField
-$opt$getDoubleField
-$opt$getByteField
-$opt$getBooleanField
-$opt$getCharField
-$opt$getShortField
+$opt$noinline$setObjectField
+$opt$noinline$setIntField
+$opt$noinline$setFloatField
+$opt$noinline$setLongField
+$opt$noinline$setDoubleField
+$opt$noinline$setByteField
+$opt$noinline$setBooleanField
+$opt$noinline$setCharField
+$opt$noinline$setShortField
+$opt$noinline$getObjectField
+$opt$noinline$getIntField
+$opt$noinline$getFloatField
+$opt$noinline$getLongField
+$opt$noinline$getDoubleField
+$opt$noinline$getByteField
+$opt$noinline$getBooleanField
+$opt$noinline$getCharField
+$opt$noinline$getShortField
+$opt$noinline$setVolatileObjectField
+$opt$noinline$setVolatileIntField
+$opt$noinline$setVolatileFloatField
+$opt$noinline$setVolatileLongField
+$opt$noinline$setVolatileDoubleField
+$opt$noinline$setVolatileByteField
+$opt$noinline$setVolatileBooleanField
+$opt$noinline$setVolatileCharField
+$opt$noinline$setVolatileShortField
+$opt$noinline$getVolatileObjectField
+$opt$noinline$getVolatileIntField
+$opt$noinline$getVolatileFloatField
+$opt$noinline$getVolatileLongField
+$opt$noinline$getVolatileDoubleField
+$opt$noinline$getVolatileByteField
+$opt$noinline$getVolatileBooleanField
+$opt$noinline$getVolatileCharField
+$opt$noinline$getVolatileShortField
+$opt$noinline$setObjectElement
+$opt$noinline$setIntElement
+$opt$noinline$setFloatElement
+$opt$noinline$setLongElement
+$opt$noinline$setDoubleElement
+$opt$noinline$setByteElement
+$opt$noinline$setBooleanElement
+$opt$noinline$setCharElement
+$opt$noinline$setShortElement
+$opt$noinline$getObjectElement
+$opt$noinline$getIntElement
+$opt$noinline$getFloatElement
+$opt$noinline$getLongElement
+$opt$noinline$getDoubleElement
+$opt$noinline$getByteElement
+$opt$noinline$getBooleanElement
+$opt$noinline$getCharElement
+$opt$noinline$getShortElement
diff --git a/test/439-npe/src/Main.java b/test/439-npe/src/Main.java
index 40c2645..8f66da0 100644
--- a/test/439-npe/src/Main.java
+++ b/test/439-npe/src/Main.java
@@ -15,199 +15,624 @@
  */
 
 public class Main {
+  public static boolean doThrow = false;
 
-  private volatile Object objectField;
-  private volatile int intField;
-  private volatile float floatField;
-  private volatile long longField;
-  private volatile double doubleField;
-  private volatile byte byteField;
-  private volatile boolean booleanField;
-  private volatile char charField;
-  private volatile short shortField;
+  private Object objectField;
+  private int intField;
+  private float floatField;
+  private long longField;
+  private double doubleField;
+  private byte byteField;
+  private boolean booleanField;
+  private char charField;
+  private short shortField;
 
-  public static void $opt$setObjectField(Main m) {
+  private volatile Object volatileObjectField;
+  private volatile int volatileIntField;
+  private volatile float volatileFloatField;
+  private volatile long volatileLongField;
+  private volatile double volatileDoubleField;
+  private volatile byte volatileByteField;
+  private volatile boolean volatileBooleanField;
+  private volatile char volatileCharField;
+  private volatile short volatileShortField;
+
+  public static void $opt$noinline$setObjectField(Main m) {
+    if (doThrow) { throw new Error(); }
     m.objectField = null;
   }
 
-  public static void $opt$setIntField(Main m) {
+  public static void $opt$noinline$setIntField(Main m) {
+    if (doThrow) { throw new Error(); }
     m.intField = 0;
   }
 
-  public static void $opt$setFloatField(Main m) {
+  public static void $opt$noinline$setFloatField(Main m) {
+    if (doThrow) { throw new Error(); }
     m.floatField = 0;
   }
 
-  public static void $opt$setLongField(Main m) {
+  public static void $opt$noinline$setLongField(Main m) {
+    if (doThrow) { throw new Error(); }
     m.longField = 0;
   }
 
-  public static void $opt$setDoubleField(Main m) {
+  public static void $opt$noinline$setDoubleField(Main m) {
+    if (doThrow) { throw new Error(); }
     m.doubleField = 0;
   }
 
-  public static void $opt$setByteField(Main m) {
+  public static void $opt$noinline$setByteField(Main m) {
+    if (doThrow) { throw new Error(); }
     m.byteField = 0;
   }
 
-  public static void $opt$setBooleanField(Main m) {
+  public static void $opt$noinline$setBooleanField(Main m) {
+    if (doThrow) { throw new Error(); }
     m.booleanField = false;
   }
 
-  public static void $opt$setCharField(Main m) {
+  public static void $opt$noinline$setCharField(Main m) {
+    if (doThrow) { throw new Error(); }
     m.charField = 0;
   }
 
-  public static void $opt$setShortField(Main m) {
+  public static void $opt$noinline$setShortField(Main m) {
+    if (doThrow) { throw new Error(); }
     m.shortField = 0;
   }
 
-  public static Object $opt$getObjectField(Main m) {
+  public static Object $opt$noinline$getObjectField(Main m) {
+    if (doThrow) { throw new Error(); }
     return m.objectField;
   }
 
-  public static int $opt$getIntField(Main m) {
+  public static int $opt$noinline$getIntField(Main m) {
+    if (doThrow) { throw new Error(); }
     return m.intField;
   }
 
-  public static float $opt$getFloatField(Main m) {
+  public static float $opt$noinline$getFloatField(Main m) {
+    if (doThrow) { throw new Error(); }
     return m.floatField;
   }
 
-  public static long $opt$getLongField(Main m) {
+  public static long $opt$noinline$getLongField(Main m) {
+    if (doThrow) { throw new Error(); }
     return m.longField;
   }
 
-  public static double $opt$getDoubleField(Main m) {
+  public static double $opt$noinline$getDoubleField(Main m) {
+    if (doThrow) { throw new Error(); }
     return m.doubleField;
   }
 
-  public static byte $opt$getByteField(Main m) {
+  public static byte $opt$noinline$getByteField(Main m) {
+    if (doThrow) { throw new Error(); }
     return m.byteField;
   }
 
-  public static boolean $opt$getBooleanField(Main m) {
+  public static boolean $opt$noinline$getBooleanField(Main m) {
+    if (doThrow) { throw new Error(); }
     return m.booleanField;
   }
 
-  public static char $opt$getCharField(Main m) {
+  public static char $opt$noinline$getCharField(Main m) {
+    if (doThrow) { throw new Error(); }
     return m.charField;
   }
 
-  public static short $opt$getShortField(Main m) {
+  public static short $opt$noinline$getShortField(Main m) {
+    if (doThrow) { throw new Error(); }
     return m.shortField;
   }
 
+  public static void $opt$noinline$setVolatileObjectField(Main m) {
+    if (doThrow) { throw new Error(); }
+    m.volatileObjectField = null;
+  }
+
+  public static void $opt$noinline$setVolatileIntField(Main m) {
+    if (doThrow) { throw new Error(); }
+    m.volatileIntField = 0;
+  }
+
+  public static void $opt$noinline$setVolatileFloatField(Main m) {
+    if (doThrow) { throw new Error(); }
+    m.volatileFloatField = 0;
+  }
+
+  public static void $opt$noinline$setVolatileLongField(Main m) {
+    if (doThrow) { throw new Error(); }
+    m.volatileLongField = 0;
+  }
+
+  public static void $opt$noinline$setVolatileDoubleField(Main m) {
+    if (doThrow) { throw new Error(); }
+    m.volatileDoubleField = 0;
+  }
+
+  public static void $opt$noinline$setVolatileByteField(Main m) {
+    if (doThrow) { throw new Error(); }
+    m.volatileByteField = 0;
+  }
+
+  public static void $opt$noinline$setVolatileBooleanField(Main m) {
+    if (doThrow) { throw new Error(); }
+    m.volatileBooleanField = false;
+  }
+
+  public static void $opt$noinline$setVolatileCharField(Main m) {
+    if (doThrow) { throw new Error(); }
+    m.volatileCharField = 0;
+  }
+
+  public static void $opt$noinline$setVolatileShortField(Main m) {
+    if (doThrow) { throw new Error(); }
+    m.volatileShortField = 0;
+  }
+
+  public static Object $opt$noinline$getVolatileObjectField(Main m) {
+    if (doThrow) { throw new Error(); }
+    return m.volatileObjectField;
+  }
+
+  public static int $opt$noinline$getVolatileIntField(Main m) {
+    if (doThrow) { throw new Error(); }
+    return m.volatileIntField;
+  }
+
+  public static float $opt$noinline$getVolatileFloatField(Main m) {
+    if (doThrow) { throw new Error(); }
+    return m.volatileFloatField;
+  }
+
+  public static long $opt$noinline$getVolatileLongField(Main m) {
+    if (doThrow) { throw new Error(); }
+    return m.volatileLongField;
+  }
+
+  public static double $opt$noinline$getVolatileDoubleField(Main m) {
+    if (doThrow) { throw new Error(); }
+    return m.volatileDoubleField;
+  }
+
+  public static byte $opt$noinline$getVolatileByteField(Main m) {
+    if (doThrow) { throw new Error(); }
+    return m.volatileByteField;
+  }
+
+  public static boolean $opt$noinline$getVolatileBooleanField(Main m) {
+    if (doThrow) { throw new Error(); }
+    return m.volatileBooleanField;
+  }
+
+  public static char $opt$noinline$getVolatileCharField(Main m) {
+    if (doThrow) { throw new Error(); }
+    return m.volatileCharField;
+  }
+
+  public static short $opt$noinline$getVolatileShortField(Main m) {
+    if (doThrow) { throw new Error(); }
+    return m.volatileShortField;
+  }
+
+  public static void $opt$noinline$setObjectElement(Object[] a) {
+    if (doThrow) { throw new Error(); }
+    a[0] = null;
+  }
+
+  public static void $opt$noinline$setIntElement(int[] a) {
+    if (doThrow) { throw new Error(); }
+    a[0] = 0;
+  }
+
+  public static void $opt$noinline$setFloatElement(float[] a) {
+    if (doThrow) { throw new Error(); }
+    a[0] = 0;
+  }
+
+  public static void $opt$noinline$setLongElement(long[] a) {
+    if (doThrow) { throw new Error(); }
+    a[0] = 0;
+  }
+
+  public static void $opt$noinline$setDoubleElement(double[] a) {
+    if (doThrow) { throw new Error(); }
+    a[0] = 0;
+  }
+
+  public static void $opt$noinline$setByteElement(byte[] a) {
+    if (doThrow) { throw new Error(); }
+    a[0] = 0;
+  }
+
+  public static void $opt$noinline$setBooleanElement(boolean[] a) {
+    if (doThrow) { throw new Error(); }
+    a[0] = false;
+  }
+
+  public static void $opt$noinline$setCharElement(char[] a) {
+    if (doThrow) { throw new Error(); }
+    a[0] = 0;
+  }
+
+  public static void $opt$noinline$setShortElement(short[] a) {
+    if (doThrow) { throw new Error(); }
+    a[0] = 0;
+  }
+
+  public static Object $opt$noinline$getObjectElement(Object[] a) {
+    if (doThrow) { throw new Error(); }
+    return a[0];
+  }
+
+  public static int $opt$noinline$getIntElement(int[] a) {
+    if (doThrow) { throw new Error(); }
+    return a[0];
+  }
+
+  public static float $opt$noinline$getFloatElement(float[] a) {
+    if (doThrow) { throw new Error(); }
+    return a[0];
+  }
+
+  public static long $opt$noinline$getLongElement(long[] a) {
+    if (doThrow) { throw new Error(); }
+    return a[0];
+  }
+
+  public static double $opt$noinline$getDoubleElement(double[] a) {
+    if (doThrow) { throw new Error(); }
+    return a[0];
+  }
+
+  public static byte $opt$noinline$getByteElement(byte[] a) {
+    if (doThrow) { throw new Error(); }
+    return a[0];
+  }
+
+  public static boolean $opt$noinline$getBooleanElement(boolean[] a) {
+    if (doThrow) { throw new Error(); }
+    return a[0];
+  }
+
+  public static char $opt$noinline$getCharElement(char[] a) {
+    if (doThrow) { throw new Error(); }
+    return a[0];
+  }
+
+  public static short $opt$noinline$getShortElement(short[] a) {
+    if (doThrow) { throw new Error(); }
+    return a[0];
+  }
+
   public static void main(String[] args) {
-    int methodLine = 30;
-    int thisLine = 103;
+    int methodLine = 42;
+    int thisLine = 312;
     try {
-      $opt$setObjectField(null);
+      $opt$noinline$setObjectField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 2, methodLine, "$opt$setObjectField");
+      check(npe, thisLine += 2, methodLine, "$opt$noinline$setObjectField");
     }
     try {
-      $opt$setIntField(null);
+      $opt$noinline$setIntField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$setIntField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setIntField");
     }
     try {
-      $opt$setFloatField(null);
+      $opt$noinline$setFloatField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$setFloatField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setFloatField");
     }
     try {
-      $opt$setLongField(null);
+      $opt$noinline$setLongField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$setLongField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setLongField");
     }
     try {
-      $opt$setDoubleField(null);
+      $opt$noinline$setDoubleField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$setDoubleField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setDoubleField");
     }
     try {
-      $opt$setByteField(null);
+      $opt$noinline$setByteField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$setByteField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setByteField");
     }
     try {
-      $opt$setBooleanField(null);
+      $opt$noinline$setBooleanField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$setBooleanField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setBooleanField");
     }
     try {
-      $opt$setCharField(null);
+      $opt$noinline$setCharField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$setCharField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setCharField");
     }
     try {
-      $opt$setShortField(null);
+      $opt$noinline$setShortField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$setShortField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setShortField");
     }
     try {
-      $opt$getObjectField(null);
+      $opt$noinline$getObjectField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$getObjectField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getObjectField");
     }
     try {
-      $opt$getIntField(null);
+      $opt$noinline$getIntField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$getIntField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getIntField");
     }
     try {
-      $opt$getFloatField(null);
+      $opt$noinline$getFloatField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$getFloatField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getFloatField");
     }
     try {
-      $opt$getLongField(null);
+      $opt$noinline$getLongField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$getLongField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getLongField");
     }
     try {
-      $opt$getDoubleField(null);
+      $opt$noinline$getDoubleField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$getDoubleField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getDoubleField");
     }
     try {
-      $opt$getByteField(null);
+      $opt$noinline$getByteField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$getByteField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getByteField");
     }
     try {
-      $opt$getBooleanField(null);
+      $opt$noinline$getBooleanField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$getBooleanField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getBooleanField");
     }
     try {
-      $opt$getCharField(null);
+      $opt$noinline$getCharField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$getCharField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getCharField");
     }
     try {
-      $opt$getShortField(null);
+      $opt$noinline$getShortField(null);
       throw new RuntimeException("Failed to throw NullPointerException.");
     } catch (NullPointerException npe) {
-      check(npe, thisLine += 6, methodLine += 4, "$opt$getShortField");
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getShortField");
+    }
+    try {
+      $opt$noinline$setVolatileObjectField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setVolatileObjectField");
+    }
+    try {
+      $opt$noinline$setVolatileIntField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setVolatileIntField");
+    }
+    try {
+      $opt$noinline$setVolatileFloatField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setVolatileFloatField");
+    }
+    try {
+      $opt$noinline$setVolatileLongField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setVolatileLongField");
+    }
+    try {
+      $opt$noinline$setVolatileDoubleField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setVolatileDoubleField");
+    }
+    try {
+      $opt$noinline$setVolatileByteField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setVolatileByteField");
+    }
+    try {
+      $opt$noinline$setVolatileBooleanField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setVolatileBooleanField");
+    }
+    try {
+      $opt$noinline$setVolatileCharField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setVolatileCharField");
+    }
+    try {
+      $opt$noinline$setVolatileShortField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setVolatileShortField");
+    }
+    try {
+      $opt$noinline$getVolatileObjectField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getVolatileObjectField");
+    }
+    try {
+      $opt$noinline$getVolatileIntField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getVolatileIntField");
+    }
+    try {
+      $opt$noinline$getVolatileFloatField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getVolatileFloatField");
+    }
+    try {
+      $opt$noinline$getVolatileLongField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getVolatileLongField");
+    }
+    try {
+      $opt$noinline$getVolatileDoubleField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getVolatileDoubleField");
+    }
+    try {
+      $opt$noinline$getVolatileByteField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getVolatileByteField");
+    }
+    try {
+      $opt$noinline$getVolatileBooleanField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getVolatileBooleanField");
+    }
+    try {
+      $opt$noinline$getVolatileCharField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getVolatileCharField");
+    }
+    try {
+      $opt$noinline$getVolatileShortField(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getVolatileShortField");
+    }
+    try {
+      $opt$noinline$setObjectElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setObjectElement");
+    }
+    try {
+      $opt$noinline$setIntElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setIntElement");
+    }
+    try {
+      $opt$noinline$setFloatElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setFloatElement");
+    }
+    try {
+      $opt$noinline$setLongElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setLongElement");
+    }
+    try {
+      $opt$noinline$setDoubleElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setDoubleElement");
+    }
+    try {
+      $opt$noinline$setByteElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setByteElement");
+    }
+    try {
+      $opt$noinline$setBooleanElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setBooleanElement");
+    }
+    try {
+      $opt$noinline$setCharElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setCharElement");
+    }
+    try {
+      $opt$noinline$setShortElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$setShortElement");
+    }
+    try {
+      $opt$noinline$getObjectElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getObjectElement");
+    }
+    try {
+      $opt$noinline$getIntElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getIntElement");
+    }
+    try {
+      $opt$noinline$getFloatElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getFloatElement");
+    }
+    try {
+      $opt$noinline$getLongElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getLongElement");
+    }
+    try {
+      $opt$noinline$getDoubleElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getDoubleElement");
+    }
+    try {
+      $opt$noinline$getByteElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getByteElement");
+    }
+    try {
+      $opt$noinline$getBooleanElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getBooleanElement");
+    }
+    try {
+      $opt$noinline$getCharElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getCharElement");
+    }
+    try {
+      $opt$noinline$getShortElement(null);
+      throw new RuntimeException("Failed to throw NullPointerException.");
+    } catch (NullPointerException npe) {
+      check(npe, thisLine += 6, methodLine += 5, "$opt$noinline$getShortElement");
     }
   }
 
diff --git a/test/537-checker-arraycopy/src/Main.java b/test/537-checker-arraycopy/src/Main.java
index 7c124ca..95a11ca 100644
--- a/test/537-checker-arraycopy/src/Main.java
+++ b/test/537-checker-arraycopy/src/Main.java
@@ -51,10 +51,10 @@
 
   /// CHECK-START-X86_64: void Main.arraycopy() disassembly (after)
   /// CHECK:          InvokeStaticOrDirect intrinsic:SystemArrayCopy
-  /// CHECK-NOT:      test
+  /// CHECK-NOT:      test {{^[^\[].*}}, {{^[^\[].*}}
   /// CHECK-NOT:      call
   /// CHECK:          ReturnVoid
-  // Checks that the call is intrinsified and that there is no test instruction
+  // Checks that the call is intrinsified and that there is no register test instruction
   // when we know the source and destination are not null.
   public static void arraycopy() {
     Object[] obj = new Object[4];
diff --git a/test/551-implicit-null-checks/expected.txt b/test/551-implicit-null-checks/expected.txt
index e69de29..49b3771 100644
--- a/test/551-implicit-null-checks/expected.txt
+++ b/test/551-implicit-null-checks/expected.txt
@@ -0,0 +1,4 @@
+NPE from GetLong
+NPE from PutLong
+NPE from GetDouble
+NPE from PutDouble
diff --git a/test/551-implicit-null-checks/info.txt b/test/551-implicit-null-checks/info.txt
index bdd066b..bd3ecfd 100644
--- a/test/551-implicit-null-checks/info.txt
+++ b/test/551-implicit-null-checks/info.txt
@@ -1 +1 @@
-Test that implicit null checks are recorded correctly for longs.
\ No newline at end of file
+Test that implicit null checks are recorded correctly for longs and doubles.
diff --git a/test/551-implicit-null-checks/src/Main.java b/test/551-implicit-null-checks/src/Main.java
index 677e8d3..3586a29 100644
--- a/test/551-implicit-null-checks/src/Main.java
+++ b/test/551-implicit-null-checks/src/Main.java
@@ -18,6 +18,7 @@
 
   private class Inner {
     private long i1;
+    private double i2;
   }
   private Inner inst;
 
@@ -26,12 +27,22 @@
     try {
       m.$opt$noinline$testGetLong();
     } catch (NullPointerException ex) {
-      // good
+      System.out.println("NPE from GetLong");
     }
     try {
       m.$opt$noinline$testPutLong(778899112233L);
     } catch (NullPointerException ex) {
-      // good
+      System.out.println("NPE from PutLong");
+    }
+    try {
+      m.$opt$noinline$testGetDouble();
+    } catch (NullPointerException ex) {
+      System.out.println("NPE from GetDouble");
+    }
+    try {
+      m.$opt$noinline$testPutDouble(1.0);
+    } catch (NullPointerException ex) {
+      System.out.println("NPE from PutDouble");
     }
   }
 
@@ -44,4 +55,14 @@
     inst.i1 = a;
     throw new Exception();  // prevent inline
   }
+
+  public void $opt$noinline$testGetDouble() throws Exception {
+    double result = inst.i2;
+    throw new Exception();  // prevent inline
+  }
+
+  public void $opt$noinline$testPutDouble(double a) throws Exception {
+    inst.i2 = a;
+    throw new Exception();  // prevent inline
+  }
 }
diff --git a/test/Android.run-test.mk b/test/Android.run-test.mk
index 65debc9..75c4f34 100644
--- a/test/Android.run-test.mk
+++ b/test/Android.run-test.mk
@@ -225,9 +225,11 @@
 
 # Disable 149-suspend-all-stress, its output is flaky (b/28988206).
 # Disable 577-profile-foreign-dex (b/27454772).
+# Disable 552-checker-sharpening, until compiler component of new string dex cache is added (@cwadsworth, @vmarko)
 TEST_ART_BROKEN_ALL_TARGET_TESTS := \
   149-suspend-all-stress \
   577-profile-foreign-dex \
+  552-checker-sharpening \
 
 ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,$(TARGET_TYPES),$(RUN_TYPES),$(PREBUILD_TYPES), \
     $(COMPILER_TYPES), $(RELOCATE_TYPES),$(TRACE_TYPES),$(GC_TYPES),$(JNI_TYPES), \
@@ -557,16 +559,25 @@
 #      more parallel moves on x86, thus some Checker assertions may fail.
 # 527: On ARM64 and ARM, the read barrier instrumentation does not support the HIntermediateAddress
 #      instruction yet (b/26601270).
-# 537: Expects an array copy to be intrinsified on x86-64, but calling-on-slowpath intrinsics are
-#      not yet handled in the read barrier configuration.
 TEST_ART_BROKEN_OPTIMIZING_READ_BARRIER_RUN_TESTS := \
   484-checker-register-hints \
-  527-checker-array-access-split \
-  537-checker-arraycopy
+  527-checker-array-access-split
 
 # Tests that should fail in the read barrier configuration with JIT (Optimizing compiler).
 TEST_ART_BROKEN_JIT_READ_BARRIER_RUN_TESTS :=
 
+# Tests failing in non-Baker read barrier configurations with the Optimizing compiler (AOT).
+# 537: Expects an array copy to be intrinsified, but calling-on-slowpath intrinsics are not yet
+#      handled in non-Baker read barrier configurations.
+TEST_ART_BROKEN_OPTIMIZING_NON_BAKER_READ_BARRIER_RUN_TESTS := \
+  537-checker-arraycopy
+
+# Tests failing in non-Baker read barrier configurations with JIT (Optimizing compiler).
+# 537: Expects an array copy to be intrinsified, but calling-on-slowpath intrinsics are not yet
+#      handled in non-Baker read barrier configurations.
+TEST_ART_BROKEN_JIT_NON_BAKER_READ_BARRIER_RUN_TESTS := \
+  537-checker-arraycopy
+
 ifeq ($(ART_USE_READ_BARRIER),true)
   ifneq (,$(filter interpreter,$(COMPILER_TYPES)))
     ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,$(TARGET_TYPES),$(RUN_TYPES), \
@@ -577,9 +588,15 @@
 
   ifneq (,$(filter $(OPTIMIZING_COMPILER_TYPES),$(COMPILER_TYPES)))
     ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,$(TARGET_TYPES),$(RUN_TYPES), \
-        $(PREBUILD_TYPES),$(OPTIMIZING_COMPILER_TYPES),$(RELOCATE_TYPES),$(TRACE_TYPES),$(GC_TYPES), \
-        $(JNI_TYPES),$(IMAGE_TYPES),$(PICTEST_TYPES),$(DEBUGGABLE_TYPES), \
+        $(PREBUILD_TYPES),$(OPTIMIZING_COMPILER_TYPES),$(RELOCATE_TYPES),$(TRACE_TYPES), \
+        $(GC_TYPES),$(JNI_TYPES),$(IMAGE_TYPES),$(PICTEST_TYPES),$(DEBUGGABLE_TYPES), \
         $(TEST_ART_BROKEN_OPTIMIZING_READ_BARRIER_RUN_TESTS),$(ALL_ADDRESS_SIZES))
+    ifneq ($(ART_READ_BARRIER_TYPE),BAKER)
+      ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,$(TARGET_TYPES),$(RUN_TYPES), \
+          $(PREBUILD_TYPES),$(OPTIMIZING_COMPILER_TYPES),$(RELOCATE_TYPES),$(TRACE_TYPES), \
+          $(GC_TYPES),$(JNI_TYPES),$(IMAGE_TYPES),$(PICTEST_TYPES),$(DEBUGGABLE_TYPES), \
+          $(TEST_ART_BROKEN_OPTIMIZING_NON_BAKER_READ_BARRIER_RUN_TESTS),$(ALL_ADDRESS_SIZES))
+    endif
   endif
 
   ifneq (,$(filter jit,$(COMPILER_TYPES)))
@@ -587,6 +604,12 @@
         $(PREBUILD_TYPES),jit,$(RELOCATE_TYPES),$(TRACE_TYPES),$(GC_TYPES), \
         $(JNI_TYPES),$(IMAGE_TYPES),$(PICTEST_TYPES),$(DEBUGGABLE_TYPES), \
         $(TEST_ART_BROKEN_JIT_READ_BARRIER_RUN_TESTS),$(ALL_ADDRESS_SIZES))
+    ifneq ($(ART_READ_BARRIER_TYPE),BAKER)
+      ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,$(TARGET_TYPES),$(RUN_TYPES), \
+          $(PREBUILD_TYPES),jit,$(RELOCATE_TYPES),$(TRACE_TYPES),$(GC_TYPES), \
+          $(JNI_TYPES),$(IMAGE_TYPES),$(PICTEST_TYPES),$(DEBUGGABLE_TYPES), \
+          $(TEST_ART_BROKEN_JIT_NON_BAKER_READ_BARRIER_RUN_TESTS),$(ALL_ADDRESS_SIZES))
+    endif
   endif
 endif
 
diff --git a/tools/cpp-define-generator/constant_dexcache.def b/tools/cpp-define-generator/constant_dexcache.def
new file mode 100644
index 0000000..fd197f2
--- /dev/null
+++ b/tools/cpp-define-generator/constant_dexcache.def
@@ -0,0 +1,24 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined(DEFINE_INCLUDE_DEPENDENCIES)
+#include "mirror/dex_cache.h"   // art::mirror::DexCache, StringDexCachePair
+#endif
+
+DEFINE_EXPR(STRING_DEX_CACHE_ELEMENT_SIZE_SHIFT,       int32_t, art::WhichPowerOf2(sizeof(art::mirror::StringDexCachePair)))
+DEFINE_EXPR(STRING_DEX_CACHE_SIZE_MINUS_ONE,           int32_t, art::mirror::DexCache::kDexCacheStringCacheSize - 1)
+DEFINE_EXPR(STRING_DEX_CACHE_HASH_BITS,                int32_t,
+    art::LeastSignificantBit(art::mirror::DexCache::kDexCacheStringCacheSize))
\ No newline at end of file
diff --git a/tools/cpp-define-generator/offset_dexcache.def b/tools/cpp-define-generator/offset_dexcache.def
index 3b26518..4b9d481 100644
--- a/tools/cpp-define-generator/offset_dexcache.def
+++ b/tools/cpp-define-generator/offset_dexcache.def
@@ -19,16 +19,27 @@
 #if defined(DEFINE_INCLUDE_DEPENDENCIES)
 #include "art_method.h"         // art::ArtMethod
 #include "base/enums.h"         // PointerSize
+#include "mirror/dex_cache.h"   // art::DexCache
 #endif
 
-#define DEFINE_ART_METHOD_OFFSET(field_name, method_name) \
+#define DEFINE_ART_METHOD_OFFSET_SIZED(field_name, method_name) \
   DEFINE_EXPR(ART_METHOD_ ## field_name ## _OFFSET_32, int32_t, art::ArtMethod::method_name##Offset(art::PointerSize::k32).Int32Value()) \
   DEFINE_EXPR(ART_METHOD_ ## field_name ## _OFFSET_64, int32_t, art::ArtMethod::method_name##Offset(art::PointerSize::k64).Int32Value())
 
+#define DEFINE_ART_METHOD_OFFSET(field_name, method_name) \
+  DEFINE_EXPR(ART_METHOD_ ## field_name ## _OFFSET, int32_t, art::ArtMethod::method_name##Offset().Int32Value())
+
+#define DEFINE_DECLARING_CLASS_OFFSET(field_name, method_name) \
+  DEFINE_EXPR(DECLARING_CLASS_ ## field_name ## _OFFSET, int32_t, art::mirror::Class::method_name##Offset().Int32Value())
+
 //                         New macro suffix          Method Name (of the Offset method)
-DEFINE_ART_METHOD_OFFSET(DEX_CACHE_METHODS,          DexCacheResolvedMethods)
-DEFINE_ART_METHOD_OFFSET(DEX_CACHE_TYPES,            DexCacheResolvedTypes)
-DEFINE_ART_METHOD_OFFSET(JNI,                        EntryPointFromJni)
-DEFINE_ART_METHOD_OFFSET(QUICK_CODE,                 EntryPointFromQuickCompiledCode)
+DEFINE_ART_METHOD_OFFSET_SIZED(DEX_CACHE_METHODS,    DexCacheResolvedMethods)
+DEFINE_ART_METHOD_OFFSET_SIZED(DEX_CACHE_TYPES,      DexCacheResolvedTypes)
+DEFINE_ART_METHOD_OFFSET_SIZED(JNI,                  EntryPointFromJni)
+DEFINE_ART_METHOD_OFFSET_SIZED(QUICK_CODE,           EntryPointFromQuickCompiledCode)
+DEFINE_ART_METHOD_OFFSET(DECLARING_CLASS,            DeclaringClass)
+DEFINE_DECLARING_CLASS_OFFSET(DEX_CACHE_STRINGS,     DexCacheStrings)
 
 #undef DEFINE_ART_METHOD_OFFSET
+#undef DEFINE_ART_METHOD_OFFSET_32
+#undef DEFINE_DECLARING_CLASS_OFFSET
diff --git a/tools/cpp-define-generator/offsets_all.def b/tools/cpp-define-generator/offsets_all.def
index d2d8777..13371a1 100644
--- a/tools/cpp-define-generator/offsets_all.def
+++ b/tools/cpp-define-generator/offsets_all.def
@@ -48,6 +48,7 @@
 // TODO: MIRROR_*_ARRAY offsets (depends on header size)
 // TODO: MIRROR_STRING offsets (depends on header size)
 #include "offset_dexcache.def"
+#include "constant_dexcache.def"
 #include "constant_heap.def"
 #include "constant_lockword.def"
 #include "constant_globals.def"
diff --git a/tools/javafuzz/README.md b/tools/javafuzz/README.md
index ca8532a..68fc171 100644
--- a/tools/javafuzz/README.md
+++ b/tools/javafuzz/README.md
@@ -1,14 +1,14 @@
 JavaFuzz
 ========
 
-JavaFuzz is tool for generating random Java programs with the objective of
-fuzz testing the ART infrastructure. Each randomly generated Java program
+JavaFuzz is a tool for generating random Java programs with the objective
+of fuzz testing the ART infrastructure. Each randomly generated Java program
 can be run under various modes of execution, such as using the interpreter,
 using the optimizing compiler, using an external reference implementation,
 or using various target architectures. Any difference between the outputs
-(a divergence) may indicate a bug in one of the execution modes.
+(**divergence**) may indicate a bug in one of the execution modes.
 
-JavaFuzz can be combined with dexfuzz to get multilayered fuzz testing.
+JavaFuzz can be combined with dexfuzz to get multi-layered fuzz testing.
 
 How to run JavaFuzz
 ===================
@@ -36,6 +36,25 @@
     jack -cp ${JACK_CLASSPATH} --output-dex . Test.java
     art -classpath classes.dex Test
 
+How to start the JavaFuzz tests
+===============================
+
+    run_java_fuzz_test.py [--num_tests]
+                          [--device]
+                          [--mode1=mode] [--mode2=mode]
+
+where
+
+    --num_tests : number of tests to run (10000 by default)
+    --device    : target device serial number (passed to adb -s)
+    --mode1     : m1
+    --mode2     : m2, with m1 != m2, and values one of
+      ri   = reference implementation on host (default for m1)
+      hint = Art interpreter on host
+      hopt = Art optimizing on host (default for m2)
+      tint = Art interpreter on target
+      topt = Art optimizing on target
+
 Background
 ==========
 
@@ -49,14 +68,15 @@
 and flaws still linger in the system.
 
 Over the years, fuzz testing has gained popularity as a testing technique for
-discovering such lingering bugs, including bugs that can bring down a system in
-an unexpected way. Fuzzing refers to feeding a large amount of random data as
-input to a system in an attempt to find bugs or make it crash. Mutation-based
-fuzz testing is a special form of fuzzing that applies small random changes to
-existing inputs in order to detect shortcomings in a system. Profile-guided or
-coverage-guided fuzzing adds a direction to the way these random changes are
-applied. Multilayer approaches generate random inputs that are subsequently
-mutated at various stages of execution.
+discovering such lingering bugs, including bugs that can bring down a system
+in an unexpected way. Fuzzing refers to feeding a large amount of random data
+as input to a system in an attempt to find bugs or make it crash. Generation-
+based fuzz testing constructs random, but properly formatted input data.
+Mutation-based fuzz testing applies small random changes to existing inputs
+in order to detect shortcomings in a system. Profile-guided or coverage-guided
+fuzzing adds a direction to the way these random changes are applied. Multi-
+layered approaches generate random inputs that are subsequently mutated at
+various stages of execution.
 
 The randomness of fuzz testing implies that the size and scope of testing is no
 longer bounded. Every new run can potentially discover bugs and crashes that were
diff --git a/tools/javafuzz/javafuzz.cc b/tools/javafuzz/javafuzz.cc
index 4e6e978..161ae0a 100644
--- a/tools/javafuzz/javafuzz.cc
+++ b/tools/javafuzz/javafuzz.cc
@@ -53,7 +53,9 @@
  * to preserve the property that a given version of JavaFuzz yields the same
  * fuzzed Java program for a deterministic random seed.
  */
-const char* VERSION = "1.0";
+const char* VERSION = "1.1";
+
+static const uint32_t MAX_DIMS[11] = { 0, 1000, 32, 10, 6, 4, 3, 3, 2, 2, 2 };
 
 /**
  * A class that generates a random Java program that compiles correctly. The program
@@ -83,8 +85,8 @@
         fuzz_loop_nest_(loop_nest),
         return_type_(randomType()),
         array_type_(randomType()),
-        array_dim_(random1(3)),
-        array_size_(random1(10)),
+        array_dim_(random1(10)),
+        array_size_(random1(MAX_DIMS[array_dim_])),
         indentation_(0),
         expr_depth_(0),
         stmt_length_(0),
@@ -169,7 +171,7 @@
   // Emit an unary operator (same type in-out).
   void emitUnaryOp(Type tp) {
     if (tp == kBoolean) {
-      fputs("!", out_);
+      fputc('!', out_);
     } else if (isInteger(tp)) {
       EMIT(kIntUnaryOps);
     } else {  // isFP(tp)
@@ -239,16 +241,21 @@
         case 6: fputs("(long)(int)(long)",   out_); return kLong;
       }
     } else if (tp == kFloat) {
-      switch (random1(3)) {
+      switch (random1(4)) {
         case 1: fputs("(float)", out_); return kInt;
         case 2: fputs("(float)", out_); return kLong;
         case 3: fputs("(float)", out_); return kDouble;
+        // Narrowing-widening.
+        case 4: fputs("(float)(int)(float)", out_); return kFloat;
       }
     } else if (tp == kDouble) {
-      switch (random1(3)) {
+      switch (random1(5)) {
         case 1: fputs("(double)", out_); return kInt;
         case 2: fputs("(double)", out_); return kLong;
         case 3: fputs("(double)", out_); return kFloat;
+        // Narrowing-widening.
+        case 4: fputs("(double)(int)(double)",   out_); return kDouble;
+        case 5: fputs("(double)(float)(double)", out_); return kDouble;
       }
     }
     return tp;  // nothing suitable, just keep type
@@ -273,15 +280,17 @@
   // Emit an unary intrinsic (out type given, new suitable in type picked).
   Type emitIntrinsic1(Type tp) {
     if (tp == kBoolean) {
-      switch (random1(4)) {
+      switch (random1(6)) {
         case 1: fputs("Float.isNaN",       out_); return kFloat;
-        case 2: fputs("Float.isInfinite",  out_); return kFloat;
-        case 3: fputs("Double.isNaN",      out_); return kDouble;
-        case 4: fputs("Double.isInfinite", out_); return kDouble;
+        case 2: fputs("Float.isFinite",    out_); return kFloat;
+        case 3: fputs("Float.isInfinite",  out_); return kFloat;
+        case 4: fputs("Double.isNaN",      out_); return kDouble;
+        case 5: fputs("Double.isFinite",   out_); return kDouble;
+        case 6: fputs("Double.isInfinite", out_); return kDouble;
       }
     } else if (isInteger(tp)) {
       const char* prefix = tp == kLong ? "Long" : "Integer";
-      switch (random1(9)) {
+      switch (random1(13)) {
         case 1: fprintf(out_, "%s.highestOneBit",         prefix); break;
         case 2: fprintf(out_, "%s.lowestOneBit",          prefix); break;
         case 3: fprintf(out_, "%s.numberOfLeadingZeros",  prefix); break;
@@ -290,15 +299,27 @@
         case 6: fprintf(out_, "%s.signum",                prefix); break;
         case 7: fprintf(out_, "%s.reverse",               prefix); break;
         case 8: fprintf(out_, "%s.reverseBytes",          prefix); break;
-        case 9: fputs("Math.abs", out_);                           break;
+        case 9:  fputs("Math.incrementExact", out_); break;
+        case 10: fputs("Math.decrementExact", out_); break;
+        case 11: fputs("Math.negateExact",    out_); break;
+        case 12: fputs("Math.abs",            out_); break;
+        case 13: fputs("Math.round", out_);
+                 return tp == kLong ? kDouble : kFloat;
       }
     } else {  // isFP(tp)
-      switch (random1(5)) {
+      switch (random1(6)) {
         case 1: fputs("Math.abs",      out_); break;
         case 2: fputs("Math.ulp",      out_); break;
         case 3: fputs("Math.signum",   out_); break;
         case 4: fputs("Math.nextUp",   out_); break;
         case 5: fputs("Math.nextDown", out_); break;
+        case 6: if (tp == kDouble) {
+                  fputs("Double.longBitsToDouble", out_);
+                  return kLong;
+                } else {
+                  fputs("Float.intBitsToFloat", out_);
+                  return kInt;
+                }
       }
     }
     return tp;  // same type in-out
@@ -314,15 +335,27 @@
       }
     } else if (isInteger(tp)) {
       const char* prefix = tp == kLong ? "Long" : "Integer";
-      switch (random1(3)) {
+      switch (random1(11)) {
         case 1: fprintf(out_, "%s.compare", prefix); break;
-        case 2: fputs("Math.min", out_); break;
-        case 3: fputs("Math.max", out_); break;
+        case 2: fprintf(out_, "%s.sum",     prefix); break;
+        case 3: fprintf(out_, "%s.min",     prefix); break;
+        case 4: fprintf(out_, "%s.max",     prefix); break;
+        case 5:  fputs("Math.min",           out_); break;
+        case 6:  fputs("Math.max",           out_); break;
+        case 7:  fputs("Math.floorDiv",      out_); break;
+        case 8:  fputs("Math.floorMod",      out_); break;
+        case 9:  fputs("Math.addExact",      out_); break;
+        case 10: fputs("Math.subtractExact", out_); break;
+        case 11: fputs("Math.multiplyExact", out_); break;
       }
     } else {  // isFP(tp)
-      switch (random1(2)) {
-        case 1: fputs("Math.min", out_); break;
-        case 2: fputs("Math.max", out_); break;
+      const char* prefix = tp == kDouble ? "Double" : "Float";
+      switch (random1(5)) {
+        case 1: fprintf(out_, "%s.sum", prefix); break;
+        case 2: fprintf(out_, "%s.min", prefix); break;
+        case 3: fprintf(out_, "%s.max", prefix); break;
+        case 4: fputs("Math.min", out_); break;
+        case 5: fputs("Math.max", out_); break;
       }
     }
     return tp;  // same type in-out
@@ -358,12 +391,24 @@
 
   // Emit miscellaneous constructs.
   void emitMisc(Type tp) {
-    switch (tp) {
-      case kBoolean: fputs("this instanceof Test", out_); break;
-      case kInt:     fputs("mArray.length",    out_); break;
-      case kLong:    fputs("Long.MAX_VALUE",   out_); break;
-      case kFloat:   fputs("Float.MAX_VALUE",  out_); break;
-      case kDouble:  fputs("Double.MAX_VALUE", out_); break;
+    if (tp == kBoolean) {
+      fputs("this instanceof Test", out_);
+    } else if (isInteger(tp)) {
+      const char* prefix = tp == kLong ? "Long" : "Integer";
+      switch (random1(2)) {
+        case 1: fprintf(out_, "%s.MIN_VALUE", prefix); break;
+        case 2: fprintf(out_, "%s.MAX_VALUE", prefix); break;
+      }
+    } else {  // isFP(tp)
+      const char* prefix = tp == kDouble ? "Double" : "Float";
+      switch (random1(6)) {
+        case 1: fprintf(out_, "%s.MIN_NORMAL", prefix);        break;
+        case 2: fprintf(out_, "%s.MIN_VALUE", prefix);         break;
+        case 3: fprintf(out_, "%s.MAX_VALUE", prefix);         break;
+        case 4: fprintf(out_, "%s.POSITIVE_INFINITY", prefix); break;
+        case 5: fprintf(out_, "%s.NEGATIVE_INFINITY", prefix); break;
+        case 6: fprintf(out_, "%s.NaN", prefix);               break;
+      }
     }
   }
 
@@ -412,10 +457,10 @@
   void emitLiteral(Type tp) {
     switch (tp) {
       case kBoolean: fputs(random1(2) == 1 ? "true" : "false", out_); break;
-      case kInt:     fprintf(out_, "%d",    random0(100)); break;
-      case kLong:    fprintf(out_, "%dL",   random0(100)); break;
-      case kFloat:   fprintf(out_, "%d.0f", random0(100)); break;
-      case kDouble:  fprintf(out_, "%d.0",  random0(100)); break;
+      case kInt:     fprintf(out_, "%d",    random()); break;
+      case kLong:    fprintf(out_, "%dL",   random()); break;
+      case kFloat:   fprintf(out_, "%d.0f", random()); break;
+      case kDouble:  fprintf(out_, "%d.0",  random()); break;
     }
   }
 
@@ -433,17 +478,6 @@
     return false;
   }
 
-  // Emit a loop variable, if available.
-  bool emitLoopVariable(Type tp) {
-    if (tp == kInt) {
-      if (loop_nest_ > 0) {
-        fprintf(out_, "i%u", random0(loop_nest_));
-        return true;
-      }
-    }
-    return false;
-  }
-
   // Emit a local variable, if available.
   bool emitLocalVariable(Type tp) {
     uint32_t locals = adjustLocal(tp, 0);
@@ -483,10 +517,6 @@
         if (emitLocalVariable(tp))
           return;
         // FALL-THROUGH
-      case 3:
-        if (emitLoopVariable(tp))
-          return;
-        // FALL-THROUGH
       default:
         emitFieldVariable(tp);
         break;
@@ -510,8 +540,9 @@
     fputc('(', out_);
     switch (random1(12)) {  // favor binary operations
       case 1:
-        // Unary operator: ~x
+        // Unary operator: ~ x
         emitUnaryOp(tp);
+        fputc(' ', out_);
         emitExpression(tp);
         break;
       case 2:
@@ -761,7 +792,7 @@
 
     bool mayFollow = false;
     fputs("switch (", out_);
-    emitExpression(kInt);
+    emitArrayIndex();  // restrict its range
     fputs(") {\n", out_);
 
     ++if_nest_;
@@ -771,7 +802,7 @@
     for (uint32_t i = 0; i < 2; i++) {
       emitIndentation();
       if (i == 0) {
-        fprintf(out_, "case %d: {\n", random0(100));
+        fprintf(out_, "case %u: {\n", random0(array_size_));
       } else {
         fprintf(out_, "default: {\n");
       }
@@ -977,6 +1008,11 @@
   // Random integers.
   //
 
+  // Return random integer.
+  int32_t random() {
+    return fuzz_random_engine_();
+  }
+
   // Return random integer in range [0,max).
   uint32_t random0(uint32_t max) {
     std::uniform_int_distribution<uint32_t> gen(0, max - 1);
@@ -1025,7 +1061,7 @@
   // Defaults.
   uint32_t seed = time(NULL);
   uint32_t expr_depth = 1;
-  uint32_t stmt_length = 4;
+  uint32_t stmt_length = 8;
   uint32_t if_nest = 2;
   uint32_t loop_nest = 3;
 
diff --git a/tools/javafuzz/run_java_fuzz_test.py b/tools/javafuzz/run_java_fuzz_test.py
new file mode 100755
index 0000000..5f527b8
--- /dev/null
+++ b/tools/javafuzz/run_java_fuzz_test.py
@@ -0,0 +1,416 @@
+#!/usr/bin/env python2
+#
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+import argparse
+import subprocess
+import sys
+import os
+
+from tempfile import mkdtemp
+from threading import Timer
+
+# Normalized return codes.
+EXIT_SUCCESS = 0
+EXIT_TIMEOUT = 1
+EXIT_NOTCOMPILED = 2
+EXIT_NOTRUN = 3
+
+#
+# Utility methods.
+#
+
+def RunCommand(cmd, args, out, err, timeout = 5):
+  """Executes a command, and returns its return code.
+
+  Args:
+    cmd: string, a command to execute
+    args: string, arguments to pass to command (or None)
+    out: string, file name to open for stdout (or None)
+    err: string, file name to open for stderr (or None)
+    timeout: int, time out in seconds
+  Returns:
+    return code of running command (forced EXIT_TIMEOUT on timeout)
+  """
+  cmd = 'exec ' + cmd  # preserve pid
+  if args != None:
+    cmd = cmd + ' ' + args
+  outf = None
+  if out != None:
+    outf = open(out, mode='w')
+  errf = None
+  if err != None:
+    errf = open(err, mode='w')
+  proc = subprocess.Popen(cmd, stdout=outf, stderr=errf, shell=True)
+  timer = Timer(timeout, proc.kill)  # enforces timeout
+  timer.start()
+  proc.communicate()
+  if timer.is_alive():
+    timer.cancel()
+    returncode = proc.returncode
+  else:
+    returncode = EXIT_TIMEOUT
+  if outf != None:
+    outf.close()
+  if errf != None:
+    errf.close()
+  return returncode
+
+def GetJackClassPath():
+  """Returns Jack's classpath."""
+  top = os.environ.get('ANDROID_BUILD_TOP')
+  if top == None:
+    raise FatalError('Cannot find AOSP build top')
+  libdir = top + '/out/host/common/obj/JAVA_LIBRARIES'
+  return libdir + '/core-libart-hostdex_intermediates/classes.jack:' \
+       + libdir + '/core-oj-hostdex_intermediates/classes.jack'
+
+def GetExecutionModeRunner(device, mode):
+  """Returns a runner for the given execution mode.
+
+  Args:
+    device: string, target device serial number (or None)
+    mode: string, execution mode
+  Returns:
+    TestRunner with given execution mode
+  Raises:
+    FatalError: error for unknown execution mode
+  """
+  if mode == 'ri':
+    return TestRunnerRIOnHost()
+  if mode == 'hint':
+    return TestRunnerArtOnHost(True)
+  if mode == 'hopt':
+    return TestRunnerArtOnHost(False)
+  if mode == 'tint':
+    return TestRunnerArtOnTarget(device, True)
+  if mode == 'topt':
+    return TestRunnerArtOnTarget(device, False)
+  raise FatalError('Unknown execution mode')
+
+def GetReturnCode(retc):
+  """Returns a string representation of the given normalized return code.
+  Args:
+    retc: int, normalized return code
+  Returns:
+    string representation of normalized return code
+  Raises:
+    FatalError: error for unknown normalized return code
+  """
+  if retc == EXIT_SUCCESS:
+    return 'SUCCESS'
+  if retc == EXIT_TIMEOUT:
+    return 'TIMED-OUT'
+  if retc == EXIT_NOTCOMPILED:
+    return 'NOT-COMPILED'
+  if retc == EXIT_NOTRUN:
+    return 'NOT-RUN'
+  raise FatalError('Unknown normalized return code')
+
+#
+# Execution mode classes.
+#
+
+class TestRunner(object):
+  """Abstraction for running a test in a particular execution mode."""
+  __meta_class__ = abc.ABCMeta
+
+  def GetDescription(self):
+    """Returns a description string of the execution mode."""
+    return self._description
+
+  def GetId(self):
+    """Returns a short string that uniquely identifies the execution mode."""
+    return self._id
+
+  @abc.abstractmethod
+  def CompileAndRunTest(self):
+    """Compile and run the generated test.
+
+    Ensures that the current Test.java in the temporary directory is compiled
+    and executed under the current execution mode. On success, transfers the
+    generated output to the file GetId()_out.txt in the temporary directory.
+    Cleans up after itself.
+
+    Most nonzero return codes are assumed non-divergent, since systems may
+    exit in different ways. This is enforced by normalizing return codes.
+
+    Returns:
+      normalized return code
+    """
+    pass
+
+class TestRunnerRIOnHost(TestRunner):
+  """Concrete test runner of the reference implementation on host."""
+
+  def  __init__(self):
+    """Constructor for the RI tester."""
+    self._description = 'RI on host'
+    self._id = 'RI'
+
+  def CompileAndRunTest(self):
+    if RunCommand('javac', 'Test.java',
+                  out=None, err=None, timeout=30) == EXIT_SUCCESS:
+      retc = RunCommand('java', 'Test', 'RI_run_out.txt', err=None)
+      if retc != EXIT_SUCCESS and retc != EXIT_TIMEOUT:
+        retc = EXIT_NOTRUN
+    else:
+      retc = EXIT_NOTCOMPILED
+    # Cleanup and return.
+    RunCommand('rm', '-f Test.class', out=None, err=None)
+    return retc
+
+class TestRunnerArtOnHost(TestRunner):
+  """Concrete test runner of Art on host (interpreter or optimizing)."""
+
+  def  __init__(self, interpreter):
+    """Constructor for the Art on host tester.
+
+    Args:
+      interpreter: boolean, selects between interpreter or optimizing
+    """
+    self._art_args = '-cp classes.dex Test'
+    if interpreter:
+      self._description = 'Art interpreter on host'
+      self._id = 'HInt'
+      self._art_args = '-Xint ' + self._art_args
+    else:
+      self._description = 'Art optimizing on host'
+      self._id = 'HOpt'
+    self._jack_args = '-cp ' + GetJackClassPath() + ' --output-dex . Test.java'
+
+  def CompileAndRunTest(self):
+    if RunCommand('jack', self._jack_args,
+                  out=None, err='jackerr.txt', timeout=30) == EXIT_SUCCESS:
+      out = self.GetId() + '_run_out.txt'
+      retc = RunCommand('art', self._art_args, out, 'arterr.txt')
+      if retc != EXIT_SUCCESS and retc != EXIT_TIMEOUT:
+        retc = EXIT_NOTRUN
+    else:
+      retc = EXIT_NOTCOMPILED
+    # Cleanup and return.
+    RunCommand('rm', '-rf classes.dex jackerr.txt arterr.txt android-data*',
+               out=None, err=None)
+    return retc
+
+# TODO: very rough first version without proper cache,
+#       reuse staszkiewicz' module for properly setting up dalvikvm on target.
+class TestRunnerArtOnTarget(TestRunner):
+  """Concrete test runner of Art on target (interpreter or optimizing)."""
+
+  def  __init__(self, device, interpreter):
+    """Constructor for the Art on target tester.
+
+    Args:
+      device: string, target device serial number (or None)
+      interpreter: boolean, selects between interpreter or optimizing
+    """
+    self._dalvik_args = 'shell dalvikvm -cp /data/local/tmp/classes.dex Test'
+    if interpreter:
+      self._description = 'Art interpreter on target'
+      self._id = 'TInt'
+      self._dalvik_args = '-Xint ' + self._dalvik_args
+    else:
+      self._description = 'Art optimizing on target'
+      self._id = 'TOpt'
+    self._adb = 'adb'
+    if device != None:
+      self._adb = self._adb + ' -s ' + device
+    self._jack_args = '-cp ' + GetJackClassPath() + ' --output-dex . Test.java'
+
+  def CompileAndRunTest(self):
+    if RunCommand('jack', self._jack_args,
+                  out=None, err='jackerr.txt', timeout=30) == EXIT_SUCCESS:
+      if RunCommand(self._adb, 'push classes.dex /data/local/tmp/',
+                    'adb.txt', err=None) != EXIT_SUCCESS:
+        raise FatalError('Cannot push to target device')
+      out = self.GetId() + '_run_out.txt'
+      retc = RunCommand(self._adb, self._dalvik_args, out, err=None)
+      if retc != EXIT_SUCCESS and retc != EXIT_TIMEOUT:
+        retc = EXIT_NOTRUN
+    else:
+      retc = EXIT_NOTCOMPILED
+    # Cleanup and return.
+    RunCommand('rm', '-f classes.dex jackerr.txt adb.txt',
+               out=None, err=None)
+    RunCommand(self._adb, 'shell rm -f /data/local/tmp/classes.dex',
+               out=None, err=None)
+    return retc
+
+#
+# Tester classes.
+#
+
+class FatalError(Exception):
+  """Fatal error in the tester."""
+  pass
+
+class JavaFuzzTester(object):
+  """Tester that runs JavaFuzz many times and report divergences."""
+
+  def  __init__(self, num_tests, device, mode1, mode2):
+    """Constructor for the tester.
+
+    Args:
+    num_tests: int, number of tests to run
+    device: string, target device serial number (or None)
+    mode1: string, execution mode for first runner
+    mode2: string, execution mode for second runner
+    """
+    self._num_tests = num_tests
+    self._device = device
+    self._runner1 = GetExecutionModeRunner(device, mode1)
+    self._runner2 = GetExecutionModeRunner(device, mode2)
+    self._save_dir = None
+    self._tmp_dir = None
+    # Statistics.
+    self._test = 0
+    self._num_success = 0
+    self._num_not_compiled = 0
+    self._num_not_run = 0
+    self._num_timed_out = 0
+    self._num_divergences = 0
+
+  def __enter__(self):
+    """On entry, enters new temp directory after saving current directory.
+
+    Raises:
+      FatalError: error when temp directory cannot be constructed
+    """
+    self._save_dir = os.getcwd()
+    self._tmp_dir = mkdtemp(dir="/tmp/")
+    if self._tmp_dir == None:
+      raise FatalError('Cannot obtain temp directory')
+    os.chdir(self._tmp_dir)
+    return self
+
+  def __exit__(self, etype, evalue, etraceback):
+    """On exit, re-enters previously saved current directory and cleans up."""
+    os.chdir(self._save_dir)
+    if self._num_divergences == 0:
+      RunCommand('rm', '-rf ' + self._tmp_dir, out=None, err=None)
+
+  def Run(self):
+    """Runs JavaFuzz many times and report divergences."""
+    print
+    print '**\n**** JavaFuzz Testing\n**'
+    print
+    print '#Tests    :', self._num_tests
+    print 'Device    :', self._device
+    print 'Directory :', self._tmp_dir
+    print 'Exec-mode1:', self._runner1.GetDescription()
+    print 'Exec-mode2:', self._runner2.GetDescription()
+    print
+    self.ShowStats()
+    for self._test in range(1, self._num_tests + 1):
+      self.RunJavaFuzzTest()
+      self.ShowStats()
+    if self._num_divergences == 0:
+      print '\n\nsuccess (no divergences)\n'
+    else:
+      print '\n\nfailure (divergences)\n'
+
+  def ShowStats(self):
+    """Shows current statistics (on same line) while tester is running."""
+    print '\rTests:', self._test, \
+        'Success:', self._num_success, \
+        'Not-compiled:', self._num_not_compiled, \
+        'Not-run:', self._num_not_run, \
+        'Timed-out:', self._num_timed_out, \
+        'Divergences:', self._num_divergences,
+    sys.stdout.flush()
+
+  def RunJavaFuzzTest(self):
+    """Runs a single JavaFuzz test, comparing two execution modes."""
+    self.ConstructTest()
+    retc1 = self._runner1.CompileAndRunTest()
+    retc2 = self._runner2.CompileAndRunTest()
+    self.CheckForDivergence(retc1, retc2)
+    self.CleanupTest()
+
+  def ConstructTest(self):
+    """Use JavaFuzz to generate next Test.java test.
+
+    Raises:
+      FatalError: error when javafuzz fails
+    """
+    if RunCommand('javafuzz', args=None,
+                  out='Test.java', err=None) != EXIT_SUCCESS:
+      raise FatalError('Unexpected error while running JavaFuzz')
+
+  def CheckForDivergence(self, retc1, retc2):
+    """Checks for divergences and updates statistics.
+
+    Args:
+      retc1: int, normalized return code of first runner
+      retc2: int, normalized return code of second runner
+    """
+    if retc1 == retc2:
+      # Non-divergent in return code.
+      if retc1 == EXIT_SUCCESS:
+        # Both compilations and runs were successful, inspect generated output.
+        args = self._runner1.GetId() + '_run_out.txt ' \
+            + self._runner2.GetId() + '_run_out.txt'
+        if RunCommand('diff', args, out=None, err=None) != EXIT_SUCCESS:
+          self.ReportDivergence('divergence in output')
+        else:
+          self._num_success += 1
+      elif retc1 == EXIT_TIMEOUT:
+        self._num_timed_out += 1
+      elif retc1 == EXIT_NOTCOMPILED:
+        self._num_not_compiled += 1
+      else:
+        self._num_not_run += 1
+    else:
+      # Divergent in return code.
+      self.ReportDivergence('divergence in return code: ' +
+                            GetReturnCode(retc1) + ' vs. ' +
+                            GetReturnCode(retc2))
+
+  def ReportDivergence(self, reason):
+    """Reports and saves a divergence."""
+    self._num_divergences += 1
+    print '\n', self._test, reason
+    # Save.
+    ddir = 'divergence' + str(self._test)
+    RunCommand('mkdir', ddir, out=None, err=None)
+    RunCommand('mv', 'Test.java *.txt ' + ddir, out=None, err=None)
+
+  def CleanupTest(self):
+    """Cleans up after a single test run."""
+    RunCommand('rm', '-f Test.java *.txt', out=None, err=None)
+
+
+def main():
+  # Handle arguments.
+  parser = argparse.ArgumentParser()
+  parser.add_argument('--num_tests', default=10000,
+                      type=int, help='number of tests to run')
+  parser.add_argument('--device', help='target device serial number')
+  parser.add_argument('--mode1', default='ri',
+                      help='execution mode 1 (default: ri)')
+  parser.add_argument('--mode2', default='hopt',
+                      help='execution mode 2 (default: hopt)')
+  args = parser.parse_args()
+  if args.mode1 == args.mode2:
+    raise FatalError("Identical execution modes given")
+  # Run the JavaFuzz tester.
+  with JavaFuzzTester(args.num_tests, args.device,
+                      args.mode1, args.mode2) as fuzzer:
+    fuzzer.Run()
+
+if __name__ == "__main__":
+  main()
diff --git a/tools/libcore_failures.txt b/tools/libcore_failures.txt
index cbb6e1d..6472c8d 100644
--- a/tools/libcore_failures.txt
+++ b/tools/libcore_failures.txt
@@ -220,13 +220,6 @@
   names: [ "libcore.java.io.FileTest#testJavaIoTmpdirMutable" ]
 },
 {
-  description: "Made for extending, shouldn't be run",
-  result: EXEC_FAILED,
-  names: ["jsr166.CollectionTest#testEmptyMeansEmpty",
-          "jsr166.Collection8Test#testForEach",
-          "jsr166.Collection8Test#testForEachConcurrentStressTest"]
-},
-{
   description: "Flaky test",
   result: EXEC_FAILED,
   bug: 30107038,