Merge "libcore_failures: Remove obsolete entries."
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index 7a34683..c787dc5 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -39,6 +39,7 @@
 #include "gc/accounting/card_table-inl.h"
 #include "gc/accounting/heap_bitmap.h"
 #include "gc/accounting/space_bitmap-inl.h"
+#include "gc/collector/concurrent_copying.h"
 #include "gc/heap.h"
 #include "gc/space/large_object_space.h"
 #include "gc/space/space-inl.h"
@@ -1823,6 +1824,11 @@
   const auto it = saved_hashcode_map_.find(obj);
   dst->SetLockWord(it != saved_hashcode_map_.end() ?
       LockWord::FromHashCode(it->second, 0u) : LockWord::Default(), false);
+  if (kUseBakerReadBarrier && gc::collector::ConcurrentCopying::kGrayDirtyImmuneObjects) {
+    // Treat all of the objects in the image as marked to avoid unnecessary dirty pages. This is
+    // safe since we mark all of the objects that may reference non immune objects as gray.
+    CHECK(dst->AtomicSetMarkBit(0, 1));
+  }
   FixupObject(obj, dst);
 }
 
diff --git a/compiler/image_writer.h b/compiler/image_writer.h
index 626a975..7d13656 100644
--- a/compiler/image_writer.h
+++ b/compiler/image_writer.h
@@ -217,8 +217,7 @@
   // uint32 = typeof(lockword_)
   // Subtract read barrier bits since we want these to remain 0, or else it may result in DCHECK
   // failures due to invalid read barrier bits during object field reads.
-  static const size_t kBinShift = BitSizeOf<uint32_t>() - kBinBits -
-      LockWord::kReadBarrierStateSize;
+  static const size_t kBinShift = BitSizeOf<uint32_t>() - kBinBits - LockWord::kGCStateSize;
   // 111000.....0
   static const size_t kBinMask = ((static_cast<size_t>(1) << kBinBits) - 1) << kBinShift;
 
diff --git a/compiler/linker/arm/relative_patcher_arm_base.cc b/compiler/linker/arm/relative_patcher_arm_base.cc
index d4dd978..2471f79 100644
--- a/compiler/linker/arm/relative_patcher_arm_base.cc
+++ b/compiler/linker/arm/relative_patcher_arm_base.cc
@@ -31,10 +31,6 @@
 }
 
 uint32_t ArmBaseRelativePatcher::ReserveSpaceEnd(uint32_t offset) {
-  // NOTE: The final thunk can be reserved from InitCodeMethodVisitor::EndClass() while it
-  // may be written early by WriteCodeMethodVisitor::VisitMethod() for a deduplicated chunk
-  // of code. To avoid any alignment discrepancies for the final chunk, we always align the
-  // offset after reserving of writing any chunk.
   uint32_t aligned_offset = CompiledMethod::AlignCode(offset, instruction_set_);
   bool needs_thunk = ReserveSpaceProcessPatches(aligned_offset,
                                                 MethodReference(nullptr, 0u),
@@ -46,7 +42,7 @@
     unprocessed_patches_.clear();
 
     thunk_locations_.push_back(aligned_offset);
-    offset = CompiledMethod::AlignCode(aligned_offset + thunk_code_.size(), instruction_set_);
+    offset = aligned_offset + thunk_code_.size();
   }
   return offset;
 }
@@ -65,13 +61,7 @@
     if (UNLIKELY(!WriteRelCallThunk(out, ArrayRef<const uint8_t>(thunk_code_)))) {
       return 0u;
     }
-    uint32_t thunk_end_offset = aligned_offset + thunk_code_.size();
-    // Align after writing chunk, see the ReserveSpace() above.
-    offset = CompiledMethod::AlignCode(thunk_end_offset, instruction_set_);
-    aligned_code_delta = offset - thunk_end_offset;
-    if (aligned_code_delta != 0u && !WriteCodeAlignment(out, aligned_code_delta)) {
-      return 0u;
-    }
+    offset = aligned_offset + thunk_code_.size();
   }
   return offset;
 }
@@ -92,7 +82,7 @@
                                                       MethodReference method_ref,
                                                       uint32_t max_extra_space) {
   uint32_t quick_code_size = compiled_method->GetQuickCode().size();
-  uint32_t quick_code_offset = compiled_method->AlignCode(offset) + sizeof(OatQuickMethodHeader);
+  uint32_t quick_code_offset = compiled_method->AlignCode(offset + sizeof(OatQuickMethodHeader));
   uint32_t next_aligned_offset = compiled_method->AlignCode(quick_code_offset + quick_code_size);
   // Adjust for extra space required by the subclass.
   next_aligned_offset = compiled_method->AlignCode(next_aligned_offset + max_extra_space);
@@ -106,9 +96,9 @@
     if (needs_thunk) {
       // A single thunk will cover all pending patches.
       unprocessed_patches_.clear();
-      uint32_t thunk_location = compiled_method->AlignCode(offset);
+      uint32_t thunk_location = CompiledMethod::AlignCode(offset, instruction_set_);
       thunk_locations_.push_back(thunk_location);
-      offset = CompiledMethod::AlignCode(thunk_location + thunk_code_.size(), instruction_set_);
+      offset = thunk_location + thunk_code_.size();
     }
   }
   for (const LinkerPatch& patch : compiled_method->GetPatches()) {
diff --git a/compiler/linker/arm/relative_patcher_thumb2_test.cc b/compiler/linker/arm/relative_patcher_thumb2_test.cc
index a8078e3..eace3d4 100644
--- a/compiler/linker/arm/relative_patcher_thumb2_test.cc
+++ b/compiler/linker/arm/relative_patcher_thumb2_test.cc
@@ -48,18 +48,18 @@
                              const ArrayRef<const LinkerPatch>& method3_patches,
                              uint32_t distance_without_thunks) {
     CHECK_EQ(distance_without_thunks % kArmAlignment, 0u);
-    const uint32_t method1_offset =
-        CompiledCode::AlignCode(kTrampolineSize, kThumb2) + sizeof(OatQuickMethodHeader);
+    uint32_t method1_offset =
+        kTrampolineSize + CodeAlignmentSize(kTrampolineSize) + sizeof(OatQuickMethodHeader);
     AddCompiledMethod(MethodRef(1u), method1_code, method1_patches);
 
     // We want to put the method3 at a very precise offset.
     const uint32_t method3_offset = method1_offset + distance_without_thunks;
-    CHECK_ALIGNED(method3_offset - sizeof(OatQuickMethodHeader), kArmAlignment);
+    CHECK_ALIGNED(method3_offset, kArmAlignment);
 
     // Calculate size of method2 so that we put method3 at the correct place.
+    const uint32_t method1_end = method1_offset + method1_code.size();
     const uint32_t method2_offset =
-        CompiledCode::AlignCode(method1_offset + method1_code.size(), kThumb2) +
-        sizeof(OatQuickMethodHeader);
+        method1_end + CodeAlignmentSize(method1_end) + sizeof(OatQuickMethodHeader);
     const uint32_t method2_size = (method3_offset - sizeof(OatQuickMethodHeader) - method2_offset);
     std::vector<uint8_t> method2_raw_code(method2_size);
     ArrayRef<const uint8_t> method2_code(method2_raw_code);
@@ -78,8 +78,11 @@
     if (result3.second == method3_offset + 1 /* thumb mode */) {
       return false;  // No thunk.
     } else {
-      uint32_t aligned_thunk_size = CompiledCode::AlignCode(ThunkSize(), kThumb2);
-      CHECK_EQ(result3.second, method3_offset + aligned_thunk_size + 1 /* thumb mode */);
+      uint32_t thunk_end =
+          CompiledCode::AlignCode(method3_offset - sizeof(OatQuickMethodHeader), kThumb2) +
+          ThunkSize();
+      uint32_t header_offset = thunk_end + CodeAlignmentSize(thunk_end);
+      CHECK_EQ(result3.second, header_offset + sizeof(OatQuickMethodHeader) + 1 /* thumb mode */);
       return true;   // Thunk present.
     }
   }
@@ -352,9 +355,12 @@
 
   uint32_t method1_offset = GetMethodOffset(1u);
   uint32_t method3_offset = GetMethodOffset(3u);
+  ASSERT_TRUE(IsAligned<kArmAlignment>(method3_offset));
   uint32_t method3_header_offset = method3_offset - sizeof(OatQuickMethodHeader);
-  ASSERT_TRUE(IsAligned<kArmAlignment>(method3_header_offset));
-  uint32_t thunk_offset = method3_header_offset - CompiledCode::AlignCode(ThunkSize(), kThumb2);
+  uint32_t thunk_offset =
+      RoundDown(method3_header_offset - ThunkSize(), GetInstructionSetAlignment(kThumb2));
+  DCHECK_EQ(thunk_offset + ThunkSize() + CodeAlignmentSize(thunk_offset + ThunkSize()),
+            method3_header_offset);
   ASSERT_TRUE(IsAligned<kArmAlignment>(thunk_offset));
   uint32_t diff = thunk_offset - (method1_offset + bl_offset_in_method1 + 4u /* PC adjustment */);
   ASSERT_EQ(diff & 1u, 0u);
diff --git a/compiler/linker/arm64/relative_patcher_arm64.cc b/compiler/linker/arm64/relative_patcher_arm64.cc
index fdd14be..4c8788e 100644
--- a/compiler/linker/arm64/relative_patcher_arm64.cc
+++ b/compiler/linker/arm64/relative_patcher_arm64.cc
@@ -83,7 +83,7 @@
 
   // Now that we have the actual offset where the code will be placed, locate the ADRP insns
   // that actually require the thunk.
-  uint32_t quick_code_offset = compiled_method->AlignCode(offset) + sizeof(OatQuickMethodHeader);
+  uint32_t quick_code_offset = compiled_method->AlignCode(offset + sizeof(OatQuickMethodHeader));
   ArrayRef<const uint8_t> code = compiled_method->GetQuickCode();
   uint32_t thunk_offset = compiled_method->AlignCode(quick_code_offset + code.size());
   DCHECK(compiled_method != nullptr);
diff --git a/compiler/linker/arm64/relative_patcher_arm64_test.cc b/compiler/linker/arm64/relative_patcher_arm64_test.cc
index 09729fd..573de73 100644
--- a/compiler/linker/arm64/relative_patcher_arm64_test.cc
+++ b/compiler/linker/arm64/relative_patcher_arm64_test.cc
@@ -67,36 +67,39 @@
                                  const ArrayRef<const LinkerPatch>& last_method_patches,
                                  uint32_t distance_without_thunks) {
     CHECK_EQ(distance_without_thunks % kArm64Alignment, 0u);
-    const uint32_t method1_offset =
-        CompiledCode::AlignCode(kTrampolineSize, kArm64) + sizeof(OatQuickMethodHeader);
+    uint32_t method1_offset =
+        kTrampolineSize + CodeAlignmentSize(kTrampolineSize) + sizeof(OatQuickMethodHeader);
     AddCompiledMethod(MethodRef(1u), method1_code, method1_patches);
-    const uint32_t gap_start =
-        CompiledCode::AlignCode(method1_offset + method1_code.size(), kArm64);
+    const uint32_t gap_start = method1_offset + method1_code.size();
 
     // We want to put the method3 at a very precise offset.
     const uint32_t last_method_offset = method1_offset + distance_without_thunks;
+    CHECK_ALIGNED(last_method_offset, kArm64Alignment);
     const uint32_t gap_end = last_method_offset - sizeof(OatQuickMethodHeader);
-    CHECK_ALIGNED(gap_end, kArm64Alignment);
 
-    // Fill the gap with intermediate methods in chunks of 2MiB and the last in [2MiB, 4MiB).
+    // Fill the gap with intermediate methods in chunks of 2MiB and the first in [2MiB, 4MiB).
     // (This allows deduplicating the small chunks to avoid using 256MiB of memory for +-128MiB
-    // offsets by this test.)
+    // offsets by this test. Making the first chunk bigger makes it easy to give all intermediate
+    // methods the same alignment of the end, so the thunk insertion adds a predictable size as
+    // long as it's after the first chunk.)
     uint32_t method_idx = 2u;
     constexpr uint32_t kSmallChunkSize = 2 * MB;
     std::vector<uint8_t> gap_code;
-    size_t gap_size = gap_end - gap_start;
-    for (; gap_size >= 2u * kSmallChunkSize; gap_size -= kSmallChunkSize) {
-      uint32_t chunk_code_size = kSmallChunkSize - sizeof(OatQuickMethodHeader);
+    uint32_t gap_size = gap_end - gap_start;
+    uint32_t num_small_chunks = std::max(gap_size / kSmallChunkSize, 1u) - 1u;
+    uint32_t chunk_start = gap_start;
+    uint32_t chunk_size = gap_size - num_small_chunks * kSmallChunkSize;
+    for (uint32_t i = 0; i <= num_small_chunks; ++i) {  // num_small_chunks+1 iterations.
+      uint32_t chunk_code_size =
+          chunk_size - CodeAlignmentSize(chunk_start) - sizeof(OatQuickMethodHeader);
       gap_code.resize(chunk_code_size, 0u);
       AddCompiledMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(gap_code),
                         ArrayRef<const LinkerPatch>());
       method_idx += 1u;
+      chunk_start += chunk_size;
+      chunk_size = kSmallChunkSize;  // For all but the first chunk.
+      DCHECK_EQ(CodeAlignmentSize(gap_end), CodeAlignmentSize(chunk_start));
     }
-    uint32_t chunk_code_size = gap_size - sizeof(OatQuickMethodHeader);
-    gap_code.resize(chunk_code_size, 0u);
-    AddCompiledMethod(MethodRef(method_idx), ArrayRef<const uint8_t>(gap_code),
-                      ArrayRef<const LinkerPatch>());
-    method_idx += 1u;
 
     // Add the last method and link
     AddCompiledMethod(MethodRef(method_idx), last_method_code, last_method_patches);
@@ -109,8 +112,9 @@
     // There may be a thunk before method2.
     if (last_result.second != last_method_offset) {
       // Thunk present. Check that there's only one.
-      uint32_t aligned_thunk_size = CompiledCode::AlignCode(ThunkSize(), kArm64);
-      CHECK_EQ(last_result.second, last_method_offset + aligned_thunk_size);
+      uint32_t thunk_end = CompiledCode::AlignCode(gap_end, kArm64) + ThunkSize();
+      uint32_t header_offset = thunk_end + CodeAlignmentSize(thunk_end);
+      CHECK_EQ(last_result.second, header_offset + sizeof(OatQuickMethodHeader));
     }
     return method_idx;
   }
@@ -341,7 +345,7 @@
                         uint32_t dex_cache_arrays_begin,
                         uint32_t element_offset) {
     uint32_t method1_offset =
-        CompiledCode::AlignCode(kTrampolineSize, kArm64) + sizeof(OatQuickMethodHeader);
+        kTrampolineSize + CodeAlignmentSize(kTrampolineSize) + sizeof(OatQuickMethodHeader);
     ASSERT_LT(method1_offset, adrp_offset);
     CHECK_ALIGNED(adrp_offset, 4u);
     uint32_t num_nops = (adrp_offset - method1_offset) / 4u;
@@ -391,7 +395,7 @@
                         bool has_thunk,
                         uint32_t string_offset) {
     uint32_t method1_offset =
-        CompiledCode::AlignCode(kTrampolineSize, kArm64) + sizeof(OatQuickMethodHeader);
+        kTrampolineSize + CodeAlignmentSize(kTrampolineSize) + sizeof(OatQuickMethodHeader);
     ASSERT_LT(method1_offset, adrp_offset);
     CHECK_ALIGNED(adrp_offset, 4u);
     uint32_t num_nops = (adrp_offset - method1_offset) / 4u;
@@ -614,10 +618,12 @@
 
   uint32_t method1_offset = GetMethodOffset(1u);
   uint32_t last_method_offset = GetMethodOffset(last_method_idx);
+  ASSERT_TRUE(IsAligned<kArm64Alignment>(last_method_offset));
   uint32_t last_method_header_offset = last_method_offset - sizeof(OatQuickMethodHeader);
-  ASSERT_TRUE(IsAligned<kArm64Alignment>(last_method_header_offset));
-  uint32_t thunk_offset = last_method_header_offset - CompiledCode::AlignCode(ThunkSize(), kArm64);
-  ASSERT_TRUE(IsAligned<kArm64Alignment>(thunk_offset));
+  uint32_t thunk_offset =
+      RoundDown(last_method_header_offset - ThunkSize(), GetInstructionSetAlignment(kArm64));
+  DCHECK_EQ(thunk_offset + ThunkSize() + CodeAlignmentSize(thunk_offset + ThunkSize()),
+            last_method_header_offset);
   uint32_t diff = thunk_offset - (method1_offset + bl_offset_in_method1);
   CHECK_ALIGNED(diff, 4u);
   ASSERT_LT(diff, 128 * MB);
diff --git a/compiler/linker/relative_patcher_test.h b/compiler/linker/relative_patcher_test.h
index ec69107..d21f33e 100644
--- a/compiler/linker/relative_patcher_test.h
+++ b/compiler/linker/relative_patcher_test.h
@@ -98,6 +98,14 @@
         patches));
   }
 
+  uint32_t CodeAlignmentSize(uint32_t header_offset_to_align) {
+    // We want to align the code rather than the preheader.
+    uint32_t unaligned_code_offset = header_offset_to_align + sizeof(OatQuickMethodHeader);
+    uint32_t aligned_code_offset =
+        CompiledMethod::AlignCode(unaligned_code_offset, instruction_set_);
+    return aligned_code_offset - unaligned_code_offset;
+  }
+
   void Link() {
     // Reserve space.
     static_assert(kTrampolineOffset == 0u, "Unexpected trampoline offset.");
@@ -106,9 +114,8 @@
     for (auto& compiled_method : compiled_methods_) {
       offset = patcher_->ReserveSpace(offset, compiled_method.get(), compiled_method_refs_[idx]);
 
-      uint32_t aligned_offset = compiled_method->AlignCode(offset);
-      uint32_t aligned_code_delta = aligned_offset - offset;
-      offset += aligned_code_delta;
+      uint32_t alignment_size = CodeAlignmentSize(offset);
+      offset += alignment_size;
 
       offset += sizeof(OatQuickMethodHeader);
       uint32_t quick_code_offset = offset + compiled_method->CodeDelta();
@@ -136,11 +143,10 @@
     for (auto& compiled_method : compiled_methods_) {
       offset = patcher_->WriteThunks(&out_, offset);
 
-      uint32_t aligned_offset = compiled_method->AlignCode(offset);
-      uint32_t aligned_code_delta = aligned_offset - offset;
-      CHECK_LE(aligned_code_delta, sizeof(kPadding));
-      out_.WriteFully(kPadding, aligned_code_delta);
-      offset += aligned_code_delta;
+      uint32_t alignment_size = CodeAlignmentSize(offset);
+      CHECK_LE(alignment_size, sizeof(kPadding));
+      out_.WriteFully(kPadding, alignment_size);
+      offset += alignment_size;
 
       out_.WriteFully(dummy_header, sizeof(OatQuickMethodHeader));
       offset += sizeof(OatQuickMethodHeader);
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index f20c715..8273b15 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -87,6 +87,13 @@
   OatHeader* const oat_header_;
 };
 
+inline uint32_t CodeAlignmentSize(uint32_t header_offset, const CompiledMethod& compiled_method) {
+  // We want to align the code rather than the preheader.
+  uint32_t unaligned_code_offset = header_offset + sizeof(OatQuickMethodHeader);
+  uint32_t aligned_code_offset =  compiled_method.AlignCode(unaligned_code_offset);
+  return aligned_code_offset - unaligned_code_offset;
+}
+
 }  // anonymous namespace
 
 // Defines the location of the raw dex file to write.
@@ -817,8 +824,8 @@
                               uint32_t thumb_offset) {
     offset_ = writer_->relative_patcher_->ReserveSpace(
         offset_, compiled_method, MethodReference(dex_file_, it.GetMemberIndex()));
-    offset_ = compiled_method->AlignCode(offset_);
-    DCHECK_ALIGNED_PARAM(offset_,
+    offset_ += CodeAlignmentSize(offset_, *compiled_method);
+    DCHECK_ALIGNED_PARAM(offset_ + sizeof(OatQuickMethodHeader),
                          GetInstructionSetAlignment(compiled_method->GetInstructionSet()));
     return offset_ + sizeof(OatQuickMethodHeader) + thumb_offset;
   }
@@ -1011,17 +1018,16 @@
           ReportWriteFailure("relative call thunk", it);
           return false;
         }
-        uint32_t aligned_offset = compiled_method->AlignCode(offset_);
-        uint32_t aligned_code_delta = aligned_offset - offset_;
-        if (aligned_code_delta != 0) {
-          if (!writer_->WriteCodeAlignment(out, aligned_code_delta)) {
+        uint32_t alignment_size = CodeAlignmentSize(offset_, *compiled_method);
+        if (alignment_size != 0) {
+          if (!writer_->WriteCodeAlignment(out, alignment_size)) {
             ReportWriteFailure("code alignment padding", it);
             return false;
           }
-          offset_ += aligned_code_delta;
+          offset_ += alignment_size;
           DCHECK_OFFSET_();
         }
-        DCHECK_ALIGNED_PARAM(offset_,
+        DCHECK_ALIGNED_PARAM(offset_ + sizeof(OatQuickMethodHeader),
                              GetInstructionSetAlignment(compiled_method->GetInstructionSet()));
         DCHECK_EQ(method_offsets.code_offset_,
                   offset_ + sizeof(OatQuickMethodHeader) + compiled_method->CodeDelta())
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index 115cee6..bccde49 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -609,6 +609,8 @@
     DCHECK_NE(obj_.reg(), LR);
     DCHECK_NE(obj_.reg(), WSP);
     DCHECK_NE(obj_.reg(), WZR);
+    // WIP0 is used by the slow path as a temp, it can not be the object register.
+    DCHECK_NE(obj_.reg(), IP0);
     DCHECK(0 <= obj_.reg() && obj_.reg() < kNumberOfWRegisters) << obj_.reg();
     // "Compact" slow path, saving two moves.
     //
@@ -751,10 +753,7 @@
                (instruction_->AsInvoke()->GetIntrinsic() == Intrinsics::kUnsafeGetObjectVolatile))
             << instruction_->AsInvoke()->GetIntrinsic();
         DCHECK_EQ(offset_, 0U);
-        DCHECK(index_.IsRegisterPair());
-        // UnsafeGet's offset location is a register pair, the low
-        // part contains the correct offset.
-        index = index_.ToLow();
+        DCHECK(index_.IsRegister());
       }
     }
 
@@ -1284,17 +1283,21 @@
       UseScratchRegisterScope temps(GetVIXLAssembler());
       HConstant* src_cst = source.GetConstant();
       CPURegister temp;
-      if (src_cst->IsIntConstant() || src_cst->IsNullConstant()) {
-        temp = temps.AcquireW();
-      } else if (src_cst->IsLongConstant()) {
-        temp = temps.AcquireX();
-      } else if (src_cst->IsFloatConstant()) {
-        temp = temps.AcquireS();
+      if (src_cst->IsZeroBitPattern()) {
+        temp = (src_cst->IsLongConstant() || src_cst->IsDoubleConstant()) ? xzr : wzr;
       } else {
-        DCHECK(src_cst->IsDoubleConstant());
-        temp = temps.AcquireD();
+        if (src_cst->IsIntConstant()) {
+          temp = temps.AcquireW();
+        } else if (src_cst->IsLongConstant()) {
+          temp = temps.AcquireX();
+        } else if (src_cst->IsFloatConstant()) {
+          temp = temps.AcquireS();
+        } else {
+          DCHECK(src_cst->IsDoubleConstant());
+          temp = temps.AcquireD();
+        }
+        MoveConstant(temp, src_cst);
       }
-      MoveConstant(temp, src_cst);
       __ Str(temp, StackOperandFrom(destination));
     } else {
       DCHECK(source.IsStackSlot() || source.IsDoubleStackSlot());
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 240936c..1b5fa85 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -243,7 +243,7 @@
   }
 
   Arm64Assembler* GetAssembler() const { return assembler_; }
-  vixl::aarch64::MacroAssembler* GetVIXLAssembler() { return GetAssembler()->vixl_masm_; }
+  vixl::aarch64::MacroAssembler* GetVIXLAssembler() { return GetAssembler()->GetVIXLAssembler(); }
 
  private:
   void GenerateClassInitializationCheck(SlowPathCodeARM64* slow_path,
@@ -364,7 +364,7 @@
  private:
   Arm64Assembler* GetAssembler() const;
   vixl::aarch64::MacroAssembler* GetVIXLAssembler() const {
-    return GetAssembler()->vixl_masm_;
+    return GetAssembler()->GetVIXLAssembler();
   }
 
   CodeGeneratorARM64* const codegen_;
@@ -413,7 +413,7 @@
   HGraphVisitor* GetInstructionVisitor() OVERRIDE { return &instruction_visitor_; }
   Arm64Assembler* GetAssembler() OVERRIDE { return &assembler_; }
   const Arm64Assembler& GetAssembler() const OVERRIDE { return assembler_; }
-  vixl::aarch64::MacroAssembler* GetVIXLAssembler() { return GetAssembler()->vixl_masm_; }
+  vixl::aarch64::MacroAssembler* GetVIXLAssembler() { return GetAssembler()->GetVIXLAssembler(); }
 
   // Emit a write barrier.
   void MarkGCCard(vixl::aarch64::Register object,
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index 0b4c569..89d80cc 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -298,6 +298,12 @@
         stream << constant->AsIntConstant()->GetValue();
       } else if (constant->IsLongConstant()) {
         stream << constant->AsLongConstant()->GetValue();
+      } else if (constant->IsFloatConstant()) {
+        stream << constant->AsFloatConstant()->GetValue();
+      } else if (constant->IsDoubleConstant()) {
+        stream << constant->AsDoubleConstant()->GetValue();
+      } else if (constant->IsNullConstant()) {
+        stream << "null";
       }
     } else if (location.IsInvalid()) {
       stream << "invalid";
diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc
index 06d1148..e3a9d27 100644
--- a/compiler/optimizing/intrinsics_arm64.cc
+++ b/compiler/optimizing/intrinsics_arm64.cc
@@ -26,7 +26,6 @@
 #include "mirror/string.h"
 #include "thread.h"
 #include "utils/arm64/assembler_arm64.h"
-#include "utils/arm64/constants_arm64.h"
 
 using namespace vixl::aarch64;  // NOLINT(build/namespaces)
 
@@ -62,14 +61,14 @@
 }  // namespace
 
 MacroAssembler* IntrinsicCodeGeneratorARM64::GetVIXLAssembler() {
-  return codegen_->GetAssembler()->vixl_masm_;
+  return codegen_->GetVIXLAssembler();
 }
 
 ArenaAllocator* IntrinsicCodeGeneratorARM64::GetAllocator() {
   return codegen_->GetGraph()->GetArena();
 }
 
-#define __ codegen->GetAssembler()->vixl_masm_->
+#define __ codegen->GetVIXLAssembler()->
 
 static void MoveFromReturnRegister(Location trg,
                                    Primitive::Type type,
@@ -782,7 +781,7 @@
   DCHECK((type == Primitive::kPrimInt) ||
          (type == Primitive::kPrimLong) ||
          (type == Primitive::kPrimNot));
-  MacroAssembler* masm = codegen->GetAssembler()->vixl_masm_;
+  MacroAssembler* masm = codegen->GetVIXLAssembler();
   Location base_loc = locations->InAt(1);
   Register base = WRegisterFrom(base_loc);      // Object pointer.
   Location offset_loc = locations->InAt(2);
@@ -916,7 +915,7 @@
                          bool is_volatile,
                          bool is_ordered,
                          CodeGeneratorARM64* codegen) {
-  MacroAssembler* masm = codegen->GetAssembler()->vixl_masm_;
+  MacroAssembler* masm = codegen->GetVIXLAssembler();
 
   Register base = WRegisterFrom(locations->InAt(1));    // Object pointer.
   Register offset = XRegisterFrom(locations->InAt(2));  // Long offset.
@@ -1035,7 +1034,7 @@
 }
 
 static void GenCas(LocationSummary* locations, Primitive::Type type, CodeGeneratorARM64* codegen) {
-  MacroAssembler* masm = codegen->GetAssembler()->vixl_masm_;
+  MacroAssembler* masm = codegen->GetVIXLAssembler();
 
   Register out = WRegisterFrom(locations->Out());                  // Boolean result.
 
diff --git a/compiler/utils/arm64/assembler_arm64.cc b/compiler/utils/arm64/assembler_arm64.cc
index d82caf5..dc1f24a 100644
--- a/compiler/utils/arm64/assembler_arm64.cc
+++ b/compiler/utils/arm64/assembler_arm64.cc
@@ -28,7 +28,7 @@
 #ifdef ___
 #error "ARM64 Assembler macro already defined."
 #else
-#define ___   vixl_masm_->
+#define ___   vixl_masm_.
 #endif
 
 void Arm64Assembler::FinalizeCode() {
@@ -39,16 +39,16 @@
 }
 
 size_t Arm64Assembler::CodeSize() const {
-  return vixl_masm_->GetBufferCapacity() - vixl_masm_->GetRemainingBufferSpace();
+  return vixl_masm_.GetBufferCapacity() - vixl_masm_.GetRemainingBufferSpace();
 }
 
 const uint8_t* Arm64Assembler::CodeBufferBaseAddress() const {
-  return vixl_masm_->GetStartAddress<uint8_t*>();
+  return vixl_masm_.GetStartAddress<uint8_t*>();
 }
 
 void Arm64Assembler::FinalizeInstructions(const MemoryRegion& region) {
   // Copy the instructions from the buffer.
-  MemoryRegion from(vixl_masm_->GetStartAddress<void*>(), CodeSize());
+  MemoryRegion from(vixl_masm_.GetStartAddress<void*>(), CodeSize());
   region.CopyFrom(0, from);
 }
 
@@ -86,7 +86,7 @@
   } else {
     // temp = rd + value
     // rd = cond ? temp : rn
-    UseScratchRegisterScope temps(vixl_masm_);
+    UseScratchRegisterScope temps(&vixl_masm_);
     temps.Exclude(reg_x(rd), reg_x(rn));
     Register temp = temps.AcquireX();
     ___ Add(temp, reg_x(rn), value);
@@ -183,7 +183,7 @@
 }
 
 void Arm64Assembler::StoreStackPointerToThread64(ThreadOffset64 tr_offs) {
-  UseScratchRegisterScope temps(vixl_masm_);
+  UseScratchRegisterScope temps(&vixl_masm_);
   Register temp = temps.AcquireX();
   ___ Mov(temp, reg_x(SP));
   ___ Str(temp, MEM_OP(reg_x(TR), tr_offs.Int32Value()));
@@ -207,7 +207,7 @@
     // temp = value
     // rd = cond ? temp : rd
     if (value != 0) {
-      UseScratchRegisterScope temps(vixl_masm_);
+      UseScratchRegisterScope temps(&vixl_masm_);
       temps.Exclude(reg_x(dest));
       Register temp = temps.AcquireX();
       ___ Mov(temp, value);
@@ -314,7 +314,7 @@
   Arm64ManagedRegister base = m_base.AsArm64();
   CHECK(dst.IsXRegister() && base.IsXRegister());
   // Remove dst and base form the temp list - higher level API uses IP1, IP0.
-  UseScratchRegisterScope temps(vixl_masm_);
+  UseScratchRegisterScope temps(&vixl_masm_);
   temps.Exclude(reg_x(dst.AsXRegister()), reg_x(base.AsXRegister()));
   ___ Ldr(reg_x(dst.AsXRegister()), MEM_OP(reg_x(base.AsXRegister()), offs.Int32Value()));
 }
@@ -528,7 +528,7 @@
   CHECK(base.IsXRegister()) << base;
   CHECK(scratch.IsXRegister()) << scratch;
   // Remove base and scratch form the temp list - higher level API uses IP1, IP0.
-  UseScratchRegisterScope temps(vixl_masm_);
+  UseScratchRegisterScope temps(&vixl_masm_);
   temps.Exclude(reg_x(base.AsXRegister()), reg_x(scratch.AsXRegister()));
   ___ Ldr(reg_x(scratch.AsXRegister()), MEM_OP(reg_x(base.AsXRegister()), offs.Int32Value()));
   ___ Br(reg_x(scratch.AsXRegister()));
@@ -621,7 +621,7 @@
 }
 
 void Arm64Assembler::EmitExceptionPoll(Arm64Exception *exception) {
-  UseScratchRegisterScope temps(vixl_masm_);
+  UseScratchRegisterScope temps(&vixl_masm_);
   temps.Exclude(reg_x(exception->scratch_.AsXRegister()));
   Register temp = temps.AcquireX();
 
@@ -653,7 +653,7 @@
 
 void Arm64Assembler::SpillRegisters(CPURegList registers, int offset) {
   int size = registers.GetRegisterSizeInBytes();
-  const Register sp = vixl_masm_->StackPointer();
+  const Register sp = vixl_masm_.StackPointer();
   // Since we are operating on register pairs, we would like to align on
   // double the standard size; on the other hand, we don't want to insert
   // an extra store, which will happen if the number of registers is even.
@@ -681,7 +681,7 @@
 
 void Arm64Assembler::UnspillRegisters(CPURegList registers, int offset) {
   int size = registers.GetRegisterSizeInBytes();
-  const Register sp = vixl_masm_->StackPointer();
+  const Register sp = vixl_masm_.StackPointer();
   // Be consistent with the logic for spilling registers.
   if (!IsAlignedParam(offset, 2 * size) && registers.GetCount() % 2 != 0) {
     const CPURegister& dst0 = registers.PopLowestIndex();
diff --git a/compiler/utils/arm64/assembler_arm64.h b/compiler/utils/arm64/assembler_arm64.h
index 24b7982..b8434b9 100644
--- a/compiler/utils/arm64/assembler_arm64.h
+++ b/compiler/utils/arm64/assembler_arm64.h
@@ -23,7 +23,6 @@
 
 #include "base/arena_containers.h"
 #include "base/logging.h"
-#include "constants_arm64.h"
 #include "utils/arm64/managed_register_arm64.h"
 #include "utils/assembler.h"
 #include "offsets.h"
@@ -84,16 +83,13 @@
 
 class Arm64Assembler FINAL : public Assembler {
  public:
-  // We indicate the size of the initial code generation buffer to the VIXL
-  // assembler. From there we it will automatically manage the buffer.
   explicit Arm64Assembler(ArenaAllocator* arena)
       : Assembler(arena),
-        exception_blocks_(arena->Adapter(kArenaAllocAssembler)),
-        vixl_masm_(new vixl::aarch64::MacroAssembler(kArm64BaseBufferSize)) {}
+        exception_blocks_(arena->Adapter(kArenaAllocAssembler)) {}
 
-  virtual ~Arm64Assembler() {
-    delete vixl_masm_;
-  }
+  virtual ~Arm64Assembler() {}
+
+  vixl::aarch64::MacroAssembler* GetVIXLAssembler() { return &vixl_masm_; }
 
   // Finalize the code.
   void FinalizeCode() OVERRIDE;
@@ -287,9 +283,8 @@
   // List of exception blocks to generate at the end of the code cache.
   ArenaVector<std::unique_ptr<Arm64Exception>> exception_blocks_;
 
- public:
-  // Vixl assembler.
-  vixl::aarch64::MacroAssembler* const vixl_masm_;
+  // VIXL assembler.
+  vixl::aarch64::MacroAssembler vixl_masm_;
 
   // Used for testing.
   friend class Arm64ManagedRegister_VixlRegisters_Test;
diff --git a/compiler/utils/arm64/constants_arm64.h b/compiler/utils/arm64/constants_arm64.h
deleted file mode 100644
index 01e8be9..0000000
--- a/compiler/utils/arm64/constants_arm64.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (C) 2014 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ART_COMPILER_UTILS_ARM64_CONSTANTS_ARM64_H_
-#define ART_COMPILER_UTILS_ARM64_CONSTANTS_ARM64_H_
-
-#include <stdint.h>
-#include <iosfwd>
-#include "arch/arm64/registers_arm64.h"
-#include "base/casts.h"
-#include "base/logging.h"
-#include "globals.h"
-
-// TODO: Extend this file by adding missing functionality.
-
-namespace art {
-namespace arm64 {
-
-constexpr size_t kArm64BaseBufferSize = 4096;
-
-}  // namespace arm64
-}  // namespace art
-
-#endif  // ART_COMPILER_UTILS_ARM64_CONSTANTS_ARM64_H_
diff --git a/compiler/utils/arm64/managed_register_arm64.h b/compiler/utils/arm64/managed_register_arm64.h
index f7d74d2..7378a0a 100644
--- a/compiler/utils/arm64/managed_register_arm64.h
+++ b/compiler/utils/arm64/managed_register_arm64.h
@@ -17,8 +17,8 @@
 #ifndef ART_COMPILER_UTILS_ARM64_MANAGED_REGISTER_ARM64_H_
 #define ART_COMPILER_UTILS_ARM64_MANAGED_REGISTER_ARM64_H_
 
+#include "arch/arm64/registers_arm64.h"
 #include "base/logging.h"
-#include "constants_arm64.h"
 #include "debug/dwarf/register.h"
 #include "utils/managed_register.h"
 
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 34d3158..8828dce 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -520,7 +520,7 @@
     ldr    r2, [r9, #THREAD_ID_OFFSET]
     ldrex  r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
     mov    r3, r1
-    and    r3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  @ zero the read barrier bits
+    and    r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  @ zero the gc bits
     cbnz   r3, .Lnot_unlocked         @ already thin locked
     @ unlocked case - r1: original lock word that's zero except for the read barrier bits.
     orr    r2, r1, r2                 @ r2 holds thread id with count of 0 with preserved read barrier bits
@@ -536,9 +536,9 @@
     cbnz   r2, .Lslow_lock            @ lock word and self thread id's match -> recursive lock
                                       @ else contention, go to slow path
     mov    r3, r1                     @ copy the lock word to check count overflow.
-    and    r3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  @ zero the read barrier bits.
+    and    r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  @ zero the gc bits.
     add    r2, r3, #LOCK_WORD_THIN_LOCK_COUNT_ONE  @ increment count in lock word placing in r2 to check overflow
-    lsr    r3, r2, LOCK_WORD_READ_BARRIER_STATE_SHIFT  @ if either of the upper two bits (28-29) are set, we overflowed.
+    lsr    r3, r2, #LOCK_WORD_GC_STATE_SHIFT    @ if the first gc state bit is set, we overflowed.
     cbnz   r3, .Lslow_lock            @ if we overflow the count go slow path
     add    r2, r1, #LOCK_WORD_THIN_LOCK_COUNT_ONE  @ increment count for real
     strex  r3, r2, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET] @ strex necessary for read barrier bits
@@ -581,17 +581,17 @@
     cbnz   r2, .Lslow_unlock          @ if either of the top two bits are set, go slow path
     ldr    r2, [r9, #THREAD_ID_OFFSET]
     mov    r3, r1                     @ copy lock word to check thread id equality
-    and    r3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  @ zero the read barrier bits
+    and    r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  @ zero the gc bits
     eor    r3, r3, r2                 @ lock_word.ThreadId() ^ self->ThreadId()
     uxth   r3, r3                     @ zero top 16 bits
     cbnz   r3, .Lslow_unlock          @ do lock word and self thread id's match?
     mov    r3, r1                     @ copy lock word to detect transition to unlocked
-    and    r3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  @ zero the read barrier bits
+    and    r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  @ zero the gc bits
     cmp    r3, #LOCK_WORD_THIN_LOCK_COUNT_ONE
     bpl    .Lrecursive_thin_unlock
     @ transition to unlocked
     mov    r3, r1
-    and    r3, #LOCK_WORD_READ_BARRIER_STATE_MASK  @ r3: zero except for the preserved read barrier bits
+    and    r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED  @ r3: zero except for the preserved gc bits
     dmb    ish                        @ full (LoadStore|StoreStore) memory barrier
 #ifndef USE_READ_BARRIER
     str    r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
@@ -1772,6 +1772,20 @@
      */
 .macro READ_BARRIER_MARK_REG name, reg
 ENTRY \name
+    // Null check so that we can load the lock word.
+    cmp \reg, #0
+    beq .Lret_rb_\name
+    // Check lock word for mark bit, if marked return.
+    push {r0}
+    ldr r0, [\reg, MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    and r0, #LOCK_WORD_MARK_BIT_MASK_SHIFTED
+    cbz r0, .Lslow_rb_\name
+    // Restore LR and return.
+    pop   {r0}
+    bx    lr
+
+.Lslow_rb_\name:
+    pop   {r0}
     push  {r0-r4, r9, r12, lr}          @ save return address and core caller-save registers
     .cfi_adjust_cfa_offset 32
     .cfi_rel_offset r0, 0
@@ -1831,6 +1845,8 @@
       .endif
     .endif
     pop   {r0-r4, r9, r12, pc}          @ restore caller-save registers and return
+.Lret_rb_\name:
+    bx lr
 END \name
 .endm
 
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index a5be52d..51094d5 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1090,7 +1090,7 @@
     ldr    w2, [xSELF, #THREAD_ID_OFFSET] // TODO: Can the thread ID really change during the loop?
     ldxr   w1, [x4]
     mov    x3, x1
-    and    w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  // zero the read barrier bits
+    and    w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits
     cbnz   w3, .Lnot_unlocked         // already thin locked
     // unlocked case - x1: original lock word that's zero except for the read barrier bits.
     orr    x2, x1, x2                 // x2 holds thread id with count of 0 with preserved read barrier bits
@@ -1106,9 +1106,9 @@
     cbnz   w2, .Lslow_lock            // lock word and self thread id's match -> recursive lock
                                       // else contention, go to slow path
     mov    x3, x1                     // copy the lock word to check count overflow.
-    and    w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  // zero the read barrier bits.
+    and    w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits.
     add    w2, w3, #LOCK_WORD_THIN_LOCK_COUNT_ONE  // increment count in lock word placing in w2 to check overflow
-    lsr    w3, w2, LOCK_WORD_READ_BARRIER_STATE_SHIFT  // if either of the upper two bits (28-29) are set, we overflowed.
+    lsr    w3, w2, #LOCK_WORD_GC_STATE_SHIFT     // if the first gc state bit is set, we overflowed.
     cbnz   w3, .Lslow_lock            // if we overflow the count go slow path
     add    w2, w1, #LOCK_WORD_THIN_LOCK_COUNT_ONE  // increment count for real
     stxr   w3, w2, [x4]
@@ -1152,17 +1152,17 @@
     cbnz   w2, .Lslow_unlock          // if either of the top two bits are set, go slow path
     ldr    w2, [xSELF, #THREAD_ID_OFFSET]
     mov    x3, x1                     // copy lock word to check thread id equality
-    and    w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  // zero the read barrier bits
+    and    w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits
     eor    w3, w3, w2                 // lock_word.ThreadId() ^ self->ThreadId()
     uxth   w3, w3                     // zero top 16 bits
     cbnz   w3, .Lslow_unlock          // do lock word and self thread id's match?
     mov    x3, x1                     // copy lock word to detect transition to unlocked
-    and    w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  // zero the read barrier bits
+    and    w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits
     cmp    w3, #LOCK_WORD_THIN_LOCK_COUNT_ONE
     bpl    .Lrecursive_thin_unlock
     // transition to unlocked
     mov    x3, x1
-    and    w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK  // w3: zero except for the preserved read barrier bits
+    and    w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED  // w3: zero except for the preserved read barrier bits
     dmb    ish                        // full (LoadStore|StoreStore) memory barrier
 #ifndef USE_READ_BARRIER
     str    w3, [x4]
@@ -1791,12 +1791,20 @@
     ldr    x2, [x1, #ART_METHOD_DEX_CACHE_TYPES_OFFSET_64]    // Load dex cache resolved types array
                                                               // Load the class (x2)
     ldr    w2, [x2, x0, lsl #COMPRESSED_REFERENCE_SIZE_SHIFT]
-                                                              // Read barrier for class load.
+
+    // Most common case: GC is not marking.
     ldr    w3, [xSELF, #THREAD_IS_GC_MARKING_OFFSET]
-    cbnz   x3, .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path
-.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit:
+    cbnz   x3, .Lart_quick_alloc_object_region_tlab_marking
+.Lart_quick_alloc_object_region_tlab_do_allocation:
     ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path
-.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path:
+.Lart_quick_alloc_object_region_tlab_marking:
+    // GC is marking, check the lock word of the class for the mark bit.
+    // If the class is null, go slow path. The check is required to read the lock word.
+    cbz    w2, .Lart_quick_alloc_object_region_tlab_slow_path
+    // Class is not null, check mark bit in lock word.
+    ldr    w3, [x2, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    // If the bit is not zero, do the allocation.
+    tbnz    w3, #LOCK_WORD_MARK_BIT_SHIFT, .Lart_quick_alloc_object_region_tlab_do_allocation
                                                               // The read barrier slow path. Mark
                                                               // the class.
     stp    x0, x1, [sp, #-32]!                                // Save registers (x0, x1, lr).
@@ -1807,7 +1815,7 @@
     ldp    x0, x1, [sp, #0]                                   // Restore registers.
     ldr    xLR, [sp, #16]
     add    sp, sp, #32
-    b      .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
+    b      .Lart_quick_alloc_object_region_tlab_do_allocation
 .Lart_quick_alloc_object_region_tlab_slow_path:
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME          // Save callee saves in case of GC.
     mov    x2, xSELF                           // Pass Thread::Current.
@@ -2265,6 +2273,8 @@
      */
 .macro READ_BARRIER_MARK_REG name, wreg, xreg
 ENTRY \name
+    // Reference is null, no work to do at all.
+    cbz \wreg, .Lret_rb_\name
     /*
      * Allocate 46 stack slots * 8 = 368 bytes:
      * - 20 slots for core registers X0-X19
@@ -2272,6 +2282,11 @@
      * -  1 slot for return address register XLR
      * -  1 padding slot for 16-byte stack alignment
      */
+    // Use wIP0 as temp and check the mark bit of the reference. wIP0 is not used by the compiler.
+    ldr   wIP0, [\xreg, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    tbz   wIP0, #LOCK_WORD_MARK_BIT_SHIFT, .Lslow_path_rb_\name
+    ret
+.Lslow_path_rb_\name:
     // Save all potentially live caller-save core registers.
     stp   x0, x1,   [sp, #-368]!
     .cfi_adjust_cfa_offset 368
@@ -2360,6 +2375,7 @@
     .cfi_restore x30
     add sp, sp, #368
     .cfi_adjust_cfa_offset -368
+.Lret_rb_\name:
     ret
 END \name
 .endm
diff --git a/runtime/arch/mips/thread_mips.cc b/runtime/arch/mips/thread_mips.cc
index 06d6211..0a9ab7a 100644
--- a/runtime/arch/mips/thread_mips.cc
+++ b/runtime/arch/mips/thread_mips.cc
@@ -25,7 +25,7 @@
 void Thread::InitCpu() {
   CHECK_EQ(THREAD_FLAGS_OFFSET, ThreadFlagsOffset<PointerSize::k32>().Int32Value());
   CHECK_EQ(THREAD_CARD_TABLE_OFFSET, CardTableOffset<PointerSize::k32>().Int32Value());
-  CHECK_EQ(THREAD_EXCEPTION_OFFSET, ExceptionOffset<PointerSize::k64>().Int32Value());
+  CHECK_EQ(THREAD_EXCEPTION_OFFSET, ExceptionOffset<PointerSize::k32>().Int32Value());
 }
 
 void Thread::CleanupCpu() {
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 77e04e7..6c9239f 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -1028,7 +1028,13 @@
     movl 0(%edx, %eax, COMPRESSED_REFERENCE_SIZE), %edx
                                                                // Read barrier for class load.
     cmpl LITERAL(0), %fs:THREAD_IS_GC_MARKING_OFFSET
-    jne .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path
+    jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
+    // Null check so that we can load the lock word.
+    testl %edx, %edx
+    jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
+    // Check the mark bit, if it is 1 return.
+    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)
+    jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path
 .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit:
     ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path
 .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path:
@@ -1065,7 +1071,7 @@
     test LITERAL(LOCK_WORD_STATE_MASK), %ecx         // test the 2 high bits.
     jne  .Lslow_lock                      // slow path if either of the two high bits are set.
     movl %ecx, %edx                       // save lock word (edx) to keep read barrier bits.
-    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %ecx  // zero the read barrier bits.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %ecx  // zero the gc bits.
     test %ecx, %ecx
     jnz  .Lalready_thin                   // lock word contains a thin lock
     // unlocked case - edx: original lock word, eax: obj.
@@ -1081,9 +1087,9 @@
     cmpw %cx, %dx                         // do we hold the lock already?
     jne  .Lslow_lock
     movl %edx, %ecx                       // copy the lock word to check count overflow.
-    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %ecx  // zero the read barrier bits.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %ecx  // zero the read barrier bits.
     addl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %ecx  // increment recursion count for overflow check.
-    test LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx  // overflowed if either of the upper two bits (28-29) are set.
+    test LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED), %ecx  // overflowed if the first gc state bit is set.
     jne  .Lslow_lock                      // count overflowed so go slow
     movl %eax, %ecx                       // save obj to use eax for cmpxchg.
     movl %edx, %eax                       // copy the lock word as the old val for cmpxchg.
@@ -1137,13 +1143,13 @@
     cmpw %cx, %dx                         // does the thread id match?
     jne  .Lslow_unlock
     movl %ecx, %edx                       // copy the lock word to detect new count of 0.
-    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %edx  // zero the read barrier bits.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %edx  // zero the gc bits.
     cmpl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %edx
     jae  .Lrecursive_thin_unlock
     // update lockword, cmpxchg necessary for read barrier bits.
     movl %eax, %edx                       // edx: obj
     movl %ecx, %eax                       // eax: old lock word.
-    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx  // ecx: new lock word zero except original rb bits.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED), %ecx  // ecx: new lock word zero except original rb bits.
 #ifndef USE_READ_BARRIER
     movl %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)
 #else
@@ -1923,6 +1929,14 @@
 //   convention (e.g. standard callee-save registers are preserved).
 MACRO2(READ_BARRIER_MARK_REG, name, reg)
     DEFINE_FUNCTION VAR(name)
+    // Null check so that we can load the lock word.
+    test REG_VAR(reg), REG_VAR(reg)
+    jz .Lret_rb_\name
+    // Check the mark bit, if it is 1 return.
+    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(reg))
+    jz .Lslow_rb_\name
+    ret
+.Lslow_rb_\name:
     // Save all potentially live caller-save core registers.
     PUSH eax
     PUSH ecx
@@ -1970,6 +1984,7 @@
     POP_REG_NE edx, RAW_VAR(reg)
     POP_REG_NE ecx, RAW_VAR(reg)
     POP_REG_NE eax, RAW_VAR(reg)
+.Lret_rb_\name:
     ret
     END_FUNCTION VAR(name)
 END_MACRO
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 784ec39..127d7db 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -989,7 +989,13 @@
                                                                // Load the class
     movl 0(%rdx, %rdi, COMPRESSED_REFERENCE_SIZE), %edx
     cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET
-    jne .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path
+    jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
+    // Null check so that we can load the lock word.
+    testl %edx, %edx
+    jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
+    // Check the mark bit, if it is 1 return.
+    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)
+    jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path
 .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit:
     ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path
 .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path:
@@ -1022,7 +1028,7 @@
     test LITERAL(LOCK_WORD_STATE_MASK), %ecx         // Test the 2 high bits.
     jne  .Lslow_lock                      // Slow path if either of the two high bits are set.
     movl %ecx, %edx                       // save lock word (edx) to keep read barrier bits.
-    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %ecx  // zero the read barrier bits.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %ecx  // zero the gc bits.
     test %ecx, %ecx
     jnz  .Lalready_thin                   // Lock word contains a thin lock.
     // unlocked case - edx: original lock word, edi: obj.
@@ -1037,9 +1043,9 @@
     cmpw %cx, %dx                         // do we hold the lock already?
     jne  .Lslow_lock
     movl %edx, %ecx                       // copy the lock word to check count overflow.
-    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %ecx  // zero the read barrier bits.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %ecx  // zero the gc bits.
     addl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %ecx  // increment recursion count
-    test LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx  // overflowed if either of the upper two bits (28-29) are set
+    test LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx  // overflowed if the upper bit (28) is set
     jne  .Lslow_lock                      // count overflowed so go slow
     movl %edx, %eax                       // copy the lock word as the old val for cmpxchg.
     addl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %edx   // increment recursion count again for real.
@@ -1074,12 +1080,12 @@
     cmpw %cx, %dx                         // does the thread id match?
     jne  .Lslow_unlock
     movl %ecx, %edx                       // copy the lock word to detect new count of 0.
-    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %edx  // zero the read barrier bits.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %edx  // zero the gc bits.
     cmpl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %edx
     jae  .Lrecursive_thin_unlock
     // update lockword, cmpxchg necessary for read barrier bits.
     movl %ecx, %eax                       // eax: old lock word.
-    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx  // ecx: new lock word zero except original rb bits.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED), %ecx  // ecx: new lock word zero except original gc bits.
 #ifndef USE_READ_BARRIER
     movl %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)
 #else
@@ -1833,6 +1839,14 @@
 //   convention (e.g. standard callee-save registers are preserved).
 MACRO2(READ_BARRIER_MARK_REG, name, reg)
     DEFINE_FUNCTION VAR(name)
+    // Null check so that we can load the lock word.
+    testq REG_VAR(reg), REG_VAR(reg)
+    jz .Lret_rb_\name
+    // Check the mark bit, if it is 1 return.
+    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(reg))
+    jz .Lslow_rb_\name
+    ret
+.Lslow_rb_\name:
     // Save all potentially live caller-save core registers.
     PUSH rax
     PUSH rcx
@@ -1897,6 +1911,7 @@
     POP_REG_NE rdx, RAW_VAR(reg)
     POP_REG_NE rcx, RAW_VAR(reg)
     POP_REG_NE rax, RAW_VAR(reg)
+.Lret_rb_\name:
     ret
     END_FUNCTION VAR(name)
 END_MACRO
diff --git a/runtime/dex_file.cc b/runtime/dex_file.cc
index 061babd..a6eb5f6 100644
--- a/runtime/dex_file.cc
+++ b/runtime/dex_file.cc
@@ -338,6 +338,11 @@
     *error_code = ZipOpenErrorCode::kEntryNotFound;
     return nullptr;
   }
+  if (zip_entry->GetUncompressedLength() == 0) {
+    *error_msg = StringPrintf("Dex file '%s' has zero length", location.c_str());
+    *error_code = ZipOpenErrorCode::kDexFileError;
+    return nullptr;
+  }
   std::unique_ptr<MemMap> map(zip_entry->ExtractToMemMap(location.c_str(), entry_name, error_msg));
   if (map.get() == nullptr) {
     *error_msg = StringPrintf("Failed to extract '%s' from '%s': %s", entry_name, location.c_str(),
@@ -435,6 +440,8 @@
                                                    MemMap* mem_map,
                                                    const OatDexFile* oat_dex_file,
                                                    std::string* error_msg) {
+  DCHECK(base != nullptr);
+  DCHECK_NE(size, 0U);
   CHECK_ALIGNED(base, 4);  // various dex file structures must be word aligned
   std::unique_ptr<DexFile> dex_file(
       new DexFile(base, size, location, location_checksum, mem_map, oat_dex_file));
diff --git a/runtime/dex_file_test.cc b/runtime/dex_file_test.cc
index 616c2a0..2704d8a 100644
--- a/runtime/dex_file_test.cc
+++ b/runtime/dex_file_test.cc
@@ -166,6 +166,12 @@
   "uAAAAAYAAAABAAAA0AAAAAEgAAACAAAA8AAAAAEQAAABAAAAHAEAAAIgAAAIAAAAIgEAAAMgAAAC"
   "AAAAcwEAAAAgAAABAAAAfgEAAAAQAAABAAAAjAEAAA==";
 
+static const char kRawDexZeroLength[] =
+  "UEsDBAoAAAAAAOhxAkkAAAAAAAAAAAAAAAALABwAY2xhc3Nlcy5kZXhVVAkAA2QNoVdnDaFXdXgL"
+  "AAEE5AMBAASIEwAAUEsBAh4DCgAAAAAA6HECSQAAAAAAAAAAAAAAAAsAGAAAAAAAAAAAAKCBAAAA"
+  "AGNsYXNzZXMuZGV4VVQFAANkDaFXdXgLAAEE5AMBAASIEwAAUEsFBgAAAAABAAEAUQAAAEUAAAAA"
+  "AA==";
+
 static void DecodeAndWriteDexFile(const char* base64, const char* location) {
   // decode base64
   CHECK(base64 != nullptr);
@@ -254,6 +260,18 @@
   ASSERT_FALSE(DexFile::Open(location, location, kVerifyChecksum, &error_msg, &dex_files));
 }
 
+TEST_F(DexFileTest, ZeroLengthDexRejected) {
+  ScratchFile tmp;
+  const char* location = tmp.GetFilename().c_str();
+  DecodeAndWriteDexFile(kRawDexZeroLength, location);
+
+  ScopedObjectAccess soa(Thread::Current());
+  static constexpr bool kVerifyChecksum = true;
+  std::string error_msg;
+  std::vector<std::unique_ptr<const DexFile>> dex_files;
+  ASSERT_FALSE(DexFile::Open(location, location, kVerifyChecksum, &error_msg, &dex_files));
+}
+
 TEST_F(DexFileTest, GetLocationChecksum) {
   ScopedObjectAccess soa(Thread::Current());
   std::unique_ptr<const DexFile> raw(OpenTestDexFile("Main"));
diff --git a/runtime/gc/collector/concurrent_copying-inl.h b/runtime/gc/collector/concurrent_copying-inl.h
index 4019a5b..fb774a4 100644
--- a/runtime/gc/collector/concurrent_copying-inl.h
+++ b/runtime/gc/collector/concurrent_copying-inl.h
@@ -154,11 +154,30 @@
 }
 
 inline mirror::Object* ConcurrentCopying::MarkFromReadBarrier(mirror::Object* from_ref) {
+  mirror::Object* ret;
+  // TODO: Delete GetMarkBit check when all of the callers properly check the bit. Remaining caller
+  // is array allocations.
+  if (from_ref == nullptr || from_ref->GetMarkBit()) {
+    return from_ref;
+  }
   // TODO: Consider removing this check when we are done investigating slow paths. b/30162165
   if (UNLIKELY(mark_from_read_barrier_measurements_)) {
-    return MarkFromReadBarrierWithMeasurements(from_ref);
+    ret = MarkFromReadBarrierWithMeasurements(from_ref);
+  } else {
+    ret = Mark(from_ref);
   }
-  return Mark(from_ref);
+  // Only set the mark bit for baker barrier.
+  if (kUseBakerReadBarrier && LIKELY(!rb_mark_bit_stack_full_ && ret->AtomicSetMarkBit(0, 1))) {
+    // If the mark stack is full, we may temporarily go to mark and back to unmarked. Seeing both
+    // values are OK since the only race is doing an unnecessary Mark.
+    if (!rb_mark_bit_stack_->AtomicPushBack(ret)) {
+      // Mark stack is full, set the bit back to zero.
+      CHECK(ret->AtomicSetMarkBit(1, 0));
+      // Set rb_mark_bit_stack_full_, this is racy but OK since AtomicPushBack is thread safe.
+      rb_mark_bit_stack_full_ = true;
+    }
+  }
+  return ret;
 }
 
 inline mirror::Object* ConcurrentCopying::GetFwdPtr(mirror::Object* from_ref) {
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index d7221e4..071537d 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -42,9 +42,6 @@
 namespace collector {
 
 static constexpr size_t kDefaultGcMarkStackSize = 2 * MB;
-// If kGrayDirtyImmuneObjects is true then we gray dirty objects in the GC pause to prevent dirty
-// pages.
-static constexpr bool kGrayDirtyImmuneObjects = true;
 // If kFilterModUnionCards then we attempt to filter cards that don't need to be dirty in the mod
 // union table. Disabled since it does not seem to help the pause much.
 static constexpr bool kFilterModUnionCards = kIsDebugBuild;
@@ -52,6 +49,9 @@
 // ConcurrentCopying::Scan. May be used to diagnose possibly unnecessary read barriers.
 // Only enabled for kIsDebugBuild to avoid performance hit.
 static constexpr bool kDisallowReadBarrierDuringScan = kIsDebugBuild;
+// Slow path mark stack size, increase this if the stack is getting full and it is causing
+// performance problems.
+static constexpr size_t kReadBarrierMarkStackSize = 512 * KB;
 
 ConcurrentCopying::ConcurrentCopying(Heap* heap,
                                      const std::string& name_prefix,
@@ -63,6 +63,10 @@
       gc_mark_stack_(accounting::ObjectStack::Create("concurrent copying gc mark stack",
                                                      kDefaultGcMarkStackSize,
                                                      kDefaultGcMarkStackSize)),
+      rb_mark_bit_stack_(accounting::ObjectStack::Create("rb copying gc mark stack",
+                                                         kReadBarrierMarkStackSize,
+                                                         kReadBarrierMarkStackSize)),
+      rb_mark_bit_stack_full_(false),
       mark_stack_lock_("concurrent copying mark stack lock", kMarkSweepMarkStackLock),
       thread_running_gc_(nullptr),
       is_marking_(false), is_active_(false), is_asserting_to_space_invariant_(false),
@@ -187,6 +191,7 @@
     CHECK(false_gray_stack_.empty());
   }
 
+  rb_mark_bit_stack_full_ = false;
   mark_from_read_barrier_measurements_ = measure_read_barrier_slow_path_;
   if (measure_read_barrier_slow_path_) {
     rb_slow_path_ns_.StoreRelaxed(0);
@@ -914,9 +919,9 @@
     }
     collector_->AssertToSpaceInvariant(nullptr, MemberOffset(0), ref);
     if (kUseBakerReadBarrier) {
-      CHECK(ref->GetReadBarrierPointer() == ReadBarrier::WhitePtr())
+      CHECK_EQ(ref->GetReadBarrierPointer(), ReadBarrier::WhitePtr())
           << "Ref " << ref << " " << PrettyTypeOf(ref)
-          << " has non-white rb_ptr " << ref->GetReadBarrierPointer();
+          << " has non-white rb_ptr ";
     }
   }
 
@@ -982,7 +987,7 @@
     VerifyNoFromSpaceRefsFieldVisitor visitor(collector);
     obj->VisitReferences(visitor, visitor);
     if (kUseBakerReadBarrier) {
-      CHECK(obj->GetReadBarrierPointer() == ReadBarrier::WhitePtr())
+      CHECK_EQ(obj->GetReadBarrierPointer(), ReadBarrier::WhitePtr())
           << "obj=" << obj << " non-white rb_ptr " << obj->GetReadBarrierPointer();
     }
   }
@@ -2243,6 +2248,15 @@
         }
       }
     }
+    if (kUseBakerReadBarrier) {
+      TimingLogger::ScopedTiming split("EmptyRBMarkBitStack", GetTimings());
+      DCHECK(rb_mark_bit_stack_.get() != nullptr);
+      const auto* limit = rb_mark_bit_stack_->End();
+      for (StackReference<mirror::Object>* it = rb_mark_bit_stack_->Begin(); it != limit; ++it) {
+        CHECK(it->AsMirrorPtr()->AtomicSetMarkBit(1, 0));
+      }
+      rb_mark_bit_stack_->Reset();
+    }
   }
   if (measure_read_barrier_slow_path_) {
     MutexLock mu(self, rb_slow_path_histogram_lock_);
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index 72112fa..a862802 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -57,6 +57,9 @@
   static constexpr bool kEnableFromSpaceAccountingCheck = kIsDebugBuild;
   // Enable verbose mode.
   static constexpr bool kVerboseMode = false;
+  // If kGrayDirtyImmuneObjects is true then we gray dirty objects in the GC pause to prevent dirty
+  // pages.
+  static constexpr bool kGrayDirtyImmuneObjects = true;
 
   ConcurrentCopying(Heap* heap,
                     const std::string& name_prefix = "",
@@ -230,6 +233,8 @@
   space::RegionSpace* region_space_;      // The underlying region space.
   std::unique_ptr<Barrier> gc_barrier_;
   std::unique_ptr<accounting::ObjectStack> gc_mark_stack_;
+  std::unique_ptr<accounting::ObjectStack> rb_mark_bit_stack_;
+  bool rb_mark_bit_stack_full_;
   std::vector<mirror::Object*> false_gray_stack_ GUARDED_BY(mark_stack_lock_);
   Mutex mark_stack_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
   std::vector<accounting::ObjectStack*> revoked_mark_stacks_
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index a92cb24..5485cd2 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -2538,6 +2538,17 @@
   AddSpace(zygote_space_);
   non_moving_space_->SetFootprintLimit(non_moving_space_->Capacity());
   AddSpace(non_moving_space_);
+  if (kUseBakerReadBarrier && gc::collector::ConcurrentCopying::kGrayDirtyImmuneObjects) {
+    // Treat all of the objects in the zygote as marked to avoid unnecessary dirty pages. This is
+    // safe since we mark all of the objects that may reference non immune objects as gray.
+    zygote_space_->GetLiveBitmap()->VisitMarkedRange(
+        reinterpret_cast<uintptr_t>(zygote_space_->Begin()),
+        reinterpret_cast<uintptr_t>(zygote_space_->Limit()),
+        [](mirror::Object* obj) SHARED_REQUIRES(Locks::mutator_lock_) {
+      CHECK(obj->AtomicSetMarkBit(0, 1));
+    });
+  }
+
   // Create the zygote space mod union table.
   accounting::ModUnionTable* mod_union_table =
       new accounting::ModUnionTableCardCache("zygote space mod-union table", this,
diff --git a/runtime/generated/asm_support_gen.h b/runtime/generated/asm_support_gen.h
index 5d62b59..f5d5181 100644
--- a/runtime/generated/asm_support_gen.h
+++ b/runtime/generated/asm_support_gen.h
@@ -74,12 +74,22 @@
 DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_STATE_MASK), (static_cast<uint32_t>(art::LockWord::kStateMaskShifted)))
 #define LOCK_WORD_READ_BARRIER_STATE_SHIFT 28
 DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_READ_BARRIER_STATE_SHIFT), (static_cast<int32_t>(art::LockWord::kReadBarrierStateShift)))
-#define LOCK_WORD_READ_BARRIER_STATE_MASK 0x30000000
+#define LOCK_WORD_READ_BARRIER_STATE_MASK 0x10000000
 DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_READ_BARRIER_STATE_MASK), (static_cast<uint32_t>(art::LockWord::kReadBarrierStateMaskShifted)))
-#define LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED 0xcfffffff
+#define LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED 0xefffffff
 DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), (static_cast<uint32_t>(art::LockWord::kReadBarrierStateMaskShiftedToggled)))
 #define LOCK_WORD_THIN_LOCK_COUNT_ONE 65536
 DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_THIN_LOCK_COUNT_ONE), (static_cast<int32_t>(art::LockWord::kThinLockCountOne)))
+#define LOCK_WORD_GC_STATE_MASK_SHIFTED 0x30000000
+DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_GC_STATE_MASK_SHIFTED), (static_cast<uint32_t>(art::LockWord::kGCStateMaskShifted)))
+#define LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED 0xcfffffff
+DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), (static_cast<uint32_t>(art::LockWord::kGCStateMaskShiftedToggled)))
+#define LOCK_WORD_GC_STATE_SHIFT 28
+DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_GC_STATE_SHIFT), (static_cast<int32_t>(art::LockWord::kGCStateShift)))
+#define LOCK_WORD_MARK_BIT_SHIFT 29
+DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_MARK_BIT_SHIFT), (static_cast<int32_t>(art::LockWord::kMarkBitStateShift)))
+#define LOCK_WORD_MARK_BIT_MASK_SHIFTED 0x20000000
+DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_MARK_BIT_MASK_SHIFTED), (static_cast<uint32_t>(art::LockWord::kMarkBitStateMaskShifted)))
 #define OBJECT_ALIGNMENT_MASK 0x7
 DEFINE_CHECK_EQ(static_cast<size_t>(OBJECT_ALIGNMENT_MASK), (static_cast<size_t>(art::kObjectAlignment - 1)))
 #define OBJECT_ALIGNMENT_MASK_TOGGLED 0xfffffff8
diff --git a/runtime/globals.h b/runtime/globals.h
index 0b44c47..9045d40 100644
--- a/runtime/globals.h
+++ b/runtime/globals.h
@@ -47,7 +47,8 @@
 }
 
 // Required object alignment
-static constexpr size_t kObjectAlignment = 8;
+static constexpr size_t kObjectAlignmentShift = 3;
+static constexpr size_t kObjectAlignment = 1u << kObjectAlignmentShift;
 static constexpr size_t kLargeObjectAlignment = kPageSize;
 
 // Whether or not this is a debug build. Useful in conditionals where NDEBUG isn't.
diff --git a/runtime/lock_word-inl.h b/runtime/lock_word-inl.h
index 341501b..4a2a293 100644
--- a/runtime/lock_word-inl.h
+++ b/runtime/lock_word-inl.h
@@ -43,17 +43,15 @@
 
 inline size_t LockWord::ForwardingAddress() const {
   DCHECK_EQ(GetState(), kForwardingAddress);
-  return value_ << kStateSize;
+  return value_ << kForwardingAddressShift;
 }
 
 inline LockWord::LockWord() : value_(0) {
   DCHECK_EQ(GetState(), kUnlocked);
 }
 
-inline LockWord::LockWord(Monitor* mon, uint32_t rb_state)
-    : value_(mon->GetMonitorId() | (rb_state << kReadBarrierStateShift) |
-             (kStateFat << kStateShift)) {
-  DCHECK_EQ(rb_state & ~kReadBarrierStateMask, 0U);
+inline LockWord::LockWord(Monitor* mon, uint32_t gc_state)
+    : value_(mon->GetMonitorId() | (gc_state << kGCStateShift) | (kStateFat << kStateShift)) {
 #ifndef __LP64__
   DCHECK_ALIGNED(mon, kMonitorIdAlignment);
 #endif
diff --git a/runtime/lock_word.h b/runtime/lock_word.h
index 5d0d204..538b6eb 100644
--- a/runtime/lock_word.h
+++ b/runtime/lock_word.h
@@ -35,27 +35,27 @@
  * the state. The four possible states are fat locked, thin/unlocked, hash code, and forwarding
  * address. When the lock word is in the "thin" state and its bits are formatted as follows:
  *
- *  |33|22|222222221111|1111110000000000|
- *  |10|98|765432109876|5432109876543210|
- *  |00|rb| lock count |thread id owner |
+ *  |33|2|2|222222221111|1111110000000000|
+ *  |10|9|8|765432109876|5432109876543210|
+ *  |00|m|r| lock count |thread id owner |
  *
  * When the lock word is in the "fat" state and its bits are formatted as follows:
  *
- *  |33|22|2222222211111111110000000000|
- *  |10|98|7654321098765432109876543210|
- *  |01|rb| MonitorId                  |
+ *  |33|2|2|2222222211111111110000000000|
+ *  |10|9|8|7654321098765432109876543210|
+ *  |01|m|r| MonitorId                  |
  *
  * When the lock word is in hash state and its bits are formatted as follows:
  *
- *  |33|22|2222222211111111110000000000|
- *  |10|98|7654321098765432109876543210|
- *  |10|rb| HashCode                   |
+ *  |33|2|2|2222222211111111110000000000|
+ *  |10|9|8|7654321098765432109876543210|
+ *  |10|m|r| HashCode                   |
  *
- * When the lock word is in fowarding address state and its bits are formatted as follows:
+ * When the lock word is in forwarding address state and its bits are formatted as follows:
  *
- *  |33|22|2222222211111111110000000000|
- *  |10|98|7654321098765432109876543210|
- *  |11| ForwardingAddress             |
+ *  |33|2|22222222211111111110000000000|
+ *  |10|9|87654321098765432109876543210|
+ *  |11|0| ForwardingAddress           |
  *
  * The rb bits store the read barrier state.
  */
@@ -64,11 +64,13 @@
   enum SizeShiftsAndMasks {  // private marker to avoid generate-operator-out.py from processing.
     // Number of bits to encode the state, currently just fat or thin/unlocked or hash code.
     kStateSize = 2,
-    kReadBarrierStateSize = 2,
+    kReadBarrierStateSize = 1,
+    kMarkBitStateSize = 1,
     // Number of bits to encode the thin lock owner.
     kThinLockOwnerSize = 16,
     // Remaining bits are the recursive lock count.
-    kThinLockCountSize = 32 - kThinLockOwnerSize - kStateSize - kReadBarrierStateSize,
+    kThinLockCountSize = 32 - kThinLockOwnerSize - kStateSize - kReadBarrierStateSize -
+        kMarkBitStateSize,
     // Thin lock bits. Owner in lowest bits.
 
     kThinLockOwnerShift = 0,
@@ -81,25 +83,43 @@
     kThinLockCountOne = 1 << kThinLockCountShift,  // == 65536 (0x10000)
 
     // State in the highest bits.
-    kStateShift = kReadBarrierStateSize + kThinLockCountSize + kThinLockCountShift,
+    kStateShift = kReadBarrierStateSize + kThinLockCountSize + kThinLockCountShift +
+        kMarkBitStateSize,
     kStateMask = (1 << kStateSize) - 1,
     kStateMaskShifted = kStateMask << kStateShift,
     kStateThinOrUnlocked = 0,
     kStateFat = 1,
     kStateHash = 2,
     kStateForwardingAddress = 3,
+
+    // Read barrier bit.
     kReadBarrierStateShift = kThinLockCountSize + kThinLockCountShift,
     kReadBarrierStateMask = (1 << kReadBarrierStateSize) - 1,
     kReadBarrierStateMaskShifted = kReadBarrierStateMask << kReadBarrierStateShift,
     kReadBarrierStateMaskShiftedToggled = ~kReadBarrierStateMaskShifted,
 
+    // Mark bit.
+    kMarkBitStateShift = kReadBarrierStateSize + kReadBarrierStateShift,
+    kMarkBitStateMask = (1 << kMarkBitStateSize) - 1,
+    kMarkBitStateMaskShifted = kMarkBitStateMask << kMarkBitStateShift,
+    kMarkBitStateMaskShiftedToggled = ~kMarkBitStateMaskShifted,
+
+    // GC state is mark bit and read barrier state.
+    kGCStateSize = kReadBarrierStateSize + kMarkBitStateSize,
+    kGCStateShift = kReadBarrierStateShift,
+    kGCStateMaskShifted = kReadBarrierStateMaskShifted | kMarkBitStateMaskShifted,
+    kGCStateMaskShiftedToggled = ~kGCStateMaskShifted,
+
     // When the state is kHashCode, the non-state bits hold the hashcode.
     // Note Object.hashCode() has the hash code layout hardcoded.
     kHashShift = 0,
-    kHashSize = 32 - kStateSize - kReadBarrierStateSize,
+    kHashSize = 32 - kStateSize - kReadBarrierStateSize - kMarkBitStateSize,
     kHashMask = (1 << kHashSize) - 1,
     kMaxHash = kHashMask,
 
+    // Forwarding address shift.
+    kForwardingAddressShift = kObjectAlignmentShift,
+
     kMonitorIdShift = kHashShift,
     kMonitorIdSize = kHashSize,
     kMonitorIdMask = kHashMask,
@@ -108,31 +128,31 @@
     kMaxMonitorId = kMaxHash
   };
 
-  static LockWord FromThinLockId(uint32_t thread_id, uint32_t count, uint32_t rb_state) {
+  static LockWord FromThinLockId(uint32_t thread_id, uint32_t count, uint32_t gc_state) {
     CHECK_LE(thread_id, static_cast<uint32_t>(kThinLockMaxOwner));
     CHECK_LE(count, static_cast<uint32_t>(kThinLockMaxCount));
-    DCHECK_EQ(rb_state & ~kReadBarrierStateMask, 0U);
-    return LockWord((thread_id << kThinLockOwnerShift) | (count << kThinLockCountShift) |
-                    (rb_state << kReadBarrierStateShift) |
+    // DCHECK_EQ(gc_bits & kGCStateMaskToggled, 0U);
+    return LockWord((thread_id << kThinLockOwnerShift) |
+                    (count << kThinLockCountShift) |
+                    (gc_state << kGCStateShift) |
                     (kStateThinOrUnlocked << kStateShift));
   }
 
   static LockWord FromForwardingAddress(size_t target) {
     DCHECK_ALIGNED(target, (1 << kStateSize));
-    return LockWord((target >> kStateSize) | (kStateForwardingAddress << kStateShift));
+    return LockWord((target >> kForwardingAddressShift) | (kStateForwardingAddress << kStateShift));
   }
 
-  static LockWord FromHashCode(uint32_t hash_code, uint32_t rb_state) {
+  static LockWord FromHashCode(uint32_t hash_code, uint32_t gc_state) {
     CHECK_LE(hash_code, static_cast<uint32_t>(kMaxHash));
-    DCHECK_EQ(rb_state & ~kReadBarrierStateMask, 0U);
+    // DCHECK_EQ(gc_bits & kGCStateMaskToggled, 0U);
     return LockWord((hash_code << kHashShift) |
-                    (rb_state << kReadBarrierStateShift) |
+                    (gc_state << kGCStateShift) |
                     (kStateHash << kStateShift));
   }
 
-  static LockWord FromDefault(uint32_t rb_state) {
-    DCHECK_EQ(rb_state & ~kReadBarrierStateMask, 0U);
-    return LockWord(rb_state << kReadBarrierStateShift);
+  static LockWord FromDefault(uint32_t gc_state) {
+    return LockWord(gc_state << kGCStateShift);
   }
 
   static bool IsDefault(LockWord lw) {
@@ -154,7 +174,7 @@
   LockState GetState() const {
     CheckReadBarrierState();
     if ((!kUseReadBarrier && UNLIKELY(value_ == 0)) ||
-        (kUseReadBarrier && UNLIKELY((value_ & kReadBarrierStateMaskShiftedToggled) == 0))) {
+        (kUseReadBarrier && UNLIKELY((value_ & kGCStateMaskShiftedToggled) == 0))) {
       return kUnlocked;
     } else {
       uint32_t internal_state = (value_ >> kStateShift) & kStateMask;
@@ -176,6 +196,10 @@
     return (value_ >> kReadBarrierStateShift) & kReadBarrierStateMask;
   }
 
+  uint32_t GCState() const {
+    return (value_ & kGCStateMaskShifted) >> kGCStateShift;
+  }
+
   void SetReadBarrierState(uint32_t rb_state) {
     DCHECK_EQ(rb_state & ~kReadBarrierStateMask, 0U);
     DCHECK_NE(static_cast<uint32_t>(GetState()), static_cast<uint32_t>(kForwardingAddress));
@@ -184,6 +208,19 @@
     value_ |= (rb_state & kReadBarrierStateMask) << kReadBarrierStateShift;
   }
 
+
+  uint32_t MarkBitState() const {
+    return (value_ >> kMarkBitStateShift) & kMarkBitStateMask;
+  }
+
+  void SetMarkBitState(uint32_t mark_bit) {
+    DCHECK_EQ(mark_bit & ~kMarkBitStateMask, 0U);
+    DCHECK_NE(static_cast<uint32_t>(GetState()), static_cast<uint32_t>(kForwardingAddress));
+    // Clear and or the bits.
+    value_ &= kMarkBitStateMaskShiftedToggled;
+    value_ |= mark_bit << kMarkBitStateShift;
+  }
+
   // Return the owner thin lock thread id.
   uint32_t ThinLockOwner() const;
 
@@ -197,7 +234,7 @@
   size_t ForwardingAddress() const;
 
   // Constructor a lock word for inflation to use a Monitor.
-  LockWord(Monitor* mon, uint32_t rb_state);
+  LockWord(Monitor* mon, uint32_t gc_state);
 
   // Return the hash code stored in the lock word, must be kHashCode state.
   int32_t GetHashCode() const;
@@ -207,7 +244,7 @@
     if (kIncludeReadBarrierState) {
       return lw1.GetValue() == lw2.GetValue();
     }
-    return lw1.GetValueWithoutReadBarrierState() == lw2.GetValueWithoutReadBarrierState();
+    return lw1.GetValueWithoutGCState() == lw2.GetValueWithoutGCState();
   }
 
   void Dump(std::ostream& os) {
@@ -248,9 +285,9 @@
     return value_;
   }
 
-  uint32_t GetValueWithoutReadBarrierState() const {
+  uint32_t GetValueWithoutGCState() const {
     CheckReadBarrierState();
-    return value_ & ~(kReadBarrierStateMask << kReadBarrierStateShift);
+    return value_ & kGCStateMaskShiftedToggled;
   }
 
   // Only Object should be converting LockWords to/from uints.
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index 0592c6c..0495c95 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -147,10 +147,20 @@
 #endif
 }
 
+inline uint32_t Object::GetMarkBit() {
+#ifdef USE_READ_BARRIER
+  return GetLockWord(false).MarkBitState();
+#else
+  LOG(FATAL) << "Unreachable";
+  UNREACHABLE();
+#endif
+}
+
 inline void Object::SetReadBarrierPointer(Object* rb_ptr) {
 #ifdef USE_BAKER_READ_BARRIER
   DCHECK(kUseBakerReadBarrier);
   DCHECK_EQ(reinterpret_cast<uint64_t>(rb_ptr) >> 32, 0U);
+  DCHECK_NE(rb_ptr, ReadBarrier::BlackPtr()) << "Setting to black is not supported";
   LockWord lw = GetLockWord(false);
   lw.SetReadBarrierState(static_cast<uint32_t>(reinterpret_cast<uintptr_t>(rb_ptr)));
   SetLockWord(lw, false);
@@ -173,6 +183,8 @@
   DCHECK(kUseBakerReadBarrier);
   DCHECK_EQ(reinterpret_cast<uint64_t>(expected_rb_ptr) >> 32, 0U);
   DCHECK_EQ(reinterpret_cast<uint64_t>(rb_ptr) >> 32, 0U);
+  DCHECK_NE(expected_rb_ptr, ReadBarrier::BlackPtr()) << "Setting to black is not supported";
+  DCHECK_NE(rb_ptr, ReadBarrier::BlackPtr()) << "Setting to black is not supported";
   LockWord expected_lw;
   LockWord new_lw;
   do {
@@ -216,6 +228,24 @@
 #endif
 }
 
+inline bool Object::AtomicSetMarkBit(uint32_t expected_mark_bit, uint32_t mark_bit) {
+  LockWord expected_lw;
+  LockWord new_lw;
+  do {
+    LockWord lw = GetLockWord(false);
+    if (UNLIKELY(lw.MarkBitState() != expected_mark_bit)) {
+      // Lost the race.
+      return false;
+    }
+    expected_lw = lw;
+    new_lw = lw;
+    new_lw.SetMarkBitState(mark_bit);
+    // Since this is only set from the mutator, we can use the non release Cas.
+  } while (!CasLockWordWeakRelaxed(expected_lw, new_lw));
+  return true;
+}
+
+
 inline void Object::AssertReadBarrierPointer() const {
   if (kUseBakerReadBarrier) {
     Object* obj = const_cast<Object*>(this);
diff --git a/runtime/mirror/object.cc b/runtime/mirror/object.cc
index 701c600..13c536e 100644
--- a/runtime/mirror/object.cc
+++ b/runtime/mirror/object.cc
@@ -163,8 +163,7 @@
       case LockWord::kUnlocked: {
         // Try to compare and swap in a new hash, if we succeed we will return the hash on the next
         // loop iteration.
-        LockWord hash_word = LockWord::FromHashCode(GenerateIdentityHashCode(),
-                                                    lw.ReadBarrierState());
+        LockWord hash_word = LockWord::FromHashCode(GenerateIdentityHashCode(), lw.GCState());
         DCHECK_EQ(hash_word.GetState(), LockWord::kHashCode);
         if (const_cast<Object*>(this)->CasLockWordWeakRelaxed(lw, hash_word)) {
           return hash_word.GetHashCode();
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index a4bdbad..5b129bf 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -93,6 +93,7 @@
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   void SetClass(Class* new_klass) SHARED_REQUIRES(Locks::mutator_lock_);
 
+  // TODO: Clean this up and change to return int32_t
   Object* GetReadBarrierPointer() SHARED_REQUIRES(Locks::mutator_lock_);
 
 #ifndef USE_BAKER_OR_BROOKS_READ_BARRIER
@@ -103,6 +104,12 @@
   template<bool kCasRelease = false>
   ALWAYS_INLINE bool AtomicSetReadBarrierPointer(Object* expected_rb_ptr, Object* rb_ptr)
       SHARED_REQUIRES(Locks::mutator_lock_);
+
+  ALWAYS_INLINE uint32_t GetMarkBit() SHARED_REQUIRES(Locks::mutator_lock_);
+
+  ALWAYS_INLINE bool AtomicSetMarkBit(uint32_t expected_mark_bit, uint32_t mark_bit)
+      SHARED_REQUIRES(Locks::mutator_lock_);
+
   void AssertReadBarrierPointer() const SHARED_REQUIRES(Locks::mutator_lock_);
 
   // The verifier treats all interfaces as java.lang.Object and relies on runtime checks in
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index bf9f931..e863ea9 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -155,7 +155,7 @@
       return false;
     }
   }
-  LockWord fat(this, lw.ReadBarrierState());
+  LockWord fat(this, lw.GCState());
   // Publish the updated lock word, which may race with other threads.
   bool success = GetObject()->CasLockWordWeakSequentiallyConsistent(lw, fat);
   // Lock profiling.
@@ -774,20 +774,21 @@
         return false;
       }
       // Deflate to a thin lock.
-      LockWord new_lw = LockWord::FromThinLockId(owner->GetThreadId(), monitor->lock_count_,
-                                                 lw.ReadBarrierState());
+      LockWord new_lw = LockWord::FromThinLockId(owner->GetThreadId(),
+                                                 monitor->lock_count_,
+                                                 lw.GCState());
       // Assume no concurrent read barrier state changes as mutators are suspended.
       obj->SetLockWord(new_lw, false);
       VLOG(monitor) << "Deflated " << obj << " to thin lock " << owner->GetTid() << " / "
           << monitor->lock_count_;
     } else if (monitor->HasHashCode()) {
-      LockWord new_lw = LockWord::FromHashCode(monitor->GetHashCode(), lw.ReadBarrierState());
+      LockWord new_lw = LockWord::FromHashCode(monitor->GetHashCode(), lw.GCState());
       // Assume no concurrent read barrier state changes as mutators are suspended.
       obj->SetLockWord(new_lw, false);
       VLOG(monitor) << "Deflated " << obj << " to hash monitor " << monitor->GetHashCode();
     } else {
       // No lock and no hash, just put an empty lock word inside the object.
-      LockWord new_lw = LockWord::FromDefault(lw.ReadBarrierState());
+      LockWord new_lw = LockWord::FromDefault(lw.GCState());
       // Assume no concurrent read barrier state changes as mutators are suspended.
       obj->SetLockWord(new_lw, false);
       VLOG(monitor) << "Deflated" << obj << " to empty lock word";
@@ -876,7 +877,7 @@
     LockWord lock_word = h_obj->GetLockWord(true);
     switch (lock_word.GetState()) {
       case LockWord::kUnlocked: {
-        LockWord thin_locked(LockWord::FromThinLockId(thread_id, 0, lock_word.ReadBarrierState()));
+        LockWord thin_locked(LockWord::FromThinLockId(thread_id, 0, lock_word.GCState()));
         if (h_obj->CasLockWordWeakSequentiallyConsistent(lock_word, thin_locked)) {
           AtraceMonitorLock(self, h_obj.Get(), false /* is_wait */);
           // CasLockWord enforces more than the acquire ordering we need here.
@@ -890,8 +891,9 @@
           // We own the lock, increase the recursion count.
           uint32_t new_count = lock_word.ThinLockCount() + 1;
           if (LIKELY(new_count <= LockWord::kThinLockMaxCount)) {
-            LockWord thin_locked(LockWord::FromThinLockId(thread_id, new_count,
-                                                          lock_word.ReadBarrierState()));
+            LockWord thin_locked(LockWord::FromThinLockId(thread_id,
+                                                          new_count,
+                                                          lock_word.GCState()));
             if (!kUseReadBarrier) {
               h_obj->SetLockWord(thin_locked, true);
               AtraceMonitorLock(self, h_obj.Get(), false /* is_wait */);
@@ -975,9 +977,9 @@
           LockWord new_lw = LockWord::Default();
           if (lock_word.ThinLockCount() != 0) {
             uint32_t new_count = lock_word.ThinLockCount() - 1;
-            new_lw = LockWord::FromThinLockId(thread_id, new_count, lock_word.ReadBarrierState());
+            new_lw = LockWord::FromThinLockId(thread_id, new_count, lock_word.GCState());
           } else {
-            new_lw = LockWord::FromDefault(lock_word.ReadBarrierState());
+            new_lw = LockWord::FromDefault(lock_word.GCState());
           }
           if (!kUseReadBarrier) {
             DCHECK_EQ(new_lw.ReadBarrierState(), 0U);
diff --git a/runtime/read_barrier.h b/runtime/read_barrier.h
index 42e959c..5d32c09 100644
--- a/runtime/read_barrier.h
+++ b/runtime/read_barrier.h
@@ -99,8 +99,9 @@
   // Note: These couldn't be constexpr pointers as reinterpret_cast isn't compatible with them.
   static constexpr uintptr_t white_ptr_ = 0x0;    // Not marked.
   static constexpr uintptr_t gray_ptr_ = 0x1;     // Marked, but not marked through. On mark stack.
+  // TODO: black_ptr_ is unused, we should remove it.
   static constexpr uintptr_t black_ptr_ = 0x2;    // Marked through. Used for non-moving objects.
-  static constexpr uintptr_t rb_ptr_mask_ = 0x3;  // The low 2 bits for white|gray|black.
+  static constexpr uintptr_t rb_ptr_mask_ = 0x1;  // The low bits for white|gray.
 };
 
 }  // namespace art
diff --git a/test/614-checker-dump-constant-location/expected.txt b/test/614-checker-dump-constant-location/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/614-checker-dump-constant-location/expected.txt
diff --git a/test/614-checker-dump-constant-location/info.txt b/test/614-checker-dump-constant-location/info.txt
new file mode 100644
index 0000000..4a94ffa
--- /dev/null
+++ b/test/614-checker-dump-constant-location/info.txt
@@ -0,0 +1,2 @@
+Test that the graph visualizer outputs useful information for constant
+locations in parallel moves.
diff --git a/test/614-checker-dump-constant-location/src/Main.java b/test/614-checker-dump-constant-location/src/Main.java
new file mode 100644
index 0000000..f6bc063
--- /dev/null
+++ b/test/614-checker-dump-constant-location/src/Main.java
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+
+  public static int array_int[] = { 0 };
+  public static long array_long[] = { 0 };
+  public static float array_float[] = { 0.0f };
+  public static double array_double[] = { 0.0 };
+
+  // The code used to print constant locations in parallel moves is architecture
+  // independent. We only test for ARM and ARM64 as it is easy: 'store'
+  // instructions only take registers as a source.
+
+  /// CHECK-START-ARM: void Main.store_to_arrays() register (after)
+  /// CHECK:    ParallelMove {{.*#1->.*#2->.*#3\.3->.*#4\.4->.*}}
+
+  /// CHECK-START-ARM64: void Main.store_to_arrays() register (after)
+  /// CHECK:    ParallelMove {{.*#1->.*#2->.*#3\.3->.*#4\.4->.*}}
+
+  public void store_to_arrays() {
+    array_int[0] = 1;
+    array_long[0] = 2;
+    array_float[0] = 3.3f;
+    array_double[0] = 4.4;
+  }
+
+  public static void main(String args[]) {}
+}
diff --git a/test/615-checker-arm64-zr-parallel-move/expected.txt b/test/615-checker-arm64-zr-parallel-move/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/615-checker-arm64-zr-parallel-move/expected.txt
diff --git a/test/615-checker-arm64-zr-parallel-move/info.txt b/test/615-checker-arm64-zr-parallel-move/info.txt
new file mode 100644
index 0000000..199755d
--- /dev/null
+++ b/test/615-checker-arm64-zr-parallel-move/info.txt
@@ -0,0 +1 @@
+Checker test to verify we correctly use wzr and xzr to synthesize zero constants.
diff --git a/test/615-checker-arm64-zr-parallel-move/src/Main.java b/test/615-checker-arm64-zr-parallel-move/src/Main.java
new file mode 100644
index 0000000..5024f28
--- /dev/null
+++ b/test/615-checker-arm64-zr-parallel-move/src/Main.java
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+
+  public static boolean doThrow = false;
+
+  public void $noinline$foo(int in_w1,
+                            int in_w2,
+                            int in_w3,
+                            int in_w4,
+                            int in_w5,
+                            int in_w6,
+                            int in_w7,
+                            int on_stack_int,
+                            long on_stack_long,
+                            float in_s0,
+                            float in_s1,
+                            float in_s2,
+                            float in_s3,
+                            float in_s4,
+                            float in_s5,
+                            float in_s6,
+                            float in_s7,
+                            float on_stack_float,
+                            double on_stack_double) {
+    if (doThrow) throw new Error();
+  }
+
+  // We expect a parallel move that moves four times the zero constant to stack locations.
+  /// CHECK-START-ARM64: void Main.bar() register (after)
+  /// CHECK:             ParallelMove {{.*#0->[0-9x]+\(sp\).*#0->[0-9x]+\(sp\).*#0->[0-9x]+\(sp\).*#0->[0-9x]+\(sp\).*}}
+
+  // Those four moves should generate four 'store' instructions using directly the zero register.
+  /// CHECK-START-ARM64: void Main.bar() disassembly (after)
+  /// CHECK-DAG:         {{(str|stur)}} wzr, [sp, #{{[0-9]+}}]
+  /// CHECK-DAG:         {{(str|stur)}} xzr, [sp, #{{[0-9]+}}]
+  /// CHECK-DAG:         {{(str|stur)}} wzr, [sp, #{{[0-9]+}}]
+  /// CHECK-DAG:         {{(str|stur)}} xzr, [sp, #{{[0-9]+}}]
+
+  public void bar() {
+    $noinline$foo(1, 2, 3, 4, 5, 6, 7,     // Integral values in registers.
+                  0, 0L,                   // Integral values on the stack.
+                  1, 2, 3, 4, 5, 6, 7, 8,  // Floating-point values in registers.
+                  0.0f, 0.0);              // Floating-point values on the stack.
+  }
+
+  public static void main(String args[]) {}
+}
diff --git a/test/Android.run-test.mk b/test/Android.run-test.mk
index 8f8b667..8d7d70d 100644
--- a/test/Android.run-test.mk
+++ b/test/Android.run-test.mk
@@ -26,7 +26,8 @@
 
 # The path where build only targets will be output, e.g.
 # out/target/product/generic_x86_64/obj/PACKAGING/art-run-tests_intermediates/DATA
-art_run_tests_dir := $(call intermediates-dir-for,PACKAGING,art-run-tests)/DATA
+art_run_tests_build_dir := $(call intermediates-dir-for,JAVA_LIBRARIES,art-run-tests)/DATA
+art_run_tests_install_dir := $(call intermediates-dir-for,PACKAGING,art-run-tests)/DATA
 
 # A generated list of prerequisites that call 'run-test --build-only', the actual prerequisite is
 # an empty file touched in the intermediate directory.
@@ -49,7 +50,8 @@
 # Helper to create individual build targets for tests. Must be called with $(eval).
 # $(1): the test number
 define define-build-art-run-test
-  dmart_target := $(art_run_tests_dir)/art-run-tests/$(1)/touch
+  dmart_target := $(art_run_tests_build_dir)/art-run-tests/$(1)/touch
+  dmart_install_target := $(art_run_tests_install_dir)/art-run-tests/$(1)/touch
   run_test_options = --build-only
   ifeq ($(ART_TEST_QUIET),true)
     run_test_options += --quiet
@@ -67,8 +69,13 @@
 	  $(LOCAL_PATH)/run-test $$(PRIVATE_RUN_TEST_OPTIONS) --output-path $$(abspath $$(dir $$@)) $(1)
 	$(hide) touch $$@
 
-  TEST_ART_RUN_TEST_BUILD_RULES += $$(dmart_target)
+$$(dmart_install_target): $$(dmart_target)
+	$(hide) rm -rf $$(dir $$@) && mkdir -p $$(dir $$@)
+	$(hide) cp $$(dir $$<)/* $$(dir $$@)/
+
+  TEST_ART_RUN_TEST_BUILD_RULES += $$(dmart_install_target)
   dmart_target :=
+  dmart_install_target :=
   run_test_options :=
 endef
 $(foreach test, $(TEST_ART_RUN_TESTS), $(eval $(call define-build-art-run-test,$(test))))
@@ -78,12 +85,13 @@
 LOCAL_MODULE := art-run-tests
 LOCAL_ADDITIONAL_DEPENDENCIES := $(TEST_ART_RUN_TEST_BUILD_RULES)
 # The build system use this flag to pick up files generated by declare-make-art-run-test.
-LOCAL_PICKUP_FILES := $(art_run_tests_dir)
+LOCAL_PICKUP_FILES := $(art_run_tests_install_dir)
 
 include $(BUILD_PHONY_PACKAGE)
 
 # Clear temp vars.
-art_run_tests_dir :=
+art_run_tests_build_dir :=
+art_run_tests_install_dir :=
 define-build-art-run-test :=
 TEST_ART_RUN_TEST_BUILD_RULES :=
 
diff --git a/test/common/runtime_state.cc b/test/common/runtime_state.cc
index 806e130..ee2ee1a 100644
--- a/test/common/runtime_state.cc
+++ b/test/common/runtime_state.cc
@@ -130,18 +130,18 @@
     return;
   }
 
-  ScopedObjectAccess soa(Thread::Current());
+  ArtMethod* method = nullptr;
+  {
+    ScopedObjectAccess soa(Thread::Current());
 
-  ScopedUtfChars chars(env, method_name);
-  CHECK(chars.c_str() != nullptr);
-
-  mirror::Class* klass = soa.Decode<mirror::Class*>(cls);
-  ArtMethod* method = klass->FindDeclaredDirectMethodByName(chars.c_str(), kRuntimePointerSize);
+    ScopedUtfChars chars(env, method_name);
+    CHECK(chars.c_str() != nullptr);
+    method = soa.Decode<mirror::Class*>(cls)->FindDeclaredDirectMethodByName(
+        chars.c_str(), kRuntimePointerSize);
+  }
 
   jit::JitCodeCache* code_cache = jit->GetCodeCache();
   OatQuickMethodHeader* header = nullptr;
-  // Make sure there is a profiling info, required by the compiler.
-  ProfilingInfo::Create(soa.Self(), method, /* retry_allocation */ true);
   while (true) {
     header = OatQuickMethodHeader::FromEntryPoint(method->GetEntryPointFromQuickCompiledCode());
     if (code_cache->ContainsPc(header->GetCode())) {
@@ -149,6 +149,9 @@
     } else {
       // Sleep to yield to the compiler thread.
       usleep(1000);
+      ScopedObjectAccess soa(Thread::Current());
+      // Make sure there is a profiling info, required by the compiler.
+      ProfilingInfo::Create(soa.Self(), method, /* retry_allocation */ true);
       // Will either ensure it's compiled or do the compilation itself.
       jit->CompileMethod(method, soa.Self(), /* osr */ false);
     }
diff --git a/test/etc/run-test-jar b/test/etc/run-test-jar
index 64bf4f3..c6c9380 100755
--- a/test/etc/run-test-jar
+++ b/test/etc/run-test-jar
@@ -553,12 +553,10 @@
     if [ "$TIME_OUT" = "timeout" ]; then
       # Add timeout command if time out is desired.
       #
-      # Note: We use nested timeouts. The inner timeout sends SIGRTMIN+2 (usually 36) to ART, which
-      #       will induce a full thread dump before abort. However, dumping threads might deadlock,
-      #       so the outer timeout sends the regular SIGTERM after an additional minute to ensure
-      #       termination (without dumping all threads).
-      TIME_PLUS_ONE=$(($TIME_OUT_VALUE + 60))
-      cmdline="timeout ${TIME_PLUS_ONE}s timeout -s SIGRTMIN+2 ${TIME_OUT_VALUE}s $cmdline"
+      # Note: We first send SIGRTMIN+2 (usually 36) to ART, which will induce a full thread dump
+      #       before abort. However, dumping threads might deadlock, so we also use the "-k"
+      #       option to definitely kill the child.
+      cmdline="timeout -k 120s -s SIGRTMIN+2 ${TIME_OUT_VALUE}s $cmdline"
     fi
 
     if [ "$DEV_MODE" = "y" ]; then
diff --git a/tools/ahat/src/InstanceUtils.java b/tools/ahat/src/InstanceUtils.java
index 3cdb40c..8769d11 100644
--- a/tools/ahat/src/InstanceUtils.java
+++ b/tools/ahat/src/InstanceUtils.java
@@ -95,9 +95,7 @@
       return null;
     }
 
-    // TODO: When perflib provides a better way to get the length of the
-    // array, we should use that here.
-    int numChars = chars.getValues().length;
+    int numChars = chars.getLength();
     int count = getIntField(inst, "count", numChars);
     if (count == 0) {
       return "";
diff --git a/tools/cpp-define-generator/constant_lockword.def b/tools/cpp-define-generator/constant_lockword.def
index c1e6099..67ed5b5 100644
--- a/tools/cpp-define-generator/constant_lockword.def
+++ b/tools/cpp-define-generator/constant_lockword.def
@@ -30,5 +30,12 @@
 DEFINE_LOCK_WORD_EXPR(READ_BARRIER_STATE_MASK_TOGGLED, uint32_t, kReadBarrierStateMaskShiftedToggled)
 DEFINE_LOCK_WORD_EXPR(THIN_LOCK_COUNT_ONE,       int32_t,  kThinLockCountOne)
 
+DEFINE_LOCK_WORD_EXPR(GC_STATE_MASK_SHIFTED,   uint32_t,  kGCStateMaskShifted)
+DEFINE_LOCK_WORD_EXPR(GC_STATE_MASK_SHIFTED_TOGGLED, uint32_t, kGCStateMaskShiftedToggled)
+DEFINE_LOCK_WORD_EXPR(GC_STATE_SHIFT,   int32_t,  kGCStateShift)
+
+DEFINE_LOCK_WORD_EXPR(MARK_BIT_SHIFT, int32_t, kMarkBitStateShift)
+DEFINE_LOCK_WORD_EXPR(MARK_BIT_MASK_SHIFTED, uint32_t, kMarkBitStateMaskShifted)
+
 #undef DEFINE_LOCK_WORD_EXPR