Merge "Implement single-precision round intrinsic in x86"
diff --git a/compiler/image_writer.cc b/compiler/image_writer.cc
index 7a34683..7c87a60 100644
--- a/compiler/image_writer.cc
+++ b/compiler/image_writer.cc
@@ -39,6 +39,7 @@
 #include "gc/accounting/card_table-inl.h"
 #include "gc/accounting/heap_bitmap.h"
 #include "gc/accounting/space_bitmap-inl.h"
+#include "gc/collector/concurrent_copying.h"
 #include "gc/heap.h"
 #include "gc/space/large_object_space.h"
 #include "gc/space/space-inl.h"
@@ -1377,6 +1378,8 @@
       runtime->GetCalleeSaveMethod(Runtime::kRefsOnly);
   image_methods_[ImageHeader::kRefsAndArgsSaveMethod] =
       runtime->GetCalleeSaveMethod(Runtime::kRefsAndArgs);
+  image_methods_[ImageHeader::kSaveEverythingMethod] =
+      runtime->GetCalleeSaveMethod(Runtime::kSaveEverything);
   // Visit image methods first to have the main runtime methods in the first image.
   for (auto* m : image_methods_) {
     CHECK(m != nullptr);
@@ -1823,6 +1826,11 @@
   const auto it = saved_hashcode_map_.find(obj);
   dst->SetLockWord(it != saved_hashcode_map_.end() ?
       LockWord::FromHashCode(it->second, 0u) : LockWord::Default(), false);
+  if (kUseBakerReadBarrier && gc::collector::ConcurrentCopying::kGrayDirtyImmuneObjects) {
+    // Treat all of the objects in the image as marked to avoid unnecessary dirty pages. This is
+    // safe since we mark all of the objects that may reference non immune objects as gray.
+    CHECK(dst->AtomicSetMarkBit(0, 1));
+  }
   FixupObject(obj, dst);
 }
 
diff --git a/compiler/image_writer.h b/compiler/image_writer.h
index 626a975..7d13656 100644
--- a/compiler/image_writer.h
+++ b/compiler/image_writer.h
@@ -217,8 +217,7 @@
   // uint32 = typeof(lockword_)
   // Subtract read barrier bits since we want these to remain 0, or else it may result in DCHECK
   // failures due to invalid read barrier bits during object field reads.
-  static const size_t kBinShift = BitSizeOf<uint32_t>() - kBinBits -
-      LockWord::kReadBarrierStateSize;
+  static const size_t kBinShift = BitSizeOf<uint32_t>() - kBinBits - LockWord::kGCStateSize;
   // 111000.....0
   static const size_t kBinMask = ((static_cast<size_t>(1) << kBinBits) - 1) << kBinShift;
 
diff --git a/compiler/jni/jni_cfi_test_expected.inc b/compiler/jni/jni_cfi_test_expected.inc
index 16b4386..da72c75 100644
--- a/compiler/jni/jni_cfi_test_expected.inc
+++ b/compiler/jni/jni_cfi_test_expected.inc
@@ -1,8 +1,7 @@
 static constexpr uint8_t expected_asm_kThumb2[] = {
     0x2D, 0xE9, 0xE0, 0x4D, 0x2D, 0xED, 0x10, 0x8A, 0x89, 0xB0, 0x00, 0x90,
-    0xCD, 0xF8, 0x84, 0x10, 0x8D, 0xED, 0x22, 0x0A, 0xCD, 0xF8, 0x8C, 0x20,
-    0xCD, 0xF8, 0x90, 0x30, 0x88, 0xB0, 0x08, 0xB0, 0x09, 0xB0, 0xBD, 0xEC,
-    0x10, 0x8A, 0xBD, 0xE8, 0xE0, 0x8D,
+    0x21, 0x91, 0x8D, 0xED, 0x22, 0x0A, 0x23, 0x92, 0x24, 0x93, 0x88, 0xB0,
+    0x08, 0xB0, 0x09, 0xB0, 0xBD, 0xEC, 0x10, 0x8A, 0xBD, 0xE8, 0xE0, 0x8D,
 };
 static constexpr uint8_t expected_cfi_kThumb2[] = {
     0x44, 0x0E, 0x1C, 0x85, 0x07, 0x86, 0x06, 0x87, 0x05, 0x88, 0x04, 0x8A,
@@ -11,7 +10,7 @@
     0x55, 0x12, 0x05, 0x56, 0x11, 0x05, 0x57, 0x10, 0x05, 0x58, 0x0F, 0x05,
     0x59, 0x0E, 0x05, 0x5A, 0x0D, 0x05, 0x5B, 0x0C, 0x05, 0x5C, 0x0B, 0x05,
     0x5D, 0x0A, 0x05, 0x5E, 0x09, 0x05, 0x5F, 0x08, 0x42, 0x0E, 0x80, 0x01,
-    0x54, 0x0E, 0xA0, 0x01, 0x42, 0x0E, 0x80, 0x01, 0x0A, 0x42, 0x0E, 0x5C,
+    0x4E, 0x0E, 0xA0, 0x01, 0x42, 0x0E, 0x80, 0x01, 0x0A, 0x42, 0x0E, 0x5C,
     0x44, 0x0E, 0x1C, 0x06, 0x50, 0x06, 0x51, 0x06, 0x52, 0x06, 0x53, 0x06,
     0x54, 0x06, 0x55, 0x06, 0x56, 0x06, 0x57, 0x06, 0x58, 0x06, 0x59, 0x06,
     0x5A, 0x06, 0x5B, 0x06, 0x5C, 0x06, 0x5D, 0x06, 0x5E, 0x06, 0x5F, 0x44,
@@ -47,38 +46,38 @@
 // 0x00000008: sub sp, sp, #36
 // 0x0000000a: .cfi_def_cfa_offset: 128
 // 0x0000000a: str r0, [sp, #0]
-// 0x0000000c: str.w r1, [sp, #132]
-// 0x00000010: vstr.f32 s0, [sp, #136]
-// 0x00000014: str.w r2, [sp, #140]
-// 0x00000018: str.w r3, [sp, #144]
-// 0x0000001c: sub sp, sp, #32
-// 0x0000001e: .cfi_def_cfa_offset: 160
-// 0x0000001e: add sp, sp, #32
-// 0x00000020: .cfi_def_cfa_offset: 128
-// 0x00000020: .cfi_remember_state
-// 0x00000020: add sp, sp, #36
-// 0x00000022: .cfi_def_cfa_offset: 92
-// 0x00000022: vpop.f32 {s16-s31}
-// 0x00000026: .cfi_def_cfa_offset: 28
-// 0x00000026: .cfi_restore_extended: r80
-// 0x00000026: .cfi_restore_extended: r81
-// 0x00000026: .cfi_restore_extended: r82
-// 0x00000026: .cfi_restore_extended: r83
-// 0x00000026: .cfi_restore_extended: r84
-// 0x00000026: .cfi_restore_extended: r85
-// 0x00000026: .cfi_restore_extended: r86
-// 0x00000026: .cfi_restore_extended: r87
-// 0x00000026: .cfi_restore_extended: r88
-// 0x00000026: .cfi_restore_extended: r89
-// 0x00000026: .cfi_restore_extended: r90
-// 0x00000026: .cfi_restore_extended: r91
-// 0x00000026: .cfi_restore_extended: r92
-// 0x00000026: .cfi_restore_extended: r93
-// 0x00000026: .cfi_restore_extended: r94
-// 0x00000026: .cfi_restore_extended: r95
-// 0x00000026: pop {r5, r6, r7, r8, r10, r11, pc}
-// 0x0000002a: .cfi_restore_state
-// 0x0000002a: .cfi_def_cfa_offset: 128
+// 0x0000000c: str r1, [sp, #132]
+// 0x0000000e: vstr.f32 s0, [sp, #136]
+// 0x00000012: str r2, [sp, #140]
+// 0x00000014: str r3, [sp, #144]
+// 0x00000016: sub sp, sp, #32
+// 0x00000018: .cfi_def_cfa_offset: 160
+// 0x00000018: add sp, sp, #32
+// 0x0000001a: .cfi_def_cfa_offset: 128
+// 0x0000001a: .cfi_remember_state
+// 0x0000001a: add sp, sp, #36
+// 0x0000001c: .cfi_def_cfa_offset: 92
+// 0x0000001c: vpop.f32 {s16-s31}
+// 0x00000020: .cfi_def_cfa_offset: 28
+// 0x00000020: .cfi_restore_extended: r80
+// 0x00000020: .cfi_restore_extended: r81
+// 0x00000020: .cfi_restore_extended: r82
+// 0x00000020: .cfi_restore_extended: r83
+// 0x00000020: .cfi_restore_extended: r84
+// 0x00000020: .cfi_restore_extended: r85
+// 0x00000020: .cfi_restore_extended: r86
+// 0x00000020: .cfi_restore_extended: r87
+// 0x00000020: .cfi_restore_extended: r88
+// 0x00000020: .cfi_restore_extended: r89
+// 0x00000020: .cfi_restore_extended: r90
+// 0x00000020: .cfi_restore_extended: r91
+// 0x00000020: .cfi_restore_extended: r92
+// 0x00000020: .cfi_restore_extended: r93
+// 0x00000020: .cfi_restore_extended: r94
+// 0x00000020: .cfi_restore_extended: r95
+// 0x00000020: pop {r5, r6, r7, r8, r10, r11, pc}
+// 0x00000024: .cfi_restore_state
+// 0x00000024: .cfi_def_cfa_offset: 128
 
 static constexpr uint8_t expected_asm_kArm64[] = {
     0xFF, 0x03, 0x03, 0xD1, 0xF3, 0x53, 0x06, 0xA9, 0xF5, 0x5B, 0x07, 0xA9,
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 4a4b98c..a5493ab 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -765,16 +765,24 @@
   LocationSummary* locations = instruction->GetLocations();
 
   uint32_t register_mask = locations->GetRegisterMask();
-  if (locations->OnlyCallsOnSlowPath()) {
-    // In case of slow path, we currently set the location of caller-save registers
-    // to register (instead of their stack location when pushed before the slow-path
-    // call). Therefore register_mask contains both callee-save and caller-save
-    // registers that hold objects. We must remove the caller-save from the mask, since
-    // they will be overwritten by the callee.
-    register_mask &= core_callee_save_mask_;
+  if (instruction->IsSuspendCheck()) {
+    // Suspend check has special ABI that saves the caller-save registers in callee,
+    // so we want to emit stack maps containing the registers.
+    // TODO: Register allocator still reserves space for the caller-save registers.
+    // We should add slow-path-specific caller-save information into LocationSummary
+    // and refactor the code here as well as in the register allocator to use it.
+  } else {
+    if (locations->OnlyCallsOnSlowPath()) {
+      // In case of slow path, we currently set the location of caller-save registers
+      // to register (instead of their stack location when pushed before the slow-path
+      // call). Therefore register_mask contains both callee-save and caller-save
+      // registers that hold objects. We must remove the caller-save from the mask, since
+      // they will be overwritten by the callee.
+      register_mask &= core_callee_save_mask_;
+    }
+    // The register mask must be a subset of callee-save registers.
+    DCHECK_EQ(register_mask & core_callee_save_mask_, register_mask);
   }
-  // The register mask must be a subset of callee-save registers.
-  DCHECK_EQ(register_mask & core_callee_save_mask_, register_mask);
   stack_map_stream_.BeginStackMapEntry(outer_dex_pc,
                                        native_pc,
                                        register_mask,
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index cd7a90e..c105940 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -119,11 +119,9 @@
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     CodeGeneratorARM* arm_codegen = down_cast<CodeGeneratorARM*>(codegen);
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, instruction_->GetLocations());
     arm_codegen->InvokeRuntime(
         QUICK_ENTRY_POINT(pTestSuspend), instruction_, instruction_->GetDexPc(), this);
     CheckEntrypointTypes<kQuickTestSuspend, void, void>();
-    RestoreLiveRegisters(codegen, instruction_->GetLocations());
     if (successor_ == nullptr) {
       __ b(GetReturnLabel());
     } else {
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index f3a09fd..54c9efc 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -398,11 +398,9 @@
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     CodeGeneratorARM64* arm64_codegen = down_cast<CodeGeneratorARM64*>(codegen);
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, instruction_->GetLocations());
     arm64_codegen->InvokeRuntime(
         QUICK_ENTRY_POINT(pTestSuspend), instruction_, instruction_->GetDexPc(), this);
     CheckEntrypointTypes<kQuickTestSuspend, void, void>();
-    RestoreLiveRegisters(codegen, instruction_->GetLocations());
     if (successor_ == nullptr) {
       __ B(GetReturnLabel());
     } else {
@@ -609,6 +607,8 @@
     DCHECK_NE(obj_.reg(), LR);
     DCHECK_NE(obj_.reg(), WSP);
     DCHECK_NE(obj_.reg(), WZR);
+    // WIP0 is used by the slow path as a temp, it can not be the object register.
+    DCHECK_NE(obj_.reg(), IP0);
     DCHECK(0 <= obj_.reg() && obj_.reg() < kNumberOfWRegisters) << obj_.reg();
     // "Compact" slow path, saving two moves.
     //
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 8dd82ef..59e103a 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -351,14 +351,12 @@
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     CodeGeneratorMIPS* mips_codegen = down_cast<CodeGeneratorMIPS*>(codegen);
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, instruction_->GetLocations());
     mips_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pTestSuspend),
                                 instruction_,
                                 instruction_->GetDexPc(),
                                 this,
                                 IsDirectEntrypoint(kQuickTestSuspend));
     CheckEntrypointTypes<kQuickTestSuspend, void, void>();
-    RestoreLiveRegisters(codegen, instruction_->GetLocations());
     if (successor_ == nullptr) {
       __ B(GetReturnLabel());
     } else {
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index 3472830..fe1fddc 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -300,13 +300,11 @@
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     CodeGeneratorMIPS64* mips64_codegen = down_cast<CodeGeneratorMIPS64*>(codegen);
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, instruction_->GetLocations());
     mips64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pTestSuspend),
                                   instruction_,
                                   instruction_->GetDexPc(),
                                   this);
     CheckEntrypointTypes<kQuickTestSuspend, void, void>();
-    RestoreLiveRegisters(codegen, instruction_->GetLocations());
     if (successor_ == nullptr) {
       __ Bc(GetReturnLabel());
     } else {
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index a2fa245..ade2117 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -192,13 +192,11 @@
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     CodeGeneratorX86* x86_codegen = down_cast<CodeGeneratorX86*>(codegen);
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, instruction_->GetLocations());
     x86_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pTestSuspend),
                                instruction_,
                                instruction_->GetDexPc(),
                                this);
     CheckEntrypointTypes<kQuickTestSuspend, void, void>();
-    RestoreLiveRegisters(codegen, instruction_->GetLocations());
     if (successor_ == nullptr) {
       __ jmp(GetReturnLabel());
     } else {
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index 5d5fa85..eadb431 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -149,13 +149,11 @@
   void EmitNativeCode(CodeGenerator* codegen) OVERRIDE {
     CodeGeneratorX86_64* x86_64_codegen = down_cast<CodeGeneratorX86_64*>(codegen);
     __ Bind(GetEntryLabel());
-    SaveLiveRegisters(codegen, instruction_->GetLocations());
     x86_64_codegen->InvokeRuntime(QUICK_ENTRY_POINT(pTestSuspend),
                                   instruction_,
                                   instruction_->GetDexPc(),
                                   this);
     CheckEntrypointTypes<kQuickTestSuspend, void, void>();
-    RestoreLiveRegisters(codegen, instruction_->GetLocations());
     if (successor_ == nullptr) {
       __ jmp(GetReturnLabel());
     } else {
diff --git a/compiler/utils/arm/assembler_thumb2.cc b/compiler/utils/arm/assembler_thumb2.cc
index 353c729..4be7aae 100644
--- a/compiler/utils/arm/assembler_thumb2.cc
+++ b/compiler/utils/arm/assembler_thumb2.cc
@@ -2325,7 +2325,7 @@
   }
 
   Register rn = ad.GetRegister();
-  if (IsHighRegister(rn) && rn != SP && rn != PC) {
+  if (IsHighRegister(rn) && (byte || half || (rn != SP && rn != PC))) {
     must_be_32bit = true;
   }
 
@@ -2337,24 +2337,24 @@
     // Immediate offset
     int32_t offset = ad.GetOffset();
 
-    // The 16 bit SP relative instruction can only have a 10 bit offset.
-    if (rn == SP && offset >= (1 << 10)) {
-      must_be_32bit = true;
-    }
-
     if (byte) {
       // 5 bit offset, no shift.
-      if (offset >= (1 << 5)) {
+      if ((offset & ~0x1f) != 0) {
         must_be_32bit = true;
       }
     } else if (half) {
-      // 6 bit offset, shifted by 1.
-      if (offset >= (1 << 6)) {
+      // 5 bit offset, shifted by 1.
+      if ((offset & ~(0x1f << 1)) != 0) {
+        must_be_32bit = true;
+      }
+    } else if (rn == SP || rn == PC) {
+      // The 16 bit SP/PC relative instruction can only have an (imm8 << 2) offset.
+      if ((offset & ~(0xff << 2)) != 0) {
         must_be_32bit = true;
       }
     } else {
-      // 7 bit offset, shifted by 2.
-      if (offset >= (1 << 7)) {
+      // 5 bit offset, shifted by 2.
+      if ((offset & ~(0x1f << 2)) != 0) {
         must_be_32bit = true;
       }
     }
@@ -2370,7 +2370,7 @@
     } else {
       // 16 bit thumb1.
       uint8_t opA = 0;
-      bool sp_relative = false;
+      bool sp_or_pc_relative = false;
 
       if (byte) {
         opA = 7U /* 0b0111 */;
@@ -2379,7 +2379,10 @@
       } else {
         if (rn == SP) {
           opA = 9U /* 0b1001 */;
-          sp_relative = true;
+          sp_or_pc_relative = true;
+        } else if (rn == PC) {
+          opA = 4U;
+          sp_or_pc_relative = true;
         } else {
           opA = 6U /* 0b0110 */;
         }
@@ -2388,7 +2391,7 @@
           (load ? B11 : 0);
 
       CHECK_GE(offset, 0);
-      if (sp_relative) {
+      if (sp_or_pc_relative) {
         // SP relative, 10 bit offset.
         CHECK_LT(offset, (1 << 10));
         CHECK_ALIGNED(offset, 4);
diff --git a/compiler/utils/arm/assembler_thumb2_test.cc b/compiler/utils/arm/assembler_thumb2_test.cc
index abb09f7..3ca3714 100644
--- a/compiler/utils/arm/assembler_thumb2_test.cc
+++ b/compiler/utils/arm/assembler_thumb2_test.cc
@@ -279,6 +279,148 @@
   DriverStr(expected, "smull");
 }
 
+TEST_F(AssemblerThumb2Test, LoadByteFromThumbOffset) {
+  arm::LoadOperandType type = arm::kLoadUnsignedByte;
+
+  __ LoadFromOffset(type, arm::R0, arm::R7, 0);
+  __ LoadFromOffset(type, arm::R1, arm::R7, 31);
+  __ LoadFromOffset(type, arm::R2, arm::R7, 32);
+  __ LoadFromOffset(type, arm::R3, arm::R7, 4095);
+  __ LoadFromOffset(type, arm::R4, arm::SP, 0);
+
+  const char* expected =
+      "ldrb r0, [r7, #0]\n"
+      "ldrb r1, [r7, #31]\n"
+      "ldrb.w r2, [r7, #32]\n"
+      "ldrb.w r3, [r7, #4095]\n"
+      "ldrb.w r4, [sp, #0]\n";
+  DriverStr(expected, "LoadByteFromThumbOffset");
+}
+
+TEST_F(AssemblerThumb2Test, StoreByteToThumbOffset) {
+  arm::StoreOperandType type = arm::kStoreByte;
+
+  __ StoreToOffset(type, arm::R0, arm::R7, 0);
+  __ StoreToOffset(type, arm::R1, arm::R7, 31);
+  __ StoreToOffset(type, arm::R2, arm::R7, 32);
+  __ StoreToOffset(type, arm::R3, arm::R7, 4095);
+  __ StoreToOffset(type, arm::R4, arm::SP, 0);
+
+  const char* expected =
+      "strb r0, [r7, #0]\n"
+      "strb r1, [r7, #31]\n"
+      "strb.w r2, [r7, #32]\n"
+      "strb.w r3, [r7, #4095]\n"
+      "strb.w r4, [sp, #0]\n";
+  DriverStr(expected, "StoreByteToThumbOffset");
+}
+
+TEST_F(AssemblerThumb2Test, LoadHalfFromThumbOffset) {
+  arm::LoadOperandType type = arm::kLoadUnsignedHalfword;
+
+  __ LoadFromOffset(type, arm::R0, arm::R7, 0);
+  __ LoadFromOffset(type, arm::R1, arm::R7, 62);
+  __ LoadFromOffset(type, arm::R2, arm::R7, 64);
+  __ LoadFromOffset(type, arm::R3, arm::R7, 4094);
+  __ LoadFromOffset(type, arm::R4, arm::SP, 0);
+  __ LoadFromOffset(type, arm::R5, arm::R7, 1);  // Unaligned
+
+  const char* expected =
+      "ldrh r0, [r7, #0]\n"
+      "ldrh r1, [r7, #62]\n"
+      "ldrh.w r2, [r7, #64]\n"
+      "ldrh.w r3, [r7, #4094]\n"
+      "ldrh.w r4, [sp, #0]\n"
+      "ldrh.w r5, [r7, #1]\n";
+  DriverStr(expected, "LoadHalfFromThumbOffset");
+}
+
+TEST_F(AssemblerThumb2Test, StoreHalfToThumbOffset) {
+  arm::StoreOperandType type = arm::kStoreHalfword;
+
+  __ StoreToOffset(type, arm::R0, arm::R7, 0);
+  __ StoreToOffset(type, arm::R1, arm::R7, 62);
+  __ StoreToOffset(type, arm::R2, arm::R7, 64);
+  __ StoreToOffset(type, arm::R3, arm::R7, 4094);
+  __ StoreToOffset(type, arm::R4, arm::SP, 0);
+  __ StoreToOffset(type, arm::R5, arm::R7, 1);  // Unaligned
+
+  const char* expected =
+      "strh r0, [r7, #0]\n"
+      "strh r1, [r7, #62]\n"
+      "strh.w r2, [r7, #64]\n"
+      "strh.w r3, [r7, #4094]\n"
+      "strh.w r4, [sp, #0]\n"
+      "strh.w r5, [r7, #1]\n";
+  DriverStr(expected, "StoreHalfToThumbOffset");
+}
+
+TEST_F(AssemblerThumb2Test, LoadWordFromSpPlusOffset) {
+  arm::LoadOperandType type = arm::kLoadWord;
+
+  __ LoadFromOffset(type, arm::R0, arm::SP, 0);
+  __ LoadFromOffset(type, arm::R1, arm::SP, 124);
+  __ LoadFromOffset(type, arm::R2, arm::SP, 128);
+  __ LoadFromOffset(type, arm::R3, arm::SP, 1020);
+  __ LoadFromOffset(type, arm::R4, arm::SP, 1024);
+  __ LoadFromOffset(type, arm::R5, arm::SP, 4092);
+  __ LoadFromOffset(type, arm::R6, arm::SP, 1);  // Unaligned
+
+  const char* expected =
+      "ldr r0, [sp, #0]\n"
+      "ldr r1, [sp, #124]\n"
+      "ldr r2, [sp, #128]\n"
+      "ldr r3, [sp, #1020]\n"
+      "ldr.w r4, [sp, #1024]\n"
+      "ldr.w r5, [sp, #4092]\n"
+      "ldr.w r6, [sp, #1]\n";
+  DriverStr(expected, "LoadWordFromSpPlusOffset");
+}
+
+TEST_F(AssemblerThumb2Test, StoreWordToSpPlusOffset) {
+  arm::StoreOperandType type = arm::kStoreWord;
+
+  __ StoreToOffset(type, arm::R0, arm::SP, 0);
+  __ StoreToOffset(type, arm::R1, arm::SP, 124);
+  __ StoreToOffset(type, arm::R2, arm::SP, 128);
+  __ StoreToOffset(type, arm::R3, arm::SP, 1020);
+  __ StoreToOffset(type, arm::R4, arm::SP, 1024);
+  __ StoreToOffset(type, arm::R5, arm::SP, 4092);
+  __ StoreToOffset(type, arm::R6, arm::SP, 1);  // Unaligned
+
+  const char* expected =
+      "str r0, [sp, #0]\n"
+      "str r1, [sp, #124]\n"
+      "str r2, [sp, #128]\n"
+      "str r3, [sp, #1020]\n"
+      "str.w r4, [sp, #1024]\n"
+      "str.w r5, [sp, #4092]\n"
+      "str.w r6, [sp, #1]\n";
+  DriverStr(expected, "StoreWordToSpPlusOffset");
+}
+
+TEST_F(AssemblerThumb2Test, LoadWordFromPcPlusOffset) {
+  arm::LoadOperandType type = arm::kLoadWord;
+
+  __ LoadFromOffset(type, arm::R0, arm::PC, 0);
+  __ LoadFromOffset(type, arm::R1, arm::PC, 124);
+  __ LoadFromOffset(type, arm::R2, arm::PC, 128);
+  __ LoadFromOffset(type, arm::R3, arm::PC, 1020);
+  __ LoadFromOffset(type, arm::R4, arm::PC, 1024);
+  __ LoadFromOffset(type, arm::R5, arm::PC, 4092);
+  __ LoadFromOffset(type, arm::R6, arm::PC, 1);  // Unaligned
+
+  const char* expected =
+      "ldr r0, [pc, #0]\n"
+      "ldr r1, [pc, #124]\n"
+      "ldr r2, [pc, #128]\n"
+      "ldr r3, [pc, #1020]\n"
+      "ldr.w r4, [pc, #1024]\n"
+      "ldr.w r5, [pc, #4092]\n"
+      "ldr.w r6, [pc, #1]\n";
+  DriverStr(expected, "LoadWordFromPcPlusOffset");
+}
+
 TEST_F(AssemblerThumb2Test, StoreWordToThumbOffset) {
   arm::StoreOperandType type = arm::kStoreWord;
   int32_t offset = 4092;
diff --git a/oatdump/oatdump.cc b/oatdump/oatdump.cc
index 8c3c5e5..a0def61 100644
--- a/oatdump/oatdump.cc
+++ b/oatdump/oatdump.cc
@@ -76,6 +76,7 @@
   "kCalleeSaveMethod",
   "kRefsOnlySaveMethod",
   "kRefsAndArgsSaveMethod",
+  "kSaveEverythingMethod",
 };
 
 const char* image_roots_descriptions_[] = {
diff --git a/runtime/arch/arch_test.cc b/runtime/arch/arch_test.cc
index ee31c58..6d80eb6 100644
--- a/runtime/arch/arch_test.cc
+++ b/runtime/arch/arch_test.cc
@@ -69,7 +69,9 @@
 #undef FRAME_SIZE_REFS_ONLY_CALLEE_SAVE
 static constexpr size_t kFrameSizeRefsAndArgsCalleeSave = FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE;
 #undef FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE
-}
+static constexpr size_t kFrameSizeSaveEverythingCalleeSave = FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE;
+#undef FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE
+}  // namespace arm
 
 namespace arm64 {
 #include "arch/arm64/asm_support_arm64.h"
@@ -79,7 +81,9 @@
 #undef FRAME_SIZE_REFS_ONLY_CALLEE_SAVE
 static constexpr size_t kFrameSizeRefsAndArgsCalleeSave = FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE;
 #undef FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE
-}
+static constexpr size_t kFrameSizeSaveEverythingCalleeSave = FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE;
+#undef FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE
+}  // namespace arm64
 
 namespace mips {
 #include "arch/mips/asm_support_mips.h"
@@ -89,7 +93,9 @@
 #undef FRAME_SIZE_REFS_ONLY_CALLEE_SAVE
 static constexpr size_t kFrameSizeRefsAndArgsCalleeSave = FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE;
 #undef FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE
-}
+static constexpr size_t kFrameSizeSaveEverythingCalleeSave = FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE;
+#undef FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE
+}  // namespace mips
 
 namespace mips64 {
 #include "arch/mips64/asm_support_mips64.h"
@@ -99,7 +105,9 @@
 #undef FRAME_SIZE_REFS_ONLY_CALLEE_SAVE
 static constexpr size_t kFrameSizeRefsAndArgsCalleeSave = FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE;
 #undef FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE
-}
+static constexpr size_t kFrameSizeSaveEverythingCalleeSave = FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE;
+#undef FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE
+}  // namespace mips64
 
 namespace x86 {
 #include "arch/x86/asm_support_x86.h"
@@ -109,7 +117,9 @@
 #undef FRAME_SIZE_REFS_ONLY_CALLEE_SAVE
 static constexpr size_t kFrameSizeRefsAndArgsCalleeSave = FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE;
 #undef FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE
-}
+static constexpr size_t kFrameSizeSaveEverythingCalleeSave = FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE;
+#undef FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE
+}  // namespace x86
 
 namespace x86_64 {
 #include "arch/x86_64/asm_support_x86_64.h"
@@ -119,13 +129,18 @@
 #undef FRAME_SIZE_REFS_ONLY_CALLEE_SAVE
 static constexpr size_t kFrameSizeRefsAndArgsCalleeSave = FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE;
 #undef FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE
-}
+static constexpr size_t kFrameSizeSaveEverythingCalleeSave = FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE;
+#undef FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE
+}  // namespace x86_64
 
 // Check architecture specific constants are sound.
 TEST_F(ArchTest, ARM) {
   CheckFrameSize(InstructionSet::kArm, Runtime::kSaveAll, arm::kFrameSizeSaveAllCalleeSave);
   CheckFrameSize(InstructionSet::kArm, Runtime::kRefsOnly, arm::kFrameSizeRefsOnlyCalleeSave);
   CheckFrameSize(InstructionSet::kArm, Runtime::kRefsAndArgs, arm::kFrameSizeRefsAndArgsCalleeSave);
+  CheckFrameSize(InstructionSet::kArm,
+                 Runtime::kSaveEverything,
+                 arm::kFrameSizeSaveEverythingCalleeSave);
 }
 
 
@@ -134,33 +149,51 @@
   CheckFrameSize(InstructionSet::kArm64, Runtime::kRefsOnly, arm64::kFrameSizeRefsOnlyCalleeSave);
   CheckFrameSize(InstructionSet::kArm64, Runtime::kRefsAndArgs,
                  arm64::kFrameSizeRefsAndArgsCalleeSave);
+  CheckFrameSize(InstructionSet::kArm64,
+                 Runtime::kSaveEverything,
+                 arm64::kFrameSizeSaveEverythingCalleeSave);
 }
 
 TEST_F(ArchTest, MIPS) {
   CheckFrameSize(InstructionSet::kMips, Runtime::kSaveAll, mips::kFrameSizeSaveAllCalleeSave);
   CheckFrameSize(InstructionSet::kMips, Runtime::kRefsOnly, mips::kFrameSizeRefsOnlyCalleeSave);
-  CheckFrameSize(InstructionSet::kMips, Runtime::kRefsAndArgs,
+  CheckFrameSize(InstructionSet::kMips,
+                 Runtime::kRefsAndArgs,
                  mips::kFrameSizeRefsAndArgsCalleeSave);
+  CheckFrameSize(InstructionSet::kMips,
+                 Runtime::kSaveEverything,
+                 mips::kFrameSizeSaveEverythingCalleeSave);
 }
 
 TEST_F(ArchTest, MIPS64) {
   CheckFrameSize(InstructionSet::kMips64, Runtime::kSaveAll, mips64::kFrameSizeSaveAllCalleeSave);
   CheckFrameSize(InstructionSet::kMips64, Runtime::kRefsOnly, mips64::kFrameSizeRefsOnlyCalleeSave);
-  CheckFrameSize(InstructionSet::kMips64, Runtime::kRefsAndArgs,
+  CheckFrameSize(InstructionSet::kMips64,
+                 Runtime::kRefsAndArgs,
                  mips64::kFrameSizeRefsAndArgsCalleeSave);
+  CheckFrameSize(InstructionSet::kMips64,
+                 Runtime::kSaveEverything,
+                 mips64::kFrameSizeSaveEverythingCalleeSave);
 }
 
 TEST_F(ArchTest, X86) {
   CheckFrameSize(InstructionSet::kX86, Runtime::kSaveAll, x86::kFrameSizeSaveAllCalleeSave);
   CheckFrameSize(InstructionSet::kX86, Runtime::kRefsOnly, x86::kFrameSizeRefsOnlyCalleeSave);
   CheckFrameSize(InstructionSet::kX86, Runtime::kRefsAndArgs, x86::kFrameSizeRefsAndArgsCalleeSave);
+  CheckFrameSize(InstructionSet::kX86,
+                 Runtime::kSaveEverything,
+                 x86::kFrameSizeSaveEverythingCalleeSave);
 }
 
 TEST_F(ArchTest, X86_64) {
   CheckFrameSize(InstructionSet::kX86_64, Runtime::kSaveAll, x86_64::kFrameSizeSaveAllCalleeSave);
   CheckFrameSize(InstructionSet::kX86_64, Runtime::kRefsOnly, x86_64::kFrameSizeRefsOnlyCalleeSave);
-  CheckFrameSize(InstructionSet::kX86_64, Runtime::kRefsAndArgs,
+  CheckFrameSize(InstructionSet::kX86_64,
+                 Runtime::kRefsAndArgs,
                  x86_64::kFrameSizeRefsAndArgsCalleeSave);
+  CheckFrameSize(InstructionSet::kX86_64,
+                 Runtime::kSaveEverything,
+                 x86_64::kFrameSizeSaveEverythingCalleeSave);
 }
 
 }  // namespace art
diff --git a/runtime/arch/arm/asm_support_arm.h b/runtime/arch/arm/asm_support_arm.h
index 1fa566b..67f6f7a 100644
--- a/runtime/arch/arm/asm_support_arm.h
+++ b/runtime/arch/arm/asm_support_arm.h
@@ -22,6 +22,7 @@
 #define FRAME_SIZE_SAVE_ALL_CALLEE_SAVE 112
 #define FRAME_SIZE_REFS_ONLY_CALLEE_SAVE 32
 #define FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE 112
+#define FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE 192
 
 // Flag for enabling R4 optimization in arm runtime
 // #define ARM_R4_SUSPEND_FLAG
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index 34d3158..42418ad 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -168,6 +168,65 @@
     .cfi_adjust_cfa_offset -40
 .endm
 
+    /*
+     * Macro that sets up the callee save frame to conform with
+     * Runtime::CreateCalleeSaveMethod(kSaveEverything)
+     */
+.macro SETUP_SAVE_EVERYTHING_CALLEE_SAVE_FRAME rTemp
+    push {r0-r12, lr}                   @ 14 words of callee saves and args.
+    .cfi_adjust_cfa_offset 56
+    .cfi_rel_offset r0, 0
+    .cfi_rel_offset r1, 4
+    .cfi_rel_offset r2, 8
+    .cfi_rel_offset r3, 12
+    .cfi_rel_offset r4, 16
+    .cfi_rel_offset r5, 20
+    .cfi_rel_offset r6, 24
+    .cfi_rel_offset r7, 28
+    .cfi_rel_offset r8, 32
+    .cfi_rel_offset r9, 36
+    .cfi_rel_offset r10, 40
+    .cfi_rel_offset r11, 44
+    .cfi_rel_offset ip, 48
+    .cfi_rel_offset lr, 52
+    vpush {s0-s31}                      @ 32 words of float args.
+    .cfi_adjust_cfa_offset 128
+    sub sp, #8                          @ 2 words of space, alignment padding and Method*
+    .cfi_adjust_cfa_offset 8
+    RUNTIME_CURRENT1 \rTemp             @ Load Runtime::Current into rTemp.
+    @ Load kSaveEverything Method* to rTemp.
+    ldr \rTemp, [\rTemp, #RUNTIME_SAVE_EVERYTHING_CALLEE_SAVE_FRAME_OFFSET]
+    str \rTemp, [sp, #0]                @ Store kSaveEverything Method* to the bottom of the stack.
+    str sp, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
+
+    // Ugly compile-time check, but we only have the preprocessor.
+#if (FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE != 56 + 128 + 8)
+#error "SAVE_EVERYTHING_CALLEE_SAVE_FRAME(ARM) size not as expected."
+#endif
+.endm
+
+.macro RESTORE_SAVE_EVERYTHING_CALLEE_SAVE_FRAME
+    add  sp, #8                         @ rewind sp
+    .cfi_adjust_cfa_offset -8
+    vpop {s0-s31}
+    .cfi_adjust_cfa_offset -128
+    pop {r0-r12, lr}                    @ 14 words of callee saves
+    .cfi_restore r0
+    .cfi_restore r1
+    .cfi_restore r2
+    .cfi_restore r3
+    .cfi_restore r5
+    .cfi_restore r6
+    .cfi_restore r7
+    .cfi_restore r8
+    .cfi_restore r9
+    .cfi_restore r10
+    .cfi_restore r11
+    .cfi_restore r12
+    .cfi_restore lr
+    .cfi_adjust_cfa_offset -56
+.endm
+
 .macro RETURN_IF_RESULT_IS_ZERO
     cbnz   r0, 1f              @ result non-zero branch over
     bx     lr                  @ return
@@ -520,7 +579,7 @@
     ldr    r2, [r9, #THREAD_ID_OFFSET]
     ldrex  r1, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
     mov    r3, r1
-    and    r3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  @ zero the read barrier bits
+    and    r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  @ zero the gc bits
     cbnz   r3, .Lnot_unlocked         @ already thin locked
     @ unlocked case - r1: original lock word that's zero except for the read barrier bits.
     orr    r2, r1, r2                 @ r2 holds thread id with count of 0 with preserved read barrier bits
@@ -536,9 +595,9 @@
     cbnz   r2, .Lslow_lock            @ lock word and self thread id's match -> recursive lock
                                       @ else contention, go to slow path
     mov    r3, r1                     @ copy the lock word to check count overflow.
-    and    r3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  @ zero the read barrier bits.
+    and    r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  @ zero the gc bits.
     add    r2, r3, #LOCK_WORD_THIN_LOCK_COUNT_ONE  @ increment count in lock word placing in r2 to check overflow
-    lsr    r3, r2, LOCK_WORD_READ_BARRIER_STATE_SHIFT  @ if either of the upper two bits (28-29) are set, we overflowed.
+    lsr    r3, r2, #LOCK_WORD_GC_STATE_SHIFT    @ if the first gc state bit is set, we overflowed.
     cbnz   r3, .Lslow_lock            @ if we overflow the count go slow path
     add    r2, r1, #LOCK_WORD_THIN_LOCK_COUNT_ONE  @ increment count for real
     strex  r3, r2, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET] @ strex necessary for read barrier bits
@@ -581,17 +640,17 @@
     cbnz   r2, .Lslow_unlock          @ if either of the top two bits are set, go slow path
     ldr    r2, [r9, #THREAD_ID_OFFSET]
     mov    r3, r1                     @ copy lock word to check thread id equality
-    and    r3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  @ zero the read barrier bits
+    and    r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  @ zero the gc bits
     eor    r3, r3, r2                 @ lock_word.ThreadId() ^ self->ThreadId()
     uxth   r3, r3                     @ zero top 16 bits
     cbnz   r3, .Lslow_unlock          @ do lock word and self thread id's match?
     mov    r3, r1                     @ copy lock word to detect transition to unlocked
-    and    r3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  @ zero the read barrier bits
+    and    r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  @ zero the gc bits
     cmp    r3, #LOCK_WORD_THIN_LOCK_COUNT_ONE
     bpl    .Lrecursive_thin_unlock
     @ transition to unlocked
     mov    r3, r1
-    and    r3, #LOCK_WORD_READ_BARRIER_STATE_MASK  @ r3: zero except for the preserved read barrier bits
+    and    r3, #LOCK_WORD_GC_STATE_MASK_SHIFTED  @ r3: zero except for the preserved gc bits
     dmb    ish                        @ full (LoadStore|StoreStore) memory barrier
 #ifndef USE_READ_BARRIER
     str    r3, [r0, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
@@ -1212,17 +1271,18 @@
     .extern artTestSuspendFromCode
 ENTRY art_quick_test_suspend
 #ifdef ARM_R4_SUSPEND_FLAG
-    ldrh   r0, [rSELF, #THREAD_FLAGS_OFFSET]
-    mov    rSUSPEND, #SUSPEND_CHECK_INTERVAL  @ reset rSUSPEND to SUSPEND_CHECK_INTERVAL
-    cbnz   r0, 1f                             @ check Thread::Current()->suspend_count_ == 0
-    bx     lr                                 @ return if suspend_count_ == 0
+    ldrh   rSUSPEND, [rSELF, #THREAD_FLAGS_OFFSET]
+    cbnz   rSUSPEND, 1f                         @ check Thread::Current()->suspend_count_ == 0
+    mov    rSUSPEND, #SUSPEND_CHECK_INTERVAL    @ reset rSUSPEND to SUSPEND_CHECK_INTERVAL
+    bx     lr                                   @ return if suspend_count_ == 0
 1:
+    mov    rSUSPEND, #SUSPEND_CHECK_INTERVAL    @ reset rSUSPEND to SUSPEND_CHECK_INTERVAL
 #endif
+    SETUP_SAVE_EVERYTHING_CALLEE_SAVE_FRAME r0  @ save everything for GC stack crawl
     mov    r0, rSELF
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME r1      @ save callee saves for GC stack crawl
-    @ TODO: save FPRs to enable access in the debugger?
-    bl     artTestSuspendFromCode             @ (Thread*)
-    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME_AND_RETURN
+    bl     artTestSuspendFromCode               @ (Thread*)
+    RESTORE_SAVE_EVERYTHING_CALLEE_SAVE_FRAME
+    bx     lr
 END art_quick_test_suspend
 
 ENTRY art_quick_implicit_suspend
@@ -1772,6 +1832,20 @@
      */
 .macro READ_BARRIER_MARK_REG name, reg
 ENTRY \name
+    // Null check so that we can load the lock word.
+    cmp \reg, #0
+    beq .Lret_rb_\name
+    // Check lock word for mark bit, if marked return.
+    push {r0}
+    ldr r0, [\reg, MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    and r0, #LOCK_WORD_MARK_BIT_MASK_SHIFTED
+    cbz r0, .Lslow_rb_\name
+    // Restore LR and return.
+    pop   {r0}
+    bx    lr
+
+.Lslow_rb_\name:
+    pop   {r0}
     push  {r0-r4, r9, r12, lr}          @ save return address and core caller-save registers
     .cfi_adjust_cfa_offset 32
     .cfi_rel_offset r0, 0
@@ -1831,6 +1905,8 @@
       .endif
     .endif
     pop   {r0-r4, r9, r12, pc}          @ restore caller-save registers and return
+.Lret_rb_\name:
+    bx lr
 END \name
 .endm
 
diff --git a/runtime/arch/arm/quick_method_frame_info_arm.h b/runtime/arch/arm/quick_method_frame_info_arm.h
index 0fb8a63..c474d2e 100644
--- a/runtime/arch/arm/quick_method_frame_info_arm.h
+++ b/runtime/arch/arm/quick_method_frame_info_arm.h
@@ -34,6 +34,9 @@
     (1 << art::arm::R1) | (1 << art::arm::R2) | (1 << art::arm::R3);
 static constexpr uint32_t kArmCalleeSaveAllSpills =
     (1 << art::arm::R4) | (1 << art::arm::R9);
+static constexpr uint32_t kArmCalleeSaveEverythingSpills =
+    (1 << art::arm::R0) | (1 << art::arm::R1) | (1 << art::arm::R2) | (1 << art::arm::R3) |
+    (1 << art::arm::R4) | (1 << art::arm::R9) | (1 << art::arm::R12);
 
 static constexpr uint32_t kArmCalleeSaveFpAlwaysSpills = 0;
 static constexpr uint32_t kArmCalleeSaveFpRefSpills = 0;
@@ -47,17 +50,21 @@
     (1 << art::arm::S20) | (1 << art::arm::S21) | (1 << art::arm::S22) | (1 << art::arm::S23) |
     (1 << art::arm::S24) | (1 << art::arm::S25) | (1 << art::arm::S26) | (1 << art::arm::S27) |
     (1 << art::arm::S28) | (1 << art::arm::S29) | (1 << art::arm::S30) | (1 << art::arm::S31);
+static constexpr uint32_t kArmCalleeSaveFpEverythingSpills =
+    kArmCalleeSaveFpArgSpills | kArmCalleeSaveFpAllSpills;
 
 constexpr uint32_t ArmCalleeSaveCoreSpills(Runtime::CalleeSaveType type) {
   return kArmCalleeSaveAlwaysSpills | kArmCalleeSaveRefSpills |
       (type == Runtime::kRefsAndArgs ? kArmCalleeSaveArgSpills : 0) |
-      (type == Runtime::kSaveAll ? kArmCalleeSaveAllSpills : 0);
+      (type == Runtime::kSaveAll ? kArmCalleeSaveAllSpills : 0) |
+      (type == Runtime::kSaveEverything ? kArmCalleeSaveEverythingSpills : 0);
 }
 
 constexpr uint32_t ArmCalleeSaveFpSpills(Runtime::CalleeSaveType type) {
   return kArmCalleeSaveFpAlwaysSpills | kArmCalleeSaveFpRefSpills |
       (type == Runtime::kRefsAndArgs ? kArmCalleeSaveFpArgSpills: 0) |
-      (type == Runtime::kSaveAll ? kArmCalleeSaveFpAllSpills : 0);
+      (type == Runtime::kSaveAll ? kArmCalleeSaveFpAllSpills : 0) |
+      (type == Runtime::kSaveEverything ? kArmCalleeSaveFpEverythingSpills : 0);
 }
 
 constexpr uint32_t ArmCalleeSaveFrameSize(Runtime::CalleeSaveType type) {
diff --git a/runtime/arch/arm64/asm_support_arm64.h b/runtime/arch/arm64/asm_support_arm64.h
index 989ecc6..68d12e9 100644
--- a/runtime/arch/arm64/asm_support_arm64.h
+++ b/runtime/arch/arm64/asm_support_arm64.h
@@ -22,5 +22,6 @@
 #define FRAME_SIZE_SAVE_ALL_CALLEE_SAVE 176
 #define FRAME_SIZE_REFS_ONLY_CALLEE_SAVE 96
 #define FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE 224
+#define FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE 512
 
 #endif  // ART_RUNTIME_ARCH_ARM64_ASM_SUPPORT_ARM64_H_
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index a5be52d..415bb71 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -316,6 +316,204 @@
     .cfi_adjust_cfa_offset -224
 .endm
 
+    /*
+     * Macro that sets up the callee save frame to conform with
+     * Runtime::CreateCalleeSaveMethod(kSaveEverything)
+     */
+.macro SETUP_SAVE_EVERYTHING_CALLEE_SAVE_FRAME
+    sub sp, sp, #512
+    .cfi_adjust_cfa_offset 512
+
+    // Ugly compile-time check, but we only have the preprocessor.
+#if (FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE != 512)
+#error "SAVE_EVERYTHING_CALLEE_SAVE_FRAME(ARM64) size not as expected."
+#endif
+
+    // Save FP registers.
+    stp d0, d1,   [sp, #8]
+    stp d2, d3,   [sp, #24]
+    stp d4, d5,   [sp, #40]
+    stp d6, d7,   [sp, #56]
+    stp d8, d9,   [sp, #72]
+    stp d10, d11, [sp, #88]
+    stp d12, d13, [sp, #104]
+    stp d14, d15, [sp, #120]
+    stp d16, d17, [sp, #136]
+    stp d18, d19, [sp, #152]
+    stp d20, d21, [sp, #168]
+    stp d22, d23, [sp, #184]
+    stp d24, d25, [sp, #200]
+    stp d26, d27, [sp, #216]
+    stp d28, d29, [sp, #232]
+    stp d30, d31, [sp, #248]
+
+    // Save core registers.
+    str x0,       [sp, #264]
+    .cfi_rel_offset x0, 264
+
+    stp x1, x2,   [sp, #272]
+    .cfi_rel_offset x1, 272
+    .cfi_rel_offset x2, 280
+
+    stp x3, x4,   [sp, #288]
+    .cfi_rel_offset x3, 288
+    .cfi_rel_offset x4, 296
+
+    stp x5, x6,   [sp, #304]
+    .cfi_rel_offset x5, 304
+    .cfi_rel_offset x6, 312
+
+    stp x7, x8,   [sp, #320]
+    .cfi_rel_offset x7, 320
+    .cfi_rel_offset x8, 328
+
+    stp x9, x10,  [sp, #336]
+    .cfi_rel_offset x9, 336
+    .cfi_rel_offset x10, 344
+
+    stp x11, x12, [sp, #352]
+    .cfi_rel_offset x11, 352
+    .cfi_rel_offset x12, 360
+
+    stp x13, x14, [sp, #368]
+    .cfi_rel_offset x13, 368
+    .cfi_rel_offset x14, 376
+
+    stp x15, x16, [sp, #384]
+    .cfi_rel_offset x15, 384
+    .cfi_rel_offset x16, 392
+
+    stp x17, x18, [sp, #400]
+    .cfi_rel_offset x17, 400
+    .cfi_rel_offset x18, 408
+
+    stp x19, x20, [sp, #416]
+    .cfi_rel_offset x19, 416
+    .cfi_rel_offset x20, 424
+
+    stp x21, x22, [sp, #432]
+    .cfi_rel_offset x21, 432
+    .cfi_rel_offset x22, 440
+
+    stp x23, x24, [sp, #448]
+    .cfi_rel_offset x23, 448
+    .cfi_rel_offset x24, 456
+
+    stp x25, x26, [sp, #464]
+    .cfi_rel_offset x25, 464
+    .cfi_rel_offset x26, 472
+
+    stp x27, x28, [sp, #480]
+    .cfi_rel_offset x27, 480
+    .cfi_rel_offset x28, 488
+
+    stp x29, xLR, [sp, #496]
+    .cfi_rel_offset x29, 496
+    .cfi_rel_offset x30, 504
+
+    adrp xIP0, :got:_ZN3art7Runtime9instance_E
+    ldr xIP0, [xIP0, #:got_lo12:_ZN3art7Runtime9instance_E]
+
+    ldr xIP0, [xIP0]  // xIP0 = & (art::Runtime * art::Runtime.instance_) .
+
+    // xIP0 = (ArtMethod*) Runtime.instance_.callee_save_methods[kSaveEverything]  .
+    // Loads appropriate callee-save-method.
+    ldr xIP0, [xIP0, RUNTIME_SAVE_EVERYTHING_CALLEE_SAVE_FRAME_OFFSET ]
+
+    // Store ArtMethod* Runtime::callee_save_methods_[kSaveEverything].
+    str xIP0, [sp]
+    // Place sp in Thread::Current()->top_quick_frame.
+    mov xIP0, sp
+    str xIP0, [xSELF, # THREAD_TOP_QUICK_FRAME_OFFSET]
+.endm
+
+.macro RESTORE_SAVE_EVERYTHING_CALLEE_SAVE_FRAME
+    // Restore FP registers.
+    ldp d0, d1,   [sp, #8]
+    ldp d2, d3,   [sp, #24]
+    ldp d4, d5,   [sp, #40]
+    ldp d6, d7,   [sp, #56]
+    ldp d8, d9,   [sp, #72]
+    ldp d10, d11, [sp, #88]
+    ldp d12, d13, [sp, #104]
+    ldp d14, d15, [sp, #120]
+    ldp d16, d17, [sp, #136]
+    ldp d18, d19, [sp, #152]
+    ldp d20, d21, [sp, #168]
+    ldp d22, d23, [sp, #184]
+    ldp d24, d25, [sp, #200]
+    ldp d26, d27, [sp, #216]
+    ldp d28, d29, [sp, #232]
+    ldp d30, d31, [sp, #248]
+
+    // Restore core registers.
+    ldr x0,       [sp, #264]
+    .cfi_restore x0
+
+    ldp x1, x2,   [sp, #272]
+    .cfi_restore x1
+    .cfi_restore x2
+
+    ldp x3, x4,   [sp, #288]
+    .cfi_restore x3
+    .cfi_restore x4
+
+    ldp x5, x6,   [sp, #304]
+    .cfi_restore x5
+    .cfi_restore x6
+
+    ldp x7, x8,   [sp, #320]
+    .cfi_restore x7
+    .cfi_restore x8
+
+    ldp x9, x10,  [sp, #336]
+    .cfi_restore x9
+    .cfi_restore x10
+
+    ldp x11, x12, [sp, #352]
+    .cfi_restore x11
+    .cfi_restore x12
+
+    ldp x13, x14, [sp, #368]
+    .cfi_restore x13
+    .cfi_restore x14
+
+    ldp x15, x16, [sp, #384]
+    .cfi_restore x15
+    .cfi_restore x16
+
+    ldp x17, x18, [sp, #400]
+    .cfi_restore x17
+    .cfi_restore x18
+
+    ldp x19, x20, [sp, #416]
+    .cfi_restore x19
+    .cfi_restore x20
+
+    ldp x21, x22, [sp, #432]
+    .cfi_restore x21
+    .cfi_restore x22
+
+    ldp x23, x24, [sp, #448]
+    .cfi_restore x23
+    .cfi_restore x24
+
+    ldp x25, x26, [sp, #464]
+    .cfi_restore x25
+    .cfi_restore x26
+
+    ldp x27, x28, [sp, #480]
+    .cfi_restore x27
+    .cfi_restore x28
+
+    ldp x29, xLR, [sp, #496]
+    .cfi_restore x29
+    .cfi_restore x30
+
+    add sp, sp, #512
+    .cfi_adjust_cfa_offset -512
+.endm
+
 .macro RETURN_IF_RESULT_IS_ZERO
     cbnz x0, 1f                // result non-zero branch over
     ret                        // return
@@ -1090,7 +1288,7 @@
     ldr    w2, [xSELF, #THREAD_ID_OFFSET] // TODO: Can the thread ID really change during the loop?
     ldxr   w1, [x4]
     mov    x3, x1
-    and    w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  // zero the read barrier bits
+    and    w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits
     cbnz   w3, .Lnot_unlocked         // already thin locked
     // unlocked case - x1: original lock word that's zero except for the read barrier bits.
     orr    x2, x1, x2                 // x2 holds thread id with count of 0 with preserved read barrier bits
@@ -1106,9 +1304,9 @@
     cbnz   w2, .Lslow_lock            // lock word and self thread id's match -> recursive lock
                                       // else contention, go to slow path
     mov    x3, x1                     // copy the lock word to check count overflow.
-    and    w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  // zero the read barrier bits.
+    and    w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits.
     add    w2, w3, #LOCK_WORD_THIN_LOCK_COUNT_ONE  // increment count in lock word placing in w2 to check overflow
-    lsr    w3, w2, LOCK_WORD_READ_BARRIER_STATE_SHIFT  // if either of the upper two bits (28-29) are set, we overflowed.
+    lsr    w3, w2, #LOCK_WORD_GC_STATE_SHIFT     // if the first gc state bit is set, we overflowed.
     cbnz   w3, .Lslow_lock            // if we overflow the count go slow path
     add    w2, w1, #LOCK_WORD_THIN_LOCK_COUNT_ONE  // increment count for real
     stxr   w3, w2, [x4]
@@ -1152,17 +1350,17 @@
     cbnz   w2, .Lslow_unlock          // if either of the top two bits are set, go slow path
     ldr    w2, [xSELF, #THREAD_ID_OFFSET]
     mov    x3, x1                     // copy lock word to check thread id equality
-    and    w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  // zero the read barrier bits
+    and    w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits
     eor    w3, w3, w2                 // lock_word.ThreadId() ^ self->ThreadId()
     uxth   w3, w3                     // zero top 16 bits
     cbnz   w3, .Lslow_unlock          // do lock word and self thread id's match?
     mov    x3, x1                     // copy lock word to detect transition to unlocked
-    and    w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED  // zero the read barrier bits
+    and    w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED  // zero the gc bits
     cmp    w3, #LOCK_WORD_THIN_LOCK_COUNT_ONE
     bpl    .Lrecursive_thin_unlock
     // transition to unlocked
     mov    x3, x1
-    and    w3, w3, #LOCK_WORD_READ_BARRIER_STATE_MASK  // w3: zero except for the preserved read barrier bits
+    and    w3, w3, #LOCK_WORD_GC_STATE_MASK_SHIFTED  // w3: zero except for the preserved read barrier bits
     dmb    ish                        // full (LoadStore|StoreStore) memory barrier
 #ifndef USE_READ_BARRIER
     str    w3, [x4]
@@ -1791,12 +1989,20 @@
     ldr    x2, [x1, #ART_METHOD_DEX_CACHE_TYPES_OFFSET_64]    // Load dex cache resolved types array
                                                               // Load the class (x2)
     ldr    w2, [x2, x0, lsl #COMPRESSED_REFERENCE_SIZE_SHIFT]
-                                                              // Read barrier for class load.
+
+    // Most common case: GC is not marking.
     ldr    w3, [xSELF, #THREAD_IS_GC_MARKING_OFFSET]
-    cbnz   x3, .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path
-.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit:
+    cbnz   x3, .Lart_quick_alloc_object_region_tlab_marking
+.Lart_quick_alloc_object_region_tlab_do_allocation:
     ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path
-.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path:
+.Lart_quick_alloc_object_region_tlab_marking:
+    // GC is marking, check the lock word of the class for the mark bit.
+    // If the class is null, go slow path. The check is required to read the lock word.
+    cbz    w2, .Lart_quick_alloc_object_region_tlab_slow_path
+    // Class is not null, check mark bit in lock word.
+    ldr    w3, [x2, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    // If the bit is not zero, do the allocation.
+    tbnz    w3, #LOCK_WORD_MARK_BIT_SHIFT, .Lart_quick_alloc_object_region_tlab_do_allocation
                                                               // The read barrier slow path. Mark
                                                               // the class.
     stp    x0, x1, [sp, #-32]!                                // Save registers (x0, x1, lr).
@@ -1807,7 +2013,7 @@
     ldp    x0, x1, [sp, #0]                                   // Restore registers.
     ldr    xLR, [sp, #16]
     add    sp, sp, #32
-    b      .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
+    b      .Lart_quick_alloc_object_region_tlab_do_allocation
 .Lart_quick_alloc_object_region_tlab_slow_path:
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME          // Save callee saves in case of GC.
     mov    x2, xSELF                           // Pass Thread::Current.
@@ -1821,14 +2027,11 @@
      */
     .extern artTestSuspendFromCode
 ENTRY art_quick_test_suspend
-    ldrh   w0, [xSELF, #THREAD_FLAGS_OFFSET]  // get xSELF->state_and_flags.as_struct.flags
-    cbnz   w0, .Lneed_suspend                 // check flags == 0
-    ret                                       // return if flags == 0
-.Lneed_suspend:
+    SETUP_SAVE_EVERYTHING_CALLEE_SAVE_FRAME   // save callee saves for stack crawl
     mov    x0, xSELF
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME          // save callee saves for stack crawl
     bl     artTestSuspendFromCode             // (Thread*)
-    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME_AND_RETURN
+    RESTORE_SAVE_EVERYTHING_CALLEE_SAVE_FRAME
+    ret
 END art_quick_test_suspend
 
 ENTRY art_quick_implicit_suspend
@@ -2265,6 +2468,8 @@
      */
 .macro READ_BARRIER_MARK_REG name, wreg, xreg
 ENTRY \name
+    // Reference is null, no work to do at all.
+    cbz \wreg, .Lret_rb_\name
     /*
      * Allocate 46 stack slots * 8 = 368 bytes:
      * - 20 slots for core registers X0-X19
@@ -2272,6 +2477,11 @@
      * -  1 slot for return address register XLR
      * -  1 padding slot for 16-byte stack alignment
      */
+    // Use wIP0 as temp and check the mark bit of the reference. wIP0 is not used by the compiler.
+    ldr   wIP0, [\xreg, #MIRROR_OBJECT_LOCK_WORD_OFFSET]
+    tbz   wIP0, #LOCK_WORD_MARK_BIT_SHIFT, .Lslow_path_rb_\name
+    ret
+.Lslow_path_rb_\name:
     // Save all potentially live caller-save core registers.
     stp   x0, x1,   [sp, #-368]!
     .cfi_adjust_cfa_offset 368
@@ -2360,6 +2570,7 @@
     .cfi_restore x30
     add sp, sp, #368
     .cfi_adjust_cfa_offset -368
+.Lret_rb_\name:
     ret
 END \name
 .endm
diff --git a/runtime/arch/arm64/quick_method_frame_info_arm64.h b/runtime/arch/arm64/quick_method_frame_info_arm64.h
index b3d250b..188e46e 100644
--- a/runtime/arch/arm64/quick_method_frame_info_arm64.h
+++ b/runtime/arch/arm64/quick_method_frame_info_arm64.h
@@ -29,7 +29,7 @@
 static constexpr uint32_t kArm64CalleeSaveAlwaysSpills =
     // Note: ArtMethod::GetReturnPcOffsetInBytes() rely on the assumption that
     // LR is always saved on the top of the frame for all targets.
-    // That is, lr = *(sp + framesize - pointsize).
+    // That is, lr = *(sp + framesize - pointer_size).
     (1 << art::arm64::LR);
 // Callee saved registers
 static constexpr uint32_t kArm64CalleeSaveRefSpills =
@@ -44,6 +44,14 @@
     (1 << art::arm64::X7);
 static constexpr uint32_t kArm64CalleeSaveAllSpills =
     (1 << art::arm64::X19);
+static constexpr uint32_t kArm64CalleeSaveEverythingSpills =
+    (1 << art::arm64::X0) | (1 << art::arm64::X1) | (1 << art::arm64::X2) |
+    (1 << art::arm64::X3) | (1 << art::arm64::X4) | (1 << art::arm64::X5) |
+    (1 << art::arm64::X6) | (1 << art::arm64::X7) | (1 << art::arm64::X8) |
+    (1 << art::arm64::X9) | (1 << art::arm64::X10) | (1 << art::arm64::X11) |
+    (1 << art::arm64::X12) | (1 << art::arm64::X13) | (1 << art::arm64::X14) |
+    (1 << art::arm64::X15) | (1 << art::arm64::X16) | (1 << art::arm64::X17) |
+    (1 << art::arm64::X18) | (1 << art::arm64::X19);
 
 static constexpr uint32_t kArm64CalleeSaveFpAlwaysSpills = 0;
 static constexpr uint32_t kArm64CalleeSaveFpRefSpills = 0;
@@ -55,17 +63,31 @@
     (1 << art::arm64::D8)  | (1 << art::arm64::D9)  | (1 << art::arm64::D10) |
     (1 << art::arm64::D11)  | (1 << art::arm64::D12)  | (1 << art::arm64::D13) |
     (1 << art::arm64::D14)  | (1 << art::arm64::D15);
+static constexpr uint32_t kArm64CalleeSaveFpEverythingSpills =
+    (1 << art::arm64::D0) | (1 << art::arm64::D1) | (1 << art::arm64::D2) |
+    (1 << art::arm64::D3) | (1 << art::arm64::D4) | (1 << art::arm64::D5) |
+    (1 << art::arm64::D6) | (1 << art::arm64::D7) | (1 << art::arm64::D8) |
+    (1 << art::arm64::D9) | (1 << art::arm64::D10) | (1 << art::arm64::D11) |
+    (1 << art::arm64::D12) | (1 << art::arm64::D13) | (1 << art::arm64::D14) |
+    (1 << art::arm64::D15) | (1 << art::arm64::D16) | (1 << art::arm64::D17) |
+    (1 << art::arm64::D18) | (1 << art::arm64::D19) | (1 << art::arm64::D20) |
+    (1 << art::arm64::D21) | (1 << art::arm64::D22) | (1 << art::arm64::D23) |
+    (1 << art::arm64::D24) | (1 << art::arm64::D25) | (1 << art::arm64::D26) |
+    (1 << art::arm64::D27) | (1 << art::arm64::D28) | (1 << art::arm64::D29) |
+    (1 << art::arm64::D30) | (1 << art::arm64::D31);
 
 constexpr uint32_t Arm64CalleeSaveCoreSpills(Runtime::CalleeSaveType type) {
   return kArm64CalleeSaveAlwaysSpills | kArm64CalleeSaveRefSpills |
       (type == Runtime::kRefsAndArgs ? kArm64CalleeSaveArgSpills : 0) |
-      (type == Runtime::kSaveAll ? kArm64CalleeSaveAllSpills : 0);
+      (type == Runtime::kSaveAll ? kArm64CalleeSaveAllSpills : 0) |
+      (type == Runtime::kSaveEverything ? kArm64CalleeSaveEverythingSpills : 0);
 }
 
 constexpr uint32_t Arm64CalleeSaveFpSpills(Runtime::CalleeSaveType type) {
   return kArm64CalleeSaveFpAlwaysSpills | kArm64CalleeSaveFpRefSpills |
       (type == Runtime::kRefsAndArgs ? kArm64CalleeSaveFpArgSpills: 0) |
-      (type == Runtime::kSaveAll ? kArm64CalleeSaveFpAllSpills : 0);
+      (type == Runtime::kSaveAll ? kArm64CalleeSaveFpAllSpills : 0) |
+      (type == Runtime::kSaveEverything ? kArm64CalleeSaveFpEverythingSpills : 0);
 }
 
 constexpr uint32_t Arm64CalleeSaveFrameSize(Runtime::CalleeSaveType type) {
diff --git a/runtime/arch/mips/asm_support_mips.h b/runtime/arch/mips/asm_support_mips.h
index 453056d..2ef45f5 100644
--- a/runtime/arch/mips/asm_support_mips.h
+++ b/runtime/arch/mips/asm_support_mips.h
@@ -22,5 +22,6 @@
 #define FRAME_SIZE_SAVE_ALL_CALLEE_SAVE 96
 #define FRAME_SIZE_REFS_ONLY_CALLEE_SAVE 48
 #define FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE 80
+#define FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE 256
 
 #endif  // ART_RUNTIME_ARCH_MIPS_ASM_SUPPORT_MIPS_H_
diff --git a/runtime/arch/mips/quick_entrypoints_mips.S b/runtime/arch/mips/quick_entrypoints_mips.S
index c1b8044..b926bdf 100644
--- a/runtime/arch/mips/quick_entrypoints_mips.S
+++ b/runtime/arch/mips/quick_entrypoints_mips.S
@@ -277,6 +277,197 @@
 .endm
 
     /*
+     * Macro that sets up the callee save frame to conform with
+     * Runtime::CreateCalleeSaveMethod(kSaveEverything).
+     * Callee-save: $at, $v0-$v1, $a0-$a3, $t0-$t7, $s0-$s7, $t8-$t9, $gp, $fp $ra, $f0-$f31;
+     *              28(GPR)+ 32(FPR) + 3 words for padding and 1 word for Method*
+     * Clobbers $t0 and $t1.
+     * Allocates ARG_SLOT_SIZE bytes at the bottom of the stack for arg slots.
+     * Reserves FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE + ARG_SLOT_SIZE bytes on the stack.
+     * This macro sets up $gp; entrypoints using it should start with ENTRY_NO_GP.
+     */
+.macro SETUP_SAVE_EVERYTHING_CALLEE_SAVE_FRAME
+    addiu  $sp, $sp, -256
+    .cfi_adjust_cfa_offset 256
+
+     // Ugly compile-time check, but we only have the preprocessor.
+#if (FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE != 256)
+#error "SAVE_EVERYTHING_CALLEE_SAVE_FRAME(MIPS) size not as expected."
+#endif
+
+    sw     $ra, 252($sp)
+    .cfi_rel_offset 31, 252
+    sw     $fp, 248($sp)
+    .cfi_rel_offset 30, 248
+    sw     $gp, 244($sp)
+    .cfi_rel_offset 28, 244
+    sw     $t9, 240($sp)
+    .cfi_rel_offset 25, 240
+    sw     $t8, 236($sp)
+    .cfi_rel_offset 24, 236
+    sw     $s7, 232($sp)
+    .cfi_rel_offset 23, 232
+    sw     $s6, 228($sp)
+    .cfi_rel_offset 22, 228
+    sw     $s5, 224($sp)
+    .cfi_rel_offset 21, 224
+    sw     $s4, 220($sp)
+    .cfi_rel_offset 20, 220
+    sw     $s3, 216($sp)
+    .cfi_rel_offset 19, 216
+    sw     $s2, 212($sp)
+    .cfi_rel_offset 18, 212
+    sw     $s1, 208($sp)
+    .cfi_rel_offset 17, 208
+    sw     $s0, 204($sp)
+    .cfi_rel_offset 16, 204
+    sw     $t7, 200($sp)
+    .cfi_rel_offset 15, 200
+    sw     $t6, 196($sp)
+    .cfi_rel_offset 14, 196
+    sw     $t5, 192($sp)
+    .cfi_rel_offset 13, 192
+    sw     $t4, 188($sp)
+    .cfi_rel_offset 12, 188
+    sw     $t3, 184($sp)
+    .cfi_rel_offset 11, 184
+    sw     $t2, 180($sp)
+    .cfi_rel_offset 10, 180
+    sw     $t1, 176($sp)
+    .cfi_rel_offset 9, 176
+    sw     $t0, 172($sp)
+    .cfi_rel_offset 8, 172
+    sw     $a3, 168($sp)
+    .cfi_rel_offset 7, 168
+    sw     $a2, 164($sp)
+    .cfi_rel_offset 6, 164
+    sw     $a1, 160($sp)
+    .cfi_rel_offset 5, 160
+    sw     $a0, 156($sp)
+    .cfi_rel_offset 4, 156
+    sw     $v1, 152($sp)
+    .cfi_rel_offset 3, 152
+    sw     $v0, 148($sp)
+    .cfi_rel_offset 2, 148
+
+    // Set up $gp, clobbering $ra and using the branch delay slot for a useful instruction.
+    bal 1f
+    sw     $at, 144($sp)
+    .cfi_rel_offset 1, 144
+1:
+    .cpload $ra
+
+    SDu $f30, $f31, 136, $sp, $t1
+    SDu $f28, $f29, 128, $sp, $t1
+    SDu $f26, $f27, 120, $sp, $t1
+    SDu $f24, $f25, 112, $sp, $t1
+    SDu $f22, $f23, 104, $sp, $t1
+    SDu $f20, $f21, 96,  $sp, $t1
+    SDu $f18, $f19, 88,  $sp, $t1
+    SDu $f16, $f17, 80,  $sp, $t1
+    SDu $f14, $f15, 72,  $sp, $t1
+    SDu $f12, $f13, 64,  $sp, $t1
+    SDu $f10, $f11, 56,  $sp, $t1
+    SDu $f8, $f9, 48,  $sp, $t1
+    SDu $f6, $f7, 40,  $sp, $t1
+    SDu $f4, $f5, 32,  $sp, $t1
+    SDu $f2, $f3, 24,  $sp, $t1
+    SDu $f0, $f1, 16,  $sp, $t1
+
+    # 3 words padding and 1 word for holding Method*
+
+    lw $t0, %got(_ZN3art7Runtime9instance_E)($gp)
+    lw $t0, 0($t0)
+    lw $t0, RUNTIME_SAVE_EVERYTHING_CALLEE_SAVE_FRAME_OFFSET($t0)
+    sw $t0, 0($sp)                                # Place Method* at bottom of stack.
+    sw $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)  # Place sp in Thread::Current()->top_quick_frame.
+    addiu  $sp, $sp, -ARG_SLOT_SIZE               # reserve argument slots on the stack
+    .cfi_adjust_cfa_offset ARG_SLOT_SIZE
+.endm
+
+.macro RESTORE_SAVE_EVERYTHING_CALLEE_SAVE_FRAME
+    addiu  $sp, $sp, ARG_SLOT_SIZE                # remove argument slots on the stack
+    .cfi_adjust_cfa_offset -ARG_SLOT_SIZE
+
+    LDu $f30, $f31, 136, $sp, $t1
+    LDu $f28, $f29, 128, $sp, $t1
+    LDu $f26, $f27, 120, $sp, $t1
+    LDu $f24, $f25, 112, $sp, $t1
+    LDu $f22, $f23, 104, $sp, $t1
+    LDu $f20, $f21, 96,  $sp, $t1
+    LDu $f18, $f19, 88,  $sp, $t1
+    LDu $f16, $f17, 80,  $sp, $t1
+    LDu $f14, $f15, 72,  $sp, $t1
+    LDu $f12, $f13, 64,  $sp, $t1
+    LDu $f10, $f11, 56,  $sp, $t1
+    LDu $f8, $f9, 48,  $sp, $t1
+    LDu $f6, $f7, 40,  $sp, $t1
+    LDu $f4, $f5, 32,  $sp, $t1
+    LDu $f2, $f3, 24,  $sp, $t1
+    LDu $f0, $f1, 16,  $sp, $t1
+
+    lw     $ra, 252($sp)
+    .cfi_restore 31
+    lw     $fp, 248($sp)
+    .cfi_restore 30
+    lw     $gp, 244($sp)
+    .cfi_restore 28
+    lw     $t9, 240($sp)
+    .cfi_restore 25
+    lw     $t8, 236($sp)
+    .cfi_restore 24
+    lw     $s7, 232($sp)
+    .cfi_restore 23
+    lw     $s6, 228($sp)
+    .cfi_restore 22
+    lw     $s5, 224($sp)
+    .cfi_restore 21
+    lw     $s4, 220($sp)
+    .cfi_restore 20
+    lw     $s3, 216($sp)
+    .cfi_restore 19
+    lw     $s2, 212($sp)
+    .cfi_restore 18
+    lw     $s1, 208($sp)
+    .cfi_restore 17
+    lw     $s0, 204($sp)
+    .cfi_restore 16
+    lw     $t7, 200($sp)
+    .cfi_restore 15
+    lw     $t6, 196($sp)
+    .cfi_restore 14
+    lw     $t5, 192($sp)
+    .cfi_restore 13
+    lw     $t4, 188($sp)
+    .cfi_restore 12
+    lw     $t3, 184($sp)
+    .cfi_restore 11
+    lw     $t2, 180($sp)
+    .cfi_restore 10
+    lw     $t1, 176($sp)
+    .cfi_restore 9
+    lw     $t0, 172($sp)
+    .cfi_restore 8
+    lw     $a3, 168($sp)
+    .cfi_restore 7
+    lw     $a2, 164($sp)
+    .cfi_restore 6
+    lw     $a1, 160($sp)
+    .cfi_restore 5
+    lw     $a0, 156($sp)
+    .cfi_restore 4
+    lw     $v1, 152($sp)
+    .cfi_restore 3
+    lw     $v0, 148($sp)
+    .cfi_restore 2
+    lw     $at, 144($sp)
+    .cfi_restore 1
+
+    addiu  $sp, $sp, 256            # pop frame
+    .cfi_adjust_cfa_offset -256
+.endm
+
+    /*
      * Macro that set calls through to artDeliverPendingExceptionFromCode, where the pending
      * exception is Thread::Current()->exception_
      */
@@ -1652,18 +1843,20 @@
      * Called by managed code when the value in rSUSPEND has been decremented to 0.
      */
     .extern artTestSuspendFromCode
-ENTRY art_quick_test_suspend
-    lh     $a0, THREAD_FLAGS_OFFSET(rSELF)
-    bnez   $a0, 1f
+ENTRY_NO_GP art_quick_test_suspend
+    lh     rSUSPEND, THREAD_FLAGS_OFFSET(rSELF)
+    bnez   rSUSPEND, 1f
     addiu  rSUSPEND, $zero, SUSPEND_CHECK_INTERVAL   # reset rSUSPEND to SUSPEND_CHECK_INTERVAL
     jalr   $zero, $ra
     nop
 1:
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME          # save callee saves for stack crawl
+    SETUP_SAVE_EVERYTHING_CALLEE_SAVE_FRAME          # save everything for stack crawl
     la     $t9, artTestSuspendFromCode
-    jalr   $t9                                 # (Thread*)
+    jalr   $t9                                       # (Thread*)
     move   $a0, rSELF
-    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME_AND_RETURN
+    RESTORE_SAVE_EVERYTHING_CALLEE_SAVE_FRAME
+    jalr   $zero, $ra
+    nop
 END art_quick_test_suspend
 
     /*
diff --git a/runtime/arch/mips/quick_method_frame_info_mips.h b/runtime/arch/mips/quick_method_frame_info_mips.h
index 7b0623b..170513d 100644
--- a/runtime/arch/mips/quick_method_frame_info_mips.h
+++ b/runtime/arch/mips/quick_method_frame_info_mips.h
@@ -34,6 +34,12 @@
     (1 << art::mips::A1) | (1 << art::mips::A2) | (1 << art::mips::A3);
 static constexpr uint32_t kMipsCalleeSaveAllSpills =
     (1 << art::mips::S0) | (1 << art::mips::S1);
+static constexpr uint32_t kMipsCalleeSaveEverythingSpills =
+    (1 << art::mips::AT) | (1 << art::mips::V0) | (1 << art::mips::V1) |
+    (1 << art::mips::A0) | (1 << art::mips::A1) | (1 << art::mips::A2) | (1 << art::mips::A3) |
+    (1 << art::mips::T0) | (1 << art::mips::T1) | (1 << art::mips::T2) | (1 << art::mips::T3) |
+    (1 << art::mips::T4) | (1 << art::mips::T5) | (1 << art::mips::T6) | (1 << art::mips::T7) |
+    (1 << art::mips::S0) | (1 << art::mips::S1) | (1 << art::mips::T8) | (1 << art::mips::T9);
 
 static constexpr uint32_t kMipsCalleeSaveFpAlwaysSpills = 0;
 static constexpr uint32_t kMipsCalleeSaveFpRefSpills = 0;
@@ -43,17 +49,28 @@
     (1 << art::mips::F20) | (1 << art::mips::F21) | (1 << art::mips::F22) | (1 << art::mips::F23) |
     (1 << art::mips::F24) | (1 << art::mips::F25) | (1 << art::mips::F26) | (1 << art::mips::F27) |
     (1 << art::mips::F28) | (1 << art::mips::F29) | (1 << art::mips::F30) | (1 << art::mips::F31);
+static constexpr uint32_t kMipsCalleeSaveFpEverythingSpills =
+    (1 << art::mips::F0) | (1 << art::mips::F1) | (1 << art::mips::F2) | (1 << art::mips::F3) |
+    (1 << art::mips::F4) | (1 << art::mips::F5) | (1 << art::mips::F6) | (1 << art::mips::F7) |
+    (1 << art::mips::F8) | (1 << art::mips::F9) | (1 << art::mips::F10) | (1 << art::mips::F11) |
+    (1 << art::mips::F12) | (1 << art::mips::F13) | (1 << art::mips::F14) | (1 << art::mips::F15) |
+    (1 << art::mips::F16) | (1 << art::mips::F17) | (1 << art::mips::F18) | (1 << art::mips::F19) |
+    (1 << art::mips::F20) | (1 << art::mips::F21) | (1 << art::mips::F22) | (1 << art::mips::F23) |
+    (1 << art::mips::F24) | (1 << art::mips::F25) | (1 << art::mips::F26) | (1 << art::mips::F27) |
+    (1 << art::mips::F28) | (1 << art::mips::F29) | (1 << art::mips::F30) | (1 << art::mips::F31);
 
 constexpr uint32_t MipsCalleeSaveCoreSpills(Runtime::CalleeSaveType type) {
   return kMipsCalleeSaveAlwaysSpills | kMipsCalleeSaveRefSpills |
       (type == Runtime::kRefsAndArgs ? kMipsCalleeSaveArgSpills : 0) |
-      (type == Runtime::kSaveAll ? kMipsCalleeSaveAllSpills : 0);
+      (type == Runtime::kSaveAll ? kMipsCalleeSaveAllSpills : 0) |
+      (type == Runtime::kSaveEverything ? kMipsCalleeSaveEverythingSpills : 0);
 }
 
 constexpr uint32_t MipsCalleeSaveFPSpills(Runtime::CalleeSaveType type) {
   return kMipsCalleeSaveFpAlwaysSpills | kMipsCalleeSaveFpRefSpills |
       (type == Runtime::kRefsAndArgs ? kMipsCalleeSaveFpArgSpills : 0) |
-      (type == Runtime::kSaveAll ? kMipsCalleeSaveAllFPSpills : 0);
+      (type == Runtime::kSaveAll ? kMipsCalleeSaveAllFPSpills : 0) |
+      (type == Runtime::kSaveEverything ? kMipsCalleeSaveFpEverythingSpills : 0);
 }
 
 constexpr uint32_t MipsCalleeSaveFrameSize(Runtime::CalleeSaveType type) {
diff --git a/runtime/arch/mips64/asm_support_mips64.h b/runtime/arch/mips64/asm_support_mips64.h
index 995fcf3..2c16c25 100644
--- a/runtime/arch/mips64/asm_support_mips64.h
+++ b/runtime/arch/mips64/asm_support_mips64.h
@@ -25,5 +25,7 @@
 #define FRAME_SIZE_REFS_ONLY_CALLEE_SAVE 80
 // $f12-$f19, $a1-$a7, $s2-$s7 + $gp + $s8 + $ra, 16 total + 1x8 bytes padding + method*
 #define FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE 208
+// $f0-$f31, $at, $v0-$v1, $a0-$a7, $t0-$t3, $s0-$s7, $t8-$t9, $gp, $s8, $ra + padding + method*
+#define FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE 496
 
 #endif  // ART_RUNTIME_ARCH_MIPS64_ASM_SUPPORT_MIPS64_H_
diff --git a/runtime/arch/mips64/quick_entrypoints_mips64.S b/runtime/arch/mips64/quick_entrypoints_mips64.S
index ae69620..0a37909 100644
--- a/runtime/arch/mips64/quick_entrypoints_mips64.S
+++ b/runtime/arch/mips64/quick_entrypoints_mips64.S
@@ -314,6 +314,227 @@
 .endm
 
     /*
+     * Macro that sets up the callee save frame to conform with
+     * Runtime::CreateCalleeSaveMethod(kSaveEverything).
+     * callee-save: $at + $v0-$v1 + $a0-$a7 + $t0-$t3 + $s0-$s7 + $t8-$t9 + $gp + $s8 + $ra + $s8,
+     *              $f0-$f31; 28(GPR)+ 32(FPR) + 1x8 bytes padding + method*
+     * This macro sets up $gp; entrypoints using it should start with ENTRY_NO_GP.
+     */
+.macro SETUP_SAVE_EVERYTHING_CALLEE_SAVE_FRAME
+    daddiu $sp, $sp, -496
+    .cfi_adjust_cfa_offset 496
+
+     // Ugly compile-time check, but we only have the preprocessor.
+#if (FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE != 496)
+#error "SAVE_EVERYTHING_CALLEE_SAVE_FRAME(MIPS64) size not as expected."
+#endif
+
+    // Save core registers.
+    sd     $ra, 488($sp)
+    .cfi_rel_offset 31, 488
+    sd     $s8, 480($sp)
+    .cfi_rel_offset 30, 480
+    sd     $gp, 472($sp)
+    .cfi_rel_offset 28, 472
+    sd     $t9, 464($sp)
+    .cfi_rel_offset 25, 464
+    sd     $t8, 456($sp)
+    .cfi_rel_offset 24, 456
+    sd     $s7, 448($sp)
+    .cfi_rel_offset 23, 448
+    sd     $s6, 440($sp)
+    .cfi_rel_offset 22, 440
+    sd     $s5, 432($sp)
+    .cfi_rel_offset 21, 432
+    sd     $s4, 424($sp)
+    .cfi_rel_offset 20, 424
+    sd     $s3,  416($sp)
+    .cfi_rel_offset 19, 416
+    sd     $s2,  408($sp)
+    .cfi_rel_offset 18, 408
+    sd     $s1,  400($sp)
+    .cfi_rel_offset 17, 400
+    sd     $s0,  392($sp)
+    .cfi_rel_offset 16, 392
+    sd     $t3,  384($sp)
+    .cfi_rel_offset 15, 384
+    sd     $t2,  376($sp)
+    .cfi_rel_offset 14, 376
+    sd     $t1,  368($sp)
+    .cfi_rel_offset 13, 368
+    sd     $t0,  360($sp)
+    .cfi_rel_offset 12, 360
+    sd     $a7, 352($sp)
+    .cfi_rel_offset 11, 352
+    sd     $a6, 344($sp)
+    .cfi_rel_offset 10, 344
+    sd     $a5, 336($sp)
+    .cfi_rel_offset 9, 336
+    sd     $a4, 328($sp)
+    .cfi_rel_offset 8, 328
+    sd     $a3,  320($sp)
+    .cfi_rel_offset 7, 320
+    sd     $a2,  312($sp)
+    .cfi_rel_offset 6, 312
+    sd     $a1,  304($sp)
+    .cfi_rel_offset 5, 304
+    sd     $a0,  296($sp)
+    .cfi_rel_offset 4, 296
+    sd     $v1,  288($sp)
+    .cfi_rel_offset 3, 288
+    sd     $v0,  280($sp)
+    .cfi_rel_offset 2, 280
+
+    // Set up $gp, clobbering $ra and using the branch delay slot for a useful instruction.
+    bal 1f
+    sd     $at,  272($sp)
+    .cfi_rel_offset 1, 272
+1:
+    // TODO: Can we avoid the unnecessary move $t8<-$gp?
+    .cpsetup $ra, $t8, 1b
+
+    // Save FP registers.
+    s.d    $f31, 264($sp)
+    s.d    $f30, 256($sp)
+    s.d    $f29, 248($sp)
+    s.d    $f28, 240($sp)
+    s.d    $f27, 232($sp)
+    s.d    $f26, 224($sp)
+    s.d    $f25, 216($sp)
+    s.d    $f24, 208($sp)
+    s.d    $f23, 200($sp)
+    s.d    $f22, 192($sp)
+    s.d    $f21, 184($sp)
+    s.d    $f20, 176($sp)
+    s.d    $f19, 168($sp)
+    s.d    $f18, 160($sp)
+    s.d    $f17, 152($sp)
+    s.d    $f16, 144($sp)
+    s.d    $f15, 136($sp)
+    s.d    $f14, 128($sp)
+    s.d    $f13, 120($sp)
+    s.d    $f12, 112($sp)
+    s.d    $f11, 104($sp)
+    s.d    $f10, 96($sp)
+    s.d    $f9, 88($sp)
+    s.d    $f8, 80($sp)
+    s.d    $f7, 72($sp)
+    s.d    $f6, 64($sp)
+    s.d    $f5, 56($sp)
+    s.d    $f4, 48($sp)
+    s.d    $f3, 40($sp)
+    s.d    $f2, 32($sp)
+    s.d    $f1, 24($sp)
+    s.d    $f0, 16($sp)
+
+    # load appropriate callee-save-method
+    ld      $t1, %got(_ZN3art7Runtime9instance_E)($gp)
+    ld      $t1, 0($t1)
+    ld      $t1, RUNTIME_SAVE_EVERYTHING_CALLEE_SAVE_FRAME_OFFSET($t1)
+    sd      $t1, 0($sp)                                # Place ArtMethod* at bottom of stack.
+    # Place sp in Thread::Current()->top_quick_frame.
+    sd      $sp, THREAD_TOP_QUICK_FRAME_OFFSET(rSELF)
+.endm
+
+.macro RESTORE_SAVE_EVERYTHING_CALLEE_SAVE_FRAME
+    // Restore FP registers.
+    l.d    $f31, 264($sp)
+    l.d    $f30, 256($sp)
+    l.d    $f29, 248($sp)
+    l.d    $f28, 240($sp)
+    l.d    $f27, 232($sp)
+    l.d    $f26, 224($sp)
+    l.d    $f25, 216($sp)
+    l.d    $f24, 208($sp)
+    l.d    $f23, 200($sp)
+    l.d    $f22, 192($sp)
+    l.d    $f21, 184($sp)
+    l.d    $f20, 176($sp)
+    l.d    $f19, 168($sp)
+    l.d    $f18, 160($sp)
+    l.d    $f17, 152($sp)
+    l.d    $f16, 144($sp)
+    l.d    $f15, 136($sp)
+    l.d    $f14, 128($sp)
+    l.d    $f13, 120($sp)
+    l.d    $f12, 112($sp)
+    l.d    $f11, 104($sp)
+    l.d    $f10, 96($sp)
+    l.d    $f9, 88($sp)
+    l.d    $f8, 80($sp)
+    l.d    $f7, 72($sp)
+    l.d    $f6, 64($sp)
+    l.d    $f5, 56($sp)
+    l.d    $f4, 48($sp)
+    l.d    $f3, 40($sp)
+    l.d    $f2, 32($sp)
+    l.d    $f1, 24($sp)
+    l.d    $f0, 16($sp)
+
+    // Restore core registers.
+    ld     $ra, 488($sp)
+    .cfi_restore 31
+    ld     $s8, 480($sp)
+    .cfi_restore 30
+    ld     $gp, 472($sp)
+    .cfi_restore 28
+    ld     $t9, 464($sp)
+    .cfi_restore 25
+    ld     $t8, 456($sp)
+    .cfi_restore 24
+    ld     $s7, 448($sp)
+    .cfi_restore 23
+    ld     $s6, 440($sp)
+    .cfi_restore 22
+    ld     $s5, 432($sp)
+    .cfi_restore 21
+    ld     $s4, 424($sp)
+    .cfi_restore 20
+    ld     $s3,  416($sp)
+    .cfi_restore 19
+    ld     $s2,  408($sp)
+    .cfi_restore 18
+    ld     $s1,  400($sp)
+    .cfi_restore 17
+    ld     $s0,  392($sp)
+    .cfi_restore 16
+    ld     $t3,  384($sp)
+    .cfi_restore 15
+    ld     $t2,  376($sp)
+    .cfi_restore 14
+    ld     $t1,  368($sp)
+    .cfi_restore 13
+    ld     $t0,  360($sp)
+    .cfi_restore 12
+    ld     $a7, 352($sp)
+    .cfi_restore 11
+    ld     $a6, 344($sp)
+    .cfi_restore 10
+    ld     $a5, 336($sp)
+    .cfi_restore 9
+    ld     $a4, 328($sp)
+    .cfi_restore 8
+    ld     $a3,  320($sp)
+    .cfi_restore 7
+    ld     $a2,  312($sp)
+    .cfi_restore 6
+    ld     $a1,  304($sp)
+    .cfi_restore 5
+    ld     $a0,  296($sp)
+    .cfi_restore 4
+    ld     $v1,  288($sp)
+    .cfi_restore 3
+    ld     $v0,  280($sp)
+    .cfi_restore 2
+    ld     $at,  272($sp)
+    .cfi_restore 1
+
+    .cpreturn
+    daddiu $sp, $sp, 496
+    .cfi_adjust_cfa_offset -496
+.endm
+
+    /*
      * Macro that set calls through to artDeliverPendingExceptionFromCode,
      * where the pending
      * exception is Thread::Current()->exception_
@@ -1673,17 +1894,19 @@
      * Called by managed code when the value in rSUSPEND has been decremented to 0.
      */
     .extern artTestSuspendFromCode
-ENTRY art_quick_test_suspend
-    lh     $a0, THREAD_FLAGS_OFFSET(rSELF)
-    bne    $a0, $zero, 1f
+ENTRY_NO_GP art_quick_test_suspend
+    lh     rSUSPEND, THREAD_FLAGS_OFFSET(rSELF)
+    bne    rSUSPEND, $zero, 1f
     daddiu rSUSPEND, $zero, SUSPEND_CHECK_INTERVAL   # reset rSUSPEND to SUSPEND_CHECK_INTERVAL
     jalr   $zero, $ra
-    .cpreturn                                 # Restore gp from t8 in branch delay slot.
+    nop
 1:
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME         # save callee saves for stack crawl
+    SETUP_SAVE_EVERYTHING_CALLEE_SAVE_FRAME   # save everything for stack crawl
     jal    artTestSuspendFromCode             # (Thread*)
     move   $a0, rSELF
-    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME_AND_RETURN
+    RESTORE_SAVE_EVERYTHING_CALLEE_SAVE_FRAME
+    jalr   $zero, $ra
+    nop
 END art_quick_test_suspend
 
     /*
diff --git a/runtime/arch/mips64/quick_method_frame_info_mips64.h b/runtime/arch/mips64/quick_method_frame_info_mips64.h
index b7dc57f..d52945f 100644
--- a/runtime/arch/mips64/quick_method_frame_info_mips64.h
+++ b/runtime/arch/mips64/quick_method_frame_info_mips64.h
@@ -25,6 +25,8 @@
 namespace art {
 namespace mips64 {
 
+static constexpr uint32_t kMips64CalleeSaveAlwaysSpills =
+    (1 << art::mips64::RA);
 static constexpr uint32_t kMips64CalleeSaveRefSpills =
     (1 << art::mips64::S2) | (1 << art::mips64::S3) | (1 << art::mips64::S4) |
     (1 << art::mips64::S5) | (1 << art::mips64::S6) | (1 << art::mips64::S7) |
@@ -35,6 +37,14 @@
     (1 << art::mips64::A7);
 static constexpr uint32_t kMips64CalleeSaveAllSpills =
     (1 << art::mips64::S0) | (1 << art::mips64::S1);
+static constexpr uint32_t kMips64CalleeSaveEverythingSpills =
+    (1 << art::mips64::AT) | (1 << art::mips64::V0) | (1 << art::mips64::V1) |
+    (1 << art::mips64::A0) | (1 << art::mips64::A1) | (1 << art::mips64::A2) |
+    (1 << art::mips64::A3) | (1 << art::mips64::A4) | (1 << art::mips64::A5) |
+    (1 << art::mips64::A6) | (1 << art::mips64::A7) | (1 << art::mips64::T0) |
+    (1 << art::mips64::T1) | (1 << art::mips64::T2) | (1 << art::mips64::T3) |
+    (1 << art::mips64::S0) | (1 << art::mips64::S1) | (1 << art::mips64::T8) |
+    (1 << art::mips64::T9);
 
 static constexpr uint32_t kMips64CalleeSaveFpRefSpills = 0;
 static constexpr uint32_t kMips64CalleeSaveFpArgSpills =
@@ -46,17 +56,31 @@
     (1 << art::mips64::F24) | (1 << art::mips64::F25) | (1 << art::mips64::F26) |
     (1 << art::mips64::F27) | (1 << art::mips64::F28) | (1 << art::mips64::F29) |
     (1 << art::mips64::F30) | (1 << art::mips64::F31);
+static constexpr uint32_t kMips64CalleeSaveFpEverythingSpills =
+    (1 << art::mips64::F0) | (1 << art::mips64::F1) | (1 << art::mips64::F2) |
+    (1 << art::mips64::F3) | (1 << art::mips64::F4) | (1 << art::mips64::F5) |
+    (1 << art::mips64::F6) | (1 << art::mips64::F7) | (1 << art::mips64::F8) |
+    (1 << art::mips64::F9) | (1 << art::mips64::F10) | (1 << art::mips64::F11) |
+    (1 << art::mips64::F12) | (1 << art::mips64::F13) | (1 << art::mips64::F14) |
+    (1 << art::mips64::F15) | (1 << art::mips64::F16) | (1 << art::mips64::F17) |
+    (1 << art::mips64::F18) | (1 << art::mips64::F19) | (1 << art::mips64::F20) |
+    (1 << art::mips64::F21) | (1 << art::mips64::F22) | (1 << art::mips64::F23) |
+    (1 << art::mips64::F24) | (1 << art::mips64::F25) | (1 << art::mips64::F26) |
+    (1 << art::mips64::F27) | (1 << art::mips64::F28) | (1 << art::mips64::F29) |
+    (1 << art::mips64::F30) | (1 << art::mips64::F31);
 
 constexpr uint32_t Mips64CalleeSaveCoreSpills(Runtime::CalleeSaveType type) {
-  return kMips64CalleeSaveRefSpills |
+  return kMips64CalleeSaveAlwaysSpills | kMips64CalleeSaveRefSpills |
       (type == Runtime::kRefsAndArgs ? kMips64CalleeSaveArgSpills : 0) |
-      (type == Runtime::kSaveAll ? kMips64CalleeSaveAllSpills : 0) | (1 << art::mips64::RA);
+      (type == Runtime::kSaveAll ? kMips64CalleeSaveAllSpills : 0) |
+      (type == Runtime::kSaveEverything ? kMips64CalleeSaveEverythingSpills : 0);
 }
 
 constexpr uint32_t Mips64CalleeSaveFpSpills(Runtime::CalleeSaveType type) {
   return kMips64CalleeSaveFpRefSpills |
       (type == Runtime::kRefsAndArgs ? kMips64CalleeSaveFpArgSpills: 0) |
-      (type == Runtime::kSaveAll ? kMips64CalleeSaveFpAllSpills : 0);
+      (type == Runtime::kSaveAll ? kMips64CalleeSaveFpAllSpills : 0) |
+      (type == Runtime::kSaveEverything ? kMips64CalleeSaveFpEverythingSpills : 0);
 }
 
 constexpr uint32_t Mips64CalleeSaveFrameSize(Runtime::CalleeSaveType type) {
diff --git a/runtime/arch/x86/asm_support_x86.h b/runtime/arch/x86/asm_support_x86.h
index b0a6017..ba5fd99 100644
--- a/runtime/arch/x86/asm_support_x86.h
+++ b/runtime/arch/x86/asm_support_x86.h
@@ -21,8 +21,7 @@
 
 #define FRAME_SIZE_SAVE_ALL_CALLEE_SAVE 32
 #define FRAME_SIZE_REFS_ONLY_CALLEE_SAVE 32
-
-// 32 bytes for GPRs and 32 bytes for FPRs.
 #define FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE (32 + 32)
+#define FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE (48 + 64)
 
 #endif  // ART_RUNTIME_ARCH_X86_ASM_SUPPORT_X86_H_
diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S
index 77e04e7..68ba0cf 100644
--- a/runtime/arch/x86/quick_entrypoints_x86.S
+++ b/runtime/arch/x86/quick_entrypoints_x86.S
@@ -222,6 +222,74 @@
 END_MACRO
 
     /*
+     * Macro that sets up the callee save frame to conform with
+     * Runtime::CreateCalleeSaveMethod(kSaveEverything)
+     */
+MACRO2(SETUP_SAVE_EVERYTHING_CALLEE_SAVE_FRAME, got_reg, temp_reg)
+    // Save core registers.
+    PUSH edi
+    PUSH esi
+    PUSH ebp
+    PUSH ebx
+    PUSH edx
+    PUSH ecx
+    PUSH eax
+    // Create space for FPR registers and stack alignment padding.
+    subl MACRO_LITERAL(12 + 8 * 8), %esp
+    CFI_ADJUST_CFA_OFFSET(12 + 8 * 8)
+    // Save FPRs.
+    movsd %xmm0, 12(%esp)
+    movsd %xmm1, 20(%esp)
+    movsd %xmm2, 28(%esp)
+    movsd %xmm3, 36(%esp)
+    movsd %xmm4, 44(%esp)
+    movsd %xmm5, 52(%esp)
+    movsd %xmm6, 60(%esp)
+    movsd %xmm7, 68(%esp)
+
+    SETUP_GOT_NOSAVE RAW_VAR(got_reg)
+    // Load Runtime::instance_ from GOT.
+    movl SYMBOL(_ZN3art7Runtime9instance_E)@GOT(REG_VAR(got_reg)), REG_VAR(temp_reg)
+    movl (REG_VAR(temp_reg)), REG_VAR(temp_reg)
+    // Push save everything callee-save method.
+    pushl RUNTIME_SAVE_EVERYTHING_CALLEE_SAVE_FRAME_OFFSET(REG_VAR(temp_reg))
+    CFI_ADJUST_CFA_OFFSET(4)
+    // Store esp as the stop quick frame.
+    movl %esp, %fs:THREAD_TOP_QUICK_FRAME_OFFSET
+
+    // Ugly compile-time check, but we only have the preprocessor.
+    // Last +4: implicit return address pushed on stack when caller made call.
+#if (FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE != 7*4 + 8*8 + 12 + 4 + 4)
+#error "SAVE_EVERYTHING_CALLEE_SAVE_FRAME(X86) size not as expected."
+#endif
+END_MACRO
+
+MACRO0(RESTORE_SAVE_EVERYTHING_CALLEE_SAVE_FRAME)
+    // Restore FPRs. Method and padding is still on the stack.
+    movsd 16(%esp), %xmm0
+    movsd 24(%esp), %xmm1
+    movsd 32(%esp), %xmm2
+    movsd 40(%esp), %xmm3
+    movsd 48(%esp), %xmm4
+    movsd 56(%esp), %xmm5
+    movsd 64(%esp), %xmm6
+    movsd 72(%esp), %xmm7
+
+    // Remove save everything callee save method, stack alignment padding and FPRs.
+    addl MACRO_LITERAL(16 + 8 * 8), %esp
+    CFI_ADJUST_CFA_OFFSET(-(16 + 8 * 8))
+
+    // Restore core registers.
+    POP eax
+    POP ecx
+    POP edx
+    POP ebx
+    POP ebp
+    POP esi
+    POP edi
+END_MACRO
+
+    /*
      * Macro that set calls through to artDeliverPendingExceptionFromCode, where the pending
      * exception is Thread::Current()->exception_.
      */
@@ -661,22 +729,6 @@
     ret
 END_FUNCTION art_quick_invoke_static_stub
 
-MACRO3(NO_ARG_DOWNCALL, c_name, cxx_name, return_macro)
-    DEFINE_FUNCTION VAR(c_name)
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME ebx, ebx  // save ref containing registers for GC
-    // Outgoing argument set up
-    subl MACRO_LITERAL(12), %esp                // push padding
-    CFI_ADJUST_CFA_OFFSET(12)
-    pushl %fs:THREAD_SELF_OFFSET                // pass Thread::Current()
-    CFI_ADJUST_CFA_OFFSET(4)
-    call CALLVAR(cxx_name)                      // cxx_name(Thread*)
-    addl MACRO_LITERAL(16), %esp                // pop arguments
-    CFI_ADJUST_CFA_OFFSET(-16)
-    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME         // restore frame up to return address
-    CALL_MACRO(return_macro)                    // return or deliver exception
-    END_FUNCTION VAR(c_name)
-END_MACRO
-
 MACRO3(ONE_ARG_DOWNCALL, c_name, cxx_name, return_macro)
     DEFINE_FUNCTION VAR(c_name)
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME  ebx, ebx  // save ref containing registers for GC
@@ -1028,7 +1080,13 @@
     movl 0(%edx, %eax, COMPRESSED_REFERENCE_SIZE), %edx
                                                                // Read barrier for class load.
     cmpl LITERAL(0), %fs:THREAD_IS_GC_MARKING_OFFSET
-    jne .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path
+    jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
+    // Null check so that we can load the lock word.
+    testl %edx, %edx
+    jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
+    // Check the mark bit, if it is 1 return.
+    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)
+    jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path
 .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit:
     ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path
 .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path:
@@ -1065,7 +1123,7 @@
     test LITERAL(LOCK_WORD_STATE_MASK), %ecx         // test the 2 high bits.
     jne  .Lslow_lock                      // slow path if either of the two high bits are set.
     movl %ecx, %edx                       // save lock word (edx) to keep read barrier bits.
-    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %ecx  // zero the read barrier bits.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %ecx  // zero the gc bits.
     test %ecx, %ecx
     jnz  .Lalready_thin                   // lock word contains a thin lock
     // unlocked case - edx: original lock word, eax: obj.
@@ -1081,9 +1139,9 @@
     cmpw %cx, %dx                         // do we hold the lock already?
     jne  .Lslow_lock
     movl %edx, %ecx                       // copy the lock word to check count overflow.
-    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %ecx  // zero the read barrier bits.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %ecx  // zero the read barrier bits.
     addl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %ecx  // increment recursion count for overflow check.
-    test LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx  // overflowed if either of the upper two bits (28-29) are set.
+    test LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED), %ecx  // overflowed if the first gc state bit is set.
     jne  .Lslow_lock                      // count overflowed so go slow
     movl %eax, %ecx                       // save obj to use eax for cmpxchg.
     movl %edx, %eax                       // copy the lock word as the old val for cmpxchg.
@@ -1137,13 +1195,13 @@
     cmpw %cx, %dx                         // does the thread id match?
     jne  .Lslow_unlock
     movl %ecx, %edx                       // copy the lock word to detect new count of 0.
-    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %edx  // zero the read barrier bits.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %edx  // zero the gc bits.
     cmpl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %edx
     jae  .Lrecursive_thin_unlock
     // update lockword, cmpxchg necessary for read barrier bits.
     movl %eax, %edx                       // edx: obj
     movl %ecx, %eax                       // eax: old lock word.
-    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx  // ecx: new lock word zero except original rb bits.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED), %ecx  // ecx: new lock word zero except original rb bits.
 #ifndef USE_READ_BARRIER
     movl %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)
 #else
@@ -1397,7 +1455,19 @@
     ret
 END_FUNCTION art_quick_memcpy
 
-NO_ARG_DOWNCALL art_quick_test_suspend, artTestSuspendFromCode, ret
+DEFINE_FUNCTION art_quick_test_suspend
+    SETUP_SAVE_EVERYTHING_CALLEE_SAVE_FRAME ebx, ebx  // save everything for GC
+    // Outgoing argument set up
+    subl MACRO_LITERAL(12), %esp                      // push padding
+    CFI_ADJUST_CFA_OFFSET(12)
+    pushl %fs:THREAD_SELF_OFFSET                      // pass Thread::Current()
+    CFI_ADJUST_CFA_OFFSET(4)
+    call SYMBOL(artTestSuspendFromCode)               // (Thread*)
+    addl MACRO_LITERAL(16), %esp                      // pop arguments
+    CFI_ADJUST_CFA_OFFSET(-16)
+    RESTORE_SAVE_EVERYTHING_CALLEE_SAVE_FRAME         // restore frame up to return address
+    ret                                               // return
+END_FUNCTION art_quick_test_suspend
 
 DEFINE_FUNCTION art_quick_d2l
     subl LITERAL(12), %esp        // alignment padding, room for argument
@@ -1923,6 +1993,14 @@
 //   convention (e.g. standard callee-save registers are preserved).
 MACRO2(READ_BARRIER_MARK_REG, name, reg)
     DEFINE_FUNCTION VAR(name)
+    // Null check so that we can load the lock word.
+    test REG_VAR(reg), REG_VAR(reg)
+    jz .Lret_rb_\name
+    // Check the mark bit, if it is 1 return.
+    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(reg))
+    jz .Lslow_rb_\name
+    ret
+.Lslow_rb_\name:
     // Save all potentially live caller-save core registers.
     PUSH eax
     PUSH ecx
@@ -1970,6 +2048,7 @@
     POP_REG_NE edx, RAW_VAR(reg)
     POP_REG_NE ecx, RAW_VAR(reg)
     POP_REG_NE eax, RAW_VAR(reg)
+.Lret_rb_\name:
     ret
     END_FUNCTION VAR(name)
 END_MACRO
diff --git a/runtime/arch/x86/quick_method_frame_info_x86.h b/runtime/arch/x86/quick_method_frame_info_x86.h
index 24c671c..a1612c3 100644
--- a/runtime/arch/x86/quick_method_frame_info_x86.h
+++ b/runtime/arch/x86/quick_method_frame_info_x86.h
@@ -36,21 +36,33 @@
   XMM7 = 7,
 };
 
+static constexpr uint32_t kX86CalleeSaveAlwaysSpills =
+    (1 << art::x86::kNumberOfCpuRegisters);  // Fake return address callee save.
 static constexpr uint32_t kX86CalleeSaveRefSpills =
     (1 << art::x86::EBP) | (1 << art::x86::ESI) | (1 << art::x86::EDI);
 static constexpr uint32_t kX86CalleeSaveArgSpills =
     (1 << art::x86::ECX) | (1 << art::x86::EDX) | (1 << art::x86::EBX);
+static constexpr uint32_t kX86CalleeSaveEverythingSpills =
+    (1 << art::x86::EAX) | (1 << art::x86::ECX) | (1 << art::x86::EDX) | (1 << art::x86::EBX);
+
 static constexpr uint32_t kX86CalleeSaveFpArgSpills =
     (1 << art::x86::XMM0) | (1 << art::x86::XMM1) |
     (1 << art::x86::XMM2) | (1 << art::x86::XMM3);
+static constexpr uint32_t kX86CalleeSaveFpEverythingSpills =
+    (1 << art::x86::XMM0) | (1 << art::x86::XMM1) |
+    (1 << art::x86::XMM2) | (1 << art::x86::XMM3) |
+    (1 << art::x86::XMM4) | (1 << art::x86::XMM5) |
+    (1 << art::x86::XMM6) | (1 << art::x86::XMM7);
 
 constexpr uint32_t X86CalleeSaveCoreSpills(Runtime::CalleeSaveType type) {
-  return kX86CalleeSaveRefSpills | (type == Runtime::kRefsAndArgs ? kX86CalleeSaveArgSpills : 0) |
-      (1 << art::x86::kNumberOfCpuRegisters);  // fake return address callee save
+  return kX86CalleeSaveAlwaysSpills | kX86CalleeSaveRefSpills |
+      (type == Runtime::kRefsAndArgs ? kX86CalleeSaveArgSpills : 0) |
+      (type == Runtime::kSaveEverything ? kX86CalleeSaveEverythingSpills : 0);
 }
 
 constexpr uint32_t X86CalleeSaveFpSpills(Runtime::CalleeSaveType type) {
-    return type == Runtime::kRefsAndArgs ? kX86CalleeSaveFpArgSpills : 0;
+    return (type == Runtime::kRefsAndArgs ? kX86CalleeSaveFpArgSpills : 0) |
+        (type == Runtime::kSaveEverything ? kX86CalleeSaveFpEverythingSpills : 0);
 }
 
 constexpr uint32_t X86CalleeSaveFrameSize(Runtime::CalleeSaveType type) {
diff --git a/runtime/arch/x86_64/asm_support_x86_64.h b/runtime/arch/x86_64/asm_support_x86_64.h
index 48bec73..58dc2fe 100644
--- a/runtime/arch/x86_64/asm_support_x86_64.h
+++ b/runtime/arch/x86_64/asm_support_x86_64.h
@@ -21,6 +21,7 @@
 
 #define FRAME_SIZE_SAVE_ALL_CALLEE_SAVE (64 + 4*8)
 #define FRAME_SIZE_REFS_ONLY_CALLEE_SAVE (64 + 4*8)
-#define FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE (176 + 4*8)
+#define FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE (112 + 12*8)
+#define FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE (144 + 16*8)
 
 #endif  // ART_RUNTIME_ARCH_X86_64_ASM_SUPPORT_X86_64_H_
diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
index 784ec39..3048404 100644
--- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S
+++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S
@@ -165,8 +165,8 @@
     PUSH rdx  // Quick arg 2.
     PUSH rcx  // Quick arg 3.
     // Create space for FPR args and create 2 slots for ArtMethod*.
-    subq MACRO_LITERAL(80 + 4 * 8), %rsp
-    CFI_ADJUST_CFA_OFFSET(80 + 4 * 8)
+    subq MACRO_LITERAL(16 + 12 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(16 + 12 * 8)
     // R10 := ArtMethod* for ref and args callee save frame method.
     movq RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET(%r10), %r10
     // Save FPRs.
@@ -189,7 +189,7 @@
 
     // Ugly compile-time check, but we only have the preprocessor.
     // Last +8: implicit return address pushed on stack when caller made call.
-#if (FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE != 11 * 8 + 4 * 8 + 80 + 8)
+#if (FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE != 11 * 8 + 12 * 8 + 16 + 8)
 #error "REFS_AND_ARGS_CALLEE_SAVE_FRAME(X86_64) size not as expected."
 #endif
 #endif  // __APPLE__
@@ -260,6 +260,108 @@
     POP r15
 END_MACRO
 
+    /*
+     * Macro that sets up the callee save frame to conform with
+     * Runtime::CreateCalleeSaveMethod(kSaveEverything)
+     */
+MACRO0(SETUP_SAVE_EVERYTHING_CALLEE_SAVE_FRAME)
+#if defined(__APPLE__)
+    int3
+    int3
+#else
+    // Save core registers from highest to lowest to agree with core spills bitmap.
+    PUSH r15
+    PUSH r14
+    PUSH r13
+    PUSH r12
+    PUSH r11
+    PUSH r10
+    PUSH r9
+    PUSH r8
+    PUSH rdi
+    PUSH rsi
+    PUSH rbp
+    PUSH rbx
+    PUSH rdx
+    PUSH rcx
+    PUSH rax
+    // Create space for FPRs and stack alignment padding.
+    subq MACRO_LITERAL(8 + 16 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(8 + 16 * 8)
+    // R10 := Runtime::Current()
+    movq _ZN3art7Runtime9instance_E@GOTPCREL(%rip), %r10
+    movq (%r10), %r10
+    // Save FPRs.
+    movq %xmm0, 8(%rsp)
+    movq %xmm1, 16(%rsp)
+    movq %xmm2, 24(%rsp)
+    movq %xmm3, 32(%rsp)
+    movq %xmm4, 40(%rsp)
+    movq %xmm5, 48(%rsp)
+    movq %xmm6, 56(%rsp)
+    movq %xmm7, 64(%rsp)
+    movq %xmm8, 72(%rsp)
+    movq %xmm9, 80(%rsp)
+    movq %xmm10, 88(%rsp)
+    movq %xmm11, 96(%rsp)
+    movq %xmm12, 104(%rsp)
+    movq %xmm13, 112(%rsp)
+    movq %xmm14, 120(%rsp)
+    movq %xmm15, 128(%rsp)
+    // Push ArtMethod* for save everything frame method.
+    pushq RUNTIME_SAVE_EVERYTHING_CALLEE_SAVE_FRAME_OFFSET(%r10)
+    CFI_ADJUST_CFA_OFFSET(8)
+    // Store rsp as the top quick frame.
+    movq %rsp, %gs:THREAD_TOP_QUICK_FRAME_OFFSET
+
+    // Ugly compile-time check, but we only have the preprocessor.
+    // Last +8: implicit return address pushed on stack when caller made call.
+#if (FRAME_SIZE_SAVE_EVERYTHING_CALLEE_SAVE != 15 * 8 + 16 * 8 + 16 + 8)
+#error "SAVE_EVERYTHING_CALLEE_SAVE_FRAME(X86_64) size not as expected."
+#endif
+#endif  // __APPLE__
+END_MACRO
+
+MACRO0(RESTORE_SAVE_EVERYTHING_CALLEE_SAVE_FRAME)
+    // Restore FPRs. Method and padding is still on the stack.
+    movq 16(%rsp), %xmm0
+    movq 24(%rsp), %xmm1
+    movq 32(%rsp), %xmm2
+    movq 40(%rsp), %xmm3
+    movq 48(%rsp), %xmm4
+    movq 56(%rsp), %xmm5
+    movq 64(%rsp), %xmm6
+    movq 72(%rsp), %xmm7
+    movq 80(%rsp), %xmm8
+    movq 88(%rsp), %xmm9
+    movq 96(%rsp), %xmm10
+    movq 104(%rsp), %xmm11
+    movq 112(%rsp), %xmm12
+    movq 120(%rsp), %xmm13
+    movq 128(%rsp), %xmm14
+    movq 136(%rsp), %xmm15
+
+    // Remove save everything callee save method, stack alignment padding and FPRs.
+    addq MACRO_LITERAL(16 + 16 * 8), %rsp
+    CFI_ADJUST_CFA_OFFSET(-(16 + 16 * 8))
+    // Restore callee and GPR args, mixed together to agree with core spills bitmap.
+    POP rax
+    POP rcx
+    POP rdx
+    POP rbx
+    POP rbp
+    POP rsi
+    POP rdi
+    POP r8
+    POP r9
+    POP r10
+    POP r11
+    POP r12
+    POP r13
+    POP r14
+    POP r15
+END_MACRO
+
 
     /*
      * Macro that set calls through to artDeliverPendingExceptionFromCode, where the pending
@@ -702,17 +804,6 @@
 #endif  // __APPLE__
 END_FUNCTION art_quick_do_long_jump
 
-MACRO3(NO_ARG_DOWNCALL, c_name, cxx_name, return_macro)
-    DEFINE_FUNCTION VAR(c_name)
-    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    // save ref containing registers for GC
-    // Outgoing argument set up
-    movq %gs:THREAD_SELF_OFFSET, %rdi    // pass Thread::Current()
-    call VAR(cxx_name)                   // cxx_name(Thread*)
-    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME  // restore frame up to return address
-    CALL_MACRO(return_macro)             // return or deliver exception
-    END_FUNCTION VAR(c_name)
-END_MACRO
-
 MACRO3(ONE_ARG_DOWNCALL, c_name, cxx_name, return_macro)
     DEFINE_FUNCTION VAR(c_name)
     SETUP_REFS_ONLY_CALLEE_SAVE_FRAME    // save ref containing registers for GC
@@ -989,7 +1080,13 @@
                                                                // Load the class
     movl 0(%rdx, %rdi, COMPRESSED_REFERENCE_SIZE), %edx
     cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET
-    jne .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path
+    jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
+    // Null check so that we can load the lock word.
+    testl %edx, %edx
+    jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit
+    // Check the mark bit, if it is 1 return.
+    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx)
+    jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path
 .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit:
     ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path
 .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path:
@@ -1022,7 +1119,7 @@
     test LITERAL(LOCK_WORD_STATE_MASK), %ecx         // Test the 2 high bits.
     jne  .Lslow_lock                      // Slow path if either of the two high bits are set.
     movl %ecx, %edx                       // save lock word (edx) to keep read barrier bits.
-    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %ecx  // zero the read barrier bits.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %ecx  // zero the gc bits.
     test %ecx, %ecx
     jnz  .Lalready_thin                   // Lock word contains a thin lock.
     // unlocked case - edx: original lock word, edi: obj.
@@ -1037,9 +1134,9 @@
     cmpw %cx, %dx                         // do we hold the lock already?
     jne  .Lslow_lock
     movl %edx, %ecx                       // copy the lock word to check count overflow.
-    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %ecx  // zero the read barrier bits.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %ecx  // zero the gc bits.
     addl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %ecx  // increment recursion count
-    test LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx  // overflowed if either of the upper two bits (28-29) are set
+    test LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx  // overflowed if the upper bit (28) is set
     jne  .Lslow_lock                      // count overflowed so go slow
     movl %edx, %eax                       // copy the lock word as the old val for cmpxchg.
     addl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %edx   // increment recursion count again for real.
@@ -1074,12 +1171,12 @@
     cmpw %cx, %dx                         // does the thread id match?
     jne  .Lslow_unlock
     movl %ecx, %edx                       // copy the lock word to detect new count of 0.
-    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), %edx  // zero the read barrier bits.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), %edx  // zero the gc bits.
     cmpl LITERAL(LOCK_WORD_THIN_LOCK_COUNT_ONE), %edx
     jae  .Lrecursive_thin_unlock
     // update lockword, cmpxchg necessary for read barrier bits.
     movl %ecx, %eax                       // eax: old lock word.
-    andl LITERAL(LOCK_WORD_READ_BARRIER_STATE_MASK), %ecx  // ecx: new lock word zero except original rb bits.
+    andl LITERAL(LOCK_WORD_GC_STATE_MASK_SHIFTED), %ecx  // ecx: new lock word zero except original gc bits.
 #ifndef USE_READ_BARRIER
     movl %ecx, MIRROR_OBJECT_LOCK_WORD_OFFSET(%edi)
 #else
@@ -1329,7 +1426,14 @@
     ret
 END_FUNCTION art_quick_memcpy
 
-NO_ARG_DOWNCALL art_quick_test_suspend, artTestSuspendFromCode, ret
+DEFINE_FUNCTION art_quick_test_suspend
+    SETUP_SAVE_EVERYTHING_CALLEE_SAVE_FRAME     // save everything for GC
+    // Outgoing argument set up
+    movq %gs:THREAD_SELF_OFFSET, %rdi           // pass Thread::Current()
+    call SYMBOL(artTestSuspendFromCode)         // (Thread*)
+    RESTORE_SAVE_EVERYTHING_CALLEE_SAVE_FRAME   // restore frame up to return address
+    ret
+END_FUNCTION art_quick_test_suspend
 
 UNIMPLEMENTED art_quick_ldiv
 UNIMPLEMENTED art_quick_lmod
@@ -1833,6 +1937,14 @@
 //   convention (e.g. standard callee-save registers are preserved).
 MACRO2(READ_BARRIER_MARK_REG, name, reg)
     DEFINE_FUNCTION VAR(name)
+    // Null check so that we can load the lock word.
+    testq REG_VAR(reg), REG_VAR(reg)
+    jz .Lret_rb_\name
+    // Check the mark bit, if it is 1 return.
+    testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(REG_VAR(reg))
+    jz .Lslow_rb_\name
+    ret
+.Lslow_rb_\name:
     // Save all potentially live caller-save core registers.
     PUSH rax
     PUSH rcx
@@ -1897,6 +2009,7 @@
     POP_REG_NE rdx, RAW_VAR(reg)
     POP_REG_NE rcx, RAW_VAR(reg)
     POP_REG_NE rax, RAW_VAR(reg)
+.Lret_rb_\name:
     ret
     END_FUNCTION VAR(name)
 END_MACRO
diff --git a/runtime/arch/x86_64/quick_method_frame_info_x86_64.h b/runtime/arch/x86_64/quick_method_frame_info_x86_64.h
index 37eff83..aa75b56 100644
--- a/runtime/arch/x86_64/quick_method_frame_info_x86_64.h
+++ b/runtime/arch/x86_64/quick_method_frame_info_x86_64.h
@@ -25,12 +25,19 @@
 namespace art {
 namespace x86_64 {
 
+static constexpr uint32_t kX86_64CalleeSaveAlwaysSpills =
+    (1 << art::x86_64::kNumberOfCpuRegisters);  // Fake return address callee save.
 static constexpr uint32_t kX86_64CalleeSaveRefSpills =
     (1 << art::x86_64::RBX) | (1 << art::x86_64::RBP) | (1 << art::x86_64::R12) |
     (1 << art::x86_64::R13) | (1 << art::x86_64::R14) | (1 << art::x86_64::R15);
 static constexpr uint32_t kX86_64CalleeSaveArgSpills =
     (1 << art::x86_64::RSI) | (1 << art::x86_64::RDX) | (1 << art::x86_64::RCX) |
     (1 << art::x86_64::R8) | (1 << art::x86_64::R9);
+static constexpr uint32_t kX86_64CalleeSaveEverythingSpills =
+    (1 << art::x86_64::RAX) | (1 << art::x86_64::RCX) | (1 << art::x86_64::RDX) |
+    (1 << art::x86_64::RSI) | (1 << art::x86_64::RDI) | (1 << art::x86_64::R8) |
+    (1 << art::x86_64::R9) | (1 << art::x86_64::R10) | (1 << art::x86_64::R11);
+
 static constexpr uint32_t kX86_64CalleeSaveFpArgSpills =
     (1 << art::x86_64::XMM0) | (1 << art::x86_64::XMM1) | (1 << art::x86_64::XMM2) |
     (1 << art::x86_64::XMM3) | (1 << art::x86_64::XMM4) | (1 << art::x86_64::XMM5) |
@@ -38,16 +45,24 @@
 static constexpr uint32_t kX86_64CalleeSaveFpSpills =
     (1 << art::x86_64::XMM12) | (1 << art::x86_64::XMM13) |
     (1 << art::x86_64::XMM14) | (1 << art::x86_64::XMM15);
+static constexpr uint32_t kX86_64CalleeSaveFpEverythingSpills =
+    (1 << art::x86_64::XMM0) | (1 << art::x86_64::XMM1) |
+    (1 << art::x86_64::XMM2) | (1 << art::x86_64::XMM3) |
+    (1 << art::x86_64::XMM4) | (1 << art::x86_64::XMM5) |
+    (1 << art::x86_64::XMM6) | (1 << art::x86_64::XMM7) |
+    (1 << art::x86_64::XMM8) | (1 << art::x86_64::XMM9) |
+    (1 << art::x86_64::XMM10) | (1 << art::x86_64::XMM11);
 
 constexpr uint32_t X86_64CalleeSaveCoreSpills(Runtime::CalleeSaveType type) {
-  return kX86_64CalleeSaveRefSpills |
+  return kX86_64CalleeSaveAlwaysSpills | kX86_64CalleeSaveRefSpills |
       (type == Runtime::kRefsAndArgs ? kX86_64CalleeSaveArgSpills : 0) |
-      (1 << art::x86_64::kNumberOfCpuRegisters);  // fake return address callee save;
+      (type == Runtime::kSaveEverything ? kX86_64CalleeSaveEverythingSpills : 0);
 }
 
 constexpr uint32_t X86_64CalleeSaveFpSpills(Runtime::CalleeSaveType type) {
   return kX86_64CalleeSaveFpSpills |
-      (type == Runtime::kRefsAndArgs ? kX86_64CalleeSaveFpArgSpills : 0);
+      (type == Runtime::kRefsAndArgs ? kX86_64CalleeSaveFpArgSpills : 0) |
+      (type == Runtime::kSaveEverything ? kX86_64CalleeSaveFpEverythingSpills : 0);
 }
 
 constexpr uint32_t X86_64CalleeSaveFrameSize(Runtime::CalleeSaveType type) {
diff --git a/runtime/gc/collector/concurrent_copying-inl.h b/runtime/gc/collector/concurrent_copying-inl.h
index 4019a5b..fb774a4 100644
--- a/runtime/gc/collector/concurrent_copying-inl.h
+++ b/runtime/gc/collector/concurrent_copying-inl.h
@@ -154,11 +154,30 @@
 }
 
 inline mirror::Object* ConcurrentCopying::MarkFromReadBarrier(mirror::Object* from_ref) {
+  mirror::Object* ret;
+  // TODO: Delete GetMarkBit check when all of the callers properly check the bit. Remaining caller
+  // is array allocations.
+  if (from_ref == nullptr || from_ref->GetMarkBit()) {
+    return from_ref;
+  }
   // TODO: Consider removing this check when we are done investigating slow paths. b/30162165
   if (UNLIKELY(mark_from_read_barrier_measurements_)) {
-    return MarkFromReadBarrierWithMeasurements(from_ref);
+    ret = MarkFromReadBarrierWithMeasurements(from_ref);
+  } else {
+    ret = Mark(from_ref);
   }
-  return Mark(from_ref);
+  // Only set the mark bit for baker barrier.
+  if (kUseBakerReadBarrier && LIKELY(!rb_mark_bit_stack_full_ && ret->AtomicSetMarkBit(0, 1))) {
+    // If the mark stack is full, we may temporarily go to mark and back to unmarked. Seeing both
+    // values are OK since the only race is doing an unnecessary Mark.
+    if (!rb_mark_bit_stack_->AtomicPushBack(ret)) {
+      // Mark stack is full, set the bit back to zero.
+      CHECK(ret->AtomicSetMarkBit(1, 0));
+      // Set rb_mark_bit_stack_full_, this is racy but OK since AtomicPushBack is thread safe.
+      rb_mark_bit_stack_full_ = true;
+    }
+  }
+  return ret;
 }
 
 inline mirror::Object* ConcurrentCopying::GetFwdPtr(mirror::Object* from_ref) {
diff --git a/runtime/gc/collector/concurrent_copying.cc b/runtime/gc/collector/concurrent_copying.cc
index d7221e4..071537d 100644
--- a/runtime/gc/collector/concurrent_copying.cc
+++ b/runtime/gc/collector/concurrent_copying.cc
@@ -42,9 +42,6 @@
 namespace collector {
 
 static constexpr size_t kDefaultGcMarkStackSize = 2 * MB;
-// If kGrayDirtyImmuneObjects is true then we gray dirty objects in the GC pause to prevent dirty
-// pages.
-static constexpr bool kGrayDirtyImmuneObjects = true;
 // If kFilterModUnionCards then we attempt to filter cards that don't need to be dirty in the mod
 // union table. Disabled since it does not seem to help the pause much.
 static constexpr bool kFilterModUnionCards = kIsDebugBuild;
@@ -52,6 +49,9 @@
 // ConcurrentCopying::Scan. May be used to diagnose possibly unnecessary read barriers.
 // Only enabled for kIsDebugBuild to avoid performance hit.
 static constexpr bool kDisallowReadBarrierDuringScan = kIsDebugBuild;
+// Slow path mark stack size, increase this if the stack is getting full and it is causing
+// performance problems.
+static constexpr size_t kReadBarrierMarkStackSize = 512 * KB;
 
 ConcurrentCopying::ConcurrentCopying(Heap* heap,
                                      const std::string& name_prefix,
@@ -63,6 +63,10 @@
       gc_mark_stack_(accounting::ObjectStack::Create("concurrent copying gc mark stack",
                                                      kDefaultGcMarkStackSize,
                                                      kDefaultGcMarkStackSize)),
+      rb_mark_bit_stack_(accounting::ObjectStack::Create("rb copying gc mark stack",
+                                                         kReadBarrierMarkStackSize,
+                                                         kReadBarrierMarkStackSize)),
+      rb_mark_bit_stack_full_(false),
       mark_stack_lock_("concurrent copying mark stack lock", kMarkSweepMarkStackLock),
       thread_running_gc_(nullptr),
       is_marking_(false), is_active_(false), is_asserting_to_space_invariant_(false),
@@ -187,6 +191,7 @@
     CHECK(false_gray_stack_.empty());
   }
 
+  rb_mark_bit_stack_full_ = false;
   mark_from_read_barrier_measurements_ = measure_read_barrier_slow_path_;
   if (measure_read_barrier_slow_path_) {
     rb_slow_path_ns_.StoreRelaxed(0);
@@ -914,9 +919,9 @@
     }
     collector_->AssertToSpaceInvariant(nullptr, MemberOffset(0), ref);
     if (kUseBakerReadBarrier) {
-      CHECK(ref->GetReadBarrierPointer() == ReadBarrier::WhitePtr())
+      CHECK_EQ(ref->GetReadBarrierPointer(), ReadBarrier::WhitePtr())
           << "Ref " << ref << " " << PrettyTypeOf(ref)
-          << " has non-white rb_ptr " << ref->GetReadBarrierPointer();
+          << " has non-white rb_ptr ";
     }
   }
 
@@ -982,7 +987,7 @@
     VerifyNoFromSpaceRefsFieldVisitor visitor(collector);
     obj->VisitReferences(visitor, visitor);
     if (kUseBakerReadBarrier) {
-      CHECK(obj->GetReadBarrierPointer() == ReadBarrier::WhitePtr())
+      CHECK_EQ(obj->GetReadBarrierPointer(), ReadBarrier::WhitePtr())
           << "obj=" << obj << " non-white rb_ptr " << obj->GetReadBarrierPointer();
     }
   }
@@ -2243,6 +2248,15 @@
         }
       }
     }
+    if (kUseBakerReadBarrier) {
+      TimingLogger::ScopedTiming split("EmptyRBMarkBitStack", GetTimings());
+      DCHECK(rb_mark_bit_stack_.get() != nullptr);
+      const auto* limit = rb_mark_bit_stack_->End();
+      for (StackReference<mirror::Object>* it = rb_mark_bit_stack_->Begin(); it != limit; ++it) {
+        CHECK(it->AsMirrorPtr()->AtomicSetMarkBit(1, 0));
+      }
+      rb_mark_bit_stack_->Reset();
+    }
   }
   if (measure_read_barrier_slow_path_) {
     MutexLock mu(self, rb_slow_path_histogram_lock_);
diff --git a/runtime/gc/collector/concurrent_copying.h b/runtime/gc/collector/concurrent_copying.h
index 72112fa..a862802 100644
--- a/runtime/gc/collector/concurrent_copying.h
+++ b/runtime/gc/collector/concurrent_copying.h
@@ -57,6 +57,9 @@
   static constexpr bool kEnableFromSpaceAccountingCheck = kIsDebugBuild;
   // Enable verbose mode.
   static constexpr bool kVerboseMode = false;
+  // If kGrayDirtyImmuneObjects is true then we gray dirty objects in the GC pause to prevent dirty
+  // pages.
+  static constexpr bool kGrayDirtyImmuneObjects = true;
 
   ConcurrentCopying(Heap* heap,
                     const std::string& name_prefix = "",
@@ -230,6 +233,8 @@
   space::RegionSpace* region_space_;      // The underlying region space.
   std::unique_ptr<Barrier> gc_barrier_;
   std::unique_ptr<accounting::ObjectStack> gc_mark_stack_;
+  std::unique_ptr<accounting::ObjectStack> rb_mark_bit_stack_;
+  bool rb_mark_bit_stack_full_;
   std::vector<mirror::Object*> false_gray_stack_ GUARDED_BY(mark_stack_lock_);
   Mutex mark_stack_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
   std::vector<accounting::ObjectStack*> revoked_mark_stacks_
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index a92cb24..5485cd2 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -2538,6 +2538,17 @@
   AddSpace(zygote_space_);
   non_moving_space_->SetFootprintLimit(non_moving_space_->Capacity());
   AddSpace(non_moving_space_);
+  if (kUseBakerReadBarrier && gc::collector::ConcurrentCopying::kGrayDirtyImmuneObjects) {
+    // Treat all of the objects in the zygote as marked to avoid unnecessary dirty pages. This is
+    // safe since we mark all of the objects that may reference non immune objects as gray.
+    zygote_space_->GetLiveBitmap()->VisitMarkedRange(
+        reinterpret_cast<uintptr_t>(zygote_space_->Begin()),
+        reinterpret_cast<uintptr_t>(zygote_space_->Limit()),
+        [](mirror::Object* obj) SHARED_REQUIRES(Locks::mutator_lock_) {
+      CHECK(obj->AtomicSetMarkBit(0, 1));
+    });
+  }
+
   // Create the zygote space mod union table.
   accounting::ModUnionTable* mod_union_table =
       new accounting::ModUnionTableCardCache("zygote space mod-union table", this,
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index d140b75..8ade185 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -1436,6 +1436,8 @@
              image_header->GetImageMethod(ImageHeader::kRefsOnlySaveMethod));
     CHECK_EQ(runtime->GetCalleeSaveMethod(Runtime::kRefsAndArgs),
              image_header->GetImageMethod(ImageHeader::kRefsAndArgsSaveMethod));
+    CHECK_EQ(runtime->GetCalleeSaveMethod(Runtime::kSaveEverything),
+             image_header->GetImageMethod(ImageHeader::kSaveEverythingMethod));
   } else if (!runtime->HasResolutionMethod()) {
     runtime->SetInstructionSet(space->oat_file_non_owned_->GetOatHeader().GetInstructionSet());
     runtime->SetResolutionMethod(image_header->GetImageMethod(ImageHeader::kResolutionMethod));
@@ -1448,6 +1450,8 @@
         image_header->GetImageMethod(ImageHeader::kRefsOnlySaveMethod), Runtime::kRefsOnly);
     runtime->SetCalleeSaveMethod(
         image_header->GetImageMethod(ImageHeader::kRefsAndArgsSaveMethod), Runtime::kRefsAndArgs);
+    runtime->SetCalleeSaveMethod(
+        image_header->GetImageMethod(ImageHeader::kSaveEverythingMethod), Runtime::kSaveEverything);
   }
 
   VLOG(image) << "ImageSpace::Init exiting " << *space.get();
diff --git a/runtime/generated/asm_support_gen.h b/runtime/generated/asm_support_gen.h
index 5d62b59..c66029d 100644
--- a/runtime/generated/asm_support_gen.h
+++ b/runtime/generated/asm_support_gen.h
@@ -32,6 +32,8 @@
 DEFINE_CHECK_EQ(static_cast<size_t>(RUNTIME_REFS_ONLY_CALLEE_SAVE_FRAME_OFFSET), (static_cast<size_t>(art::Runtime::GetCalleeSaveMethodOffset(art::Runtime:: kRefsOnly))))
 #define RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET 0x10
 DEFINE_CHECK_EQ(static_cast<size_t>(RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET), (static_cast<size_t>(art::Runtime::GetCalleeSaveMethodOffset(art::Runtime:: kRefsAndArgs))))
+#define RUNTIME_SAVE_EVERYTHING_CALLEE_SAVE_FRAME_OFFSET 0x18
+DEFINE_CHECK_EQ(static_cast<size_t>(RUNTIME_SAVE_EVERYTHING_CALLEE_SAVE_FRAME_OFFSET), (static_cast<size_t>(art::Runtime::GetCalleeSaveMethodOffset(art::Runtime:: kSaveEverything))))
 #define THREAD_FLAGS_OFFSET 0
 DEFINE_CHECK_EQ(static_cast<int32_t>(THREAD_FLAGS_OFFSET), (static_cast<int32_t>(art::Thread:: ThreadFlagsOffset<art::kRuntimePointerSize>().Int32Value())))
 #define THREAD_ID_OFFSET 12
@@ -74,12 +76,22 @@
 DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_STATE_MASK), (static_cast<uint32_t>(art::LockWord::kStateMaskShifted)))
 #define LOCK_WORD_READ_BARRIER_STATE_SHIFT 28
 DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_READ_BARRIER_STATE_SHIFT), (static_cast<int32_t>(art::LockWord::kReadBarrierStateShift)))
-#define LOCK_WORD_READ_BARRIER_STATE_MASK 0x30000000
+#define LOCK_WORD_READ_BARRIER_STATE_MASK 0x10000000
 DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_READ_BARRIER_STATE_MASK), (static_cast<uint32_t>(art::LockWord::kReadBarrierStateMaskShifted)))
-#define LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED 0xcfffffff
+#define LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED 0xefffffff
 DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_READ_BARRIER_STATE_MASK_TOGGLED), (static_cast<uint32_t>(art::LockWord::kReadBarrierStateMaskShiftedToggled)))
 #define LOCK_WORD_THIN_LOCK_COUNT_ONE 65536
 DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_THIN_LOCK_COUNT_ONE), (static_cast<int32_t>(art::LockWord::kThinLockCountOne)))
+#define LOCK_WORD_GC_STATE_MASK_SHIFTED 0x30000000
+DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_GC_STATE_MASK_SHIFTED), (static_cast<uint32_t>(art::LockWord::kGCStateMaskShifted)))
+#define LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED 0xcfffffff
+DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_GC_STATE_MASK_SHIFTED_TOGGLED), (static_cast<uint32_t>(art::LockWord::kGCStateMaskShiftedToggled)))
+#define LOCK_WORD_GC_STATE_SHIFT 28
+DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_GC_STATE_SHIFT), (static_cast<int32_t>(art::LockWord::kGCStateShift)))
+#define LOCK_WORD_MARK_BIT_SHIFT 29
+DEFINE_CHECK_EQ(static_cast<int32_t>(LOCK_WORD_MARK_BIT_SHIFT), (static_cast<int32_t>(art::LockWord::kMarkBitStateShift)))
+#define LOCK_WORD_MARK_BIT_MASK_SHIFTED 0x20000000
+DEFINE_CHECK_EQ(static_cast<uint32_t>(LOCK_WORD_MARK_BIT_MASK_SHIFTED), (static_cast<uint32_t>(art::LockWord::kMarkBitStateMaskShifted)))
 #define OBJECT_ALIGNMENT_MASK 0x7
 DEFINE_CHECK_EQ(static_cast<size_t>(OBJECT_ALIGNMENT_MASK), (static_cast<size_t>(art::kObjectAlignment - 1)))
 #define OBJECT_ALIGNMENT_MASK_TOGGLED 0xfffffff8
diff --git a/runtime/globals.h b/runtime/globals.h
index 0b44c47..9045d40 100644
--- a/runtime/globals.h
+++ b/runtime/globals.h
@@ -47,7 +47,8 @@
 }
 
 // Required object alignment
-static constexpr size_t kObjectAlignment = 8;
+static constexpr size_t kObjectAlignmentShift = 3;
+static constexpr size_t kObjectAlignment = 1u << kObjectAlignmentShift;
 static constexpr size_t kLargeObjectAlignment = kPageSize;
 
 // Whether or not this is a debug build. Useful in conditionals where NDEBUG isn't.
diff --git a/runtime/image.h b/runtime/image.h
index a98cea1..207a818 100644
--- a/runtime/image.h
+++ b/runtime/image.h
@@ -186,6 +186,7 @@
     kCalleeSaveMethod,
     kRefsOnlySaveMethod,
     kRefsAndArgsSaveMethod,
+    kSaveEverythingMethod,
     kImageMethodsCount,  // Number of elements in enum.
   };
 
diff --git a/runtime/lock_word-inl.h b/runtime/lock_word-inl.h
index 341501b..4a2a293 100644
--- a/runtime/lock_word-inl.h
+++ b/runtime/lock_word-inl.h
@@ -43,17 +43,15 @@
 
 inline size_t LockWord::ForwardingAddress() const {
   DCHECK_EQ(GetState(), kForwardingAddress);
-  return value_ << kStateSize;
+  return value_ << kForwardingAddressShift;
 }
 
 inline LockWord::LockWord() : value_(0) {
   DCHECK_EQ(GetState(), kUnlocked);
 }
 
-inline LockWord::LockWord(Monitor* mon, uint32_t rb_state)
-    : value_(mon->GetMonitorId() | (rb_state << kReadBarrierStateShift) |
-             (kStateFat << kStateShift)) {
-  DCHECK_EQ(rb_state & ~kReadBarrierStateMask, 0U);
+inline LockWord::LockWord(Monitor* mon, uint32_t gc_state)
+    : value_(mon->GetMonitorId() | (gc_state << kGCStateShift) | (kStateFat << kStateShift)) {
 #ifndef __LP64__
   DCHECK_ALIGNED(mon, kMonitorIdAlignment);
 #endif
diff --git a/runtime/lock_word.h b/runtime/lock_word.h
index 5d0d204..538b6eb 100644
--- a/runtime/lock_word.h
+++ b/runtime/lock_word.h
@@ -35,27 +35,27 @@
  * the state. The four possible states are fat locked, thin/unlocked, hash code, and forwarding
  * address. When the lock word is in the "thin" state and its bits are formatted as follows:
  *
- *  |33|22|222222221111|1111110000000000|
- *  |10|98|765432109876|5432109876543210|
- *  |00|rb| lock count |thread id owner |
+ *  |33|2|2|222222221111|1111110000000000|
+ *  |10|9|8|765432109876|5432109876543210|
+ *  |00|m|r| lock count |thread id owner |
  *
  * When the lock word is in the "fat" state and its bits are formatted as follows:
  *
- *  |33|22|2222222211111111110000000000|
- *  |10|98|7654321098765432109876543210|
- *  |01|rb| MonitorId                  |
+ *  |33|2|2|2222222211111111110000000000|
+ *  |10|9|8|7654321098765432109876543210|
+ *  |01|m|r| MonitorId                  |
  *
  * When the lock word is in hash state and its bits are formatted as follows:
  *
- *  |33|22|2222222211111111110000000000|
- *  |10|98|7654321098765432109876543210|
- *  |10|rb| HashCode                   |
+ *  |33|2|2|2222222211111111110000000000|
+ *  |10|9|8|7654321098765432109876543210|
+ *  |10|m|r| HashCode                   |
  *
- * When the lock word is in fowarding address state and its bits are formatted as follows:
+ * When the lock word is in forwarding address state and its bits are formatted as follows:
  *
- *  |33|22|2222222211111111110000000000|
- *  |10|98|7654321098765432109876543210|
- *  |11| ForwardingAddress             |
+ *  |33|2|22222222211111111110000000000|
+ *  |10|9|87654321098765432109876543210|
+ *  |11|0| ForwardingAddress           |
  *
  * The rb bits store the read barrier state.
  */
@@ -64,11 +64,13 @@
   enum SizeShiftsAndMasks {  // private marker to avoid generate-operator-out.py from processing.
     // Number of bits to encode the state, currently just fat or thin/unlocked or hash code.
     kStateSize = 2,
-    kReadBarrierStateSize = 2,
+    kReadBarrierStateSize = 1,
+    kMarkBitStateSize = 1,
     // Number of bits to encode the thin lock owner.
     kThinLockOwnerSize = 16,
     // Remaining bits are the recursive lock count.
-    kThinLockCountSize = 32 - kThinLockOwnerSize - kStateSize - kReadBarrierStateSize,
+    kThinLockCountSize = 32 - kThinLockOwnerSize - kStateSize - kReadBarrierStateSize -
+        kMarkBitStateSize,
     // Thin lock bits. Owner in lowest bits.
 
     kThinLockOwnerShift = 0,
@@ -81,25 +83,43 @@
     kThinLockCountOne = 1 << kThinLockCountShift,  // == 65536 (0x10000)
 
     // State in the highest bits.
-    kStateShift = kReadBarrierStateSize + kThinLockCountSize + kThinLockCountShift,
+    kStateShift = kReadBarrierStateSize + kThinLockCountSize + kThinLockCountShift +
+        kMarkBitStateSize,
     kStateMask = (1 << kStateSize) - 1,
     kStateMaskShifted = kStateMask << kStateShift,
     kStateThinOrUnlocked = 0,
     kStateFat = 1,
     kStateHash = 2,
     kStateForwardingAddress = 3,
+
+    // Read barrier bit.
     kReadBarrierStateShift = kThinLockCountSize + kThinLockCountShift,
     kReadBarrierStateMask = (1 << kReadBarrierStateSize) - 1,
     kReadBarrierStateMaskShifted = kReadBarrierStateMask << kReadBarrierStateShift,
     kReadBarrierStateMaskShiftedToggled = ~kReadBarrierStateMaskShifted,
 
+    // Mark bit.
+    kMarkBitStateShift = kReadBarrierStateSize + kReadBarrierStateShift,
+    kMarkBitStateMask = (1 << kMarkBitStateSize) - 1,
+    kMarkBitStateMaskShifted = kMarkBitStateMask << kMarkBitStateShift,
+    kMarkBitStateMaskShiftedToggled = ~kMarkBitStateMaskShifted,
+
+    // GC state is mark bit and read barrier state.
+    kGCStateSize = kReadBarrierStateSize + kMarkBitStateSize,
+    kGCStateShift = kReadBarrierStateShift,
+    kGCStateMaskShifted = kReadBarrierStateMaskShifted | kMarkBitStateMaskShifted,
+    kGCStateMaskShiftedToggled = ~kGCStateMaskShifted,
+
     // When the state is kHashCode, the non-state bits hold the hashcode.
     // Note Object.hashCode() has the hash code layout hardcoded.
     kHashShift = 0,
-    kHashSize = 32 - kStateSize - kReadBarrierStateSize,
+    kHashSize = 32 - kStateSize - kReadBarrierStateSize - kMarkBitStateSize,
     kHashMask = (1 << kHashSize) - 1,
     kMaxHash = kHashMask,
 
+    // Forwarding address shift.
+    kForwardingAddressShift = kObjectAlignmentShift,
+
     kMonitorIdShift = kHashShift,
     kMonitorIdSize = kHashSize,
     kMonitorIdMask = kHashMask,
@@ -108,31 +128,31 @@
     kMaxMonitorId = kMaxHash
   };
 
-  static LockWord FromThinLockId(uint32_t thread_id, uint32_t count, uint32_t rb_state) {
+  static LockWord FromThinLockId(uint32_t thread_id, uint32_t count, uint32_t gc_state) {
     CHECK_LE(thread_id, static_cast<uint32_t>(kThinLockMaxOwner));
     CHECK_LE(count, static_cast<uint32_t>(kThinLockMaxCount));
-    DCHECK_EQ(rb_state & ~kReadBarrierStateMask, 0U);
-    return LockWord((thread_id << kThinLockOwnerShift) | (count << kThinLockCountShift) |
-                    (rb_state << kReadBarrierStateShift) |
+    // DCHECK_EQ(gc_bits & kGCStateMaskToggled, 0U);
+    return LockWord((thread_id << kThinLockOwnerShift) |
+                    (count << kThinLockCountShift) |
+                    (gc_state << kGCStateShift) |
                     (kStateThinOrUnlocked << kStateShift));
   }
 
   static LockWord FromForwardingAddress(size_t target) {
     DCHECK_ALIGNED(target, (1 << kStateSize));
-    return LockWord((target >> kStateSize) | (kStateForwardingAddress << kStateShift));
+    return LockWord((target >> kForwardingAddressShift) | (kStateForwardingAddress << kStateShift));
   }
 
-  static LockWord FromHashCode(uint32_t hash_code, uint32_t rb_state) {
+  static LockWord FromHashCode(uint32_t hash_code, uint32_t gc_state) {
     CHECK_LE(hash_code, static_cast<uint32_t>(kMaxHash));
-    DCHECK_EQ(rb_state & ~kReadBarrierStateMask, 0U);
+    // DCHECK_EQ(gc_bits & kGCStateMaskToggled, 0U);
     return LockWord((hash_code << kHashShift) |
-                    (rb_state << kReadBarrierStateShift) |
+                    (gc_state << kGCStateShift) |
                     (kStateHash << kStateShift));
   }
 
-  static LockWord FromDefault(uint32_t rb_state) {
-    DCHECK_EQ(rb_state & ~kReadBarrierStateMask, 0U);
-    return LockWord(rb_state << kReadBarrierStateShift);
+  static LockWord FromDefault(uint32_t gc_state) {
+    return LockWord(gc_state << kGCStateShift);
   }
 
   static bool IsDefault(LockWord lw) {
@@ -154,7 +174,7 @@
   LockState GetState() const {
     CheckReadBarrierState();
     if ((!kUseReadBarrier && UNLIKELY(value_ == 0)) ||
-        (kUseReadBarrier && UNLIKELY((value_ & kReadBarrierStateMaskShiftedToggled) == 0))) {
+        (kUseReadBarrier && UNLIKELY((value_ & kGCStateMaskShiftedToggled) == 0))) {
       return kUnlocked;
     } else {
       uint32_t internal_state = (value_ >> kStateShift) & kStateMask;
@@ -176,6 +196,10 @@
     return (value_ >> kReadBarrierStateShift) & kReadBarrierStateMask;
   }
 
+  uint32_t GCState() const {
+    return (value_ & kGCStateMaskShifted) >> kGCStateShift;
+  }
+
   void SetReadBarrierState(uint32_t rb_state) {
     DCHECK_EQ(rb_state & ~kReadBarrierStateMask, 0U);
     DCHECK_NE(static_cast<uint32_t>(GetState()), static_cast<uint32_t>(kForwardingAddress));
@@ -184,6 +208,19 @@
     value_ |= (rb_state & kReadBarrierStateMask) << kReadBarrierStateShift;
   }
 
+
+  uint32_t MarkBitState() const {
+    return (value_ >> kMarkBitStateShift) & kMarkBitStateMask;
+  }
+
+  void SetMarkBitState(uint32_t mark_bit) {
+    DCHECK_EQ(mark_bit & ~kMarkBitStateMask, 0U);
+    DCHECK_NE(static_cast<uint32_t>(GetState()), static_cast<uint32_t>(kForwardingAddress));
+    // Clear and or the bits.
+    value_ &= kMarkBitStateMaskShiftedToggled;
+    value_ |= mark_bit << kMarkBitStateShift;
+  }
+
   // Return the owner thin lock thread id.
   uint32_t ThinLockOwner() const;
 
@@ -197,7 +234,7 @@
   size_t ForwardingAddress() const;
 
   // Constructor a lock word for inflation to use a Monitor.
-  LockWord(Monitor* mon, uint32_t rb_state);
+  LockWord(Monitor* mon, uint32_t gc_state);
 
   // Return the hash code stored in the lock word, must be kHashCode state.
   int32_t GetHashCode() const;
@@ -207,7 +244,7 @@
     if (kIncludeReadBarrierState) {
       return lw1.GetValue() == lw2.GetValue();
     }
-    return lw1.GetValueWithoutReadBarrierState() == lw2.GetValueWithoutReadBarrierState();
+    return lw1.GetValueWithoutGCState() == lw2.GetValueWithoutGCState();
   }
 
   void Dump(std::ostream& os) {
@@ -248,9 +285,9 @@
     return value_;
   }
 
-  uint32_t GetValueWithoutReadBarrierState() const {
+  uint32_t GetValueWithoutGCState() const {
     CheckReadBarrierState();
-    return value_ & ~(kReadBarrierStateMask << kReadBarrierStateShift);
+    return value_ & kGCStateMaskShiftedToggled;
   }
 
   // Only Object should be converting LockWords to/from uints.
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index 0592c6c..0495c95 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -147,10 +147,20 @@
 #endif
 }
 
+inline uint32_t Object::GetMarkBit() {
+#ifdef USE_READ_BARRIER
+  return GetLockWord(false).MarkBitState();
+#else
+  LOG(FATAL) << "Unreachable";
+  UNREACHABLE();
+#endif
+}
+
 inline void Object::SetReadBarrierPointer(Object* rb_ptr) {
 #ifdef USE_BAKER_READ_BARRIER
   DCHECK(kUseBakerReadBarrier);
   DCHECK_EQ(reinterpret_cast<uint64_t>(rb_ptr) >> 32, 0U);
+  DCHECK_NE(rb_ptr, ReadBarrier::BlackPtr()) << "Setting to black is not supported";
   LockWord lw = GetLockWord(false);
   lw.SetReadBarrierState(static_cast<uint32_t>(reinterpret_cast<uintptr_t>(rb_ptr)));
   SetLockWord(lw, false);
@@ -173,6 +183,8 @@
   DCHECK(kUseBakerReadBarrier);
   DCHECK_EQ(reinterpret_cast<uint64_t>(expected_rb_ptr) >> 32, 0U);
   DCHECK_EQ(reinterpret_cast<uint64_t>(rb_ptr) >> 32, 0U);
+  DCHECK_NE(expected_rb_ptr, ReadBarrier::BlackPtr()) << "Setting to black is not supported";
+  DCHECK_NE(rb_ptr, ReadBarrier::BlackPtr()) << "Setting to black is not supported";
   LockWord expected_lw;
   LockWord new_lw;
   do {
@@ -216,6 +228,24 @@
 #endif
 }
 
+inline bool Object::AtomicSetMarkBit(uint32_t expected_mark_bit, uint32_t mark_bit) {
+  LockWord expected_lw;
+  LockWord new_lw;
+  do {
+    LockWord lw = GetLockWord(false);
+    if (UNLIKELY(lw.MarkBitState() != expected_mark_bit)) {
+      // Lost the race.
+      return false;
+    }
+    expected_lw = lw;
+    new_lw = lw;
+    new_lw.SetMarkBitState(mark_bit);
+    // Since this is only set from the mutator, we can use the non release Cas.
+  } while (!CasLockWordWeakRelaxed(expected_lw, new_lw));
+  return true;
+}
+
+
 inline void Object::AssertReadBarrierPointer() const {
   if (kUseBakerReadBarrier) {
     Object* obj = const_cast<Object*>(this);
diff --git a/runtime/mirror/object.cc b/runtime/mirror/object.cc
index 701c600..13c536e 100644
--- a/runtime/mirror/object.cc
+++ b/runtime/mirror/object.cc
@@ -163,8 +163,7 @@
       case LockWord::kUnlocked: {
         // Try to compare and swap in a new hash, if we succeed we will return the hash on the next
         // loop iteration.
-        LockWord hash_word = LockWord::FromHashCode(GenerateIdentityHashCode(),
-                                                    lw.ReadBarrierState());
+        LockWord hash_word = LockWord::FromHashCode(GenerateIdentityHashCode(), lw.GCState());
         DCHECK_EQ(hash_word.GetState(), LockWord::kHashCode);
         if (const_cast<Object*>(this)->CasLockWordWeakRelaxed(lw, hash_word)) {
           return hash_word.GetHashCode();
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index a4bdbad..5b129bf 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -93,6 +93,7 @@
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   void SetClass(Class* new_klass) SHARED_REQUIRES(Locks::mutator_lock_);
 
+  // TODO: Clean this up and change to return int32_t
   Object* GetReadBarrierPointer() SHARED_REQUIRES(Locks::mutator_lock_);
 
 #ifndef USE_BAKER_OR_BROOKS_READ_BARRIER
@@ -103,6 +104,12 @@
   template<bool kCasRelease = false>
   ALWAYS_INLINE bool AtomicSetReadBarrierPointer(Object* expected_rb_ptr, Object* rb_ptr)
       SHARED_REQUIRES(Locks::mutator_lock_);
+
+  ALWAYS_INLINE uint32_t GetMarkBit() SHARED_REQUIRES(Locks::mutator_lock_);
+
+  ALWAYS_INLINE bool AtomicSetMarkBit(uint32_t expected_mark_bit, uint32_t mark_bit)
+      SHARED_REQUIRES(Locks::mutator_lock_);
+
   void AssertReadBarrierPointer() const SHARED_REQUIRES(Locks::mutator_lock_);
 
   // The verifier treats all interfaces as java.lang.Object and relies on runtime checks in
diff --git a/runtime/monitor.cc b/runtime/monitor.cc
index bf9f931..e863ea9 100644
--- a/runtime/monitor.cc
+++ b/runtime/monitor.cc
@@ -155,7 +155,7 @@
       return false;
     }
   }
-  LockWord fat(this, lw.ReadBarrierState());
+  LockWord fat(this, lw.GCState());
   // Publish the updated lock word, which may race with other threads.
   bool success = GetObject()->CasLockWordWeakSequentiallyConsistent(lw, fat);
   // Lock profiling.
@@ -774,20 +774,21 @@
         return false;
       }
       // Deflate to a thin lock.
-      LockWord new_lw = LockWord::FromThinLockId(owner->GetThreadId(), monitor->lock_count_,
-                                                 lw.ReadBarrierState());
+      LockWord new_lw = LockWord::FromThinLockId(owner->GetThreadId(),
+                                                 monitor->lock_count_,
+                                                 lw.GCState());
       // Assume no concurrent read barrier state changes as mutators are suspended.
       obj->SetLockWord(new_lw, false);
       VLOG(monitor) << "Deflated " << obj << " to thin lock " << owner->GetTid() << " / "
           << monitor->lock_count_;
     } else if (monitor->HasHashCode()) {
-      LockWord new_lw = LockWord::FromHashCode(monitor->GetHashCode(), lw.ReadBarrierState());
+      LockWord new_lw = LockWord::FromHashCode(monitor->GetHashCode(), lw.GCState());
       // Assume no concurrent read barrier state changes as mutators are suspended.
       obj->SetLockWord(new_lw, false);
       VLOG(monitor) << "Deflated " << obj << " to hash monitor " << monitor->GetHashCode();
     } else {
       // No lock and no hash, just put an empty lock word inside the object.
-      LockWord new_lw = LockWord::FromDefault(lw.ReadBarrierState());
+      LockWord new_lw = LockWord::FromDefault(lw.GCState());
       // Assume no concurrent read barrier state changes as mutators are suspended.
       obj->SetLockWord(new_lw, false);
       VLOG(monitor) << "Deflated" << obj << " to empty lock word";
@@ -876,7 +877,7 @@
     LockWord lock_word = h_obj->GetLockWord(true);
     switch (lock_word.GetState()) {
       case LockWord::kUnlocked: {
-        LockWord thin_locked(LockWord::FromThinLockId(thread_id, 0, lock_word.ReadBarrierState()));
+        LockWord thin_locked(LockWord::FromThinLockId(thread_id, 0, lock_word.GCState()));
         if (h_obj->CasLockWordWeakSequentiallyConsistent(lock_word, thin_locked)) {
           AtraceMonitorLock(self, h_obj.Get(), false /* is_wait */);
           // CasLockWord enforces more than the acquire ordering we need here.
@@ -890,8 +891,9 @@
           // We own the lock, increase the recursion count.
           uint32_t new_count = lock_word.ThinLockCount() + 1;
           if (LIKELY(new_count <= LockWord::kThinLockMaxCount)) {
-            LockWord thin_locked(LockWord::FromThinLockId(thread_id, new_count,
-                                                          lock_word.ReadBarrierState()));
+            LockWord thin_locked(LockWord::FromThinLockId(thread_id,
+                                                          new_count,
+                                                          lock_word.GCState()));
             if (!kUseReadBarrier) {
               h_obj->SetLockWord(thin_locked, true);
               AtraceMonitorLock(self, h_obj.Get(), false /* is_wait */);
@@ -975,9 +977,9 @@
           LockWord new_lw = LockWord::Default();
           if (lock_word.ThinLockCount() != 0) {
             uint32_t new_count = lock_word.ThinLockCount() - 1;
-            new_lw = LockWord::FromThinLockId(thread_id, new_count, lock_word.ReadBarrierState());
+            new_lw = LockWord::FromThinLockId(thread_id, new_count, lock_word.GCState());
           } else {
-            new_lw = LockWord::FromDefault(lock_word.ReadBarrierState());
+            new_lw = LockWord::FromDefault(lock_word.GCState());
           }
           if (!kUseReadBarrier) {
             DCHECK_EQ(new_lw.ReadBarrierState(), 0U);
diff --git a/runtime/oat.h b/runtime/oat.h
index 2c5c3e6..7c84fe9 100644
--- a/runtime/oat.h
+++ b/runtime/oat.h
@@ -32,7 +32,7 @@
 class PACKED(4) OatHeader {
  public:
   static constexpr uint8_t kOatMagic[] = { 'o', 'a', 't', '\n' };
-  static constexpr uint8_t kOatVersion[] = { '0', '8', '5', '\0' };
+  static constexpr uint8_t kOatVersion[] = { '0', '8', '6', '\0' };
 
   static constexpr const char* kImageLocationKey = "image-location";
   static constexpr const char* kDex2OatCmdLineKey = "dex2oat-cmdline";
diff --git a/runtime/read_barrier.h b/runtime/read_barrier.h
index 42e959c..5d32c09 100644
--- a/runtime/read_barrier.h
+++ b/runtime/read_barrier.h
@@ -99,8 +99,9 @@
   // Note: These couldn't be constexpr pointers as reinterpret_cast isn't compatible with them.
   static constexpr uintptr_t white_ptr_ = 0x0;    // Not marked.
   static constexpr uintptr_t gray_ptr_ = 0x1;     // Marked, but not marked through. On mark stack.
+  // TODO: black_ptr_ is unused, we should remove it.
   static constexpr uintptr_t black_ptr_ = 0x2;    // Marked through. Used for non-moving objects.
-  static constexpr uintptr_t rb_ptr_mask_ = 0x3;  // The low 2 bits for white|gray|black.
+  static constexpr uintptr_t rb_ptr_mask_ = 0x1;  // The low bits for white|gray.
 };
 
 }  // namespace art
diff --git a/runtime/runtime-inl.h b/runtime/runtime-inl.h
index bfa8c54..265587d 100644
--- a/runtime/runtime-inl.h
+++ b/runtime/runtime-inl.h
@@ -45,9 +45,11 @@
     return GetCalleeSaveMethodFrameInfo(Runtime::kRefsAndArgs);
   } else if (method == GetCalleeSaveMethodUnchecked(Runtime::kSaveAll)) {
     return GetCalleeSaveMethodFrameInfo(Runtime::kSaveAll);
-  } else {
-    DCHECK_EQ(method, GetCalleeSaveMethodUnchecked(Runtime::kRefsOnly));
+  } else if (method == GetCalleeSaveMethodUnchecked(Runtime::kRefsOnly)) {
     return GetCalleeSaveMethodFrameInfo(Runtime::kRefsOnly);
+  } else {
+    DCHECK_EQ(method, GetCalleeSaveMethodUnchecked(Runtime::kSaveEverything));
+    return GetCalleeSaveMethodFrameInfo(Runtime::kSaveEverything);
   }
 }
 
diff --git a/runtime/runtime.h b/runtime/runtime.h
index afa8e48..7e269af 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -387,9 +387,10 @@
 
   // Returns a special method that describes all callee saves being spilled to the stack.
   enum CalleeSaveType {
-    kSaveAll,
+    kSaveAll,            // All callee-save registers.
     kRefsOnly,
     kRefsAndArgs,
+    kSaveEverything,     // Even caller-save registers.
     kLastCalleeSaveType  // Value used for iteration
   };
 
diff --git a/tools/ahat/src/InstanceUtils.java b/tools/ahat/src/InstanceUtils.java
index 3cdb40c..8769d11 100644
--- a/tools/ahat/src/InstanceUtils.java
+++ b/tools/ahat/src/InstanceUtils.java
@@ -95,9 +95,7 @@
       return null;
     }
 
-    // TODO: When perflib provides a better way to get the length of the
-    // array, we should use that here.
-    int numChars = chars.getValues().length;
+    int numChars = chars.getLength();
     int count = getIntField(inst, "count", numChars);
     if (count == 0) {
       return "";
diff --git a/tools/cpp-define-generator/constant_lockword.def b/tools/cpp-define-generator/constant_lockword.def
index c1e6099..67ed5b5 100644
--- a/tools/cpp-define-generator/constant_lockword.def
+++ b/tools/cpp-define-generator/constant_lockword.def
@@ -30,5 +30,12 @@
 DEFINE_LOCK_WORD_EXPR(READ_BARRIER_STATE_MASK_TOGGLED, uint32_t, kReadBarrierStateMaskShiftedToggled)
 DEFINE_LOCK_WORD_EXPR(THIN_LOCK_COUNT_ONE,       int32_t,  kThinLockCountOne)
 
+DEFINE_LOCK_WORD_EXPR(GC_STATE_MASK_SHIFTED,   uint32_t,  kGCStateMaskShifted)
+DEFINE_LOCK_WORD_EXPR(GC_STATE_MASK_SHIFTED_TOGGLED, uint32_t, kGCStateMaskShiftedToggled)
+DEFINE_LOCK_WORD_EXPR(GC_STATE_SHIFT,   int32_t,  kGCStateShift)
+
+DEFINE_LOCK_WORD_EXPR(MARK_BIT_SHIFT, int32_t, kMarkBitStateShift)
+DEFINE_LOCK_WORD_EXPR(MARK_BIT_MASK_SHIFTED, uint32_t, kMarkBitStateMaskShifted)
+
 #undef DEFINE_LOCK_WORD_EXPR
 
diff --git a/tools/cpp-define-generator/offset_runtime.def b/tools/cpp-define-generator/offset_runtime.def
index b327ca3..123992f 100644
--- a/tools/cpp-define-generator/offset_runtime.def
+++ b/tools/cpp-define-generator/offset_runtime.def
@@ -34,6 +34,8 @@
 DEFINE_RUNTIME_CALLEE_SAVE_OFFSET(REFS_ONLY,     kRefsOnly)
 // Offset of field Runtime::callee_save_methods_[kRefsAndArgs]
 DEFINE_RUNTIME_CALLEE_SAVE_OFFSET(REFS_AND_ARGS, kRefsAndArgs)
+// Offset of field Runtime::callee_save_methods_[kSaveEverything]
+DEFINE_RUNTIME_CALLEE_SAVE_OFFSET(SAVE_EVERYTHING, kSaveEverything)
 
 #undef DEFINE_RUNTIME_CALLEE_SAVE_OFFSET
 #include "common_undef.def"  // undef DEFINE_OFFSET_EXPR
diff --git a/tools/libcore_failures.txt b/tools/libcore_failures.txt
index bf8d12b..8d87e4f 100644
--- a/tools/libcore_failures.txt
+++ b/tools/libcore_failures.txt
@@ -36,6 +36,15 @@
   names: ["libcore.io.OsTest#testUnixDomainSockets_in_file_system"]
 },
 {
+  description: "TCP_USER_TIMEOUT is not defined on host's tcp.h (glibc-2.15-4.8).",
+  result: EXEC_FAILED,
+  modes: [host],
+  names: ["libcore.android.system.OsConstantsTest#testTcpUserTimeoutIsDefined",
+          "libcore.io.OsTest#test_socket_tcpUserTimeout_setAndGet",
+          "libcore.io.OsTest#test_socket_tcpUserTimeout_doesNotWorkOnDatagramSocket"],
+  bug: 30402085
+},
+{
   description: "Issue with incorrect device time (1970)",
   result: EXEC_FAILED,
   modes: [device],
@@ -174,38 +183,7 @@
   description: "Failing tests after OpenJDK move.",
   result: EXEC_FAILED,
   bug: 26326992,
-  names: ["libcore.icu.RelativeDateTimeFormatterTest#test_getRelativeDateTimeStringDST",
-          "libcore.java.lang.OldSystemTest#test_load",
-          "libcore.java.text.NumberFormatTest#test_currencyWithPatternDigits",
-          "libcore.java.text.NumberFormatTest#test_setCurrency",
-          "libcore.java.text.OldNumberFormatTest#test_getIntegerInstanceLjava_util_Locale",
-          "libcore.java.util.CalendarTest#testAddOneDayAndOneDayOver30MinuteDstForwardAdds48Hours",
-          "libcore.java.util.CalendarTest#testNewCalendarKoreaIsSelfConsistent",
-          "libcore.java.util.CalendarTest#testSetTimeInZoneWhereDstIsNoLongerUsed",
-          "libcore.java.util.CalendarTest#test_nullLocale",
-          "libcore.java.util.FormatterTest#test_numberLocalization",
-          "libcore.java.util.FormatterTest#test_uppercaseConversions",
-          "libcore.javax.crypto.CipherTest#testCipher_getInstance_WrongType_Failure",
-          "libcore.javax.crypto.CipherTest#testDecryptBufferZeroSize_mustDecodeToEmptyString",
-          "libcore.javax.security.auth.x500.X500PrincipalTest#testExceptionsForWrongDNs",
-          "org.apache.harmony.luni.tests.java.net.URLConnectionTest#test_getDate",
-          "org.apache.harmony.luni.tests.java.net.URLConnectionTest#test_getExpiration",
-          "org.apache.harmony.regex.tests.java.util.regex.PatternSyntaxExceptionTest#testPatternSyntaxException",
-          "org.apache.harmony.tests.java.lang.FloatTest#test_parseFloat_LString_Harmony6261",
-          "org.apache.harmony.tests.java.lang.ThreadTest#test_isDaemon",
-          "org.apache.harmony.tests.java.text.DecimalFormatSymbolsTest#test_setInternationalCurrencySymbolLjava_lang_String",
-          "org.apache.harmony.tests.java.text.DecimalFormatTest#testSerializationHarmonyRICompatible",
-          "org.apache.harmony.tests.java.text.SimpleDateFormatTest#test_parseLjava_lang_StringLjava_text_ParsePosition",
-          "org.apache.harmony.tests.java.text.SimpleDateFormatTest#test_parse_W_w_dd_MMMM_yyyy_EEEE",
-          "org.apache.harmony.tests.java.text.SimpleDateFormatTest#test_parse_dayOfYearPatterns",
-          "org.apache.harmony.tests.java.text.SimpleDateFormatTest#test_parse_h_m_z",
-          "org.apache.harmony.tests.java.text.SimpleDateFormatTest#test_parse_h_z_2DigitOffsetFromGMT",
-          "org.apache.harmony.tests.java.text.SimpleDateFormatTest#test_parse_h_z_4DigitOffsetFromGMT",
-          "org.apache.harmony.tests.java.text.SimpleDateFormatTest#test_parse_h_z_4DigitOffsetNoGMT",
-          "org.apache.harmony.tests.java.util.jar.JarFileTest#test_getInputStreamLjava_util_jar_JarEntry_subtest0",
-          "libcore.java.util.CalendarTest#test_clear_45877",
-          "org.apache.harmony.crypto.tests.javax.crypto.spec.SecretKeySpecTest#testGetFormat",
-          "org.apache.harmony.tests.java.util.TimerTaskTest#test_scheduledExecutionTime"]
+  names: ["libcore.java.lang.OldSystemTest#test_load"]
 },
 {
   description: "Missing resource in classpath",
@@ -262,10 +240,12 @@
   names: ["org.apache.harmony.tests.java.lang.ProcessTest#test_destroyForcibly"]
 },
 {
-  description: "Flaky failure, possibly caused by a kernel bug accessing /proc/",
+  description: "Flaky failure, native crash in the runtime.
+                Unclear if this relates to the tests running sh as a child process.",
   result: EXEC_FAILED,
-  bug: 27464570,
+  bug: 30657148,
   modes: [device],
-  names: ["libcore.java.lang.ProcessBuilderTest#testRedirectInherit"]
+  names: ["libcore.java.lang.ProcessBuilderTest#testRedirectInherit",
+          "libcore.java.lang.ProcessBuilderTest#testRedirect_nullStreams"]
 }
 ]