Merge "Fix function names in CHECK_NON_NULL_ARGUMENT."
diff --git a/Android.mk b/Android.mk
index 8e43879..2a575f5 100644
--- a/Android.mk
+++ b/Android.mk
@@ -144,7 +144,7 @@
 	@echo test-art-host-interpreter PASSED
 
 .PHONY: test-art-host-dependencies
-test-art-host-dependencies: $(ART_HOST_TEST_DEPENDENCIES) $(HOST_OUT_SHARED_LIBRARIES)/libarttest$(ART_HOST_SHLIB_EXTENSION) $(HOST_CORE_DEX_LOCATIONS)
+test-art-host-dependencies: $(ART_HOST_TEST_DEPENDENCIES) $(HOST_OUT_SHARED_LIBRARIES)/libarttest$(ART_HOST_SHLIB_EXTENSION) $(HOST_CORE_DEX_LOCATIONS) $(HOST_OUT_EXECUTABLES)/jasmin
 
 .PHONY: test-art-host-gtest
 test-art-host-gtest: $(ART_HOST_TEST_TARGETS)
@@ -210,7 +210,7 @@
 	@echo test-art-target PASSED
 
 .PHONY: test-art-target-dependencies
-test-art-target-dependencies: $(ART_TARGET_TEST_DEPENDENCIES) $(ART_TEST_OUT)/libarttest.so
+test-art-target-dependencies: $(ART_TARGET_TEST_DEPENDENCIES) $(ART_TEST_OUT)/libarttest.so $(HOST_OUT_EXECUTABLES)/jasmin
 
 .PHONY: test-art-target-sync
 test-art-target-sync: test-art-target-dependencies
diff --git a/compiler/dex/quick/mir_to_lir-inl.h b/compiler/dex/quick/mir_to_lir-inl.h
index f567b5c..c2d12f6 100644
--- a/compiler/dex/quick/mir_to_lir-inl.h
+++ b/compiler/dex/quick/mir_to_lir-inl.h
@@ -98,6 +98,16 @@
   return insn;
 }
 
+inline LIR* Mir2Lir::NewLIR2NoDest(int opcode, int src, int info) {
+  DCHECK(IsPseudoLirOp(opcode) || (GetTargetInstFlags(opcode) & IS_UNARY_OP))
+      << GetTargetInstName(opcode) << " " << opcode << " "
+      << PrettyMethod(cu_->method_idx, *cu_->dex_file) << " "
+      << current_dalvik_offset_;
+  LIR* insn = RawLIR(current_dalvik_offset_, opcode, src, info);
+  AppendLIR(insn);
+  return insn;
+}
+
 inline LIR* Mir2Lir::NewLIR3(int opcode, int dest, int src1, int src2) {
   DCHECK(IsPseudoLirOp(opcode) || (GetTargetInstFlags(opcode) & IS_TERTIARY_OP))
       << GetTargetInstName(opcode) << " " << opcode << " "
diff --git a/compiler/dex/quick/mir_to_lir.h b/compiler/dex/quick/mir_to_lir.h
index 6115953..5d4439f 100644
--- a/compiler/dex/quick/mir_to_lir.h
+++ b/compiler/dex/quick/mir_to_lir.h
@@ -401,6 +401,7 @@
     LIR* NewLIR0(int opcode);
     LIR* NewLIR1(int opcode, int dest);
     LIR* NewLIR2(int opcode, int dest, int src1);
+    LIR* NewLIR2NoDest(int opcode, int src, int info);
     LIR* NewLIR3(int opcode, int dest, int src1, int src2);
     LIR* NewLIR4(int opcode, int dest, int src1, int src2, int info);
     LIR* NewLIR5(int opcode, int dest, int src1, int src2, int info1, int info2);
@@ -480,6 +481,7 @@
     virtual void ResetDefLocWide(RegLocation rl);
     void ResetDefTracking();
     void ClobberAllRegs();
+    void FlushSpecificReg(RegisterInfo* info);
     void FlushAllRegsBody(RegisterInfo* info, int num_regs);
     void FlushAllRegs();
     bool RegClassMatches(int reg_class, int reg);
diff --git a/compiler/dex/quick/ralloc_util.cc b/compiler/dex/quick/ralloc_util.cc
index eb70d8c..0a65171 100644
--- a/compiler/dex/quick/ralloc_util.cc
+++ b/compiler/dex/quick/ralloc_util.cc
@@ -545,15 +545,19 @@
   }
 }
 
+void Mir2Lir::FlushSpecificReg(RegisterInfo* info) {
+  if (info->pair) {
+    FlushRegWide(info->reg, info->partner);
+  } else {
+    FlushReg(info->reg);
+  }
+}
+
 // Make sure nothing is live and dirty
 void Mir2Lir::FlushAllRegsBody(RegisterInfo* info, int num_regs) {
   for (int i = 0; i < num_regs; i++) {
     if (info[i].live && info[i].dirty) {
-      if (info[i].pair) {
-        FlushRegWide(info[i].reg, info[i].partner);
-      } else {
-        FlushReg(info[i].reg);
-      }
+      FlushSpecificReg(&info[i]);
     }
   }
 }
diff --git a/compiler/dex/quick/x86/assemble_x86.cc b/compiler/dex/quick/x86/assemble_x86.cc
index 321c6a7..6481589 100644
--- a/compiler/dex/quick/x86/assemble_x86.cc
+++ b/compiler/dex/quick/x86/assemble_x86.cc
@@ -295,7 +295,11 @@
   { kX86PsrlqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 2, 0, 1 }, "PsrlqRI", "!0r,!1d" },
   { kX86PsllqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 6, 0, 1 }, "PsllqRI", "!0r,!1d" },
   { kX86SqrtsdRR, kRegReg, IS_BINARY_OP | REG_DEF0_USE1, { 0xF2, 0, 0x0F, 0x51, 0, 0, 0, 0 }, "SqrtsdRR", "!0r,!1r" },
-  { kX86FstpdM, kMem, IS_STORE | IS_BINARY_OP | REG_USE0, { 0x0, 0, 0xDD, 0x00, 0, 3, 0, 0 }, "FstpdM", "[!0r,!1d]" },
+
+  { kX86Fild32M, kMem, IS_LOAD | IS_UNARY_OP | REG_USE0, { 0x0, 0, 0xDB, 0x00, 0, 0, 0, 0 }, "Fild32M", "[!0r,!1d]" },
+  { kX86Fild64M, kMem, IS_LOAD | IS_UNARY_OP | REG_USE0, { 0x0, 0, 0xDF, 0x00, 0, 5, 0, 0 }, "Fild64M", "[!0r,!1d]" },
+  { kX86Fstp32M, kMem, IS_STORE | IS_UNARY_OP | REG_USE0, { 0x0, 0, 0xD9, 0x00, 0, 3, 0, 0 }, "FstpdM", "[!0r,!1d]" },
+  { kX86Fstp64M, kMem, IS_STORE | IS_UNARY_OP | REG_USE0, { 0x0, 0, 0xDD, 0x00, 0, 3, 0, 0 }, "FstpdM", "[!0r,!1d]" },
 
   EXT_0F_ENCODING_MAP(Movups,    0x0, 0x10, REG_DEF0),
   { kX86MovupsMR, kMemReg,      IS_STORE | IS_TERTIARY_OP | REG_USE02,  { 0x0, 0, 0x0F, 0x11, 0, 0, 0, 0 }, "MovupsMR", "[!0r+!1d],!2r" },
diff --git a/compiler/dex/quick/x86/codegen_x86.h b/compiler/dex/quick/x86/codegen_x86.h
index 22e36d5..70263d8 100644
--- a/compiler/dex/quick/x86/codegen_x86.h
+++ b/compiler/dex/quick/x86/codegen_x86.h
@@ -452,6 +452,7 @@
      */
     LIR* OpCmpMemImmBranch(ConditionCode cond, int temp_reg, int base_reg,
                            int offset, int check_value, LIR* target);
+
     /*
      * Can this operation be using core registers without temporaries?
      * @param rl_lhs Left hand operand.
@@ -460,6 +461,14 @@
      */
     bool IsOperationSafeWithoutTemps(RegLocation rl_lhs, RegLocation rl_rhs);
 
+    /**
+     * @brief Generates inline code for conversion of long to FP by using x87/
+     * @param rl_dest The destination of the FP.
+     * @param rl_src The source of the long.
+     * @param is_double 'true' if dealing with double, 'false' for float.
+     */
+    void GenLongToFP(RegLocation rl_dest, RegLocation rl_src, bool is_double);
+
     /*
      * @brief Perform MIR analysis before compiling method.
      * @note Invokes Mir2LiR::Materialize after analysis.
diff --git a/compiler/dex/quick/x86/fp_x86.cc b/compiler/dex/quick/x86/fp_x86.cc
index 006fe76..4c2ecc0 100644
--- a/compiler/dex/quick/x86/fp_x86.cc
+++ b/compiler/dex/quick/x86/fp_x86.cc
@@ -130,6 +130,70 @@
   StoreValueWide(rl_dest, rl_result);
 }
 
+void X86Mir2Lir::GenLongToFP(RegLocation rl_dest, RegLocation rl_src, bool is_double) {
+  // Compute offsets to the source and destination VRs on stack
+  int src_v_reg_offset = SRegOffset(rl_src.s_reg_low);
+  int dest_v_reg_offset = SRegOffset(rl_dest.s_reg_low);
+
+  // Update the in-register state of source.
+  rl_src = UpdateLocWide(rl_src);
+
+  // If the source is in physical register, then put it in its location on stack.
+  if (rl_src.location == kLocPhysReg) {
+    RegisterInfo* lo_info = GetRegInfo(rl_src.low_reg);
+
+    if (lo_info != nullptr && lo_info->is_temp) {
+      // Calling FlushSpecificReg because it will only write back VR if it is dirty.
+      FlushSpecificReg(lo_info);
+    } else {
+      // It must have been register promoted if it is not a temp but is still in physical
+      // register. Since we need it to be in memory to convert, we place it there now.
+      StoreBaseDispWide(TargetReg(kSp), src_v_reg_offset, rl_src.low_reg, rl_src.high_reg);
+    }
+  }
+
+  // Push the source virtual register onto the x87 stack.
+  LIR *fild64 = NewLIR2NoDest(kX86Fild64M, TargetReg(kSp), src_v_reg_offset + LOWORD_OFFSET);
+  AnnotateDalvikRegAccess(fild64, (src_v_reg_offset + LOWORD_OFFSET) >> 2,
+      true /* is_load */, true /* is64bit */);
+
+  // Now pop off x87 stack and store it in the destination VR's stack location.
+  int opcode = is_double ? kX86Fstp64M : kX86Fstp32M;
+  int displacement = is_double ? dest_v_reg_offset + LOWORD_OFFSET : dest_v_reg_offset;
+  LIR *fstp = NewLIR2NoDest(opcode, TargetReg(kSp), displacement);
+  AnnotateDalvikRegAccess(fstp, displacement >> 2, false /* is_load */, is_double);
+
+  /*
+   * The result is in a physical register if it was in a temp or was register
+   * promoted. For that reason it is enough to check if it is in physical
+   * register. If it is, then we must do all of the bookkeeping necessary to
+   * invalidate temp (if needed) and load in promoted register (if needed).
+   * If the result's location is in memory, then we do not need to do anything
+   * more since the fstp has already placed the correct value in memory.
+   */
+  RegLocation rl_result = is_double ? UpdateLocWide(rl_dest) : UpdateLoc(rl_dest);
+  if (rl_result.location == kLocPhysReg) {
+    /*
+     * We already know that the result is in a physical register but do not know if it is the
+     * right class. So we call EvalLoc(Wide) first which will ensure that it will get moved to the
+     * correct register class.
+     */
+    if (is_double) {
+      rl_result = EvalLocWide(rl_dest, kFPReg, true);
+
+      LoadBaseDispWide(TargetReg(kSp), dest_v_reg_offset, rl_result.low_reg, rl_result.high_reg, INVALID_SREG);
+
+      StoreValueWide(rl_dest, rl_result);
+    } else {
+      rl_result = EvalLoc(rl_dest, kFPReg, true);
+
+      LoadWordDisp(TargetReg(kSp), dest_v_reg_offset, rl_result.low_reg);
+
+      StoreValue(rl_dest, rl_result);
+    }
+  }
+}
+
 void X86Mir2Lir::GenConversion(Instruction::Code opcode, RegLocation rl_dest,
                                RegLocation rl_src) {
   RegisterClass rcSrc = kFPReg;
@@ -198,11 +262,10 @@
       return;
     }
     case Instruction::LONG_TO_DOUBLE:
-      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pL2d), rl_dest, rl_src);
+      GenLongToFP(rl_dest, rl_src, true /* is_double */);
       return;
     case Instruction::LONG_TO_FLOAT:
-      // TODO: inline by using memory as a 64-bit source. Be careful about promoted registers.
-      GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pL2f), rl_dest, rl_src);
+      GenLongToFP(rl_dest, rl_src, false /* is_double */);
       return;
     case Instruction::FLOAT_TO_LONG:
       GenConversionCall(QUICK_ENTRYPOINT_OFFSET(pF2l), rl_dest, rl_src);
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index 9dd6116..1df9254 100644
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -456,10 +456,20 @@
   RegLocation rl_result = {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1, kVectorNotUsed,
                           r2, INVALID_REG, INVALID_SREG, INVALID_SREG};
 
-  // handle 0x80000000 / -1 special case.
-  LIR *minint_branch = 0;
-  if (imm == -1) {
+  // handle div/rem by 1 special case.
+  if (imm == 1) {
     if (is_div) {
+      // x / 1 == x.
+      StoreValue(rl_result, rl_src);
+    } else {
+      // x % 1 == 0.
+      LoadConstantNoClobber(r0, 0);
+      // For this case, return the result in EAX.
+      rl_result.low_reg = r0;
+    }
+  } else if (imm == -1) {  // handle 0x80000000 / -1 special case.
+    if (is_div) {
+      LIR *minint_branch = 0;
       LoadValueDirectFixed(rl_src, r0);
       OpRegImm(kOpCmp, r0, 0x80000000);
       minint_branch = NewLIR2(kX86Jcc8, 0, kX86CondEq);
@@ -479,7 +489,7 @@
     // For this case, return the result in EAX.
     rl_result.low_reg = r0;
   } else {
-    DCHECK(imm <= -2 || imm >= 2);
+    CHECK(imm <= -2 || imm >= 2);
     // Use H.S.Warren's Hacker's Delight Chapter 10 and
     // T,Grablund, P.L.Montogomery's Division by invariant integers using multiplication.
     int magic, shift;
diff --git a/compiler/dex/quick/x86/x86_lir.h b/compiler/dex/quick/x86/x86_lir.h
index 8c385a1..480d5f5 100644
--- a/compiler/dex/quick/x86/x86_lir.h
+++ b/compiler/dex/quick/x86/x86_lir.h
@@ -355,7 +355,10 @@
   kX86PsrlqRI,                  // right shift of floating point registers
   kX86PsllqRI,                  // left shift of floating point registers
   kX86SqrtsdRR,                 // sqrt of floating point register
-  kX86FstpdM,                   // Store and pop top x87 fp stack
+  kX86Fild32M,                  // push 32-bit integer on x87 stack
+  kX86Fild64M,                  // push 64-bit integer on x87 stack
+  kX86Fstp32M,                  // pop top x87 fp stack and do 32-bit store
+  kX86Fstp64M,                  // pop top x87 fp stack and do 64-bit store
   Binary0fOpCode(kX86Movups),   // load unaligned packed single FP values from xmm2/m128 to xmm1
   kX86MovupsMR, kX86MovupsAR,   // store unaligned packed single FP values from xmm1 to m128
   Binary0fOpCode(kX86Movaps),   // load aligned packed single FP values from xmm2/m128 to xmm1
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 61e9fbb..5b22817 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -980,12 +980,12 @@
 }
 
 static mirror::ArtField* ComputeFieldReferencedFromCompilingMethod(
-    ScopedObjectAccess& soa, const DexCompilationUnit* mUnit, uint32_t field_idx)
+    ScopedObjectAccess& soa, const DexCompilationUnit* mUnit, uint32_t field_idx, bool is_static)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   SirtRef<mirror::DexCache> dex_cache(soa.Self(), mUnit->GetClassLinker()->FindDexCache(*mUnit->GetDexFile()));
   SirtRef<mirror::ClassLoader> class_loader(soa.Self(), soa.Decode<mirror::ClassLoader*>(mUnit->GetClassLoader()));
   return mUnit->GetClassLinker()->ResolveField(*mUnit->GetDexFile(), field_idx, dex_cache,
-                                               class_loader, false);
+                                               class_loader, is_static);
 }
 
 static mirror::ArtMethod* ComputeMethodReferencedFromCompilingMethod(ScopedObjectAccess& soa,
@@ -1006,7 +1006,7 @@
   uint32_t method_idx = verifier->GetMethodReference().dex_method_index;
   mirror::ArtMethod* method = dex_cache->GetResolvedMethod(method_idx);
   mirror::ArtField* field = dex_cache->GetResolvedField(field_idx);
-  if (method == nullptr || field == nullptr) {
+  if (method == nullptr || field == nullptr || field->IsStatic()) {
     return false;
   }
   mirror::Class* method_class = method->GetDeclaringClass();
@@ -1030,7 +1030,8 @@
   *field_offset = -1;
   *is_volatile = true;
   // Try to resolve field and ignore if an Incompatible Class Change Error (ie is static).
-  mirror::ArtField* resolved_field = ComputeFieldReferencedFromCompilingMethod(soa, mUnit, field_idx);
+  mirror::ArtField* resolved_field =
+      ComputeFieldReferencedFromCompilingMethod(soa, mUnit, field_idx, false);
   if (resolved_field != NULL && !resolved_field->IsStatic()) {
     SirtRef<mirror::DexCache> dex_cache(soa.Self(),
                                         resolved_field->GetDeclaringClass()->GetDexCache());
@@ -1070,7 +1071,8 @@
   *is_volatile = true;
   *is_initialized = false;
   // Try to resolve field and ignore if an Incompatible Class Change Error (ie isn't static).
-  mirror::ArtField* resolved_field = ComputeFieldReferencedFromCompilingMethod(soa, mUnit, field_idx);
+  mirror::ArtField* resolved_field =
+      ComputeFieldReferencedFromCompilingMethod(soa, mUnit, field_idx, true);
   if (resolved_field != NULL && resolved_field->IsStatic()) {
     SirtRef<mirror::DexCache> dex_cache(soa.Self(), resolved_field->GetDeclaringClass()->GetDexCache());
     mirror::Class* referrer_class =
@@ -2109,6 +2111,7 @@
   "Ljava/net/Inet6Address;",  // Sub-class of InetAddress.
   "Ljava/net/InetUnixAddress;",  // Sub-class of InetAddress.
   "Ljava/net/NetworkInterface;",  // Calls to Random.<init> -> System.currentTimeMillis -> OsConstants.initConstants.
+  "Ljava/net/StandardSocketOptions;",  // Call System.identityHashCode.
   "Ljava/nio/charset/Charset;",  // Calls Charset.getDefaultCharset -> System.getProperty -> OsConstants.initConstants.
   "Ljava/nio/charset/CharsetICU;",  // Sub-class of Charset.
   "Ljava/nio/charset/Charsets;",  // Calls Charset.forName.
diff --git a/disassembler/disassembler_x86.cc b/disassembler/disassembler_x86.cc
index 903d755..b6ddc95 100644
--- a/disassembler/disassembler_x86.cc
+++ b/disassembler/disassembler_x86.cc
@@ -672,6 +672,13 @@
     has_modrm = true;
     reg_is_opcode = true;
     break;
+  case 0xDB:
+    static const char* db_opcodes[] = {"fildl", "unknown-db", "unknown-db", "unknown-db", "unknown-db", "unknown-db", "unknown-db", "unknown-db"};
+    modrm_opcodes = db_opcodes;
+    load = true;
+    has_modrm = true;
+    reg_is_opcode = true;
+    break;
   case 0xDD:
     static const char* dd_opcodes[] = {"fldl", "fisttp", "fstl", "fstpl", "frstor", "unknown-dd", "fnsave", "fnstsw"};
     modrm_opcodes = dd_opcodes;
@@ -679,6 +686,13 @@
     has_modrm = true;
     reg_is_opcode = true;
     break;
+  case 0xDF:
+    static const char* df_opcodes[] = {"fild", "unknown-df", "unknown-df", "unknown-df", "unknown-df", "fildll", "unknown-df", "unknown-df"};
+    modrm_opcodes = df_opcodes;
+    load = true;
+    has_modrm = true;
+    reg_is_opcode = true;
+    break;
   case 0xE8: opcode << "call"; branch_bytes = 4; break;
   case 0xE9: opcode << "jmp"; branch_bytes = 4; break;
   case 0xEB: opcode << "jmp"; branch_bytes = 1; break;
diff --git a/runtime/gc/accounting/atomic_stack.h b/runtime/gc/accounting/atomic_stack.h
index ea8f89c..d6f3228 100644
--- a/runtime/gc/accounting/atomic_stack.h
+++ b/runtime/gc/accounting/atomic_stack.h
@@ -73,6 +73,41 @@
     return true;
   }
 
+  // Atomically bump the back index by the given number of
+  // slots. Returns false if we overflowed the stack.
+  bool AtomicBumpBack(size_t num_slots, T** start_address, T** end_address) {
+    if (kIsDebugBuild) {
+      debug_is_sorted_ = false;
+    }
+    int32_t index;
+    int32_t new_index;
+    do {
+      index = back_index_;
+      new_index = index + num_slots;
+      if (UNLIKELY(static_cast<size_t>(new_index) >= capacity_)) {
+        // Stack overflow.
+        return false;
+      }
+    } while (!back_index_.CompareAndSwap(index, new_index));
+    *start_address = &begin_[index];
+    *end_address = &begin_[new_index];
+    if (kIsDebugBuild) {
+      // Sanity check that the memory is zero.
+      for (int32_t i = index; i < new_index; ++i) {
+        DCHECK_EQ(begin_[i], static_cast<T>(0)) << "i=" << i << " index=" << index << " new_index=" << new_index;
+      }
+    }
+    return true;
+  }
+
+  void AssertAllZero() {
+    if (kIsDebugBuild) {
+      for (size_t i = 0; i < capacity_; ++i) {
+        DCHECK_EQ(begin_[i], static_cast<T>(0)) << "i=" << i;
+      }
+    }
+  }
+
   void PushBack(const T& value) {
     if (kIsDebugBuild) {
       debug_is_sorted_ = false;
diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc
index 65d4c441..d02b851 100644
--- a/runtime/gc/allocator/rosalloc.cc
+++ b/runtime/gc/allocator/rosalloc.cc
@@ -1560,7 +1560,8 @@
 void RosAlloc::RevokeAllThreadLocalRuns() {
   // This is called when a mutator thread won't allocate such as at
   // the Zygote creation time or during the GC pause.
-  MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
+  MutexLock mu(Thread::Current(), *Locks::runtime_shutdown_lock_);
+  MutexLock mu2(Thread::Current(), *Locks::thread_list_lock_);
   std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList();
   for (auto it = thread_list.begin(); it != thread_list.end(); ++it) {
     Thread* t = *it;
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index de9f59e..dbbc115 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -206,6 +206,10 @@
     // This second sweep makes sure that we don't have any objects in the live stack which point to
     // freed objects. These cause problems since their references may be previously freed objects.
     SweepArray(GetHeap()->allocation_stack_.get(), false);
+    // Since SweepArray() above resets the (active) allocation
+    // stack. Need to revoke the thread-local allocation stacks that
+    // point into it.
+    GetHeap()->RevokeAllThreadLocalAllocationStacks(self);
   }
 
   timings_.StartSplit("PreSweepingGcVerification");
@@ -241,12 +245,15 @@
   // Need to do this before the checkpoint since we don't want any threads to add references to
   // the live stack during the recursive mark.
   timings_.NewSplit("SwapStacks");
-  heap_->SwapStacks();
+  heap_->SwapStacks(self);
 
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
   if (Locks::mutator_lock_->IsExclusiveHeld(self)) {
     // If we exclusively hold the mutator lock, all threads must be suspended.
     MarkRoots();
+    if (kUseThreadLocalAllocationStack) {
+      heap_->RevokeAllThreadLocalAllocationStacks(self);
+    }
   } else {
     MarkThreadRoots(self);
     // At this point the live stack should no longer have any mutators which push into it.
@@ -995,6 +1002,9 @@
         << thread->GetState() << " thread " << thread << " self " << self;
     thread->VisitRoots(MarkSweep::MarkRootParallelCallback, mark_sweep_);
     ATRACE_END();
+    if (kUseThreadLocalAllocationStack) {
+      thread->RevokeThreadLocalAllocationStack();
+    }
     mark_sweep_->GetBarrier().Pass(self);
   }
 
@@ -1062,6 +1072,9 @@
     Object** out = objects;
     for (size_t i = 0; i < count; ++i) {
       Object* obj = objects[i];
+      if (kUseThreadLocalAllocationStack && obj == nullptr) {
+        continue;
+      }
       if (space->HasAddress(obj)) {
         // This object is in the space, remove it from the array and add it to the sweep buffer
         // if needed.
@@ -1100,6 +1113,9 @@
   for (size_t i = 0; i < count; ++i) {
     Object* obj = objects[i];
     // Handle large objects.
+    if (kUseThreadLocalAllocationStack && obj == nullptr) {
+      continue;
+    }
     if (!large_mark_objects->Test(obj)) {
       ++freed_large_objects;
       freed_large_object_bytes += large_object_space->Free(self, obj);
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index b37b9d2..b1122b9 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -150,6 +150,7 @@
   immune_begin_ = nullptr;
   immune_end_ = nullptr;
   is_large_object_space_immune_ = false;
+  saved_bytes_ = 0;
   self_ = Thread::Current();
   // Do any pre GC verification.
   timings_.NewSplit("PreGcVerification");
@@ -209,7 +210,10 @@
   // Need to do this before the checkpoint since we don't want any threads to add references to
   // the live stack during the recursive mark.
   timings_.NewSplit("SwapStacks");
-  heap_->SwapStacks();
+  if (kUseThreadLocalAllocationStack) {
+    heap_->RevokeAllThreadLocalAllocationStacks(self_);
+  }
+  heap_->SwapStacks(self_);
   WriterMutexLock mu(self_, *Locks::heap_bitmap_lock_);
   MarkRoots();
   // Mark roots of immune spaces.
@@ -361,6 +365,9 @@
   } else {
     mprotect(from_space_->Begin(), from_space_->Capacity(), PROT_READ);
   }
+  if (saved_bytes_ > 0) {
+    VLOG(heap) << "Avoided dirtying " << PrettySize(saved_bytes_);
+  }
 
   if (generational_) {
     // Record the end (top) of the to space so we can distinguish
@@ -400,6 +407,56 @@
   return false;
 }
 
+static inline size_t CopyAvoidingDirtyingPages(void* dest, const void* src, size_t size) {
+  if (LIKELY(size <= static_cast<size_t>(kPageSize))) {
+    // We will dirty the current page and somewhere in the middle of the next page. This means
+    // that the next object copied will also dirty that page.
+    // TODO: Worth considering the last object copied? We may end up dirtying one page which is
+    // not necessary per GC.
+    memcpy(dest, src, size);
+    return 0;
+  }
+  size_t saved_bytes = 0;
+  byte* byte_dest = reinterpret_cast<byte*>(dest);
+  if (kIsDebugBuild) {
+    for (size_t i = 0; i < size; ++i) {
+      CHECK_EQ(byte_dest[i], 0U);
+    }
+  }
+  // Process the start of the page. The page must already be dirty, don't bother with checking.
+  const byte* byte_src = reinterpret_cast<const byte*>(src);
+  const byte* limit = byte_src + size;
+  size_t page_remain = AlignUp(byte_dest, kPageSize) - byte_dest;
+  // Copy the bytes until the start of the next page.
+  memcpy(dest, src, page_remain);
+  byte_src += page_remain;
+  byte_dest += page_remain;
+  CHECK_ALIGNED(reinterpret_cast<uintptr_t>(byte_dest), kPageSize);
+  CHECK_ALIGNED(reinterpret_cast<uintptr_t>(byte_dest), sizeof(uintptr_t));
+  CHECK_ALIGNED(reinterpret_cast<uintptr_t>(byte_src), sizeof(uintptr_t));
+  while (byte_src + kPageSize < limit) {
+    bool all_zero = true;
+    uintptr_t* word_dest = reinterpret_cast<uintptr_t*>(byte_dest);
+    const uintptr_t* word_src = reinterpret_cast<const uintptr_t*>(byte_src);
+    for (size_t i = 0; i < kPageSize / sizeof(*word_src); ++i) {
+      // Assumes the destination of the copy is all zeros.
+      if (word_src[i] != 0) {
+        all_zero = false;
+        word_dest[i] = word_src[i];
+      }
+    }
+    if (all_zero) {
+      // Avoided copying into the page since it was all zeros.
+      saved_bytes += kPageSize;
+    }
+    byte_src += kPageSize;
+    byte_dest += kPageSize;
+  }
+  // Handle the part of the page at the end.
+  memcpy(byte_dest, byte_src, limit - byte_src);
+  return saved_bytes;
+}
+
 mirror::Object* SemiSpace::MarkNonForwardedObject(mirror::Object* obj) {
   size_t object_size = obj->SizeOf();
   size_t bytes_allocated;
@@ -458,7 +515,8 @@
   }
   // Copy over the object and add it to the mark stack since we still need to update its
   // references.
-  memcpy(reinterpret_cast<void*>(forward_address), obj, object_size);
+  saved_bytes_ +=
+      CopyAvoidingDirtyingPages(reinterpret_cast<void*>(forward_address), obj, object_size);
   if (to_space_live_bitmap_ != nullptr) {
     to_space_live_bitmap_->Set(forward_address);
   }
diff --git a/runtime/gc/collector/semi_space.h b/runtime/gc/collector/semi_space.h
index f58402f..89e2002 100644
--- a/runtime/gc/collector/semi_space.h
+++ b/runtime/gc/collector/semi_space.h
@@ -295,6 +295,9 @@
   // whole_heap_collection_ once per interval.
   int whole_heap_collection_interval_counter_;
 
+  // How many bytes we avoided dirtying.
+  size_t saved_bytes_;
+
   // Used for the generational mode. The default interval of the whole
   // heap collection. If N, the whole heap collection occurs every N
   // collections.
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 5e1136b..9c91b0e 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -82,11 +82,7 @@
     DCHECK(!Runtime::Current()->HasStatsEnabled());
   }
   if (AllocatorHasAllocationStack(allocator)) {
-    // This is safe to do since the GC will never free objects which are neither in the allocation
-    // stack or the live bitmap.
-    while (!allocation_stack_->AtomicPushBack(obj)) {
-      CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false);
-    }
+    PushOnAllocationStack(self, obj);
   }
   if (kInstrumented) {
     if (Dbg::IsAllocTrackingEnabled()) {
@@ -111,6 +107,35 @@
   return obj;
 }
 
+// The size of a thread-local allocation stack in the number of references.
+static constexpr size_t kThreadLocalAllocationStackSize = 128;
+
+inline void Heap::PushOnAllocationStack(Thread* self, mirror::Object* obj) {
+  if (kUseThreadLocalAllocationStack) {
+    bool success = self->PushOnThreadLocalAllocationStack(obj);
+    if (UNLIKELY(!success)) {
+      // Slow path. Allocate a new thread-local allocation stack.
+      mirror::Object** start_address;
+      mirror::Object** end_address;
+      while (!allocation_stack_->AtomicBumpBack(kThreadLocalAllocationStackSize,
+                                                &start_address, &end_address)) {
+        CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false);
+      }
+      self->SetThreadLocalAllocationStack(start_address, end_address);
+      // Retry on the new thread-local allocation stack.
+      success = self->PushOnThreadLocalAllocationStack(obj);
+      // Must succeed.
+      CHECK(success);
+    }
+  } else {
+    // This is safe to do since the GC will never free objects which are neither in the allocation
+    // stack or the live bitmap.
+    while (!allocation_stack_->AtomicPushBack(obj)) {
+      CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false);
+    }
+  }
+}
+
 template <bool kInstrumented, typename PreFenceVisitor>
 inline mirror::Object* Heap::AllocLargeObject(Thread* self, mirror::Class* klass,
                                               size_t byte_count,
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index a324925..f1126ef 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -381,7 +381,12 @@
   for (mirror::Object** it = allocation_stack_->Begin(), **end = allocation_stack_->End();
       it < end; ++it) {
     mirror::Object* obj = *it;
-    callback(obj, arg);
+    if (obj != nullptr && obj->GetClass() != nullptr) {
+      // Avoid the race condition caused by the object not yet being written into the allocation
+      // stack or the class not yet being written in the object. Or, if kUseThreadLocalAllocationStack,
+      // there can be nulls on the allocation stack.
+      callback(obj, arg);
+    }
   }
   GetLiveBitmap()->Walk(callback, arg);
   self->EndAssertNoThreadSuspension(old_cause);
@@ -1529,13 +1534,14 @@
   mirror::Object** limit = stack->End();
   for (mirror::Object** it = stack->Begin(); it != limit; ++it) {
     const mirror::Object* obj = *it;
-    DCHECK(obj != nullptr);
-    if (bitmap1->HasAddress(obj)) {
-      bitmap1->Set(obj);
-    } else if (bitmap2->HasAddress(obj)) {
-      bitmap2->Set(obj);
-    } else {
-      large_objects->Set(obj);
+    if (!kUseThreadLocalAllocationStack || obj != nullptr) {
+      if (bitmap1->HasAddress(obj)) {
+        bitmap1->Set(obj);
+      } else if (bitmap2->HasAddress(obj)) {
+        bitmap2->Set(obj);
+      } else {
+        large_objects->Set(obj);
+      }
     }
   }
 }
@@ -2000,7 +2006,9 @@
 
   // We can verify objects in the live stack since none of these should reference dead objects.
   for (mirror::Object** it = live_stack_->Begin(); it != live_stack_->End(); ++it) {
-    visitor(*it);
+    if (!kUseThreadLocalAllocationStack || *it != nullptr) {
+      visitor(*it);
+    }
   }
 
   if (visitor.Failed()) {
@@ -2010,10 +2018,30 @@
   return true;
 }
 
-void Heap::SwapStacks() {
+void Heap::SwapStacks(Thread* self) {
+  if (kUseThreadLocalAllocationStack) {
+    live_stack_->AssertAllZero();
+  }
   allocation_stack_.swap(live_stack_);
 }
 
+void Heap::RevokeAllThreadLocalAllocationStacks(Thread* self) {
+  if (!Runtime::Current()->IsStarted()) {
+    // There's no thread list if the runtime hasn't started (eg
+    // dex2oat or a test). Just revoke for self.
+    self->RevokeThreadLocalAllocationStack();
+    return;
+  }
+  // This must be called only during the pause.
+  CHECK(Locks::mutator_lock_->IsExclusiveHeld(self));
+  MutexLock mu(self, *Locks::runtime_shutdown_lock_);
+  MutexLock mu2(self, *Locks::thread_list_lock_);
+  std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList();
+  for (Thread* t : thread_list) {
+    t->RevokeThreadLocalAllocationStack();
+  }
+}
+
 accounting::ModUnionTable* Heap::FindModUnionTableFromSpace(space::Space* space) {
   auto it = mod_union_tables_.find(space);
   if (it == mod_union_tables_.end()) {
@@ -2068,12 +2096,12 @@
     thread_list->SuspendAll();
     {
       ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
-      SwapStacks();
+      SwapStacks(self);
       // Sort the live stack so that we can quickly binary search it later.
       if (!VerifyMissingCardMarks()) {
         LOG(FATAL) << "Pre " << gc->GetName() << " missing card mark verification failed";
       }
-      SwapStacks();
+      SwapStacks(self);
     }
     thread_list->ResumeAll();
   }
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index e416c0e..80a5a1a 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -111,6 +111,9 @@
 // If true, use rosalloc/RosAllocSpace instead of dlmalloc/DlMallocSpace
 static constexpr bool kUseRosAlloc = true;
 
+// If true, use thread-local allocation stack.
+static constexpr bool kUseThreadLocalAllocationStack = true;
+
 // The process state passed in from the activity manager, used to determine when to do trimming
 // and compaction.
 enum ProcessState {
@@ -665,11 +668,17 @@
       SHARED_LOCKS_REQUIRED(GlobalSychronization::heap_bitmap_lock_);
 
   // Swap the allocation stack with the live stack.
-  void SwapStacks();
+  void SwapStacks(Thread* self);
+
+  // Revoke all the thread-local allocation stacks.
+  void RevokeAllThreadLocalAllocationStacks(Thread* self);
 
   // Clear cards and update the mod union table.
   void ProcessCards(TimingLogger& timings);
 
+  // Push an object onto the allocation stack.
+  void PushOnAllocationStack(Thread* self, mirror::Object* obj);
+
   // All-known continuous spaces, where objects lie within fixed bounds.
   std::vector<space::ContinuousSpace*> continuous_spaces_;
 
diff --git a/runtime/gc/space/bump_pointer_space.cc b/runtime/gc/space/bump_pointer_space.cc
index f7bdc4c..f3f594f 100644
--- a/runtime/gc/space/bump_pointer_space.cc
+++ b/runtime/gc/space/bump_pointer_space.cc
@@ -137,6 +137,7 @@
 
 void BumpPointerSpace::Walk(ObjectCallback* callback, void* arg) {
   byte* pos = Begin();
+  byte* end = End();
   byte* main_end = pos;
   {
     MutexLock mu(Thread::Current(), block_lock_);
@@ -145,16 +146,29 @@
     if (num_blocks_ == 0) {
       UpdateMainBlock();
     }
-    main_end += main_block_size_;
+    main_end = Begin() + main_block_size_;
+    if (num_blocks_ == 0) {
+      // We don't have any other blocks, this means someone else may be allocating into the main
+      // block. In this case, we don't want to try and visit the other blocks after the main block
+      // since these could actually be part of the main block.
+      end = main_end;
+    }
   }
   // Walk all of the objects in the main block first.
   while (pos < main_end) {
     mirror::Object* obj = reinterpret_cast<mirror::Object*>(pos);
-    callback(obj, arg);
-    pos = reinterpret_cast<byte*>(GetNextObject(obj));
+    if (obj->GetClass() == nullptr) {
+      // There is a race condition where a thread has just allocated an object but not set the
+      // class. We can't know the size of this object, so we don't visit it and exit the function
+      // since there is guaranteed to be not other blocks.
+      return;
+    } else {
+      callback(obj, arg);
+      pos = reinterpret_cast<byte*>(GetNextObject(obj));
+    }
   }
   // Walk the other blocks (currently only TLABs).
-  while (pos < End()) {
+  while (pos < end) {
     BlockHeader* header = reinterpret_cast<BlockHeader*>(pos);
     size_t block_size = header->size_;
     pos += sizeof(BlockHeader);  // Skip the header so that we know where the objects
diff --git a/runtime/gc/space/image_space.cc b/runtime/gc/space/image_space.cc
index ebad8dd..1af481d 100644
--- a/runtime/gc/space/image_space.cc
+++ b/runtime/gc/space/image_space.cc
@@ -286,6 +286,8 @@
       down_cast<mirror::String*>(image_header.GetImageRoot(ImageHeader::kOatLocation));
   std::string oat_filename;
   oat_filename += runtime->GetHostPrefix();
+  // Ensure the path ends with a '/'.
+  if (!oat_filename.empty()) oat_filename += "/";
   oat_filename += oat_location->ToModifiedUtf8();
   OatFile* oat_file = OatFile::Open(oat_filename, oat_filename, image_header.GetOatDataBegin(),
                                     !Runtime::Current()->IsCompiler(), error_msg);
diff --git a/runtime/native/dalvik_system_Zygote.cc b/runtime/native/dalvik_system_Zygote.cc
index 22c5430..29c0bc0 100644
--- a/runtime/native/dalvik_system_Zygote.cc
+++ b/runtime/native/dalvik_system_Zygote.cc
@@ -23,8 +23,10 @@
 #include <signal.h>
 #include <stdlib.h>
 #include <sys/types.h>
+#include <sys/stat.h>
 #include <sys/wait.h>
 #include <unistd.h>
+#include <fcntl.h>
 
 #include "cutils/fs.h"
 #include "cutils/multiuser.h"
@@ -406,13 +408,43 @@
 }
 #endif
 
+// Utility to close down the Zygote socket file descriptors while
+// the child is still running as root with Zygote's privileges.  Each
+// descriptor (if any) is closed via dup2(), replacing it with a valid
+// (open) descriptor to /dev/null.
+
+static void DetachDescriptors(JNIEnv* env, jintArray fdsToClose) {
+  if (!fdsToClose) {
+    return;
+  }
+  jsize count = env->GetArrayLength(fdsToClose);
+  jint *ar = env->GetIntArrayElements(fdsToClose, 0);
+  if (!ar) {
+      PLOG(FATAL) << "Bad fd array";
+  }
+  jsize i;
+  int devnull;
+  for (i = 0; i < count; i++) {
+    devnull = open("/dev/null", O_RDWR);
+    if (devnull < 0) {
+      PLOG(FATAL) << "Failed to open /dev/null";
+      continue;
+    }
+    PLOG(VERBOSE) << "Switching descriptor " << ar[i] << " to /dev/null";
+    if (dup2(devnull, ar[i]) < 0) {
+      PLOG(FATAL) << "Failed dup2() on descriptor " << ar[i];
+    }
+    close(devnull);
+  }
+}
+
 // Utility routine to fork zygote and specialize the child process.
 static pid_t ForkAndSpecializeCommon(JNIEnv* env, uid_t uid, gid_t gid, jintArray javaGids,
                                      jint debug_flags, jobjectArray javaRlimits,
                                      jlong permittedCapabilities, jlong effectiveCapabilities,
                                      jint mount_external,
                                      jstring java_se_info, jstring java_se_name,
-                                     bool is_system_server) {
+                                     bool is_system_server, jintArray fdsToClose) {
   Runtime* runtime = Runtime::Current();
   CHECK(runtime->IsZygote()) << "runtime instance not started with -Xzygote";
   if (!runtime->PreZygoteFork()) {
@@ -431,6 +463,9 @@
     // The child process.
     gMallocLeakZygoteChild = 1;
 
+    // Clean up any descriptors which must be closed immediately
+    DetachDescriptors(env, fdsToClose);
+
     // Keep capabilities across UID change, unless we're staying root.
     if (uid != 0) {
       EnableKeepCapabilities();
@@ -533,9 +568,10 @@
 
 static jint Zygote_nativeForkAndSpecialize(JNIEnv* env, jclass, jint uid, jint gid, jintArray gids,
                                            jint debug_flags, jobjectArray rlimits,
-                                           jint mount_external, jstring se_info, jstring se_name) {
+                                           jint mount_external, jstring se_info, jstring se_name,
+                                           jintArray fdsToClose) {
   return ForkAndSpecializeCommon(env, uid, gid, gids, debug_flags, rlimits, 0, 0, mount_external,
-                                 se_info, se_name, false);
+                                 se_info, se_name, false, fdsToClose);
 }
 
 static jint Zygote_nativeForkSystemServer(JNIEnv* env, jclass, uid_t uid, gid_t gid, jintArray gids,
@@ -545,7 +581,7 @@
   pid_t pid = ForkAndSpecializeCommon(env, uid, gid, gids,
                                       debug_flags, rlimits,
                                       permittedCapabilities, effectiveCapabilities,
-                                      MOUNT_EXTERNAL_NONE, NULL, NULL, true);
+                                      MOUNT_EXTERNAL_NONE, NULL, NULL, true, NULL);
   if (pid > 0) {
       // The zygote process checks whether the child process has died or not.
       LOG(INFO) << "System server process " << pid << " has been created";
@@ -562,7 +598,7 @@
 }
 
 static JNINativeMethod gMethods[] = {
-  NATIVE_METHOD(Zygote, nativeForkAndSpecialize, "(II[II[[IILjava/lang/String;Ljava/lang/String;)I"),
+  NATIVE_METHOD(Zygote, nativeForkAndSpecialize, "(II[II[[IILjava/lang/String;Ljava/lang/String;[I)I"),
   NATIVE_METHOD(Zygote, nativeForkSystemServer, "(II[II[[IJJ)I"),
 };
 
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index 9420e7b..c0bf377 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -170,6 +170,42 @@
   return ret;
 }
 
+inline bool Thread::PushOnThreadLocalAllocationStack(mirror::Object* obj) {
+  DCHECK_LE(thread_local_alloc_stack_top_, thread_local_alloc_stack_end_);
+  if (thread_local_alloc_stack_top_ < thread_local_alloc_stack_end_) {
+    // There's room.
+    DCHECK_LE(reinterpret_cast<byte*>(thread_local_alloc_stack_top_) + sizeof(mirror::Object*),
+              reinterpret_cast<byte*>(thread_local_alloc_stack_end_));
+    DCHECK(*thread_local_alloc_stack_top_ == nullptr);
+    *thread_local_alloc_stack_top_ = obj;
+    ++thread_local_alloc_stack_top_;
+    return true;
+  }
+  return false;
+}
+
+inline void Thread::SetThreadLocalAllocationStack(mirror::Object** start, mirror::Object** end) {
+  DCHECK(Thread::Current() == this) << "Should be called by self";
+  DCHECK(start != nullptr);
+  DCHECK(end != nullptr);
+  DCHECK_ALIGNED(start, sizeof(mirror::Object*));
+  DCHECK_ALIGNED(end, sizeof(mirror::Object*));
+  DCHECK_LT(start, end);
+  thread_local_alloc_stack_end_ = end;
+  thread_local_alloc_stack_top_ = start;
+}
+
+inline void Thread::RevokeThreadLocalAllocationStack() {
+  if (kIsDebugBuild) {
+    // Note: self is not necessarily equal to this thread since thread may be suspended.
+    Thread* self = Thread::Current();
+    DCHECK(this == self || IsSuspended() || GetState() == kWaitingPerformingGc)
+        << GetState() << " thread " << this << " self " << self;
+  }
+  thread_local_alloc_stack_end_ = nullptr;
+  thread_local_alloc_stack_top_ = nullptr;
+}
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_THREAD_INL_H_
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 6c3e7ee..3382811 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -963,7 +963,9 @@
       thread_local_start_(nullptr),
       thread_local_pos_(nullptr),
       thread_local_end_(nullptr),
-      thread_local_objects_(0) {
+      thread_local_objects_(0),
+      thread_local_alloc_stack_top_(nullptr),
+      thread_local_alloc_stack_end_(nullptr) {
   CHECK_EQ((sizeof(Thread) % 4), 0U) << sizeof(Thread);
   state_and_flags_.as_struct.flags = 0;
   state_and_flags_.as_struct.state = kNative;
@@ -1145,14 +1147,13 @@
   return managed_stack_.ShadowFramesContain(sirt_entry);
 }
 
-void Thread::SirtVisitRoots(RootCallback* visitor, void* arg) {
-  uint32_t tid = GetTid();
+void Thread::SirtVisitRoots(RootCallback* visitor, void* arg, uint32_t thread_id) {
   for (StackIndirectReferenceTable* cur = top_sirt_; cur; cur = cur->GetLink()) {
     size_t num_refs = cur->NumberOfReferences();
     for (size_t j = 0; j < num_refs; ++j) {
       mirror::Object* object = cur->GetReference(j);
       if (object != nullptr) {
-        mirror::Object* new_obj = visitor(object, arg, tid, kRootNativeStack);
+        mirror::Object* new_obj = visitor(object, arg, thread_id, kRootNativeStack);
         DCHECK(new_obj != nullptr);
         if (new_obj != object) {
           cur->SetReference(j, new_obj);
@@ -1998,7 +1999,7 @@
   }
   jni_env_->locals.VisitRoots(visitor, arg, thread_id, kRootJNILocal);
   jni_env_->monitors.VisitRoots(visitor, arg, thread_id, kRootJNIMonitor);
-  SirtVisitRoots(visitor, arg);
+  SirtVisitRoots(visitor, arg, thread_id);
   // Visit roots on this thread's stack
   Context* context = GetLongJumpContext();
   RootCallbackVisitor visitorToCallback(visitor, arg, thread_id);
@@ -2025,7 +2026,7 @@
 
 void Thread::VerifyStackImpl() {
   UniquePtr<Context> context(Context::Create());
-  RootCallbackVisitor visitorToCallback(VerifyRoot, Runtime::Current()->GetHeap(), GetTid());
+  RootCallbackVisitor visitorToCallback(VerifyRoot, Runtime::Current()->GetHeap(), GetThreadId());
   ReferenceMapVisitor<RootCallbackVisitor> mapper(this, context.get(), visitorToCallback);
   mapper.WalkStack();
 }
diff --git a/runtime/thread.h b/runtime/thread.h
index daffc92..6c072ba 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -503,7 +503,7 @@
   // Is the given obj in this thread's stack indirect reference table?
   bool SirtContains(jobject obj) const;
 
-  void SirtVisitRoots(RootCallback* visitor, void* arg)
+  void SirtVisitRoots(RootCallback* visitor, void* arg, uint32_t thread_id)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void PushSirt(StackIndirectReferenceTable* sirt) {
@@ -829,6 +829,19 @@
   static const size_t kRosAllocNumOfSizeBrackets = 34;
   void* rosalloc_runs_[kRosAllocNumOfSizeBrackets];
 
+  // Thread-local allocation stack data/routines.
+  mirror::Object** thread_local_alloc_stack_top_;
+  mirror::Object** thread_local_alloc_stack_end_;
+
+  // Push an object onto the allocation stack.
+  bool PushOnThreadLocalAllocationStack(mirror::Object* obj);
+
+  // Set the thread local allocation pointers to the given pointers.
+  void SetThreadLocalAllocationStack(mirror::Object** start, mirror::Object** end);
+
+  // Resets the thread local allocation pointers.
+  void RevokeThreadLocalAllocationStack();
+
  private:
   friend class Dbg;  // For SetStateUnsafe.
   friend class Monitor;