Merge "MIPS64: Fix art_quick_aput_obj stubs"
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index 4acf3ac..93c6c20 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -21,13 +21,15 @@
 namespace art {
 
 // TODO: Generalize to cycles, as found by induction analysis?
-static bool IsPhiAddSub(HPhi* phi, /*out*/ HInstruction** addsub_out) {
+static bool IsPhiInduction(HPhi* phi, ArenaSet<HInstruction*>* iset) {
+  DCHECK(iset->empty());
   HInputsRef inputs = phi->GetInputs();
   if (inputs.size() == 2 && (inputs[1]->IsAdd() || inputs[1]->IsSub())) {
     HInstruction* addsub = inputs[1];
     if (addsub->InputAt(0) == phi || addsub->InputAt(1) == phi) {
       if (addsub->GetUses().HasExactlyOneElement()) {
-        *addsub_out = addsub;
+        iset->insert(phi);
+        iset->insert(addsub);
         return true;
       }
     }
@@ -35,39 +37,23 @@
   return false;
 }
 
-static bool IsOnlyUsedAfterLoop(const HLoopInformation& loop_info,
-                                HPhi* phi, HInstruction* addsub) {
-  for (const HUseListNode<HInstruction*>& use : phi->GetUses()) {
-    if (use.GetUser() != addsub) {
-      HLoopInformation* other_loop_info = use.GetUser()->GetBlock()->GetLoopInformation();
-      if (other_loop_info != nullptr && other_loop_info->IsIn(loop_info)) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
 // Find: phi: Phi(init, addsub)
 //       s:   SuspendCheck
 //       c:   Condition(phi, bound)
 //       i:   If(c)
 // TODO: Find a less pattern matching approach?
-static bool IsEmptyHeader(HBasicBlock* block, /*out*/ HInstruction** addsub) {
+static bool IsEmptyHeader(HBasicBlock* block, ArenaSet<HInstruction*>* iset) {
+  DCHECK(iset->empty());
   HInstruction* phi = block->GetFirstPhi();
-  if (phi != nullptr && phi->GetNext() == nullptr && IsPhiAddSub(phi->AsPhi(), addsub)) {
+  if (phi != nullptr && phi->GetNext() == nullptr && IsPhiInduction(phi->AsPhi(), iset)) {
     HInstruction* s = block->GetFirstInstruction();
     if (s != nullptr && s->IsSuspendCheck()) {
       HInstruction* c = s->GetNext();
       if (c != nullptr && c->IsCondition() && c->GetUses().HasExactlyOneElement()) {
         HInstruction* i = c->GetNext();
         if (i != nullptr && i->IsIf() && i->InputAt(0) == c) {
-          // Check that phi is only used inside loop as expected.
-          for (const HUseListNode<HInstruction*>& use : phi->GetUses()) {
-            if (use.GetUser() != *addsub && use.GetUser() != c) {
-              return false;
-            }
-          }
+          iset->insert(c);
+          iset->insert(s);
           return true;
         }
       }
@@ -76,10 +62,11 @@
   return false;
 }
 
-static bool IsEmptyBody(HBasicBlock* block, HInstruction* addsub) {
+static bool IsEmptyBody(HBasicBlock* block, ArenaSet<HInstruction*>* iset) {
   HInstruction* phi = block->GetFirstPhi();
   HInstruction* i = block->GetFirstInstruction();
-  return phi == nullptr && i == addsub && i->GetNext() != nullptr && i->GetNext()->IsGoto();
+  return phi == nullptr && iset->find(i) != iset->end() &&
+      i->GetNext() != nullptr && i->GetNext()->IsGoto();
 }
 
 static HBasicBlock* TryRemovePreHeader(HBasicBlock* preheader, HBasicBlock* entry_block) {
@@ -127,7 +114,8 @@
       induction_range_(induction_analysis),
       loop_allocator_(nullptr),
       top_loop_(nullptr),
-      last_loop_(nullptr) {
+      last_loop_(nullptr),
+      iset_(nullptr) {
 }
 
 void HLoopOptimization::Run() {
@@ -164,8 +152,14 @@
     }
   }
 
-  // Traverse the loop hierarchy inner-to-outer and optimize.
-  TraverseLoopsInnerToOuter(top_loop_);
+  // Traverse the loop hierarchy inner-to-outer and optimize. Traversal can use
+  // a temporary set that stores instructions using the phase-local allocator.
+  if (top_loop_ != nullptr) {
+    ArenaSet<HInstruction*> iset(loop_allocator_->Adapter(kArenaAllocLoopOptimization));
+    iset_ = &iset;
+    TraverseLoopsInnerToOuter(top_loop_);
+    iset_ = nullptr;  // detach
+  }
 }
 
 void HLoopOptimization::AddLoop(HLoopInformation* loop_info) {
@@ -194,9 +188,25 @@
 
 void HLoopOptimization::RemoveLoop(LoopNode* node) {
   DCHECK(node != nullptr);
-  // TODO: implement when needed (for current set of optimizations, we don't
-  // need to keep recorded loop hierarchy up to date, but as we get different
-  // traversal, we may want to remove the node from the hierarchy here.
+  DCHECK(node->inner == nullptr);
+  if (node->previous != nullptr) {
+    // Within sequence.
+    node->previous->next = node->next;
+    if (node->next != nullptr) {
+      node->next->previous = node->previous;
+    }
+  } else {
+    // First of sequence.
+    if (node->outer != nullptr) {
+      node->outer->inner = node->next;
+    } else {
+      top_loop_ = node->next;
+    }
+    if (node->next != nullptr) {
+      node->next->outer = node->outer;
+      node->next->previous = nullptr;
+    }
+  }
 }
 
 void HLoopOptimization::TraverseLoopsInnerToOuter(LoopNode* node) {
@@ -213,34 +223,20 @@
 void HLoopOptimization::SimplifyInduction(LoopNode* node) {
   HBasicBlock* header = node->loop_info->GetHeader();
   HBasicBlock* preheader = node->loop_info->GetPreHeader();
-  // Scan the phis in the header to find opportunities to optimize induction.
+  // Scan the phis in the header to find opportunities to simplify an induction
+  // cycle that is only used outside the loop. Replace these uses, if any, with
+  // the last value and remove the induction cycle.
+  // Examples: for (int i = 0; x != null;   i++) { .... no i .... }
+  //           for (int i = 0; i < 10; i++, k++) { .... no k .... } return k;
   for (HInstructionIterator it(header->GetPhis()); !it.Done(); it.Advance()) {
     HPhi* phi = it.Current()->AsPhi();
-    HInstruction* addsub = nullptr;
-    // Find phi-add/sub cycle.
-    if (IsPhiAddSub(phi, &addsub)) {
-      // Simple case, the induction is only used by itself. Although redundant,
-      // later phases do not easily detect this property. Thus, eliminate here.
-      // Example: for (int i = 0; x != null; i++) { .... no i .... }
-      if (phi->GetUses().HasExactlyOneElement()) {
-        // Remove the cycle, including all uses. Even environment uses can be removed,
-        // since these computations have no effect at all.
-        RemoveFromCycle(phi);  // removes environment uses too
-        RemoveFromCycle(addsub);
-        continue;
-      }
-      // Closed form case. Only the last value of the induction is needed. Remove all
-      // overhead from the loop, and replace subsequent uses with the last value.
-      // Example: for (int i = 0; i < 10; i++, k++) { .... no k .... } return k;
-      if (IsOnlyUsedAfterLoop(*node->loop_info, phi, addsub) &&
-          induction_range_.CanGenerateLastValue(phi)) {
-        HInstruction* last = induction_range_.GenerateLastValue(phi, graph_, preheader);
-        // Remove the cycle, replacing all uses. Even environment uses can consume the final
-        // value, since any first real use is outside the loop (although this may imply
-        // that deopting may look "ahead" a bit on the phi value).
-        ReplaceAllUses(phi, last, addsub);
-        RemoveFromCycle(phi);  // removes environment uses too
-        RemoveFromCycle(addsub);
+    iset_->clear();
+    int32_t use_count = 0;
+    if (IsPhiInduction(phi, iset_) &&
+        IsOnlyUsedAfterLoop(*node->loop_info, phi, &use_count) &&
+        TryReplaceWithLastValue(phi, use_count, preheader)) {
+      for (HInstruction* i : *iset_) {
+        RemoveFromCycle(i);
       }
     }
   }
@@ -266,14 +262,18 @@
   HBasicBlock* exit = (header->GetSuccessors()[0] == body)
       ? header->GetSuccessors()[1]
       : header->GetSuccessors()[0];
-  // Ensure exit can only be reached by exiting loop (this seems typically the
-  // case anyway, and simplifies code generation below; TODO: perhaps relax?).
+  // Ensure exit can only be reached by exiting loop.
   if (exit->GetPredecessors().size() != 1) {
     return;
   }
-  // Detect an empty loop: no side effects other than plain iteration.
-  HInstruction* addsub = nullptr;
-  if (IsEmptyHeader(header, &addsub) && IsEmptyBody(body, addsub)) {
+  // Detect an empty loop: no side effects other than plain iteration. Replace
+  // subsequent index uses, if any, with the last value and remove the loop.
+  iset_->clear();
+  int32_t use_count = 0;
+  if (IsEmptyHeader(header, iset_) &&
+      IsEmptyBody(body, iset_) &&
+      IsOnlyUsedAfterLoop(*node->loop_info, header->GetFirstPhi(), &use_count) &&
+      TryReplaceWithLastValue(header->GetFirstPhi(), use_count, preheader)) {
     HBasicBlock* entry = TryRemovePreHeader(preheader, graph_->GetEntryBlock());
     body->DisconnectAndDelete();
     exit->RemovePredecessor(header);
@@ -299,15 +299,29 @@
   }
 }
 
-void HLoopOptimization::ReplaceAllUses(HInstruction* instruction,
-                                       HInstruction* replacement,
-                                       HInstruction* exclusion) {
+bool HLoopOptimization::IsOnlyUsedAfterLoop(const HLoopInformation& loop_info,
+                                            HInstruction* instruction,
+                                            /*out*/ int32_t* use_count) {
+  for (const HUseListNode<HInstruction*>& use : instruction->GetUses()) {
+    HInstruction* user = use.GetUser();
+    if (iset_->find(user) == iset_->end()) {  // not excluded?
+      HLoopInformation* other_loop_info = user->GetBlock()->GetLoopInformation();
+      if (other_loop_info != nullptr && other_loop_info->IsIn(loop_info)) {
+        return false;
+      }
+      ++*use_count;
+    }
+  }
+  return true;
+}
+
+void HLoopOptimization::ReplaceAllUses(HInstruction* instruction, HInstruction* replacement) {
   const HUseList<HInstruction*>& uses = instruction->GetUses();
   for (auto it = uses.begin(), end = uses.end(); it != end;) {
     HInstruction* user = it->GetUser();
     size_t index = it->GetIndex();
     ++it;  // increment before replacing
-    if (user != exclusion) {
+    if (iset_->find(user) == iset_->end()) {  // not excluded?
       user->ReplaceInput(replacement, index);
       induction_range_.Replace(user, instruction, replacement);  // update induction
     }
@@ -317,7 +331,7 @@
     HEnvironment* user = it->GetUser();
     size_t index = it->GetIndex();
     ++it;  // increment before replacing
-    if (user->GetHolder() != exclusion) {
+    if (iset_->find(user->GetHolder()) == iset_->end()) {  // not excluded?
       user->RemoveAsUserOfInput(index);
       user->SetRawEnvAt(index, replacement);
       replacement->AddEnvUseAt(user, index);
@@ -325,4 +339,20 @@
   }
 }
 
+bool HLoopOptimization::TryReplaceWithLastValue(HInstruction* instruction,
+                                                int32_t use_count,
+                                                HBasicBlock* block) {
+  // If true uses appear after the loop, replace these uses with the last value. Environment
+  // uses can consume this value too, since any first true use is outside the loop (although
+  // this may imply that de-opting may look "ahead" a bit on the phi value). If there are only
+  // environment uses, the value is dropped altogether, since the computations have no effect.
+  if (use_count > 0) {
+    if (!induction_range_.CanGenerateLastValue(instruction)) {
+      return false;
+    }
+    ReplaceAllUses(instruction, induction_range_.GenerateLastValue(instruction, graph_, block));
+  }
+  return true;
+}
+
 }  // namespace art
diff --git a/compiler/optimizing/loop_optimization.h b/compiler/optimizing/loop_optimization.h
index 6092955..b2bf1c8 100644
--- a/compiler/optimizing/loop_optimization.h
+++ b/compiler/optimizing/loop_optimization.h
@@ -63,9 +63,13 @@
   void SimplifyInduction(LoopNode* node);
   void RemoveIfEmptyLoop(LoopNode* node);
 
-  void ReplaceAllUses(HInstruction* instruction,
-                      HInstruction* replacement,
-                      HInstruction* exclusion);
+  bool IsOnlyUsedAfterLoop(const HLoopInformation& loop_info,
+                           HInstruction* instruction,
+                           /*out*/ int32_t* use_count);
+  void ReplaceAllUses(HInstruction* instruction, HInstruction* replacement);
+  bool TryReplaceWithLastValue(HInstruction* instruction,
+                               int32_t use_count,
+                               HBasicBlock* block);
 
   // Range information based on prior induction variable analysis.
   InductionVarRange induction_range_;
@@ -79,6 +83,10 @@
   LoopNode* top_loop_;
   LoopNode* last_loop_;
 
+  // Temporary bookkeeping of a set of instructions.
+  // Contents reside in phase-local heap memory.
+  ArenaSet<HInstruction*>* iset_;
+
   friend class LoopOptimizationTest;
 
   DISALLOW_COPY_AND_ASSIGN(HLoopOptimization);
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 6d61c64..83789cc 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -28,6 +28,7 @@
 #include "gc/space/large_object_space.h"
 #include "gc/space/region_space-inl.h"
 #include "gc/space/rosalloc_space-inl.h"
+#include "obj_ptr-inl.h"
 #include "runtime.h"
 #include "handle_scope-inl.h"
 #include "thread-inl.h"
@@ -433,6 +434,12 @@
   }
 }
 
+inline void Heap::WriteBarrierField(ObjPtr<mirror::Object> dst,
+                                    MemberOffset offset ATTRIBUTE_UNUSED,
+                                    ObjPtr<mirror::Object> new_value ATTRIBUTE_UNUSED) {
+  card_table_->MarkCard(dst.Ptr());
+}
+
 }  // namespace gc
 }  // namespace art
 
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index e32f057..678edff 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -438,11 +438,10 @@
 
   // Must be called if a field of an Object in the heap changes, and before any GC safe-point.
   // The call is not needed if null is stored in the field.
-  ALWAYS_INLINE void WriteBarrierField(const mirror::Object* dst,
-                                       MemberOffset offset ATTRIBUTE_UNUSED,
-                                       const mirror::Object* new_value ATTRIBUTE_UNUSED) {
-    card_table_->MarkCard(dst);
-  }
+  ALWAYS_INLINE void WriteBarrierField(ObjPtr<mirror::Object> dst,
+                                       MemberOffset offset,
+                                       ObjPtr<mirror::Object> new_value)
+      REQUIRES_SHARED(Locks::mutator_lock_);
 
   // Write barrier for array operations that update many field positions
   ALWAYS_INLINE void WriteBarrierArray(const mirror::Object* dst,
diff --git a/runtime/globals.h b/runtime/globals.h
index 28534e4..6164225 100644
--- a/runtime/globals.h
+++ b/runtime/globals.h
@@ -172,6 +172,9 @@
 static constexpr bool kIsVdexEnabled = false;
 #endif
 
+// Size of a heap reference.
+static constexpr size_t kHeapReferenceSize = sizeof(uint32_t);
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_GLOBALS_H_
diff --git a/runtime/mirror/object-inl.h b/runtime/mirror/object-inl.h
index ad7558c..3e7bca7 100644
--- a/runtime/mirror/object-inl.h
+++ b/runtime/mirror/object-inl.h
@@ -31,6 +31,7 @@
 #include "lock_word-inl.h"
 #include "monitor.h"
 #include "object_array-inl.h"
+#include "object_reference-inl.h"
 #include "obj_ptr-inl.h"
 #include "read_barrier-inl.h"
 #include "reference.h"
@@ -53,7 +54,7 @@
 }
 
 template<VerifyObjectFlags kVerifyFlags>
-inline void Object::SetClass(Class* new_klass) {
+inline void Object::SetClass(ObjPtr<Class> new_klass) {
   // new_klass may be null prior to class linker initialization.
   // We don't mark the card as this occurs as part of object allocation. Not all objects have
   // backing cards, such as large objects.
@@ -159,7 +160,6 @@
 #endif
 }
 
-
 inline uint32_t Object::GetMarkBit() {
 #ifdef USE_READ_BARRIER
   return GetLockWord(false).MarkBitState();
@@ -895,18 +895,18 @@
 template<bool kTransactionActive, bool kCheckTransaction, VerifyObjectFlags kVerifyFlags,
     bool kIsVolatile>
 inline void Object::SetFieldObjectWithoutWriteBarrier(MemberOffset field_offset,
-                                                      Object* new_value) {
+                                                      ObjPtr<Object> new_value) {
   if (kCheckTransaction) {
     DCHECK_EQ(kTransactionActive, Runtime::Current()->IsActiveTransaction());
   }
   if (kTransactionActive) {
-    mirror::Object* obj;
+    ObjPtr<Object> obj;
     if (kIsVolatile) {
       obj = GetFieldObjectVolatile<Object>(field_offset);
     } else {
       obj = GetFieldObject<Object>(field_offset);
     }
-    Runtime::Current()->RecordWriteFieldReference(this, field_offset, obj, true);
+    Runtime::Current()->RecordWriteFieldReference(this, field_offset, obj.Ptr(), true);
   }
   if (kVerifyFlags & kVerifyThis) {
     VerifyObject(this);
@@ -919,17 +919,17 @@
   if (kIsVolatile) {
     // TODO: Refactor to use a SequentiallyConsistent store instead.
     QuasiAtomic::ThreadFenceRelease();  // Ensure that prior accesses are visible before store.
-    objref_addr->Assign(new_value);
+    objref_addr->Assign(new_value.Ptr());
     QuasiAtomic::ThreadFenceSequentiallyConsistent();
                                 // Ensure this store occurs before any volatile loads.
   } else {
-    objref_addr->Assign(new_value);
+    objref_addr->Assign(new_value.Ptr());
   }
 }
 
 template<bool kTransactionActive, bool kCheckTransaction, VerifyObjectFlags kVerifyFlags,
     bool kIsVolatile>
-inline void Object::SetFieldObject(MemberOffset field_offset, Object* new_value) {
+inline void Object::SetFieldObject(MemberOffset field_offset, ObjPtr<Object> new_value) {
   SetFieldObjectWithoutWriteBarrier<kTransactionActive, kCheckTransaction, kVerifyFlags,
       kIsVolatile>(field_offset, new_value);
   if (new_value != nullptr) {
@@ -940,7 +940,7 @@
 }
 
 template<bool kTransactionActive, bool kCheckTransaction, VerifyObjectFlags kVerifyFlags>
-inline void Object::SetFieldObjectVolatile(MemberOffset field_offset, Object* new_value) {
+inline void Object::SetFieldObjectVolatile(MemberOffset field_offset, ObjPtr<Object> new_value) {
   SetFieldObject<kTransactionActive, kCheckTransaction, kVerifyFlags, true>(field_offset,
                                                                             new_value);
 }
@@ -956,7 +956,8 @@
 
 template<bool kTransactionActive, bool kCheckTransaction, VerifyObjectFlags kVerifyFlags>
 inline bool Object::CasFieldWeakSequentiallyConsistentObject(MemberOffset field_offset,
-                                                             Object* old_value, Object* new_value) {
+                                                             ObjPtr<Object> old_value,
+                                                             ObjPtr<Object> new_value) {
   bool success = CasFieldWeakSequentiallyConsistentObjectWithoutWriteBarrier<
       kTransactionActive, kCheckTransaction, kVerifyFlags>(field_offset, old_value, new_value);
   if (success) {
@@ -967,7 +968,9 @@
 
 template<bool kTransactionActive, bool kCheckTransaction, VerifyObjectFlags kVerifyFlags>
 inline bool Object::CasFieldWeakSequentiallyConsistentObjectWithoutWriteBarrier(
-    MemberOffset field_offset, Object* old_value, Object* new_value) {
+    MemberOffset field_offset,
+    ObjPtr<Object> old_value,
+    ObjPtr<Object> new_value) {
   if (kCheckTransaction) {
     DCHECK_EQ(kTransactionActive, Runtime::Current()->IsActiveTransaction());
   }
@@ -983,8 +986,8 @@
   if (kTransactionActive) {
     Runtime::Current()->RecordWriteFieldReference(this, field_offset, old_value, true);
   }
-  HeapReference<Object> old_ref(HeapReference<Object>::FromMirrorPtr(old_value));
-  HeapReference<Object> new_ref(HeapReference<Object>::FromMirrorPtr(new_value));
+  HeapReference<Object> old_ref(HeapReference<Object>::FromObjPtr(old_value));
+  HeapReference<Object> new_ref(HeapReference<Object>::FromObjPtr(new_value));
   uint8_t* raw_addr = reinterpret_cast<uint8_t*>(this) + field_offset.Int32Value();
   Atomic<uint32_t>* atomic_addr = reinterpret_cast<Atomic<uint32_t>*>(raw_addr);
 
@@ -995,7 +998,8 @@
 
 template<bool kTransactionActive, bool kCheckTransaction, VerifyObjectFlags kVerifyFlags>
 inline bool Object::CasFieldStrongSequentiallyConsistentObject(MemberOffset field_offset,
-                                                               Object* old_value, Object* new_value) {
+                                                               ObjPtr<Object> old_value,
+                                                               ObjPtr<Object> new_value) {
   bool success = CasFieldStrongSequentiallyConsistentObjectWithoutWriteBarrier<
       kTransactionActive, kCheckTransaction, kVerifyFlags>(field_offset, old_value, new_value);
   if (success) {
@@ -1006,7 +1010,9 @@
 
 template<bool kTransactionActive, bool kCheckTransaction, VerifyObjectFlags kVerifyFlags>
 inline bool Object::CasFieldStrongSequentiallyConsistentObjectWithoutWriteBarrier(
-    MemberOffset field_offset, Object* old_value, Object* new_value) {
+    MemberOffset field_offset,
+    ObjPtr<Object> old_value,
+    ObjPtr<Object> new_value) {
   if (kCheckTransaction) {
     DCHECK_EQ(kTransactionActive, Runtime::Current()->IsActiveTransaction());
   }
@@ -1022,8 +1028,8 @@
   if (kTransactionActive) {
     Runtime::Current()->RecordWriteFieldReference(this, field_offset, old_value, true);
   }
-  HeapReference<Object> old_ref(HeapReference<Object>::FromMirrorPtr(old_value));
-  HeapReference<Object> new_ref(HeapReference<Object>::FromMirrorPtr(new_value));
+  HeapReference<Object> old_ref(HeapReference<Object>::FromObjPtr(old_value));
+  HeapReference<Object> new_ref(HeapReference<Object>::FromObjPtr(new_value));
   uint8_t* raw_addr = reinterpret_cast<uint8_t*>(this) + field_offset.Int32Value();
   Atomic<uint32_t>* atomic_addr = reinterpret_cast<Atomic<uint32_t>*>(raw_addr);
 
@@ -1034,7 +1040,9 @@
 
 template<bool kTransactionActive, bool kCheckTransaction, VerifyObjectFlags kVerifyFlags>
 inline bool Object::CasFieldWeakRelaxedObjectWithoutWriteBarrier(
-    MemberOffset field_offset, Object* old_value, Object* new_value) {
+    MemberOffset field_offset,
+    ObjPtr<Object> old_value,
+    ObjPtr<Object> new_value) {
   if (kCheckTransaction) {
     DCHECK_EQ(kTransactionActive, Runtime::Current()->IsActiveTransaction());
   }
@@ -1050,8 +1058,8 @@
   if (kTransactionActive) {
     Runtime::Current()->RecordWriteFieldReference(this, field_offset, old_value, true);
   }
-  HeapReference<Object> old_ref(HeapReference<Object>::FromMirrorPtr(old_value));
-  HeapReference<Object> new_ref(HeapReference<Object>::FromMirrorPtr(new_value));
+  HeapReference<Object> old_ref(HeapReference<Object>::FromObjPtr(old_value));
+  HeapReference<Object> new_ref(HeapReference<Object>::FromObjPtr(new_value));
   uint8_t* raw_addr = reinterpret_cast<uint8_t*>(this) + field_offset.Int32Value();
   Atomic<uint32_t>* atomic_addr = reinterpret_cast<Atomic<uint32_t>*>(raw_addr);
 
@@ -1062,7 +1070,9 @@
 
 template<bool kTransactionActive, bool kCheckTransaction, VerifyObjectFlags kVerifyFlags>
 inline bool Object::CasFieldStrongRelaxedObjectWithoutWriteBarrier(
-    MemberOffset field_offset, Object* old_value, Object* new_value) {
+    MemberOffset field_offset,
+    ObjPtr<Object> old_value,
+    ObjPtr<Object> new_value) {
   if (kCheckTransaction) {
     DCHECK_EQ(kTransactionActive, Runtime::Current()->IsActiveTransaction());
   }
@@ -1078,8 +1088,8 @@
   if (kTransactionActive) {
     Runtime::Current()->RecordWriteFieldReference(this, field_offset, old_value, true);
   }
-  HeapReference<Object> old_ref(HeapReference<Object>::FromMirrorPtr(old_value));
-  HeapReference<Object> new_ref(HeapReference<Object>::FromMirrorPtr(new_value));
+  HeapReference<Object> old_ref(HeapReference<Object>::FromObjPtr(old_value));
+  HeapReference<Object> new_ref(HeapReference<Object>::FromObjPtr(new_value));
   uint8_t* raw_addr = reinterpret_cast<uint8_t*>(this) + field_offset.Int32Value();
   Atomic<uint32_t>* atomic_addr = reinterpret_cast<Atomic<uint32_t>*>(raw_addr);
 
diff --git a/runtime/mirror/object.cc b/runtime/mirror/object.cc
index 90b97fd..8e49743 100644
--- a/runtime/mirror/object.cc
+++ b/runtime/mirror/object.cc
@@ -72,18 +72,45 @@
   Object* const dest_obj_;
 };
 
-Object* Object::CopyObject(Thread* self, mirror::Object* dest, mirror::Object* src,
+Object* Object::CopyObject(Thread* self,
+                           mirror::Object* dest,
+                           mirror::Object* src,
                            size_t num_bytes) {
-  // Copy instance data.  We assume memcpy copies by words.
-  // TODO: expose and use move32.
-  uint8_t* src_bytes = reinterpret_cast<uint8_t*>(src);
-  uint8_t* dst_bytes = reinterpret_cast<uint8_t*>(dest);
-  size_t offset = sizeof(Object);
-  memcpy(dst_bytes + offset, src_bytes + offset, num_bytes - offset);
+  // Copy instance data.  Don't assume memcpy copies by words (b/32012820).
+  {
+    const size_t offset = sizeof(Object);
+    uint8_t* src_bytes = reinterpret_cast<uint8_t*>(src) + offset;
+    uint8_t* dst_bytes = reinterpret_cast<uint8_t*>(dest) + offset;
+    num_bytes -= offset;
+    DCHECK_ALIGNED(src_bytes, sizeof(uintptr_t));
+    DCHECK_ALIGNED(dst_bytes, sizeof(uintptr_t));
+    // Use word sized copies to begin.
+    while (num_bytes >= sizeof(uintptr_t)) {
+      *reinterpret_cast<uintptr_t*>(dst_bytes) = *reinterpret_cast<uintptr_t*>(src_bytes);
+      src_bytes += sizeof(uintptr_t);
+      dst_bytes += sizeof(uintptr_t);
+      num_bytes -= sizeof(uintptr_t);
+    }
+    // Copy possible 32 bit word.
+    if (sizeof(uintptr_t) != sizeof(uint32_t) && num_bytes >= sizeof(uint32_t)) {
+      *reinterpret_cast<uint32_t*>(dst_bytes) = *reinterpret_cast<uint32_t*>(src_bytes);
+      src_bytes += sizeof(uint32_t);
+      dst_bytes += sizeof(uint32_t);
+      num_bytes -= sizeof(uint32_t);
+    }
+    // Copy remaining bytes, avoid going past the end of num_bytes since there may be a redzone
+    // there.
+    while (num_bytes > 0) {
+      *reinterpret_cast<uint8_t*>(dst_bytes) = *reinterpret_cast<uint8_t*>(src_bytes);
+      src_bytes += sizeof(uint8_t);
+      dst_bytes += sizeof(uint8_t);
+      num_bytes -= sizeof(uint8_t);
+    }
+  }
+
   if (kUseReadBarrier) {
-    // We need a RB here. After the memcpy that covers the whole
-    // object above, copy references fields one by one again with a
-    // RB. TODO: Optimize this later?
+    // We need a RB here. After copying the whole object above, copy references fields one by one
+    // again with a RB to make sure there are no from space refs. TODO: Optimize this later?
     CopyReferenceFieldsWithReadBarrierVisitor visitor(dest);
     src->VisitReferences(visitor, visitor);
   }
@@ -199,7 +226,7 @@
   UNREACHABLE();
 }
 
-void Object::CheckFieldAssignmentImpl(MemberOffset field_offset, Object* new_value) {
+void Object::CheckFieldAssignmentImpl(MemberOffset field_offset, ObjPtr<Object> new_value) {
   Class* c = GetClass();
   Runtime* runtime = Runtime::Current();
   if (runtime->GetClassLinker() == nullptr || !runtime->IsStarted() ||
diff --git a/runtime/mirror/object.h b/runtime/mirror/object.h
index 10faf60..9ddf995 100644
--- a/runtime/mirror/object.h
+++ b/runtime/mirror/object.h
@@ -92,7 +92,7 @@
   ALWAYS_INLINE Class* GetClass() REQUIRES_SHARED(Locks::mutator_lock_);
 
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
-  void SetClass(Class* new_klass) REQUIRES_SHARED(Locks::mutator_lock_);
+  void SetClass(ObjPtr<Class> new_klass) REQUIRES_SHARED(Locks::mutator_lock_);
 
   // TODO: Clean these up and change to return int32_t
   Object* GetReadBarrierPointer() REQUIRES_SHARED(Locks::mutator_lock_);
@@ -283,54 +283,69 @@
   ALWAYS_INLINE T* GetFieldObjectVolatile(MemberOffset field_offset)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  template<bool kTransactionActive, bool kCheckTransaction = true,
-      VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags, bool kIsVolatile = false>
-  ALWAYS_INLINE void SetFieldObjectWithoutWriteBarrier(MemberOffset field_offset, Object* new_value)
+  template<bool kTransactionActive,
+           bool kCheckTransaction = true,
+           VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
+           bool kIsVolatile = false>
+  ALWAYS_INLINE void SetFieldObjectWithoutWriteBarrier(MemberOffset field_offset,
+                                                       ObjPtr<Object> new_value)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  template<bool kTransactionActive, bool kCheckTransaction = true,
-      VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags, bool kIsVolatile = false>
-  ALWAYS_INLINE void SetFieldObject(MemberOffset field_offset, Object* new_value)
+  template<bool kTransactionActive,
+           bool kCheckTransaction = true,
+           VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags,
+           bool kIsVolatile = false>
+  ALWAYS_INLINE void SetFieldObject(MemberOffset field_offset, ObjPtr<Object> new_value)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  template<bool kTransactionActive, bool kCheckTransaction = true,
-      VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
-  ALWAYS_INLINE void SetFieldObjectVolatile(MemberOffset field_offset, Object* new_value)
+  template<bool kTransactionActive,
+           bool kCheckTransaction = true,
+           VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  ALWAYS_INLINE void SetFieldObjectVolatile(MemberOffset field_offset,
+                                            ObjPtr<Object> new_value)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  template<bool kTransactionActive, bool kCheckTransaction = true,
-      VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
-  bool CasFieldWeakSequentiallyConsistentObject(MemberOffset field_offset, Object* old_value,
-                                                Object* new_value)
+  template<bool kTransactionActive,
+           bool kCheckTransaction = true,
+           VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  bool CasFieldWeakSequentiallyConsistentObject(MemberOffset field_offset,
+                                                ObjPtr<Object> old_value,
+                                                ObjPtr<Object> new_value)
       REQUIRES_SHARED(Locks::mutator_lock_);
-  template<bool kTransactionActive, bool kCheckTransaction = true,
-      VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  template<bool kTransactionActive,
+           bool kCheckTransaction = true,
+           VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   bool CasFieldWeakSequentiallyConsistentObjectWithoutWriteBarrier(MemberOffset field_offset,
-                                                                   Object* old_value,
-                                                                   Object* new_value)
+                                                                   ObjPtr<Object> old_value,
+                                                                   ObjPtr<Object> new_value)
       REQUIRES_SHARED(Locks::mutator_lock_);
-  template<bool kTransactionActive, bool kCheckTransaction = true,
-      VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
-  bool CasFieldStrongSequentiallyConsistentObject(MemberOffset field_offset, Object* old_value,
-                                                  Object* new_value)
+  template<bool kTransactionActive,
+           bool kCheckTransaction = true,
+           VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  bool CasFieldStrongSequentiallyConsistentObject(MemberOffset field_offset,
+                                                  ObjPtr<Object> old_value,
+                                                  ObjPtr<Object> new_value)
       REQUIRES_SHARED(Locks::mutator_lock_);
-  template<bool kTransactionActive, bool kCheckTransaction = true,
-      VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  template<bool kTransactionActive,
+           bool kCheckTransaction = true,
+           VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   bool CasFieldStrongSequentiallyConsistentObjectWithoutWriteBarrier(MemberOffset field_offset,
-                                                                     Object* old_value,
-                                                                     Object* new_value)
+                                                                     ObjPtr<Object> old_value,
+                                                                     ObjPtr<Object> new_value)
       REQUIRES_SHARED(Locks::mutator_lock_);
-  template<bool kTransactionActive, bool kCheckTransaction = true,
-      VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  template<bool kTransactionActive,
+           bool kCheckTransaction = true,
+           VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   bool CasFieldWeakRelaxedObjectWithoutWriteBarrier(MemberOffset field_offset,
-                                                    Object* old_value,
-                                                    Object* new_value)
+                                                    ObjPtr<Object> old_value,
+                                                    ObjPtr<Object> new_value)
       REQUIRES_SHARED(Locks::mutator_lock_);
-  template<bool kTransactionActive, bool kCheckTransaction = true,
-      VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
+  template<bool kTransactionActive,
+           bool kCheckTransaction = true,
+           VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
   bool CasFieldStrongRelaxedObjectWithoutWriteBarrier(MemberOffset field_offset,
-                                                      Object* old_value,
-                                                      Object* new_value)
+                                                      ObjPtr<Object> old_value,
+                                                      ObjPtr<Object> new_value)
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   template<VerifyObjectFlags kVerifyFlags = kDefaultVerifyFlags>
@@ -585,9 +600,9 @@
 
   // Verify the type correctness of stores to fields.
   // TODO: This can cause thread suspension and isn't moving GC safe.
-  void CheckFieldAssignmentImpl(MemberOffset field_offset, Object* new_value)
+  void CheckFieldAssignmentImpl(MemberOffset field_offset, ObjPtr<Object> new_value)
       REQUIRES_SHARED(Locks::mutator_lock_);
-  void CheckFieldAssignment(MemberOffset field_offset, Object* new_value)
+  void CheckFieldAssignment(MemberOffset field_offset, ObjPtr<Object>new_value)
       REQUIRES_SHARED(Locks::mutator_lock_) {
     if (kCheckFieldAssignments) {
       CheckFieldAssignmentImpl(field_offset, new_value);
diff --git a/runtime/mirror/object_reference-inl.h b/runtime/mirror/object_reference-inl.h
new file mode 100644
index 0000000..60955d6
--- /dev/null
+++ b/runtime/mirror/object_reference-inl.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_MIRROR_OBJECT_REFERENCE_INL_H_
+#define ART_RUNTIME_MIRROR_OBJECT_REFERENCE_INL_H_
+
+#include "object_reference.h"
+
+#include "obj_ptr-inl.h"
+
+namespace art {
+namespace mirror {
+
+// References between objects within the managed heap.
+template<class MirrorType>
+HeapReference<MirrorType> HeapReference<MirrorType>::FromObjPtr(ObjPtr<MirrorType> ptr) {
+  return HeapReference<MirrorType>(ptr.Ptr());
+}
+
+}  // namespace mirror
+}  // namespace art
+
+#endif  // ART_RUNTIME_MIRROR_OBJECT_REFERENCE_INL_H_
diff --git a/runtime/mirror/object_reference.h b/runtime/mirror/object_reference.h
index f4a3580..573cb30 100644
--- a/runtime/mirror/object_reference.h
+++ b/runtime/mirror/object_reference.h
@@ -19,6 +19,7 @@
 
 #include "base/mutex.h"  // For Locks::mutator_lock_.
 #include "globals.h"
+#include "obj_ptr.h"
 
 namespace art {
 namespace mirror {
@@ -86,11 +87,18 @@
       REQUIRES_SHARED(Locks::mutator_lock_) {
     return HeapReference<MirrorType>(mirror_ptr);
   }
+
+  static HeapReference<MirrorType> FromObjPtr(ObjPtr<MirrorType> ptr)
+      REQUIRES_SHARED(Locks::mutator_lock_);
+
  private:
   explicit HeapReference(MirrorType* mirror_ptr) REQUIRES_SHARED(Locks::mutator_lock_)
       : ObjectReference<kPoisonHeapReferences, MirrorType>(mirror_ptr) {}
 };
 
+static_assert(sizeof(mirror::HeapReference<mirror::Object>) == kHeapReferenceSize,
+              "heap reference size does not match");
+
 // Standard compressed reference used in the runtime. Used for StackReference and GC roots.
 template<class MirrorType>
 class MANAGED CompressedReference : public mirror::ObjectReference<false, MirrorType> {
diff --git a/runtime/obj_ptr.h b/runtime/obj_ptr.h
index 74be44e..75a6f9f 100644
--- a/runtime/obj_ptr.h
+++ b/runtime/obj_ptr.h
@@ -22,7 +22,6 @@
 
 #include "base/mutex.h"  // For Locks::mutator_lock_.
 #include "globals.h"
-#include "mirror/object_reference.h"
 
 namespace art {
 
@@ -32,7 +31,7 @@
 template<class MirrorType, bool kPoison = kIsDebugBuild>
 class ObjPtr {
   static constexpr size_t kCookieShift =
-      sizeof(mirror::HeapReference<mirror::Object>) * kBitsPerByte - kObjectAlignmentShift;
+      sizeof(kHeapReferenceSize) * kBitsPerByte - kObjectAlignmentShift;
   static constexpr size_t kCookieBits = sizeof(uintptr_t) * kBitsPerByte - kCookieShift;
   static constexpr uintptr_t kCookieMask = (static_cast<uintptr_t>(1u) << kCookieBits) - 1;
 
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 7d9d506..9c0d2db 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -1899,11 +1899,16 @@
   preinitialization_transaction_->RecordWriteField64(obj, field_offset, value, is_volatile);
 }
 
-void Runtime::RecordWriteFieldReference(mirror::Object* obj, MemberOffset field_offset,
-                                        mirror::Object* value, bool is_volatile) const {
+void Runtime::RecordWriteFieldReference(mirror::Object* obj,
+                                        MemberOffset field_offset,
+                                        ObjPtr<mirror::Object> value,
+                                        bool is_volatile) const {
   DCHECK(IsAotCompiler());
   DCHECK(IsActiveTransaction());
-  preinitialization_transaction_->RecordWriteFieldReference(obj, field_offset, value, is_volatile);
+  preinitialization_transaction_->RecordWriteFieldReference(obj,
+                                                            field_offset,
+                                                            value.Ptr(),
+                                                            is_volatile);
 }
 
 void Runtime::RecordWriteArray(mirror::Array* array, size_t index, uint64_t value) const {
diff --git a/runtime/runtime.h b/runtime/runtime.h
index 5a95f78..66fd058 100644
--- a/runtime/runtime.h
+++ b/runtime/runtime.h
@@ -502,8 +502,11 @@
                           bool is_volatile) const;
   void RecordWriteField64(mirror::Object* obj, MemberOffset field_offset, uint64_t value,
                           bool is_volatile) const;
-  void RecordWriteFieldReference(mirror::Object* obj, MemberOffset field_offset,
-                                 mirror::Object* value, bool is_volatile) const;
+  void RecordWriteFieldReference(mirror::Object* obj,
+                                 MemberOffset field_offset,
+                                 ObjPtr<mirror::Object> value,
+                                 bool is_volatile) const
+      REQUIRES_SHARED(Locks::mutator_lock_);
   void RecordWriteArray(mirror::Array* array, size_t index, uint64_t value) const
       REQUIRES_SHARED(Locks::mutator_lock_);
   void RecordStrongStringInsertion(mirror::String* s) const
diff --git a/runtime/verify_object-inl.h b/runtime/verify_object-inl.h
index f7a8249..4892b49 100644
--- a/runtime/verify_object-inl.h
+++ b/runtime/verify_object-inl.h
@@ -21,31 +21,32 @@
 
 #include "gc/heap.h"
 #include "mirror/object-inl.h"
+#include "obj_ptr-inl.h"
 
 namespace art {
 
-inline void VerifyObject(mirror::Object* obj) {
+inline void VerifyObject(ObjPtr<mirror::Object> obj) {
   if (kVerifyObjectSupport > kVerifyObjectModeDisabled && obj != nullptr) {
     if (kVerifyObjectSupport > kVerifyObjectModeFast) {
       // Slow object verification, try the heap right away.
-      Runtime::Current()->GetHeap()->VerifyObjectBody(obj);
+      Runtime::Current()->GetHeap()->VerifyObjectBody(obj.Ptr());
     } else {
       // Fast object verification, only call the heap if our quick sanity tests fail. The heap will
       // print the diagnostic message.
-      bool failed = !IsAligned<kObjectAlignment>(obj);
+      bool failed = !IsAligned<kObjectAlignment>(obj.Ptr());
       if (!failed) {
         mirror::Class* c = obj->GetClass<kVerifyNone>();
         failed = failed || !IsAligned<kObjectAlignment>(c);
         failed = failed || !VerifyClassClass(c);
       }
       if (UNLIKELY(failed)) {
-        Runtime::Current()->GetHeap()->VerifyObjectBody(obj);
+        Runtime::Current()->GetHeap()->VerifyObjectBody(obj.Ptr());
       }
     }
   }
 }
 
-inline bool VerifyClassClass(mirror::Class* c) {
+inline bool VerifyClassClass(ObjPtr<mirror::Class> c) {
   if (UNLIKELY(c == nullptr)) {
     return false;
   }
diff --git a/runtime/verify_object.h b/runtime/verify_object.h
index 8e1653d..384e56f 100644
--- a/runtime/verify_object.h
+++ b/runtime/verify_object.h
@@ -20,6 +20,7 @@
 #include <stdint.h>
 
 #include "base/macros.h"
+#include "obj_ptr.h"
 
 namespace art {
 
@@ -52,10 +53,10 @@
 static constexpr VerifyObjectMode kVerifyObjectSupport =
     kDefaultVerifyFlags != 0 ? kVerifyObjectModeFast : kVerifyObjectModeDisabled;
 
-ALWAYS_INLINE void VerifyObject(mirror::Object* obj) NO_THREAD_SAFETY_ANALYSIS;
+ALWAYS_INLINE void VerifyObject(ObjPtr<mirror::Object> obj) NO_THREAD_SAFETY_ANALYSIS;
 
 // Check that c.getClass() == c.getClass().getClass().
-ALWAYS_INLINE bool VerifyClassClass(mirror::Class* c) NO_THREAD_SAFETY_ANALYSIS;
+ALWAYS_INLINE bool VerifyClassClass(ObjPtr<mirror::Class> c) NO_THREAD_SAFETY_ANALYSIS;
 
 }  // namespace art
 
diff --git a/test/618-checker-induction/src/Main.java b/test/618-checker-induction/src/Main.java
index a68c383..0ea85da 100644
--- a/test/618-checker-induction/src/Main.java
+++ b/test/618-checker-induction/src/Main.java
@@ -155,7 +155,7 @@
   /// CHECK-DAG:               Return [<<Phi1>>] loop:none
   //
   /// CHECK-START: int Main.closedFormInductionUp() loop_optimization (after)
-  /// CHECK-NOT:               Phi    loop:B\d+ outer_loop:none
+  /// CHECK-NOT:               Phi    loop:{{B\d+}} outer_loop:none
   /// CHECK-DAG:               Return loop:none
   static int closedFormInductionUp() {
     int closed = 12345;
@@ -171,7 +171,7 @@
   /// CHECK-DAG:               Return [<<Phi2>>] loop:none
   //
   /// CHECK-START: int Main.closedFormInductionInAndDown(int) loop_optimization (after)
-  /// CHECK-NOT:               Phi    loop:B\d+ outer_loop:none
+  /// CHECK-NOT:               Phi    loop:{{B\d+}} outer_loop:none
   /// CHECK-DAG:               Return loop:none
   static int closedFormInductionInAndDown(int closed) {
     for (int i = 0; i < 10; i++) {
@@ -180,6 +180,17 @@
     return closed;  // only needs last value
   }
 
+  // TODO: move closed form even further out?
+  static int closedFormNested() {
+    int closed = 0;
+    for (int i = 0; i < 10; i++) {
+      for (int j = 0; j < 10; j++) {
+        closed++;
+      }
+    }
+    return closed;  // only needs last-value
+  }
+
   // TODO: taken test around closed form?
   static int closedFormInductionUpN(int n) {
     int closed = 12345;
@@ -198,7 +209,7 @@
   }
 
   // TODO: move closed form even further out?
-  static int closedFormNested(int n) {
+  static int closedFormNestedN(int n) {
     int closed = 0;
     for (int i = 0; i < n; i++) {
       for (int j = 0; j < 10; j++) {
@@ -208,34 +219,40 @@
     return closed;  // only needs last-value
   }
 
-  // TODO: handle as closed/empty eventually?
-  static int mainIndexReturned(int n) {
+  // TODO: move closed form even further out?
+  static int closedFormNestedNN(int n) {
+    int closed = 0;
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < n; j++) {
+        closed++;
+      }
+    }
+    return closed;  // only needs last-value
+  }
+
+  /// CHECK-START: int Main.mainIndexReturned() loop_optimization (before)
+  /// CHECK-DAG: <<Phi:i\d+>> Phi              loop:{{B\d+}} outer_loop:none
+  /// CHECK-DAG:              Return [<<Phi>>] loop:none
+  //
+  /// CHECK-START: int Main.mainIndexReturned() loop_optimization (after)
+  /// CHECK-NOT:              Phi    loop:{{B\d+}} outer_loop:none
+  /// CHECK-DAG:              Return loop:none
+  static int mainIndexReturned() {
     int i;
-    for (i = 0; i < n; i++);
+    for (i = 0; i < 10; i++);
     return i;
   }
 
   // If ever replaced by closed form, last value should be correct!
-  static int periodicReturned(int n) {
+  static int periodicReturned() {
     int k = 0;
-    for (int i = 0; i < n; i++) {
+    for (int i = 0; i < 9; i++) {
       k = 1 - k;
     }
     return k;
   }
 
-  // Same here.
-  private static int getSum(int n) {
-    int k = 0;
-    int sum = 0;
-    for (int i = 0; i < n; i++) {
-      k++;
-      sum += k;
-    }
-    return sum;
-  }
-
-  // Same here.
+  // If ever replaced by closed form, last value should be correct!
   private static int getSum21() {
     int k = 0;
     int sum = 0;
@@ -246,7 +263,34 @@
     return sum;
   }
 
-  // Same here.
+  // TODO: handle as closed/empty eventually?
+  static int mainIndexReturnedN(int n) {
+    int i;
+    for (i = 0; i < n; i++);
+    return i;
+  }
+
+  // If ever replaced by closed form, last value should be correct!
+  static int periodicReturnedN(int n) {
+    int k = 0;
+    for (int i = 0; i < n; i++) {
+      k = 1 - k;
+    }
+    return k;
+  }
+
+  // If ever replaced by closed form, last value should be correct!
+  private static int getSumN(int n) {
+    int k = 0;
+    int sum = 0;
+    for (int i = 0; i < n; i++) {
+      k++;
+      sum += k;
+    }
+    return sum;
+  }
+
+  // If ever replaced by closed form, last value should be correct!
   private static int closedTwice() {
     int closed = 0;
     for (int i = 0; i < 10; i++) {
@@ -269,7 +313,7 @@
   /// CHECK-EVAL: "<<Loop1>>" != "<<Loop2>>"
   //
   /// CHECK-START: int Main.closedFeed() loop_optimization (after)
-  /// CHECK-NOT:               Phi    loop:B\d+ outer_loop:none
+  /// CHECK-NOT:               Phi    loop:{{B\d+}} outer_loop:none
   /// CHECK-DAG:               Return loop:none
   private static int closedFeed() {
     int closed = 0;
@@ -316,6 +360,27 @@
     return closed;
   }
 
+  /// CHECK-START: int Main.waterFall() loop_optimization (before)
+  /// CHECK-DAG: <<Phi1:i\d+>> Phi               loop:<<Loop1:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi2:i\d+>> Phi               loop:<<Loop2:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi3:i\d+>> Phi               loop:<<Loop3:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi4:i\d+>> Phi               loop:<<Loop4:B\d+>> outer_loop:none
+  /// CHECK-DAG: <<Phi5:i\d+>> Phi               loop:<<Loop5:B\d+>> outer_loop:none
+  /// CHECK-DAG:               Return [<<Phi5>>] loop:none
+  //
+  /// CHECK-START: int Main.waterFall() loop_optimization (after)
+  /// CHECK-NOT:               Phi    loop:B\d+ outer_loop:none
+  /// CHECK-DAG:               Return loop:none
+  private static int waterFall() {
+    int i = 0;
+    for (; i < 10; i++);
+    for (; i < 20; i++);
+    for (; i < 30; i++);
+    for (; i < 40; i++);
+    for (; i < 50; i++);
+    return i;  // this should become just 50
+  }
+
   private static int exceptionExitBeforeAdd() {
     int k = 0;
     try {
@@ -376,31 +441,32 @@
       expectEquals(4, a[i]);
     }
 
-    int c = closedFormInductionUp();
-    expectEquals(12395, c);
-    c = closedFormInductionInAndDown(12345);
-    expectEquals(12295, c);
+    expectEquals(12395, closedFormInductionUp());
+    expectEquals(12295, closedFormInductionInAndDown(12345));
+    expectEquals(10 * 10, closedFormNested());
     for (int n = -4; n < 10; n++) {
       int tc = (n <= 0) ? 0 : n;
-      c = closedFormInductionUpN(n);
-      expectEquals(12345 + tc * 5, c);
-      c = closedFormInductionInAndDownN(12345, n);
-      expectEquals(12345 - tc * 5, c);
-      c = closedFormNested(n);
-      expectEquals(tc * 10, c);
+      expectEquals(12345 + tc * 5, closedFormInductionUpN(n));
+      expectEquals(12345 - tc * 5, closedFormInductionInAndDownN(12345, n));
+      expectEquals(tc * 10, closedFormNestedN(n));
+      expectEquals(tc * tc, closedFormNestedNN(n));
     }
 
+    expectEquals(10, mainIndexReturned());
+    expectEquals(1, periodicReturned());
+    expectEquals(21, getSum21());
     for (int n = -4; n < 4; n++) {
       int tc = (n <= 0) ? 0 : n;
-      expectEquals(tc, mainIndexReturned(n));
-      expectEquals(tc & 1, periodicReturned(n));
-      expectEquals((tc * (tc + 1)) / 2, getSum(n));
+      expectEquals(tc, mainIndexReturnedN(n));
+      expectEquals(tc & 1, periodicReturnedN(n));
+      expectEquals((tc * (tc + 1)) / 2, getSumN(n));
     }
-    expectEquals(21, getSum21());
+
     expectEquals(10, closedTwice());
     expectEquals(20, closedFeed());
     expectEquals(-10, closedLargeUp());
     expectEquals(10, closedLargeDown());
+    expectEquals(50, waterFall());
 
     expectEquals(100, exceptionExitBeforeAdd());
     expectEquals(100, exceptionExitAfterAdd());