diff --git a/src/compiler.h b/src/compiler.h
index 50447db..563bbde 100644
--- a/src/compiler.h
+++ b/src/compiler.h
@@ -8,6 +8,8 @@
 #include "jni_compiler.h"
 #include "object.h"
 
+int oatVRegOffsetFromMethod(art::Method* method, int reg);
+
 namespace art {
 
 class Compiler {
@@ -45,7 +47,6 @@
   void CompileDexFile(const ClassLoader* class_loader, const DexFile& dex_file);
   void CompileClass(Class* klass);
   void CompileMethod(Method* klass);
-  int oatVRegOffsetFromMethod(Method* method, int reg);
 
   // After compiling, walk all the DexCaches and set the code and
   // method pointers of CodeAndDirectMethods entries in the DexCaches.
diff --git a/src/context.h b/src/context.h
index 05cd43b..b4a8b65 100644
--- a/src/context.h
+++ b/src/context.h
@@ -27,6 +27,9 @@
   // Set the program counter value
   virtual void SetPC(uintptr_t new_pc) = 0;
 
+  // Read the given GPR
+  virtual uintptr_t GetGPR(uint32_t reg) = 0;
+
   // Switch execution of the executing context to this context
   virtual void DoLongJump() = 0;
 };
diff --git a/src/context_arm.h b/src/context_arm.h
index 31a1a2a..07a4c0b 100644
--- a/src/context_arm.h
+++ b/src/context_arm.h
@@ -24,6 +24,12 @@
     gprs_[PC] = new_pc;
   }
 
+  virtual uintptr_t GetGPR(uint32_t reg) {
+    CHECK_GE(reg, 0u);
+    CHECK_LT(reg, 16u);
+    return gprs_[reg];
+  }
+
   virtual void DoLongJump();
 
  private:
diff --git a/src/context_x86.h b/src/context_x86.h
index 10dcbb4..aca994b 100644
--- a/src/context_x86.h
+++ b/src/context_x86.h
@@ -26,6 +26,12 @@
     eip_ = new_pc;
   }
 
+  virtual uintptr_t GetGPR(uint32_t reg) {
+    CHECK_GE(reg, 0u);
+    CHECK_LT(reg, 8u);
+    return gprs_[reg];
+  }
+
   virtual void DoLongJump();
 
  private:
diff --git a/src/dex_verifier.cc b/src/dex_verifier.cc
index 2268772..8577dcc 100644
--- a/src/dex_verifier.cc
+++ b/src/dex_verifier.cc
@@ -5305,14 +5305,14 @@
   }
 
   /* Update method, and free compressed map if it was sitting on the heap. */
-  ByteArray* header = ByteArray::Alloc(sizeof(RegisterMapHeader));
-  ByteArray* data = ByteArray::Alloc(ComputeRegisterMapSize(map));
+  //ByteArray* header = ByteArray::Alloc(sizeof(RegisterMapHeader));
+  //ByteArray* data = ByteArray::Alloc(ComputeRegisterMapSize(map));
 
-  memcpy(header->GetData(), map->header_, sizeof(RegisterMapHeader));
-  memcpy(data->GetData(), map->data_, ComputeRegisterMapSize(map));
+  //memcpy(header->GetData(), map->header_, sizeof(RegisterMapHeader));
+  //memcpy(data->GetData(), map->data_, ComputeRegisterMapSize(map));
 
-  method->SetRegisterMapHeader(header);
-  method->SetRegisterMapData(data);
+  //method->SetRegisterMapHeader(header);
+  //method->SetRegisterMapData(data);
 
   delete map;
   return new_map;
diff --git a/src/heap.cc b/src/heap.cc
index 1b3daad..015f638 100644
--- a/src/heap.cc
+++ b/src/heap.cc
@@ -348,12 +348,13 @@
     ++Runtime::Current()->GetStats()->gc_for_alloc_count;
     ++Thread::Current()->GetStats()->gc_for_alloc_count;
   }
-  UNIMPLEMENTED(FATAL) << "No implicit GC, use larger -Xms -Xmx";
+  LOG(INFO) << "GC_FOR_ALLOC: TODO: test";
   CollectGarbageInternal();
   ptr = space->AllocWithoutGrowth(size);
   if (ptr != NULL) {
     return ptr;
   }
+  UNIMPLEMENTED(FATAL) << "No AllocWithGrowth, use larger -Xms -Xmx";
 
   // Even that didn't work;  this is an exceptional state.
   // Try harder, growing the heap if necessary.
diff --git a/src/object.h b/src/object.h
index 47aa98d..9b1d3c7 100644
--- a/src/object.h
+++ b/src/object.h
@@ -838,8 +838,7 @@
   }
 
   ShortArray* GetVMapTable() const {
-    return GetFieldObject<ShortArray*>(
-        OFFSET_OF_OBJECT_MEMBER(Method, vmap_table_), false);
+    return GetFieldObject<ShortArray*>(OFFSET_OF_OBJECT_MEMBER(Method, vmap_table_), false);
   }
 
   size_t GetFrameSizeInBytes() const {
diff --git a/src/thread.cc b/src/thread.cc
index d79e1c5..7adddfa 100644
--- a/src/thread.cc
+++ b/src/thread.cc
@@ -27,7 +27,9 @@
 #include <list>
 
 #include "class_linker.h"
+#include "compiler.h"
 #include "context.h"
+#include "dex_verifier.h"
 #include "heap.h"
 #include "jni_internal.h"
 #include "monitor.h"
@@ -532,6 +534,13 @@
   return *reinterpret_cast<uintptr_t*>(pc_addr);
 }
 
+uintptr_t Frame::GetVReg(Method* method, int vreg) const {
+  DCHECK(method == GetMethod());
+  int offset = oatVRegOffsetFromMethod(method, vreg);
+  byte* vreg_addr = reinterpret_cast<byte*>(sp_) + offset;
+  return *reinterpret_cast<uintptr_t*>(vreg_addr);
+}
+
 uintptr_t Frame::LoadCalleeSave(int num) const {
   // Callee saves are held at the top of the frame
   Method* method = GetMethod();
@@ -1576,7 +1585,75 @@
   return gThread_daemon->GetBoolean(peer_);
 }
 
-void Thread::VisitRoots(Heap::RootVisitor* visitor, void* arg) const {
+class ReferenceMapVisitor : public Thread::StackVisitor {
+ public:
+  ReferenceMapVisitor(Context* context, Heap::RootVisitor* root_visitor, void* arg) :
+    context_(context), root_visitor_(root_visitor), arg_(arg) {
+  }
+
+  void VisitFrame(const Frame& frame, uintptr_t pc) {
+    Method* m = frame.GetMethod();
+    LOG(INFO) << "Visiting stack roots in " << PrettyMethod(m, false);
+
+    // Process register map (which native and callee save methods don't have)
+    if (!m->IsNative() && !m->IsPhony()) {
+      UniquePtr<art::DexVerifier::RegisterMap> map(art::DexVerifier::GetExpandedRegisterMap(m));
+
+      const uint8_t* reg_bitmap = art::DexVerifier::RegisterMapGetLine(map.get(), m->ToDexPC(pc));
+      CHECK(reg_bitmap != NULL);
+      ShortArray* vmap = m->GetVMapTable();
+      // For all dex registers
+      for (int reg = 0; reg < m->NumRegisters(); ++reg) {
+        // Does this register hold a reference?
+        if (TestBitmap(reg, reg_bitmap)) {
+          // Is the reference in the context or on the stack?
+          bool in_context = false;
+          int vmap_offset = -1;
+          // TODO: take advantage of the registers being ordered
+          for (int i = 0; i < vmap->GetLength(); i++) {
+            if (vmap->Get(i) == reg) {
+              in_context = true;
+              vmap_offset = i;
+              break;
+            }
+          }
+          Object* ref;
+          if (in_context) {
+            // Compute the register we need to load from the context
+            uint32_t spill_mask = m->GetCoreSpillMask();
+            uint32_t reg = 0;
+            for (int i = 0; i < vmap_offset; i++) {
+              while ((spill_mask & 1) == 0) {
+                CHECK_NE(spill_mask, 0u);
+                spill_mask >>= 1;
+                reg++;
+              }
+            }
+            ref = reinterpret_cast<Object*>(context_->GetGPR(reg));
+          } else {
+            ref = reinterpret_cast<Object*>(frame.GetVReg(m ,reg));
+          }
+          root_visitor_(ref, arg_);
+        }
+      }
+    }
+    context_->FillCalleeSaves(frame);
+  }
+
+ private:
+  bool TestBitmap(int reg, const uint8_t* reg_vector) {
+    return ((reg_vector[reg / 8] >> (reg % 8)) & 0x01) != 0;
+  }
+
+  // Context used to build up picture of callee saves
+  Context* context_;
+  // Call-back when we visit a root
+  Heap::RootVisitor* root_visitor_;
+  // Argument to call-back
+  void* arg_;
+};
+
+void Thread::VisitRoots(Heap::RootVisitor* visitor, void* arg) {
   if (exception_ != NULL) {
     visitor(exception_, arg);
   }
@@ -1585,8 +1662,12 @@
   }
   jni_env_->locals.VisitRoots(visitor, arg);
   jni_env_->monitors.VisitRoots(visitor, arg);
-  // visitThreadStack(visitor, thread, arg);
-  UNIMPLEMENTED(WARNING) << "some per-Thread roots not visited";
+  // Cheat and steal the long jump context. Assume that we are not doing a GC during exception
+  // delivery.
+  Context* context = GetLongJumpContext();
+  // Visit roots on this thread's stack
+  ReferenceMapVisitor mapper(context, visitor, arg);
+  WalkStack(&mapper);
 }
 
 static const char* kStateNames[] = {
diff --git a/src/thread.h b/src/thread.h
index ebab184..db5bbe8 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -121,6 +121,8 @@
 
   uintptr_t LoadCalleeSave(int num) const;
 
+  uintptr_t GetVReg(Method* method, int vreg) const;
+
   Method** GetSP() const {
     return sp_;
   }
@@ -460,7 +462,7 @@
   static jobjectArray InternalStackTraceToStackTraceElementArray(JNIEnv* env, jobject internal,
       jobjectArray output_array = NULL, int* stack_depth = NULL);
 
-  void VisitRoots(Heap::RootVisitor* visitor, void* arg) const;
+  void VisitRoots(Heap::RootVisitor* visitor, void* arg);
 
   //
   // Offsets of various members of native Thread class, used by compiled code.
