Change ProcessReferences to not use RecursiveMarkObject.

Calling ProcessMarkStack in RecursiveMarkObject caused a lot of
overhead due to timing logger splits. Changed the logic to be the
same as prior to the reference queue refactoring which involves
calling process mark stack after preserving soft references and
enqueueing finalizer references.

FinalizingGC longest pause is reduced by around 1/2 down to ~300ms.
Benchmark score ~400000 -> ~600000.

Also changed the timing logger splits in the GC to have (Paused) if
the split is a paused part of the GC.

Bug: 12129382

Change-Id: I7476d4f23670b19d70738e2fd48e37ec2f57e9f4
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index 006c271..7b9d675 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -179,11 +179,11 @@
   TimingLogger::ScopedSplit split("ProcessReferences", &timings_);
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
   GetHeap()->ProcessReferences(timings_, clear_soft_references_, &IsMarkedCallback,
-                               &RecursiveMarkObjectCallback, this);
+                               &MarkObjectCallback, &ProcessMarkStackPausedCallback, this);
 }
 
 bool MarkSweep::HandleDirtyObjectsPhase() {
-  TimingLogger::ScopedSplit split("HandleDirtyObjectsPhase", &timings_);
+  TimingLogger::ScopedSplit split("(Paused)HandleDirtyObjectsPhase", &timings_);
   Thread* self = Thread::Current();
   Locks::mutator_lock_->AssertExclusiveHeld(self);
 
@@ -400,10 +400,9 @@
   }
 }
 
-mirror::Object* MarkSweep::RecursiveMarkObjectCallback(mirror::Object* obj, void* arg) {
+mirror::Object* MarkSweep::MarkObjectCallback(mirror::Object* obj, void* arg) {
   MarkSweep* mark_sweep = reinterpret_cast<MarkSweep*>(arg);
   mark_sweep->MarkObject(obj);
-  mark_sweep->ProcessMarkStack(true);
   return obj;
 }
 
@@ -546,13 +545,6 @@
   reinterpret_cast<MarkSweep*>(arg)->MarkObjectNonNull(*root);
 }
 
-mirror::Object* MarkSweep::MarkObjectCallback(mirror::Object* object, void* arg) {
-  DCHECK(object != nullptr);
-  DCHECK(arg != nullptr);
-  reinterpret_cast<MarkSweep*>(arg)->MarkObjectNonNull(object);
-  return object;
-}
-
 void MarkSweep::VerifyRootCallback(const Object* root, void* arg, size_t vreg,
                                    const StackVisitor* visitor) {
   reinterpret_cast<MarkSweep*>(arg)->VerifyRoot(root, vreg, visitor);
@@ -957,7 +949,7 @@
 }
 
 void MarkSweep::ReMarkRoots() {
-  timings_.StartSplit("ReMarkRoots");
+  timings_.StartSplit("(Paused)ReMarkRoots");
   Runtime::Current()->VisitRoots(MarkRootCallback, this, true, true);
   timings_.EndSplit();
 }
@@ -1208,6 +1200,11 @@
   ScanObjectVisit(obj, visitor);
 }
 
+void MarkSweep::ProcessMarkStackPausedCallback(void* arg) {
+  DCHECK(arg != nullptr);
+  reinterpret_cast<MarkSweep*>(arg)->ProcessMarkStack(true);
+}
+
 void MarkSweep::ProcessMarkStackParallel(size_t thread_count) {
   Thread* self = Thread::Current();
   ThreadPool* thread_pool = GetHeap()->GetThreadPool();
@@ -1231,7 +1228,7 @@
 
 // Scan anything that's on the mark stack.
 void MarkSweep::ProcessMarkStack(bool paused) {
-  timings_.StartSplit("ProcessMarkStack");
+  timings_.StartSplit(paused ? "(Paused)ProcessMarkStack" : "ProcessMarkStack");
   size_t thread_count = GetThreadCount(paused);
   if (kParallelProcessMarkStack && thread_count > 1 &&
       mark_stack_->Size() >= kMinimumParallelMarkStackSize) {