Parellel mark stack processing

Enabled parallel mark stack processing by using a thread pool.

Optimized object scanning by removing dependent loads for IsClass.

Performance:
Prime: ~10% speedup of partial GC.
Nakasi: ~50% speedup of partial GC.

Change-Id: I43256a068efc47cb52d93108458ea18d4e02fccc
diff --git a/src/runtime.cc b/src/runtime.cc
index 79d1fb2..3fa3123 100644
--- a/src/runtime.cc
+++ b/src/runtime.cc
@@ -115,6 +115,8 @@
 }
 
 Runtime::~Runtime() {
+  heap_->DeleteThreadPool();
+
   Thread* self = Thread::Current();
   {
     MutexLock mu(self, *Locks::runtime_shutdown_lock_);
@@ -696,6 +698,9 @@
 void Runtime::DidForkFromZygote() {
   is_zygote_ = false;
 
+  // Create the thread pool.
+  heap_->CreateThreadPool();
+
   StartSignalCatcher();
 
   // Start the JDWP thread. If the command-line debugger flags specified "suspend=y",
@@ -1030,7 +1035,9 @@
 }
 
 void Runtime::DirtyRoots() {
+  CHECK(intern_table_ != NULL);
   intern_table_->Dirty();
+  CHECK(class_linker_ != NULL);
   class_linker_->Dirty();
 }