Introduce more compact ReadBarrierMark slow-paths.

Replace entry point ReadBarrierMark with 32
ReadBarrierMarkRegX entry points, using register
number X as input and output (instead of the standard
runtime calling convention) to save two moves in Baker's
read barrier mark slow-path code.

Test: ART host and target (ARM, ARM64) tests.
Bug: 29506760
Bug: 12687968
Change-Id: I73cfb82831cf040b8b018e984163c865cc44ed87
diff --git a/runtime/thread.h b/runtime/thread.h
index ab24625..a3a4005 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -1352,7 +1352,7 @@
       stacked_shadow_frame_record(nullptr), deoptimization_context_stack(nullptr),
       frame_id_to_shadow_frame(nullptr), name(nullptr), pthread_self(0),
       last_no_thread_suspension_cause(nullptr), thread_local_start(nullptr),
-      thread_local_pos(nullptr), thread_local_end(nullptr), thread_local_objects(0),
+      thread_local_objects(0), thread_local_pos(nullptr), thread_local_end(nullptr),
       mterp_current_ibase(nullptr), mterp_default_ibase(nullptr), mterp_alt_ibase(nullptr),
       thread_local_alloc_stack_top(nullptr), thread_local_alloc_stack_end(nullptr),
       nested_signal_state(nullptr), flip_function(nullptr), method_verifier(nullptr),
@@ -1468,11 +1468,11 @@
 
     // Thread-local allocation pointer.
     uint8_t* thread_local_start;
+    size_t thread_local_objects;
     // thread_local_pos and thread_local_end must be consecutive for ldrd and are 8 byte aligned for
     // potentially better performance.
     uint8_t* thread_local_pos;
     uint8_t* thread_local_end;
-    size_t thread_local_objects;
 
     // Mterp jump table bases.
     void* mterp_current_ibase;