Fast Art interpreter

Add a Dalvik-style fast interpreter to Art.
Three primary deficiencies in the existing Art interpreter
will be addressed:

1.  Structural inefficiencies (primarily the bloated
    fetch/decode/execute overhead of the C++ interpreter
    implementation).
2.  Stack memory wastage.  Each managed-language invoke
    adds a full copy of the interpreter's compiler-generated
    locals on the shared stack.  We're at the mercy of
    the compiler now in how much memory is wasted here.  An
    assembly based interpreter can manage memory usage more
    effectively.
3.  Shadow frame model, which not only spends twice the memory
    to store the Dalvik virtual registers, but causes vreg stores
    to happen twice.

This CL mostly deals with #1 (but does provide some stack memory
savings).  Subsequent CLs will address the other issues.

Current status:
   Passes all run-tests.
   Phone boots interpret-only.
   2.5x faster than Clang-compiled Art goto interpreter on fetch/decode/execute
       microbenchmark, 5x faster than gcc-compiled goto interpreter.
   1.6x faster than Clang goto on Caffeinemark overall
   2.0x faster than Clang switch on Caffeinemark overall
   68% of Dalvik interpreter performance on Caffeinemark (still much slower,
       primarily because of poor invoke performance and lack of execute-inline)
   Still nearly an order of magnitude slower than Dalvik on invokes
       (but slightly better than Art Clang goto interpreter.
   Importantly, saves ~200 bytes of stack memory per invoke (but still
       wastes ~400 relative to Dalvik).

What's needed:
   Remove the (large quantity of) bring-up hackery in place.
   Integrate into the build mechanism.  I'm still using the old Dalvik manual
       build step to generate assembly code from the stub files.
   Remove the suspend check hack.  For bring-up purposes, I'm using an explicit
       suspend check (like the other Art interpreters).  However, we should be
       doing a Dalvik style suspend check via the table base switch mechanism.
       This should be done during the alternative interpreter activation.
   General cleanup.
   Add CFI info.
   Update the new target bring-up README documentation.
   Add other targets.

In later CLs:
   Consolidate mterp handlers for expensive operations (such as new-instance) with
       the code used by the switch interpreter.  No need to duplicate the code for
       heavyweight operations (but will need some refactoring to align).
   Tuning - some fast paths needs to be moved down to the assembly handlers,
       rather than being dealt with in the out-of-line code.
   JIT profiling.  Currently, the fast interpreter is used only in the fast
       case - no instrumentation, no transactions and no access checks. We
       will want to implement fast + JIT-profiling as the alternate fast
       interpreter.  All other cases can still fall back to the reference
       interpreter.
   Improve invoke performance.  We're nearly an order of magnitude slower than
       Dalvik here.  Some of that is unavoidable, but I suspect we can do
       better.
   Add support for our other targets.

Change-Id: I43e25dc3d786fb87245705ac74a87274ad34fedc
diff --git a/runtime/interpreter/mterp/arm/footer.S b/runtime/interpreter/mterp/arm/footer.S
new file mode 100644
index 0000000..75e0037
--- /dev/null
+++ b/runtime/interpreter/mterp/arm/footer.S
@@ -0,0 +1,168 @@
+/*
+ * ===========================================================================
+ *  Common subroutines and data
+ * ===========================================================================
+ */
+
+    .text
+    .align  2
+
+/*
+ * We've detected a condition that will result in an exception, but the exception
+ * has not yet been thrown.  Just bail out to the reference interpreter to deal with it.
+ * TUNING: for consistency, we may want to just go ahead and handle these here.
+ */
+#define MTERP_LOGGING 0
+common_errDivideByZero:
+    EXPORT_PC
+#if MTERP_LOGGING
+    mov  r0, rSELF
+    add  r1, rFP, #OFF_FP_SHADOWFRAME
+    bl MterpLogDivideByZeroException
+#endif
+    b MterpCommonFallback
+
+common_errArrayIndex:
+    EXPORT_PC
+#if MTERP_LOGGING
+    mov  r0, rSELF
+    add  r1, rFP, #OFF_FP_SHADOWFRAME
+    bl MterpLogArrayIndexException
+#endif
+    b MterpCommonFallback
+
+common_errNegativeArraySize:
+    EXPORT_PC
+#if MTERP_LOGGING
+    mov  r0, rSELF
+    add  r1, rFP, #OFF_FP_SHADOWFRAME
+    bl MterpLogNegativeArraySizeException
+#endif
+    b MterpCommonFallback
+
+common_errNoSuchMethod:
+    EXPORT_PC
+#if MTERP_LOGGING
+    mov  r0, rSELF
+    add  r1, rFP, #OFF_FP_SHADOWFRAME
+    bl MterpLogNoSuchMethodException
+#endif
+    b MterpCommonFallback
+
+common_errNullObject:
+    EXPORT_PC
+#if MTERP_LOGGING
+    mov  r0, rSELF
+    add  r1, rFP, #OFF_FP_SHADOWFRAME
+    bl MterpLogNullObjectException
+#endif
+    b MterpCommonFallback
+
+common_exceptionThrown:
+    EXPORT_PC
+#if MTERP_LOGGING
+    mov  r0, rSELF
+    add  r1, rFP, #OFF_FP_SHADOWFRAME
+    bl MterpLogExceptionThrownException
+#endif
+    b MterpCommonFallback
+
+MterpSuspendFallback:
+    EXPORT_PC
+#if MTERP_LOGGING
+    mov  r0, rSELF
+    add  r1, rFP, #OFF_FP_SHADOWFRAME
+    ldr  r2, [rSELF, #THREAD_FLAGS_OFFSET]
+    bl MterpLogSuspendFallback
+#endif
+    b MterpCommonFallback
+
+/*
+ * If we're here, something is out of the ordinary.  If there is a pending
+ * exception, handle it.  Otherwise, roll back and retry with the reference
+ * interpreter.
+ */
+MterpPossibleException:
+    ldr     r0, [rSELF, #THREAD_EXCEPTION_OFFSET]
+    cmp     r0, #0                                  @ Exception pending?
+    beq     MterpFallback                           @ If not, fall back to reference interpreter.
+    /* intentional fallthrough - handle pending exception. */
+/*
+ * On return from a runtime helper routine, we've found a pending exception.
+ * Can we handle it here - or need to bail out to caller?
+ *
+ */
+MterpException:
+    mov     r0, rSELF
+    add     r1, rFP, #OFF_FP_SHADOWFRAME
+    bl      MterpHandleException                    @ (self, shadow_frame)
+    cmp     r0, #0
+    beq     MterpExceptionReturn                    @ no local catch, back to caller.
+    ldr     r0, [rFP, #OFF_FP_CODE_ITEM]
+    ldr     r1, [rFP, #OFF_FP_DEX_PC]
+    ldr     rIBASE, [rSELF, #THREAD_CURRENT_IBASE_OFFSET]
+    add     rPC, r0, #CODEITEM_INSNS_OFFSET
+    add     rPC, rPC, r1, lsl #1                    @ generate new dex_pc_ptr
+    str     rPC, [rFP, #OFF_FP_DEX_PC_PTR]
+    /* resume execution at catch block */
+    FETCH_INST
+    GET_INST_OPCODE ip
+    GOTO_OPCODE ip
+    /* NOTE: no fallthrough */
+
+/*
+ * Check for suspend check request.  Assumes rINST already loaded, rPC advanced and
+ * still needs to get the opcode and branch to it, and flags are in lr.
+ */
+MterpCheckSuspendAndContinue:
+    ldr     rIBASE, [rSELF, #THREAD_CURRENT_IBASE_OFFSET]  @ refresh rIBASE
+    EXPORT_PC
+    mov     r0, rSELF
+    ands    lr, #(THREAD_SUSPEND_REQUEST | THREAD_CHECKPOINT_REQUEST)
+    blne    MterpSuspendCheck           @ (self)
+    GET_INST_OPCODE ip                  @ extract opcode from rINST
+    GOTO_OPCODE ip                      @ jump to next instruction
+
+/*
+ * Bail out to reference interpreter.
+ */
+MterpFallback:
+    EXPORT_PC
+    mov  r0, rSELF
+    add  r1, rFP, #OFF_FP_SHADOWFRAME
+    bl MterpLogFallback
+MterpCommonFallback:
+    mov     r0, #0                                  @ signal retry with reference interpreter.
+    b       MterpDone
+
+/*
+ * We pushed some registers on the stack in ExecuteMterpImpl, then saved
+ * SP and LR.  Here we restore SP, restore the registers, and then restore
+ * LR to PC.
+ *
+ * On entry:
+ *  uint32_t* rFP  (should still be live, pointer to base of vregs)
+ */
+MterpExceptionReturn:
+    ldr     r2, [rFP, #OFF_FP_RESULT_REGISTER]
+    str     r0, [r2]
+    str     r1, [r2, #4]
+    mov     r0, #1                                  @ signal return to caller.
+    b MterpDone
+MterpReturn:
+    ldr     r2, [rFP, #OFF_FP_RESULT_REGISTER]
+    ldr     lr, [rSELF, #THREAD_FLAGS_OFFSET]
+    str     r0, [r2]
+    str     r1, [r2, #4]
+    mov     r0, rSELF
+    ands    lr, #(THREAD_SUSPEND_REQUEST | THREAD_CHECKPOINT_REQUEST)
+    blne    MterpSuspendCheck                       @ (self)
+    mov     r0, #1                                  @ signal return to caller.
+MterpDone:
+    add     sp, sp, #4                              @ un-align 64
+    ldmfd   sp!, {r4-r10,fp,pc}                     @ restore 9 regs and return
+
+
+    .fnend
+    .size   ExecuteMterpImpl, .-ExecuteMterpImpl
+