ARM: Use hardfp calling convention between java to java call.

This patch default to use hardfp calling convention. Softfp can be enabled
by setting kArm32QuickCodeUseSoftFloat to true.

We get about -1 ~ +5% performance improvement with different benchmark
tests. Hopefully, we should be able to get more performance by address the left
TODOs, as some part of the code takes the original assumption which is not
optimal.

DONE:
1. Interpreter to quick code
2. Quick code to interpreter
3. Transition assembly and callee-saves
4. Trampoline(generic jni, resolution, invoke with access check and etc.)
5. Pass fp arg reg following aapcs(gpr and stack do not follow aapcs)
6. Quick helper assembly routines to handle ABI differences
7. Quick code method entry
8. Quick code method invocation
9. JNI compiler

TODO:
10. Rework ArgMap, FlushIn, GenDalvikArgs and affected common code.
11. Rework CallRuntimeHelperXXX().

Change-Id: I9965d8a007f4829f2560b63bcbbde271bdcf6ec2
diff --git a/runtime/arch/arm/quick_entrypoints_arm.S b/runtime/arch/arm/quick_entrypoints_arm.S
index aae0c94..632b414 100644
--- a/runtime/arch/arm/quick_entrypoints_arm.S
+++ b/runtime/arch/arm/quick_entrypoints_arm.S
@@ -40,10 +40,10 @@
     .cfi_rel_offset r10, 24
     .cfi_rel_offset r11, 28
     .cfi_rel_offset lr, 32
-    vpush {s0-s31}                                @ 32 words (128 bytes) of floats.
-    .pad #128
-    .cfi_adjust_cfa_offset 128
-    sub sp, #12                                   @ 3 words of space, bottom word will hold Method*.
+    vpush {s16-s31}                               @ 16 words (64 bytes) of floats.
+    .pad #64
+    .cfi_adjust_cfa_offset 64
+    sub sp, #12                                   @ 3 words of space, bottom word will hold Method*
     .pad #12
     .cfi_adjust_cfa_offset 12
     RUNTIME_CURRENT1 \rTemp1, \rTemp2             @ Load Runtime::Current into rTemp1.
@@ -53,7 +53,7 @@
     str sp, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
 
      // Ugly compile-time check, but we only have the preprocessor.
-#if (FRAME_SIZE_SAVE_ALL_CALLEE_SAVE != 36 + 128 + 12)
+#if (FRAME_SIZE_SAVE_ALL_CALLEE_SAVE != 36 + 64 + 12)
 #error "SAVE_ALL_CALLEE_SAVE_FRAME(ARM) size not as expected."
 #endif
 .endm
@@ -101,15 +101,7 @@
 .endm
 
 .macro RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME_AND_RETURN
-    add sp, #4               @ bottom word holds Method*
-    pop {r5-r8, r10-r11, lr} @ 7 words of callee saves
-    .cfi_restore r5
-    .cfi_restore r6
-    .cfi_restore r7
-    .cfi_restore r8
-    .cfi_restore r10
-    .cfi_restore r11
-    .cfi_adjust_cfa_offset -FRAME_SIZE_REFS_ONLY_CALLEE_SAVE
+    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
     bx  lr                   @ return
 .endm
 
@@ -117,9 +109,10 @@
      * Macro that sets up the callee save frame to conform with
      * Runtime::CreateCalleeSaveMethod(kRefsAndArgs).
      */
-.macro SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME rTemp1, rTemp2
-    push {r1-r3, r5-r8, r10-r11, lr}   @ 10 words of callee saves
+.macro SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_REGISTERS_ONLY
+    push {r1-r3, r5-r8, r10-r11, lr}   @ 10 words of callee saves and args.
     .save {r1-r3, r5-r8, r10-r11, lr}
+    .cfi_adjust_cfa_offset 40
     .cfi_rel_offset r1, 0
     .cfi_rel_offset r2, 4
     .cfi_rel_offset r3, 8
@@ -130,47 +123,39 @@
     .cfi_rel_offset r10, 28
     .cfi_rel_offset r11, 32
     .cfi_rel_offset lr, 36
-    .cfi_adjust_cfa_offset 40
+    vpush {s0-s15}                     @ 16 words of float args.
+    .pad #64
+    .cfi_adjust_cfa_offset 64
     sub sp, #8                         @ 2 words of space, bottom word will hold Method*
     .pad #8
     .cfi_adjust_cfa_offset 8
+    // Ugly compile-time check, but we only have the preprocessor.
+#if (FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE != 40 + 64 + 8)
+#error "REFS_AND_ARGS_CALLEE_SAVE_FRAME(ARM) size not as expected."
+#endif
+.endm
+
+.macro SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME rTemp1, rTemp2
+    SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_REGISTERS_ONLY
     RUNTIME_CURRENT3 \rTemp1, \rTemp2  @ Load Runtime::Current into rTemp1.
     THIS_LOAD_REQUIRES_READ_BARRIER
      @ rTemp1 is kRefsAndArgs Method*.
     ldr \rTemp1, [\rTemp1, #RUNTIME_REFS_AND_ARGS_CALLEE_SAVE_FRAME_OFFSET]
     str \rTemp1, [sp, #0]                         @ Place Method* at bottom of stack.
     str sp, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
-
-    // Ugly compile-time check, but we only have the preprocessor.
-#if (FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE != 40 + 8)
-#error "REFS_AND_ARGS_CALLEE_SAVE_FRAME(ARM) size not as expected."
-#endif
 .endm
 
 .macro SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_WITH_METHOD_IN_R0
-    push {r1-r3, r5-r8, r10-r11, lr}   @ 10 words of callee saves
-    .save {r1-r3, r5-r8, r10-r11, lr}
-    .cfi_rel_offset r1, 0
-    .cfi_rel_offset r2, 4
-    .cfi_rel_offset r3, 8
-    .cfi_rel_offset r5, 12
-    .cfi_rel_offset r6, 16
-    .cfi_rel_offset r7, 20
-    .cfi_rel_offset r8, 24
-    .cfi_rel_offset r10, 28
-    .cfi_rel_offset r11, 32
-    .cfi_rel_offset lr, 36
-    .cfi_adjust_cfa_offset 40
-    sub sp, #8                         @ 2 words of space, bottom word will hold Method*
-    .pad #8
-    .cfi_adjust_cfa_offset 8
-
+    SETUP_REFS_AND_ARGS_CALLEE_SAVE_FRAME_REGISTERS_ONLY
     str r0, [sp, #0]                   @ Store ArtMethod* to bottom of stack.
     str sp, [r9, #THREAD_TOP_QUICK_FRAME_OFFSET]  @ Place sp in Thread::Current()->top_quick_frame.
 .endm
 
 .macro RESTORE_REFS_AND_ARGS_CALLEE_SAVE_FRAME
     add  sp, #8                      @ rewind sp
+    .cfi_adjust_cfa_offset -8
+    vpop {s0-s15}
+    .cfi_adjust_cfa_offset -64
     pop {r1-r3, r5-r8, r10-r11, lr}  @ 10 words of callee saves
     .cfi_restore r1
     .cfi_restore r2
@@ -181,7 +166,7 @@
     .cfi_restore r8
     .cfi_restore r10
     .cfi_restore r11
-    .cfi_adjust_cfa_offset -48
+    .cfi_adjust_cfa_offset -40
 .endm
 
 
@@ -373,60 +358,91 @@
 INVOKE_TRAMPOLINE art_quick_invoke_virtual_trampoline_with_access_check, artInvokeVirtualTrampolineWithAccessCheck
 
     /*
-     * Quick invocation stub.
+     * Quick invocation stub internal.
      * On entry:
      *   r0 = method pointer
      *   r1 = argument array or NULL for no argument methods
      *   r2 = size of argument array in bytes
      *   r3 = (managed) thread pointer
      *   [sp] = JValue* result
-     *   [sp + 4] = shorty
+     *   [sp + 4] = result_in_float
+     *   [sp + 8] = core register argument array
+     *   [sp + 12] = fp register argument array
+     *  +-------------------------+
+     *  | uint32_t* fp_reg_args   |
+     *  | uint32_t* core_reg_args |
+     *  |   result_in_float       | <- Caller frame
+     *  |   Jvalue* result        |
+     *  +-------------------------+
+     *  |          lr             |
+     *  |          r11            |
+     *  |          r9             |
+     *  |          r4             | <- r11
+     *  +-------------------------+
+     *  | uint32_t out[n-1]       |
+     *  |    :      :             |        Outs
+     *  | uint32_t out[0]         |
+     *  | StackRef<ArtMethod>     | <- SP  value=null
+     *  +-------------------------+
      */
-ENTRY art_quick_invoke_stub
-    push   {r0, r4, r5, r9, r11, lr}       @ spill regs
-    .save  {r0, r4, r5, r9, r11, lr}
-    .pad #24
-    .cfi_adjust_cfa_offset 24
-    .cfi_rel_offset r0, 0
-    .cfi_rel_offset r4, 4
-    .cfi_rel_offset r5, 8
-    .cfi_rel_offset r9, 12
-    .cfi_rel_offset r11, 16
-    .cfi_rel_offset lr, 20
+ENTRY art_quick_invoke_stub_internal
+    push   {r4, r9, r11, lr}               @ spill regs
+    .save  {r4, r9, r11, lr}
+    .pad #16
+    .cfi_adjust_cfa_offset 16
+    .cfi_rel_offset r4, 0
+    .cfi_rel_offset r9, 4
+    .cfi_rel_offset r11, 8
+    .cfi_rel_offset lr, 12
     mov    r11, sp                         @ save the stack pointer
     .cfi_def_cfa_register r11
+
     mov    r9, r3                          @ move managed thread pointer into r9
+
+    add    r4, r2, #4                      @ create space for method pointer in frame
+    sub    r4, sp, r4                      @ reserve & align *stack* to 16 bytes: native calling
+    and    r4, #0xFFFFFFF0                 @ convention only aligns to 8B, so we have to ensure ART
+    mov    sp, r4                          @ 16B alignment ourselves.
+
+    mov    r4, r0                          @ save method*
+    add    r0, sp, #4                      @ pass stack pointer + method ptr as dest for memcpy
+    bl     memcpy                          @ memcpy (dest, src, bytes)
+    mov    ip, #0                          @ set ip to 0
+    str    ip, [sp]                        @ store NULL for method* at bottom of frame
+
+    ldr    ip, [r11, #28]                  @ load fp register argument array pointer
+    vldm   ip, {s0-s15}                    @ copy s0 - s15
+
+    ldr    ip, [r11, #24]                  @ load core register argument array pointer
+    mov    r0, r4                          @ restore method*
+    add    ip, ip, #4                      @ skip r0
+    ldm    ip, {r1-r3}                     @ copy r1 - r3
+
 #ifdef ARM_R4_SUSPEND_FLAG
     mov    r4, #SUSPEND_CHECK_INTERVAL     @ reset r4 to suspend check interval
 #endif
-    add    r5, r2, #4                      @ create space for method pointer in frame
 
-    sub    r5, sp, r5                      @ reserve & align *stack* to 16 bytes: native calling
-    and    r5, #0xFFFFFFF0                 @ convention only aligns to 8B, so we have to ensure ART
-    mov    sp, r5                          @ 16B alignment ourselves.
-
-    add    r0, sp, #4                      @ pass stack pointer + method ptr as dest for memcpy
-    bl     memcpy                          @ memcpy (dest, src, bytes)
-    ldr    r0, [r11]                       @ restore method*
-    ldr    r1, [sp, #4]                    @ copy arg value for r1
-    ldr    r2, [sp, #8]                    @ copy arg value for r2
-    ldr    r3, [sp, #12]                   @ copy arg value for r3
-    mov    ip, #0                          @ set ip to 0
-    str    ip, [sp]                        @ store NULL for method* at bottom of frame
     ldr    ip, [r0, #MIRROR_ART_METHOD_QUICK_CODE_OFFSET]  @ get pointer to the code
     blx    ip                              @ call the method
+
     mov    sp, r11                         @ restore the stack pointer
-    ldr    ip, [sp, #24]                   @ load the result pointer
-    strd   r0, [ip]                        @ store r0/r1 into result pointer
-    pop    {r0, r4, r5, r9, r11, lr}       @ restore spill regs
-    .cfi_restore r0
+    .cfi_def_cfa_register sp
+
+    ldr    r4, [sp, #20]                   @ load result_is_float
+    ldr    r9, [sp, #16]                   @ load the result pointer
+    cmp    r4, #0
+    ite    eq
+    strdeq r0, [r9]                        @ store r0/r1 into result pointer
+    vstrne d0, [r9]                        @ store s0-s1/d0 into result pointer
+
+    pop    {r4, r9, r11, lr}               @ restore spill regs
     .cfi_restore r4
-    .cfi_restore r5
     .cfi_restore r9
+    .cfi_restore r11
     .cfi_restore lr
-    .cfi_adjust_cfa_offset -24
+    .cfi_adjust_cfa_offset -16
     bx     lr
-END art_quick_invoke_stub
+END art_quick_invoke_stub_internal
 
     /*
      * On entry r0 is uint32_t* gprs_ and r1 is uint32_t* fprs_
@@ -869,13 +885,14 @@
     mov     r3, sp                 @ pass SP
     blx     artQuickProxyInvokeHandler  @ (Method* proxy method, receiver, Thread*, SP)
     ldr     r2, [r9, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
-    add     sp, #16                @ skip r1-r3, 4 bytes padding.
-    .cfi_adjust_cfa_offset -16
-    cbnz    r2, 1f                 @ success if no exception is pending
+    // Tear down the callee-save frame. Skip arg registers.
+    add     sp, #(FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE - FRAME_SIZE_REFS_ONLY_CALLEE_SAVE)
+    .cfi_adjust_cfa_offset -(FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE - FRAME_SIZE_REFS_ONLY_CALLEE_SAVE)
     RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
+    cbnz    r2, 1f                 @ success if no exception is pending
+    vmov    d0, r0, r1             @ store into fpr, for when it's a fpr return...
     bx      lr                     @ return on success
 1:
-    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
     DELIVER_PENDING_EXCEPTION
 END art_quick_proxy_invoke_handler
 
@@ -977,20 +994,13 @@
     ldr r2, [r9, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
     cbnz r2, .Lexception_in_native
 
-    // Tear down the callee-save frame.
-    add  sp, #12                      @ rewind sp
-    // Do not pop r0 and r1, they contain the return value.
-    pop {r2-r3, r5-r8, r10-r11, lr}  @ 9 words of callee saves
-    .cfi_restore r2
-    .cfi_restore r3
-    .cfi_restore r5
-    .cfi_restore r6
-    .cfi_restore r7
-    .cfi_restore r8
-    .cfi_restore r10
-    .cfi_restore r11
-    .cfi_adjust_cfa_offset -48
+    // Tear down the callee-save frame. Skip arg registers.
+    add     sp, #FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE-FRAME_SIZE_REFS_ONLY_CALLEE_SAVE
+    .cfi_adjust_cfa_offset -(FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE-FRAME_SIZE_REFS_ONLY_CALLEE_SAVE)
+    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
 
+    // store into fpr, for when it's a fpr return...
+    vmov d0, r0, r1
     bx lr      // ret
 
 .Lentry_error:
@@ -1010,11 +1020,13 @@
     mov     r2, sp                 @ pass SP
     blx     artQuickToInterpreterBridge    @ (Method* method, Thread*, SP)
     ldr     r2, [r9, #THREAD_EXCEPTION_OFFSET]  @ load Thread::Current()->exception_
-    add     sp, #16                @ skip r1-r3, 4 bytes padding.
-    .cfi_adjust_cfa_offset -16
+    // Tear down the callee-save frame. Skip arg registers.
+    add     sp, #(FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE - FRAME_SIZE_REFS_ONLY_CALLEE_SAVE)
+    .cfi_adjust_cfa_offset -(FRAME_SIZE_REFS_AND_ARGS_CALLEE_SAVE - FRAME_SIZE_REFS_ONLY_CALLEE_SAVE)
     RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
     cbnz    r2, 1f                 @ success if no exception is pending
-    bx    lr                       @ return on success
+    vmov    d0, r0, r1             @ store into fpr, for when it's a fpr return...
+    bx      lr                     @ return on success
 1:
     DELIVER_PENDING_EXCEPTION
 END art_quick_to_interpreter_bridge
@@ -1435,3 +1447,54 @@
 .Ldone:
     pop   {r4, r7-r12, pc}
 END art_quick_string_compareto
+
+    /* Assembly routines used to handle ABI differences. */
+
+    /* double fmod(double a, double b) */
+    .extern fmod
+ENTRY art_quick_fmod
+    push  {lr}
+    .cfi_adjust_cfa_offset 4
+    .cfi_rel_offset lr, 0
+    sub   sp, #4
+    .cfi_adjust_cfa_offset 4
+    vmov  r0, r1, d0
+    vmov  r2, r3, d1
+    bl    fmod
+    vmov  d0, r0, r1
+    add   sp, #4
+    .cfi_adjust_cfa_offset -4
+    pop   {pc}
+    .cfi_adjust_cfa_offset -4
+END art_quick_fmod
+
+    /* float fmodf(float a, float b) */
+     .extern fmodf
+ENTRY art_quick_fmodf
+    push  {lr}
+    .cfi_adjust_cfa_offset 4
+    .cfi_rel_offset lr, 0
+    sub   sp, #4
+    .cfi_adjust_cfa_offset 4
+    vmov  r0, r1, d0
+    bl    fmodf
+    vmov  s0, r0
+    add   sp, #4
+    .cfi_adjust_cfa_offset -4
+    pop   {pc}
+    .cfi_adjust_cfa_offset -4
+END art_quick_fmod
+
+    /* int64_t art_d2l(double d) */
+    .extern art_d2l
+ENTRY art_quick_d2l
+    vmov  r0, r1, d0
+    b     art_d2l
+END art_quick_d2l
+
+    /* int64_t art_f2l(float f) */
+    .extern art_f2l
+ENTRY art_quick_f2l
+    vmov  r0, s0
+    b     art_f2l
+END art_quick_f2l