ART: Rework Generic JNI, add ARM version

Refactors and optimizes Generic JNI. This version uses TwoWordReturn
to avoid writing to / loading from the bottom of the alloca.

Change-Id: I3287007c976f79c9fd32d3b3a43f2d1371bf4cd3
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index e088751..7907b6e 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1485,33 +1485,34 @@
     mov x1, xFP
     bl artQuickGenericJniTrampoline  // (Thread*, sp)
 
-    // Get the updated pointer. This is the bottom of the frame _with_ handle scope.
-    ldr xFP, [sp]
-    add x9, sp, #8
+    // The C call will have registered the complete save-frame on success.
+    // The result of the call is:
+    // x0: pointer to native code, 0 on error.
+    // x1: pointer to the bottom of the used area of the alloca, can restore stack till there.
 
-    cmp x0, #0
-    b.mi .Lentry_error      // Check for error, negative value.
+    // Check for error = 0.
+    cbz x0, .Lentry_error
 
-    // release part of the alloca.
-    add x9, x9, x0
+    // Release part of the alloca.
+    mov sp, x1
 
-    // Get the code pointer
-    ldr xIP0, [x9, #0]
+    // Save the code pointer
+    mov xIP0, x0
 
     // Load parameters from frame into registers.
     // TODO Check with artQuickGenericJniTrampoline.
     //      Also, check again APPCS64 - the stack arguments are interleaved.
-    ldp x0, x1, [x9, #8]
-    ldp x2, x3, [x9, #24]
-    ldp x4, x5, [x9, #40]
-    ldp x6, x7, [x9, #56]
+    ldp x0, x1, [sp]
+    ldp x2, x3, [sp, #16]
+    ldp x4, x5, [sp, #32]
+    ldp x6, x7, [sp, #48]
 
-    ldp d0, d1, [x9, #72]
-    ldp d2, d3, [x9, #88]
-    ldp d4, d5, [x9, #104]
-    ldp d6, d7, [x9, #120]
+    ldp d0, d1, [sp, #64]
+    ldp d2, d3, [sp, #80]
+    ldp d4, d5, [sp, #96]
+    ldp d6, d7, [sp, #112]
 
-    add sp, x9, #136
+    add sp, sp, #128
 
     blr xIP0           // native call.
 
@@ -1520,13 +1521,11 @@
 
     // result sign extension is handled in C code
     // prepare for artQuickGenericJniEndTrampoline call
-    // (Thread*,  SP, result, result_f)
-    //   x0       x1   x2       x3       <= C calling convention
-    mov x5, x0      // Save return value
+    // (Thread*, result, result_f)
+    //    x0       x1       x2        <= C calling convention
+    mov x1, x0      // Result (from saved)
     mov x0, xSELF   // Thread register
-    mov x1, xFP     // Stack pointer
-    mov x2, x5      // Result (from saved)
-    fmov x3, d0     // d0 will contain floating point result, but needs to go into x3
+    fmov x2, d0     // d0 will contain floating point result, but needs to go into x2
 
     bl artQuickGenericJniEndTrampoline