Fix handling of long argument spanning register/memory.

Comment in arm_lir.h says:
 * If a 64-bit argument would span the register/memory argument
 * boundary, it will instead be fully passed in the frame.

This change implements such logic for all platforms. We still need
to pass the low part in register as well because I haven't ported
the jni compilers (x86 and mips) to it.

Once the jni compilers are updated, we can remove the register
assignment.

Note that this greatly simplifies optimizing's register allocator
by not having to understand a long spanning register and memory.

Change-Id: I59706ca5d47269fc46e5489ac99bd6576e87e7f3
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index c99be64..4139b51 100755
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -410,7 +410,7 @@
     // If the wide input appeared as single, flush it and go
     // as it comes from memory.
     if (t_loc->wide && reg.Valid() && !reg.Is64Bit()) {
-      StoreBaseDisp(TargetPtrReg(kSp), SRegOffset(start_vreg + i), reg, k32, kNotVolatile);
+      // The memory already holds the half. Don't do anything.
       reg = RegStorage::InvalidReg();
     }
 
@@ -881,21 +881,23 @@
       if (rl_arg.wide) {
         // if reg is not 64-bit (it is half of 64-bit) then handle it separately.
         if (!reg.Is64Bit()) {
-          // TODO: REVISIT: This adds a spill of low part while we could just copy it.
           ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
           if (rl_arg.location == kLocPhysReg) {
             int out_offset = StackVisitor::GetOutVROffset(i, cu_->instruction_set);
-            // Dump it to memory and then load only low part
+            // Dump it to memory.
             StoreBaseDisp(TargetPtrReg(kSp), out_offset, rl_arg.reg, k64, kNotVolatile);
             LoadBaseDisp(TargetPtrReg(kSp), out_offset, reg, k32, kNotVolatile);
           } else {
-            int out_offset = StackVisitor::GetOutVROffset(i + 1, cu_->instruction_set);
+            int high_offset = StackVisitor::GetOutVROffset(i + 1, cu_->instruction_set);
             // First, use target reg for high part.
             LoadBaseDisp(TargetPtrReg(kSp), SRegOffset(rl_arg.s_reg_low + 1), reg, k32,
                          kNotVolatile);
-            StoreBaseDisp(TargetPtrReg(kSp), out_offset, reg, k32, kNotVolatile);
-            // Now load target reg with low part.
+            StoreBaseDisp(TargetPtrReg(kSp), high_offset, reg, k32, kNotVolatile);
+            // Now, use target reg for low part.
             LoadBaseDisp(TargetPtrReg(kSp), SRegOffset(rl_arg.s_reg_low), reg, k32, kNotVolatile);
+            int low_offset = StackVisitor::GetOutVROffset(i, cu_->instruction_set);
+            // And store it to the expected memory location.
+            StoreBaseDisp(TargetPtrReg(kSp), low_offset, reg, k32, kNotVolatile);
           }
         } else {
           LoadValueDirectWideFixed(rl_arg, reg);
diff --git a/compiler/jni/quick/arm/calling_convention_arm.cc b/compiler/jni/quick/arm/calling_convention_arm.cc
index a3323e1..fd20715 100644
--- a/compiler/jni/quick/arm/calling_convention_arm.cc
+++ b/compiler/jni/quick/arm/calling_convention_arm.cc
@@ -168,9 +168,13 @@
         } else {
           // FIXME: Pointer this returns as both reference and long.
           if (IsCurrentParamALong() && !IsCurrentParamAReference()) {  // Long.
-            if (gpr_index < arraysize(kHFCoreArgumentRegisters)) {
+            // If it spans register and memory, we must use the value in memory.
+            if (gpr_index < arraysize(kHFCoreArgumentRegisters) - 1) {
               entry_spills_.push_back(
                   ArmManagedRegister::FromCoreRegister(kHFCoreArgumentRegisters[gpr_index++]));
+            } else if (gpr_index == arraysize(kHFCoreArgumentRegisters) - 1) {
+              gpr_index++;
+              entry_spills_.push_back(ManagedRegister::NoRegister(), 4);
             } else {
               entry_spills_.push_back(ManagedRegister::NoRegister(), 4);
             }
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index d0a72bb..1cc2dcc 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -593,8 +593,6 @@
         ArmManagedRegister pair = ArmManagedRegister::FromRegisterPair(
             calling_convention.GetRegisterPairAt(index));
         return Location::RegisterPairLocation(pair.AsRegisterPairLow(), pair.AsRegisterPairHigh());
-      } else if (index + 1 == calling_convention.GetNumberOfRegisters()) {
-        return Location::QuickParameter(index, stack_index);
       } else {
         return Location::DoubleStackSlot(calling_convention.GetStackOffsetOf(stack_index));
       }
@@ -711,16 +709,6 @@
           Location::RegisterLocation(destination.AsRegisterPairLow<Register>()));
     } else if (source.IsFpuRegister()) {
       UNIMPLEMENTED(FATAL);
-    } else if (source.IsQuickParameter()) {
-      uint16_t register_index = source.GetQuickParameterRegisterIndex();
-      uint16_t stack_index = source.GetQuickParameterStackIndex();
-      InvokeDexCallingConvention calling_convention;
-      EmitParallelMoves(
-          Location::RegisterLocation(calling_convention.GetRegisterAt(register_index)),
-          Location::RegisterLocation(destination.AsRegisterPairLow<Register>()),
-          Location::StackSlot(
-              calling_convention.GetStackOffsetOf(stack_index + 1) + GetFrameSize()),
-          Location::RegisterLocation(destination.AsRegisterPairHigh<Register>()));
     } else {
       // No conflict possible, so just do the moves.
       DCHECK(source.IsDoubleStackSlot());
@@ -741,22 +729,6 @@
     } else {
       UNIMPLEMENTED(FATAL);
     }
-  } else if (destination.IsQuickParameter()) {
-    InvokeDexCallingConvention calling_convention;
-    uint16_t register_index = destination.GetQuickParameterRegisterIndex();
-    uint16_t stack_index = destination.GetQuickParameterStackIndex();
-    if (source.IsRegisterPair()) {
-      UNIMPLEMENTED(FATAL);
-    } else if (source.IsFpuRegister()) {
-      UNIMPLEMENTED(FATAL);
-    } else {
-      DCHECK(source.IsDoubleStackSlot());
-      EmitParallelMoves(
-          Location::StackSlot(source.GetStackIndex()),
-          Location::RegisterLocation(calling_convention.GetRegisterAt(register_index)),
-          Location::StackSlot(source.GetHighStackIndex(kArmWordSize)),
-          Location::StackSlot(calling_convention.GetStackOffsetOf(stack_index + 1)));
-    }
   } else {
     DCHECK(destination.IsDoubleStackSlot());
     if (source.IsRegisterPair()) {
@@ -769,17 +741,6 @@
         __ StoreToOffset(kStoreWordPair, source.AsRegisterPairLow<Register>(),
                          SP, destination.GetStackIndex());
       }
-    } else if (source.IsQuickParameter()) {
-      InvokeDexCallingConvention calling_convention;
-      uint16_t register_index = source.GetQuickParameterRegisterIndex();
-      uint16_t stack_index = source.GetQuickParameterStackIndex();
-      // Just move the low part. The only time a source is a quick parameter is
-      // when moving the parameter to its stack locations. And the (Java) caller
-      // of this method has already done that.
-      __ StoreToOffset(kStoreWord, calling_convention.GetRegisterAt(register_index),
-                       SP, destination.GetStackIndex());
-      DCHECK_EQ(calling_convention.GetStackOffsetOf(stack_index + 1) + GetFrameSize(),
-                static_cast<size_t>(destination.GetHighStackIndex(kArmWordSize)));
     } else if (source.IsFpuRegisterPair()) {
       __ StoreDToOffset(FromLowSToD(source.AsFpuRegisterPairLow<SRegister>()),
                         SP,
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index d377cb5..04e36cc 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -643,9 +643,10 @@
       DCHECK(source.IsDoubleStackSlot());
       EmitParallelMoves(
           Location::StackSlot(source.GetStackIndex()),
-          Location::RegisterLocation(calling_convention.GetRegisterAt(register_index)),
+          Location::StackSlot(calling_convention.GetStackOffsetOf(stack_index)),
           Location::StackSlot(source.GetHighStackIndex(kX86WordSize)),
           Location::StackSlot(calling_convention.GetStackOffsetOf(stack_index + 1)));
+      __ movl(calling_convention.GetRegisterAt(register_index), Address(ESP, source.GetStackIndex()));
     }
   } else if (destination.IsFpuRegister()) {
     if (source.IsDoubleStackSlot()) {