ARM64: Improve code generated to spill/restore for slow paths.

Aligning the accesses allows generating better code.

Before:

    add x16, sp, #0x44 (68)
    stp x0, x1, [x16, #-16]

After:

    stp x0, x1, [sp, #56]

Change-Id: I3e20ad3fa59d00aee4b4d14ea9d59c7cd546509e
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index d40e2b9..9c6dcaa 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -291,7 +291,8 @@
   DCHECK(!block_order.empty());
   DCHECK(block_order[0] == GetGraph()->GetEntryBlock());
   ComputeSpillMask();
-  first_register_slot_in_slow_path_ = (number_of_out_slots + number_of_spill_slots) * kVRegSize;
+  first_register_slot_in_slow_path_ = RoundUp(
+      (number_of_out_slots + number_of_spill_slots) * kVRegSize, GetPreferredSlotsAlignment());
 
   if (number_of_spill_slots == 0
       && !HasAllocatedCalleeSaveRegisters()
@@ -302,8 +303,7 @@
     SetFrameSize(CallPushesPC() ? GetWordSize() : 0);
   } else {
     SetFrameSize(RoundUp(
-        number_of_spill_slots * kVRegSize
-        + number_of_out_slots * kVRegSize
+        first_register_slot_in_slow_path_
         + maximum_number_of_live_core_registers * GetWordSize()
         + maximum_number_of_live_fpu_registers * GetFloatingPointSpillSlotSize()
         + FrameEntrySpillSize(),