MIPS32: Improve stack alignment, use sdc1/ldc1, where possible.

- Ensure that SP is a multiple of 16 at all times, and
- Use ldc1/sdc1 to load/store FPU registers from/to 8-byte-aligned
  locations wherever possible.

Use `export ART_MIPS32_CHECK_ALIGNMENT=true` when building Android
to enable the new runtime alignment checks.

Test: Boot & run tests on 32-bit version of QEMU, and CI-20.
Test: test/testrunner/testrunner.py --target --optimizing --32
Test: test-art-host-gtest
Test: test-art-target-gtest

Change-Id: Ia667004573f419fd006098fcfadf5834239cb485
diff --git a/compiler/utils/mips/assembler_mips.cc b/compiler/utils/mips/assembler_mips.cc
index cbb2c0e..9545ca6 100644
--- a/compiler/utils/mips/assembler_mips.cc
+++ b/compiler/utils/mips/assembler_mips.cc
@@ -1863,20 +1863,20 @@
 }
 
 void MipsAssembler::Push(Register rs) {
-  IncreaseFrameSize(kMipsWordSize);
+  IncreaseFrameSize(kStackAlignment);
   Sw(rs, SP, 0);
 }
 
 void MipsAssembler::Pop(Register rd) {
   Lw(rd, SP, 0);
-  DecreaseFrameSize(kMipsWordSize);
+  DecreaseFrameSize(kStackAlignment);
 }
 
 void MipsAssembler::PopAndReturn(Register rd, Register rt) {
   bool reordering = SetReorder(false);
   Lw(rd, SP, 0);
   Jr(rt);
-  DecreaseFrameSize(kMipsWordSize);  // Single instruction in delay slot.
+  DecreaseFrameSize(kStackAlignment);  // Single instruction in delay slot.
   SetReorder(reordering);
 }
 
@@ -4588,7 +4588,7 @@
       Addu(AT, AT, RA);
       Lw(RA, SP, 0);
       Jr(AT);
-      DecreaseFrameSize(kMipsWordSize);
+      DecreaseFrameSize(kStackAlignment);
       break;
     case Branch::kLongCondBranch:
       // The comment on case 'Branch::kLongUncondBranch' applies here as well.
@@ -4608,7 +4608,7 @@
       Addu(AT, AT, RA);
       Lw(RA, SP, 0);
       Jr(AT);
-      DecreaseFrameSize(kMipsWordSize);
+      DecreaseFrameSize(kStackAlignment);
       break;
     case Branch::kLongCall:
       DCHECK_NE(delayed_instruction, Branch::kUnfillableDelaySlot);