diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index 1cc2dcc..af385eb 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -49,6 +49,9 @@
 static constexpr size_t kRuntimeParameterFpuRegistersLength =
     arraysize(kRuntimeParameterFpuRegisters);
 
+static constexpr DRegister DTMP = D7;
+static constexpr SRegister STMP = S14;
+
 class InvokeRuntimeCallingConvention : public CallingConvention<Register, SRegister> {
  public:
   InvokeRuntimeCallingConvention()
@@ -472,6 +475,11 @@
   blocked_core_registers_[R10] = true;
   blocked_core_registers_[R11] = true;
 
+  // Don't allocate our temporary double register.
+  blocked_fpu_registers_[STMP] = true;
+  blocked_fpu_registers_[STMP + 1] = true;
+  DCHECK_EQ(FromLowSToD(STMP), DTMP);
+
   blocked_fpu_registers_[S16] = true;
   blocked_fpu_registers_[S17] = true;
   blocked_fpu_registers_[S18] = true;
@@ -3364,9 +3372,9 @@
   } else if (source.IsStackSlot() && destination.IsStackSlot()) {
     Exchange(source.GetStackIndex(), destination.GetStackIndex());
   } else if (source.IsFpuRegister() && destination.IsFpuRegister()) {
-    __ vmovrs(IP, source.AsFpuRegister<SRegister>());
+    __ vmovs(STMP, source.AsFpuRegister<SRegister>());
     __ vmovs(source.AsFpuRegister<SRegister>(), destination.AsFpuRegister<SRegister>());
-    __ vmovsr(destination.AsFpuRegister<SRegister>(), IP);
+    __ vmovs(destination.AsFpuRegister<SRegister>(), STMP);
   } else if (source.IsFpuRegister() || destination.IsFpuRegister()) {
     SRegister reg = source.IsFpuRegister() ? source.AsFpuRegister<SRegister>()
                                            : destination.AsFpuRegister<SRegister>();
@@ -3374,11 +3382,33 @@
         ? destination.GetStackIndex()
         : source.GetStackIndex();
 
-    __ vmovrs(IP, reg);
+    __ vmovs(STMP, reg);
     __ LoadSFromOffset(reg, SP, mem);
-    __ StoreToOffset(kStoreWord, IP, SP, mem);
+    __ StoreSToOffset(STMP, SP, mem);
+  } else if (source.IsFpuRegisterPair() && destination.IsFpuRegisterPair()) {
+    __ vmovd(DTMP, FromLowSToD(source.AsFpuRegisterPairLow<SRegister>()));
+    __ vmovd(FromLowSToD(source.AsFpuRegisterPairLow<SRegister>()),
+             FromLowSToD(destination.AsFpuRegisterPairLow<SRegister>()));
+    __ vmovd(FromLowSToD(destination.AsFpuRegisterPairLow<SRegister>()), DTMP);
+  } else if (source.IsFpuRegisterPair() || destination.IsFpuRegisterPair()) {
+    DRegister reg = source.IsFpuRegisterPair()
+        ? FromLowSToD(source.AsFpuRegisterPairLow<SRegister>())
+        : FromLowSToD(destination.AsFpuRegisterPairLow<SRegister>());
+    int mem = source.IsFpuRegisterPair()
+        ? destination.GetStackIndex()
+        : source.GetStackIndex();
+
+    __ vmovd(DTMP, reg);
+    __ LoadDFromOffset(reg, SP, mem);
+    __ StoreDToOffset(DTMP, SP, mem);
+  } else if (source.IsDoubleStackSlot() && destination.IsDoubleStackSlot()) {
+    // TODO: We could use DTMP and ask for a pair scratch register (float or core).
+    // This would save four instructions if two scratch registers are available, and
+    // two instructions if not.
+    Exchange(source.GetStackIndex(), destination.GetStackIndex());
+    Exchange(source.GetHighStackIndex(kArmWordSize), destination.GetHighStackIndex(kArmWordSize));
   } else {
-    LOG(FATAL) << "Unimplemented";
+    LOG(FATAL) << "Unimplemented" << source << " <-> " << destination;
   }
 }
 
diff --git a/test/439-swap-double/expected.txt b/test/439-swap-double/expected.txt
new file mode 100644
index 0000000..019c901
--- /dev/null
+++ b/test/439-swap-double/expected.txt
@@ -0,0 +1,4 @@
+-26.0
+-24.0
+-22.0
+-20.0
diff --git a/test/439-swap-double/info.txt b/test/439-swap-double/info.txt
new file mode 100644
index 0000000..23447d2
--- /dev/null
+++ b/test/439-swap-double/info.txt
@@ -0,0 +1,2 @@
+Test for the optimizing compiler's parallel swap support in
+the presence of register pairs (in this case, doubles on ARM).
diff --git a/test/439-swap-double/src/Main.java b/test/439-swap-double/src/Main.java
new file mode 100644
index 0000000..da11577
--- /dev/null
+++ b/test/439-swap-double/src/Main.java
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Test for the optimizing compiler's parallel swap support in
+// the presence of register pairs (in this case, doubles on ARM).
+public class Main {
+  public static void main(String[] args) {
+    new Main().foo();
+  }
+
+  public void foo() {
+    // Do multiple calls to force swapping of registers. Note that
+    // this depends on the calling convention, as a stack-only convention
+    // may not need the swapping.
+    callWithDoubles(a, b, c, d, e, f, g);
+    callWithDoubles(b, c, d, e, f, g, a);
+    callWithDoubles(c, d, e, f, g, a, b);
+    callWithDoubles(d, e, f, g, a, b, c);
+  }
+
+  public static void callWithDoubles(
+      double a, double b, double c, double d, double e, double f, double g) {
+    System.out.println(a - b - c - d - e - f - g);
+  }
+
+  double a = 1.0;
+  double b = 2.0;
+  double c = 3.0;
+  double d = 4.0;
+  double e = 5.0;
+  double f = 6.0;
+  double g = 7.0;
+}
