Fix art's System.arraycopy now bionic doesn't have _memmove_words.

Change-Id: Ib4fc502eda7f5902e0cbe50ed9df892314a49017
diff --git a/src/native/java_lang_System.cc b/src/native/java_lang_System.cc
index 79614ae..5572623 100644
--- a/src/native/java_lang_System.cc
+++ b/src/native/java_lang_System.cc
@@ -48,61 +48,123 @@
  * appropriately for the element type, and that n is a multiple of the
  * element size.
  */
-#ifdef __BIONIC__
-#define HAVE_MEMMOVE_WORDS
-#endif
 
-#ifdef HAVE_MEMMOVE_WORDS
-extern "C" void _memmove_words(void* dst, const void* src, size_t n);
-#define move16 _memmove_words
-#define move32 _memmove_words
-#else
-static void move16(void* dst, const void* src, size_t n) {
+/*
+ * Works like memmove(), except:
+ * - if all arguments are at least 32-bit aligned, we guarantee that we
+ *   will use operations that preserve atomicity of 32-bit values
+ * - if not, we guarantee atomicity of 16-bit values
+ *
+ * If all three arguments are not at least 16-bit aligned, the behavior
+ * of this function is undefined.  (We could remove this restriction by
+ * testing for unaligned values and punting to memmove(), but that's
+ * not currently useful.)
+ *
+ * TODO: add loop for 64-bit alignment
+ * TODO: use __builtin_prefetch
+ * TODO: write ARM/MIPS/x86 optimized versions
+ */
+void MemmoveWords(void* dst, const void* src, size_t n) {
   DCHECK_EQ((((uintptr_t) dst | (uintptr_t) src | n) & 0x01), 0U);
 
-  uint16_t* d = reinterpret_cast<uint16_t*>(dst);
-  const uint16_t* s = reinterpret_cast<const uint16_t*>(src);
+  char* d = reinterpret_cast<char*>(dst);
+  const char* s = reinterpret_cast<const char*>(src);
+  size_t copyCount;
 
-  n /= sizeof(uint16_t);
+  // If the source and destination pointers are the same, this is
+  // an expensive no-op.  Testing for an empty move now allows us
+  // to skip a check later.
+  if (n == 0 || d == s) {
+    return;
+  }
 
-  if (d < s) {
-    // Copy forwards.
-    while (n--) {
-      *d++ = *s++;
+  // Determine if the source and destination buffers will overlap if
+  // we copy data forward (i.e. *dst++ = *src++).
+  //
+  // It's okay if the destination buffer starts before the source and
+  // there is some overlap, because the reader is always ahead of the
+  // writer.
+  if (LIKELY((d < s) || ((size_t)(d - s) >= n))) {
+    // Copy forward.  We prefer 32-bit loads and stores even for 16-bit
+    // data, so sort that out.
+    if (((reinterpret_cast<uintptr_t>(d) | reinterpret_cast<uintptr_t>(s)) & 0x03) != 0) {
+      // Not 32-bit aligned.  Two possibilities:
+      // (1) Congruent, we can align to 32-bit by copying one 16-bit val
+      // (2) Non-congruent, we can do one of:
+      //   a. copy whole buffer as a series of 16-bit values
+      //   b. load/store 32 bits, using shifts to ensure alignment
+      //   c. just copy the as 32-bit values and assume the CPU
+      //      will do a reasonable job
+      //
+      // We're currently using (a), which is suboptimal.
+      if (((reinterpret_cast<uintptr_t>(d) ^ reinterpret_cast<uintptr_t>(s)) & 0x03) != 0) {
+        copyCount = n;
+      } else {
+        copyCount = 2;
+      }
+      n -= copyCount;
+      copyCount /= sizeof(uint16_t);
+
+      while (copyCount--) {
+        *reinterpret_cast<uint16_t*>(d) = *reinterpret_cast<const uint16_t*>(s);
+        d += sizeof(uint16_t);
+        s += sizeof(uint16_t);
+      }
+    }
+
+    // Copy 32-bit aligned words.
+    copyCount = n / sizeof(uint32_t);
+    while (copyCount--) {
+      *reinterpret_cast<uint32_t*>(d) = *reinterpret_cast<const uint32_t*>(s);
+      d += sizeof(uint32_t);
+      s += sizeof(uint32_t);
+    }
+
+    // Check for leftovers.  Either we finished exactly, or we have one remaining 16-bit chunk.
+    if ((n & 0x02) != 0) {
+      *(uint16_t*)d = *(uint16_t*)s;
     }
   } else {
-    // Copy backwards.
+    // Copy backward, starting at the end.
     d += n;
     s += n;
-    while (n--) {
-      *--d = *--s;
+
+    if (((reinterpret_cast<uintptr_t>(d) | reinterpret_cast<uintptr_t>(s)) & 0x03) != 0) {
+      // try for 32-bit alignment.
+      if (((reinterpret_cast<uintptr_t>(d) ^ reinterpret_cast<uintptr_t>(s)) & 0x03) != 0) {
+        copyCount = n;
+      } else {
+        copyCount = 2;
+      }
+      n -= copyCount;
+      copyCount /= sizeof(uint16_t);
+
+      while (copyCount--) {
+        d -= sizeof(uint16_t);
+        s -= sizeof(uint16_t);
+        *reinterpret_cast<uint16_t*>(d) = *reinterpret_cast<const uint16_t*>(s);
+      }
+    }
+
+    // Copy 32-bit aligned words.
+    copyCount = n / sizeof(uint32_t);
+    while (copyCount--) {
+      d -= sizeof(uint32_t);
+      s -= sizeof(uint32_t);
+      *reinterpret_cast<uint32_t*>(d) = *reinterpret_cast<const uint32_t*>(s);
+    }
+
+    // Copy leftovers.
+    if ((n & 0x02) != 0) {
+      d -= sizeof(uint16_t);
+      s -= sizeof(uint16_t);
+      *reinterpret_cast<uint16_t*>(d) = *reinterpret_cast<const uint16_t*>(s);
     }
   }
 }
 
-static void move32(void* dst, const void* src, size_t n) {
-  DCHECK_EQ((((uintptr_t) dst | (uintptr_t) src | n) & 0x03), 0U);
-
-  uint32_t* d = reinterpret_cast<uint32_t*>(dst);
-  const uint32_t* s = reinterpret_cast<const uint32_t*>(src);
-
-  n /= sizeof(uint32_t);
-
-  if (d < s) {
-    // Copy forwards.
-    while (n--) {
-      *d++ = *s++;
-    }
-  } else {
-    // Copy backwards.
-    d += n;
-    s += n;
-    while (n--) {
-      *--d = *--s;
-    }
-  }
-}
-#endif // HAVE_MEMMOVE_WORDS
+#define move16 MemmoveWords
+#define move32 MemmoveWords
 
 namespace art {