Tune the memcpy for krait.

Streamline the memcpy a bit removing some unnecessary instructions.

The biggest speed improvement comes from changing the size of
the preload. On krait, the sweet spot for the preload in the main
loop is twice the L1 cache line size.

In most cases, these small tweaks yield > 1000MB/s speed ups. As
the size of the memcpy approaches about 1MB, the speed improvement
disappears.

Change-Id: Ief79694d65324e2db41bee4707dae19b8c24be62
diff --git a/libc/arch-arm/krait/bionic/memcpy.S b/libc/arch-arm/krait/bionic/memcpy.S
index 0cd4d44..4a21709 100644
--- a/libc/arch-arm/krait/bionic/memcpy.S
+++ b/libc/arch-arm/krait/bionic/memcpy.S
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2008 The Android Open Source Project
+ * Copyright (C) 2013 The Android Open Source Project
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -45,9 +45,8 @@
 ENTRY(memcpy)
         .save       {r0, lr}
         /* start preloading as early as possible */
-        pld         [r1, #(CACHE_LINE_SIZE*0)]
+        pld         [r1, #(CACHE_LINE_SIZE*4)]
         stmfd       sp!, {r0, lr}
-        pld         [r1, #(CACHE_LINE_SIZE*2)]
 
         /* do we have at least 16-bytes to copy (needed for alignment below) */
         cmp         r2, #16
@@ -56,7 +55,7 @@
         /* align destination to cache-line for the write-buffer */
         rsb         r3, r0, #0
         ands        r3, r3, #0xF
-        beq         0f
+        beq         2f
 
         /* copy up to 15-bytes (count in r3) */
         sub         r2, r2, r3
@@ -76,47 +75,29 @@
         // copies 8 bytes, destination 64-bits aligned
         vld1.8      {d0}, [r1]!
         vst1.8      {d0}, [r0, :64]!
-2:
 
-0:      /* preload immediately the next cache line, which we may need */
-        pld         [r1, #(CACHE_LINE_SIZE*0)]
-        pld         [r1, #(CACHE_LINE_SIZE*2)]
-
-        /* make sure we have at least 64 bytes to copy */
+2:      /* make sure we have at least 64 bytes to copy */
         subs        r2, r2, #64
         blo         2f
 
-        /* Preload all the cache lines we need.
-         * NOTE: The number of pld below depends on CACHE_LINE_SIZE,
-         * ideally we would increase the distance in the main loop to
-         * avoid the goofy code below. In practice this doesn't seem to make
-         * a big difference.
-         * NOTE: The value CACHE_LINE_SIZE * 8 was chosen through
-         * experimentation.
-         */
-        pld         [r1, #(CACHE_LINE_SIZE*4)]
-        pld         [r1, #(CACHE_LINE_SIZE*6)]
-        pld         [r1, #(CACHE_LINE_SIZE*8)]
-
 1:      /* The main loop copies 64 bytes at a time */
         vld1.8      {d0  - d3},   [r1]!
         vld1.8      {d4  - d7},   [r1]!
-        pld         [r1, #(CACHE_LINE_SIZE*8)]
+        pld         [r1, #(CACHE_LINE_SIZE*2)]
         subs        r2, r2, #64
         vst1.8      {d0  - d3},   [r0, :128]!
         vst1.8      {d4  - d7},   [r0, :128]!
         bhs         1b
 
 2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
-        add         r2, r2, #64
-        subs        r2, r2, #32
+        adds        r2, r2, #32
         blo         4f
 
-3:      /* 32 bytes at a time. These cache lines were already preloaded */
+        /* Copy 32 bytes. These cache lines were already preloaded */
         vld1.8      {d0 - d3},  [r1]!
-        subs        r2, r2, #32
+        sub         r2, r2, #32
         vst1.8      {d0 - d3},  [r0, :128]!
-        bhs         3b
+
 4:      /* less than 32 left */
         add         r2, r2, #32
         tst         r2, #0x10