Tune the memcpy for krait. Streamline the memcpy a bit removing some unnecessary instructions. The biggest speed improvement comes from changing the size of the preload. On krait, the sweet spot for the preload in the main loop is twice the L1 cache line size. In most cases, these small tweaks yield > 1000MB/s speed ups. As the size of the memcpy approaches about 1MB, the speed improvement disappears. Change-Id: Ief79694d65324e2db41bee4707dae19b8c24be62

commit: 4d8fe5177eae8abe3cf5a596916e85daee78a0f4 [log] [tgz]
author: Christopher Ferris <cferris@google.com> Fri Apr 19 14:01:50 2013 -0700
committer: Christopher Ferris <cferris@google.com> Thu May 02 14:04:31 2013 -0700
tree: c1e01c3f8249a998e0a278df3f8f4441a229d1f6
parent: 240bc95be1865b4242b4044a61b33099871b6c9e [diff] [blame]
diff --git a/libc/arch-arm/krait/bionic/memcpy.S b/libc/arch-arm/krait/bionic/memcpy.S
index 0cd4d44..4a21709 100644
--- a/libc/arch-arm/krait/bionic/memcpy.S
+++ b/libc/arch-arm/krait/bionic/memcpy.S

@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2008 The Android Open Source Project
+ * Copyright (C) 2013 The Android Open Source Project
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -45,9 +45,8 @@
 ENTRY(memcpy)
         .save       {r0, lr}
         /* start preloading as early as possible */
-        pld         [r1, #(CACHE_LINE_SIZE*0)]
+        pld         [r1, #(CACHE_LINE_SIZE*4)]
         stmfd       sp!, {r0, lr}
-        pld         [r1, #(CACHE_LINE_SIZE*2)]
 
         /* do we have at least 16-bytes to copy (needed for alignment below) */
         cmp         r2, #16
@@ -56,7 +55,7 @@
         /* align destination to cache-line for the write-buffer */
         rsb         r3, r0, #0
         ands        r3, r3, #0xF
-        beq         0f
+        beq         2f
 
         /* copy up to 15-bytes (count in r3) */
         sub         r2, r2, r3
@@ -76,47 +75,29 @@
         // copies 8 bytes, destination 64-bits aligned
         vld1.8      {d0}, [r1]!
         vst1.8      {d0}, [r0, :64]!
-2:
 
-0:      /* preload immediately the next cache line, which we may need */
-        pld         [r1, #(CACHE_LINE_SIZE*0)]
-        pld         [r1, #(CACHE_LINE_SIZE*2)]
-
-        /* make sure we have at least 64 bytes to copy */
+2:      /* make sure we have at least 64 bytes to copy */
         subs        r2, r2, #64
         blo         2f
 
-        /* Preload all the cache lines we need.
-         * NOTE: The number of pld below depends on CACHE_LINE_SIZE,
-         * ideally we would increase the distance in the main loop to
-         * avoid the goofy code below. In practice this doesn't seem to make
-         * a big difference.
-         * NOTE: The value CACHE_LINE_SIZE * 8 was chosen through
-         * experimentation.
-         */
-        pld         [r1, #(CACHE_LINE_SIZE*4)]
-        pld         [r1, #(CACHE_LINE_SIZE*6)]
-        pld         [r1, #(CACHE_LINE_SIZE*8)]
-
 1:      /* The main loop copies 64 bytes at a time */
         vld1.8      {d0  - d3},   [r1]!
         vld1.8      {d4  - d7},   [r1]!
-        pld         [r1, #(CACHE_LINE_SIZE*8)]
+        pld         [r1, #(CACHE_LINE_SIZE*2)]
         subs        r2, r2, #64
         vst1.8      {d0  - d3},   [r0, :128]!
         vst1.8      {d4  - d7},   [r0, :128]!
         bhs         1b
 
 2:      /* fix-up the remaining count and make sure we have >= 32 bytes left */
-        add         r2, r2, #64
-        subs        r2, r2, #32
+        adds        r2, r2, #32
         blo         4f
 
-3:      /* 32 bytes at a time. These cache lines were already preloaded */
+        /* Copy 32 bytes. These cache lines were already preloaded */
         vld1.8      {d0 - d3},  [r1]!
-        subs        r2, r2, #32
+        sub         r2, r2, #32
         vst1.8      {d0 - d3},  [r0, :128]!
-        bhs         3b
+
 4:      /* less than 32 left */
         add         r2, r2, #32
         tst         r2, #0x10
commit	4d8fe5177eae8abe3cf5a596916e85daee78a0f4	[log] [tgz]
author	Christopher Ferris <cferris@google.com>	Fri Apr 19 14:01:50 2013 -0700
committer	Christopher Ferris <cferris@google.com>	Thu May 02 14:04:31 2013 -0700
tree	c1e01c3f8249a998e0a278df3f8f4441a229d1f6
parent	240bc95be1865b4242b4044a61b33099871b6c9e [diff] [blame]