Optimize strcat/strcpy, small tweaks to strlen. DO NOT MERGE
Create one version of strcat/strcpy/strlen for cortex-a15/krait and another
version for cortex-a9.
Tested with the libc_test strcat/strcpy/strlen tests.
Including new tests that verify that the src for strcat/strcpy do not
overread across page boundaries.
NOTE: The handling of unaligned strcpy (same code in strcat) could probably
be optimized further such that the src is read 64 bits at a time instead of
the partial reads occurring now.
strlen improves slightly since it was recently optimized.
Performance improvements for strcpy and strcat (using an empty dest string):
cortex-a9
- Small copies vary from about 5% to 20% as the size gets above 10 bytes.
- Copies >= 1024, about a 60% improvement.
- Unaligned copies, from about 40% improvement.
cortex-a15
- Most small copies exhibit a 100% improvement, a few copies only
improve by 20%.
- Copies >= 1024, about 150% improvement.
- Unaligned copies, about 100% improvement.
krait
- Most small copies vary widely, but on average 20% improvement, then
the performance gets better, hitting about a 100% improvement when
copies 64 bytes of data.
- Copies >= 1024, about 100% improvement.
- When coping MBs of data, about 50% improvement.
- Unaligned copies, about 90% improvement.
As strcat destination strings get larger in size:
cortex-a9
- about 40% improvement for small dst strings (>= 32).
- about 250% improvement for dst strings >= 1024.
cortex-a15
- about 200% improvement for small dst strings (>=32).
- about 250% improvement for dst strings >= 1024.
krait
- about 25% improvement for small dst strings (>=32).
- about 100% improvement for dst strings >=1024.
Merge from internal master.
(cherry-picked from d119b7b6f48fe507088cfb98bcafa99b320fd884)
Change-Id: I296463b251ef9fab004ee4dded2793feca5b547a
diff --git a/libc/arch-arm/generic/bionic/strcpy.S b/libc/arch-arm/generic/bionic/strcpy.S
new file mode 100644
index 0000000..21dafda
--- /dev/null
+++ b/libc/arch-arm/generic/bionic/strcpy.S
@@ -0,0 +1,133 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ * Copyright (c) 2008 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Android adaptation and tweak by Jim Huang <jserv@0xlab.org>.
+ */
+
+#include <machine/cpu-features.h>
+#include <machine/asm.h>
+
+ENTRY(strcpy)
+ PLD(r1, #0)
+ eor r2, r0, r1
+ mov ip, r0
+ tst r2, #3
+ bne 4f
+ tst r1, #3
+ bne 3f
+5:
+ str r5, [sp, #-4]!
+ mov r5, #0x01
+ orr r5, r5, r5, lsl #8
+ orr r5, r5, r5, lsl #16
+
+ str r4, [sp, #-4]!
+ tst r1, #4
+ ldr r3, [r1], #4
+ beq 2f
+ sub r2, r3, r5
+ bics r2, r2, r3
+ tst r2, r5, lsl #7
+ itt eq
+ streq r3, [ip], #4
+ ldreq r3, [r1], #4
+ bne 1f
+ /* Inner loop. We now know that r1 is 64-bit aligned, so we
+ can safely fetch up to two words. This allows us to avoid
+ load stalls. */
+ .p2align 2
+2:
+ PLD(r1, #8)
+ ldr r4, [r1], #4
+ sub r2, r3, r5
+ bics r2, r2, r3
+ tst r2, r5, lsl #7
+ sub r2, r4, r5
+ bne 1f
+ str r3, [ip], #4
+ bics r2, r2, r4
+ tst r2, r5, lsl #7
+ itt eq
+ ldreq r3, [r1], #4
+ streq r4, [ip], #4
+ beq 2b
+ mov r3, r4
+1:
+#ifdef __ARMEB__
+ rors r3, r3, #24
+#endif
+ strb r3, [ip], #1
+ tst r3, #0xff
+#ifdef __ARMEL__
+ ror r3, r3, #8
+#endif
+ bne 1b
+ ldr r4, [sp], #4
+ ldr r5, [sp], #4
+ bx lr
+
+ /* Strings have the same offset from word alignment, but it's
+ not zero. */
+3:
+ tst r1, #1
+ beq 1f
+ ldrb r2, [r1], #1
+ strb r2, [ip], #1
+ cmp r2, #0
+ it eq
+ bxeq lr
+1:
+ tst r1, #2
+ beq 5b
+ ldrh r2, [r1], #2
+#ifdef __ARMEB__
+ tst r2, #0xff00
+ iteet ne
+ strneh r2, [ip], #2
+ lsreq r2, r2, #8
+ streqb r2, [ip]
+ tstne r2, #0xff
+#else
+ tst r2, #0xff
+ itet ne
+ strneh r2, [ip], #2
+ streqb r2, [ip]
+ tstne r2, #0xff00
+#endif
+ bne 5b
+ bx lr
+
+ /* src and dst do not have a common word-alignement. Fall back to
+ byte copying. */
+4:
+ ldrb r2, [r1], #1
+ strb r2, [ip], #1
+ cmp r2, #0
+ bne 4b
+ bx lr
+END(strcpy)
diff --git a/libc/arch-arm/generic/generic.mk b/libc/arch-arm/generic/generic.mk
index 18cad9d..0b3f644 100644
--- a/libc/arch-arm/generic/generic.mk
+++ b/libc/arch-arm/generic/generic.mk
@@ -1,4 +1,6 @@
$(call libc-add-cpu-variant-src,MEMCPY,arch-arm/generic/bionic/memcpy.S)
$(call libc-add-cpu-variant-src,MEMSET,arch-arm/generic/bionic/memset.S)
+$(call libc-add-cpu-variant-src,STRCAT,string/strcat.c)
$(call libc-add-cpu-variant-src,STRCMP,arch-arm/generic/bionic/strcmp.S)
+$(call libc-add-cpu-variant-src,STRCPY,arch-arm/generic/bionic/strcpy.c)
$(call libc-add-cpu-variant-src,STRLEN,arch-arm/generic/bionic/strlen.c)