Optimize strcat/strcpy, small tweaks to strlen. DO NOT MERGE Create one version of strcat/strcpy/strlen for cortex-a15/krait and another version for cortex-a9. Tested with the libc_test strcat/strcpy/strlen tests. Including new tests that verify that the src for strcat/strcpy do not overread across page boundaries. NOTE: The handling of unaligned strcpy (same code in strcat) could probably be optimized further such that the src is read 64 bits at a time instead of the partial reads occurring now. strlen improves slightly since it was recently optimized. Performance improvements for strcpy and strcat (using an empty dest string): cortex-a9 - Small copies vary from about 5% to 20% as the size gets above 10 bytes. - Copies >= 1024, about a 60% improvement. - Unaligned copies, from about 40% improvement. cortex-a15 - Most small copies exhibit a 100% improvement, a few copies only improve by 20%. - Copies >= 1024, about 150% improvement. - Unaligned copies, about 100% improvement. krait - Most small copies vary widely, but on average 20% improvement, then the performance gets better, hitting about a 100% improvement when copies 64 bytes of data. - Copies >= 1024, about 100% improvement. - When coping MBs of data, about 50% improvement. - Unaligned copies, about 90% improvement. As strcat destination strings get larger in size: cortex-a9 - about 40% improvement for small dst strings (>= 32). - about 250% improvement for dst strings >= 1024. cortex-a15 - about 200% improvement for small dst strings (>=32). - about 250% improvement for dst strings >= 1024. krait - about 25% improvement for small dst strings (>=32). - about 100% improvement for dst strings >=1024. Merge from internal master. (cherry-picked from d119b7b6f48fe507088cfb98bcafa99b320fd884) Change-Id: I296463b251ef9fab004ee4dded2793feca5b547a

commit: 4e24dcc8d869db7303650d8444c8796445fbbc07 [log] [tgz]
author: Christopher Ferris <cferris@google.com> Mon Jul 15 12:49:26 2013 -0700
committer: Christopher Ferris <cferris@google.com> Thu Aug 08 11:13:46 2013 -0700
tree: 0be95e3a9d17dafa7368394f03f304e051e0d38e
parent: cd927519a94939f2ebc307544f827baade529bc9 [diff]
diff --git a/libc/arch-arm/generic/bionic/strcpy.S b/libc/arch-arm/generic/bionic/strcpy.S
new file mode 100644
index 0000000..21dafda
--- /dev/null
+++ b/libc/arch-arm/generic/bionic/strcpy.S

@@ -0,0 +1,133 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ * Copyright (c) 2008 ARM Ltd
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Android adaptation and tweak by Jim Huang <jserv@0xlab.org>.
+ */
+
+#include <machine/cpu-features.h>
+#include <machine/asm.h>
+
+ENTRY(strcpy)
+	PLD(r1, #0)
+	eor	r2, r0, r1
+	mov	ip, r0
+	tst	r2, #3
+	bne	4f
+	tst	r1, #3
+	bne	3f
+5:
+	str	r5, [sp, #-4]!
+	mov	r5, #0x01
+	orr	r5, r5, r5, lsl #8
+	orr	r5, r5, r5, lsl #16
+
+	str	r4, [sp, #-4]!
+	tst	r1, #4
+	ldr	r3, [r1], #4
+	beq	2f
+	sub	r2, r3, r5
+	bics	r2, r2, r3
+	tst	r2, r5, lsl #7
+	itt	eq
+	streq	r3, [ip], #4
+	ldreq	r3, [r1], #4
+	bne	1f
+       /* Inner loop.  We now know that r1 is 64-bit aligned, so we
+	  can safely fetch up to two words.  This allows us to avoid
+	  load stalls.  */
+	.p2align 2
+2:
+	PLD(r1, #8)
+	ldr	r4, [r1], #4
+	sub	r2, r3, r5
+	bics	r2, r2, r3
+	tst	r2, r5, lsl #7
+	sub	r2, r4, r5
+	bne	1f
+	str	r3, [ip], #4
+	bics	r2, r2, r4
+	tst	r2, r5, lsl #7
+	itt	eq
+	ldreq	r3, [r1], #4
+	streq	r4, [ip], #4
+	beq	2b
+	mov	r3, r4
+1:
+#ifdef __ARMEB__
+	rors	r3, r3, #24
+#endif
+	strb	r3, [ip], #1
+	tst	r3, #0xff
+#ifdef __ARMEL__
+	ror	r3, r3, #8
+#endif
+	bne	1b
+	ldr	r4, [sp], #4
+	ldr	r5, [sp], #4
+	bx	lr
+
+       /* Strings have the same offset from word alignment, but it's
+	  not zero.  */
+3:
+	tst	r1, #1
+	beq	1f
+	ldrb	r2, [r1], #1
+	strb	r2, [ip], #1
+	cmp	r2, #0
+	it	eq
+	bxeq	lr
+1:
+	tst	r1, #2
+	beq	5b
+	ldrh	r2, [r1], #2
+#ifdef __ARMEB__
+	tst	r2, #0xff00
+	iteet	ne
+	strneh	r2, [ip], #2
+	lsreq	r2, r2, #8
+	streqb	r2, [ip]
+	tstne	r2, #0xff
+#else
+	tst	r2, #0xff
+	itet	ne
+	strneh	r2, [ip], #2
+	streqb	r2, [ip]
+	tstne	r2, #0xff00
+#endif
+	bne	5b
+	bx	lr
+
+       /* src and dst do not have a common word-alignement.  Fall back to
+	  byte copying.  */
+4:
+	ldrb	r2, [r1], #1
+	strb	r2, [ip], #1
+	cmp	r2, #0
+	bne	4b
+	bx	lr
+END(strcpy)

diff --git a/libc/arch-arm/generic/generic.mk b/libc/arch-arm/generic/generic.mk
index 18cad9d..0b3f644 100644
--- a/libc/arch-arm/generic/generic.mk
+++ b/libc/arch-arm/generic/generic.mk

@@ -1,4 +1,6 @@
 $(call libc-add-cpu-variant-src,MEMCPY,arch-arm/generic/bionic/memcpy.S)
 $(call libc-add-cpu-variant-src,MEMSET,arch-arm/generic/bionic/memset.S)
+$(call libc-add-cpu-variant-src,STRCAT,string/strcat.c)
 $(call libc-add-cpu-variant-src,STRCMP,arch-arm/generic/bionic/strcmp.S)
+$(call libc-add-cpu-variant-src,STRCPY,arch-arm/generic/bionic/strcpy.c)
 $(call libc-add-cpu-variant-src,STRLEN,arch-arm/generic/bionic/strlen.c)
commit	4e24dcc8d869db7303650d8444c8796445fbbc07	[log] [tgz]
author	Christopher Ferris <cferris@google.com>	Mon Jul 15 12:49:26 2013 -0700
committer	Christopher Ferris <cferris@google.com>	Thu Aug 08 11:13:46 2013 -0700
tree	0be95e3a9d17dafa7368394f03f304e051e0d38e
parent	cd927519a94939f2ebc307544f827baade529bc9 [diff]