libc: ARM: Add 32-bit Kryo memcpy

* Memcpy is based on Scorpion due to Qualcomm's
  128-bit cache line size optimizations.

* PLDOFFSET and PLDSIZE are from the ARM64 Kryo memcpy routine.

Below are the results of the benchmark, tested on a OnePlus 3 with MSM8996.

Before:
BM_string_memcpy/8                          1000k          8    0.934 GiB/s
BM_string_memcpy/64                         1000k         11    5.785 GiB/s
BM_string_memcpy/512                        1000k         25   19.918 GiB/s
BM_string_memcpy/1024                         50M         42   23.938 GiB/s
BM_string_memcpy/8Ki                          10M        473   17.291 GiB/s
BM_string_memcpy/16Ki                          5M        565   28.976 GiB/s
BM_string_memcpy/32Ki                       1000k       1105   29.631 GiB/s
BM_string_memcpy/64Ki                       1000k       2194   29.864 GiB/s

After:
BM_string_memcpy/8                          1000k          6    1.145 GiB/s
BM_string_memcpy/64                         1000k          7    8.560 GiB/s
BM_string_memcpy/512                        1000k         18   27.370 GiB/s
BM_string_memcpy/1024                         50M         33   30.340 GiB/s
BM_string_memcpy/8Ki                          10M        266   30.770 GiB/s
BM_string_memcpy/16Ki                          5M        553   29.599 GiB/s
BM_string_memcpy/32Ki                       1000k       1121   29.219 GiB/s
BM_string_memcpy/64Ki                       1000k       2208   29.678 GiB/s

Test: make otapackage
Test: Ran bionic unit tests on Pixel device. Verified memcpy wins on
Test: Pixel device.

Change-Id: Id7a9c37ef75a306dd5cf8d374d79d0fe83f8a3ba
diff --git a/libc/Android.bp b/libc/Android.bp
index eccd2fc..62d4a9c 100644
--- a/libc/Android.bp
+++ b/libc/Android.bp
@@ -1071,7 +1071,7 @@
             },
             kryo: {
                 srcs: [
-                    "arch-arm/krait/bionic/memcpy.S",
+                    "arch-arm/kryo/bionic/memcpy.S",
                     "arch-arm/cortex-a7/bionic/memset.S",
                     "arch-arm/krait/bionic/strcmp.S",
                     "arch-arm/krait/bionic/__strcat_chk.S",
diff --git a/libc/NOTICE b/libc/NOTICE
index 2121246..708ceef 100644
--- a/libc/NOTICE
+++ b/libc/NOTICE
@@ -743,6 +743,36 @@
 
 -------------------------------------------------------------------
 
+Copyright (C) 2017 The Android Open Source Project
+All rights reserved.
+
+Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in
+   the documentation and/or other materials provided with the
+   distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
+
+-------------------------------------------------------------------
+
 Copyright (c) 1980, 1983, 1988, 1993
    The Regents of the University of California.  All rights reserved.
 
@@ -5130,4 +5160,3 @@
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 -------------------------------------------------------------------
-
diff --git a/libc/arch-arm/kryo/bionic/memcpy.S b/libc/arch-arm/kryo/bionic/memcpy.S
new file mode 100644
index 0000000..a1243d4
--- /dev/null
+++ b/libc/arch-arm/kryo/bionic/memcpy.S
@@ -0,0 +1,153 @@
+/*
+ * Copyright (C) 2008 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <private/bionic_asm.h>
+#include <private/libc_events.h>
+
+#define PLDOFFS	(16)
+#define PLDSIZE	(128)	/* L2 cache line size */
+
+        .code 32
+ENTRY(__memcpy_chk)
+        cmp         r2, r3
+        bhi         __memcpy_chk_fail
+
+        // Fall through to memcpy...
+END(__memcpy_chk)
+
+ENTRY(memcpy)
+	push            {r0}
+	cmp             r2, #4
+	blt             .Lneon_lt4
+	cmp             r2, #16
+	blt             .Lneon_lt16
+	cmp             r2, #32
+	blt             .Lneon_16
+	cmp              r2, #128
+	blt              .Lneon_copy_32_a
+	/* Copy blocks of 128-bytes (word-aligned) at a time*/
+	/* Code below is optimized for PLDSIZE=128 only */
+	mov             r12, r2, lsr #7
+	cmp             r12, #PLDOFFS
+	ble             .Lneon_copy_128_loop_nopld
+	sub             r12, #PLDOFFS
+	pld             [r1, #(PLDOFFS-1)*PLDSIZE]
+.Lneon_copy_128_loop_outer:
+	pld             [r1, #(PLDOFFS*PLDSIZE)]
+	vld1.32         {q0, q1}, [r1]!
+	vld1.32         {q2, q3}, [r1]!
+	vld1.32         {q8, q9}, [r1]!
+	vld1.32         {q10, q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32	        {q0, q1}, [r0]!
+	vst1.32         {q2, q3}, [r0]!
+	vst1.32         {q8, q9}, [r0]!
+	vst1.32         {q10, q11}, [r0]!
+	bne             .Lneon_copy_128_loop_outer
+	mov             r12, #PLDOFFS
+.Lneon_copy_128_loop_nopld:
+	vld1.32         {q0, q1}, [r1]!
+	vld1.32         {q2, q3}, [r1]!
+	vld1.32         {q8, q9}, [r1]!
+	vld1.32         {q10, q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q2, q3}, [r0]!
+	vst1.32         {q8, q9}, [r0]!
+	vst1.32         {q10, q11}, [r0]!
+	bne             .Lneon_copy_128_loop_nopld
+	ands            r2, r2, #0x7f
+	beq             .Lneon_exit
+	cmp             r2, #32
+	blt             .Lneon_16
+	nop
+	/* Copy blocks of 32-bytes (word aligned) at a time*/
+.Lneon_copy_32_a:
+	mov             r12, r2, lsr #5
+.Lneon_copy_32_loop_a:
+	vld1.32         {q0,q1}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0,q1}, [r0]!
+	bne             .Lneon_copy_32_loop_a
+	ands            r2, r2, #0x1f
+	beq             .Lneon_exit
+.Lneon_16:
+	subs            r2, r2, #16
+	blt             .Lneon_lt16
+	vld1.32         {q8}, [r1]!
+	vst1.32         {q8}, [r0]!
+	beq             .Lneon_exit
+.Lneon_lt16:
+	movs            r12, r2, lsl #29
+	bcc             .Lneon_skip8
+	ldr             r3, [r1], #4
+	ldr             r12, [r1], #4
+	str             r3, [r0], #4
+	str             r12, [r0], #4
+.Lneon_skip8:
+	bpl             .Lneon_lt4
+	ldr             r3, [r1], #4
+	str             r3, [r0], #4
+.Lneon_lt4:
+	movs            r2, r2, lsl #31
+	bcc             .Lneon_lt2
+	ldrh            r3, [r1], #2
+	strh            r3, [r0], #2
+.Lneon_lt2:
+	bpl             .Lneon_exit
+	ldrb            r12, [r1]
+	strb            r12, [r0]
+.Lneon_exit:
+	pop             {r0}
+	bx              lr
+
+END(memcpy)
+
+        // Only reached when the __memcpy_chk check fails.
+ENTRY_PRIVATE(__memcpy_chk_fail)
+        // Preserve lr for backtrace.
+        push    {lr}
+        .cfi_def_cfa_offset 4
+        .cfi_rel_offset lr, 0
+
+        ldr     r0, error_message
+        ldr     r1, error_code
+1:
+        add     r0, pc
+        bl      __fortify_chk_fail
+error_code:
+        .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
+error_message:
+        .word   error_string-(1b+8)
+END(__memcpy_chk_fail)
+
+        .data
+error_string:
+        .string     "memcpy: prevented write past end of buffer"
diff --git a/libc/arch-arm/kryo/kryo.mk b/libc/arch-arm/kryo/kryo.mk
index 21bcbf3..e19a88f 100644
--- a/libc/arch-arm/kryo/kryo.mk
+++ b/libc/arch-arm/kryo/kryo.mk
@@ -23,13 +23,16 @@
     arch-arm/cortex-a15/bionic/strcpy.S \
     arch-arm/cortex-a15/bionic/strlen.S \
 
-# Use krait versions of memcpy/strcmp
+# Use krait versions of strcmp
 libc_bionic_src_files_arm += \
-    arch-arm/krait/bionic/memcpy.S \
     arch-arm/krait/bionic/strcmp.S \
     arch-arm/krait/bionic/__strcat_chk.S \
     arch-arm/krait/bionic/__strcpy_chk.S \
 
+# Use kryo versions of memcpy
+libc_bionic_src_files_arm += \
+    arch-arm/kryo/bionic/memcpy.S \
+
 # Use Denver version of memmove
 libc_bionic_src_files_arm += \
     arch-arm/denver/bionic/memmove.S