Optimize __memset_chk, __memcpy_chk.

This change creates assembler versions of __memcpy_chk/__memset_chk
that is implemented in the memcpy/memset assembler code. This change
avoids an extra call to memcpy/memset, instead allowing a simple fall
through to occur from the chk code into the body of the real
implementation.

Testing:

- Ran the libc_test on __memcpy_chk/__memset_chk on all nexus devices.
- Wrote a small test executable that has three calls to __memcpy_chk and
  three calls to __memset_chk. First call dest_len is length + 1. Second
  call dest_len is length. Third call dest_len is length - 1.
  Verified that the first two calls pass, and the third fails. Examined
  the logcat output on all nexus devices to verify that the fortify
  error message was sent properly.
- I benchmarked the new __memcpy_chk and __memset_chk on all systems. For
  __memcpy_chk and large copies, the savings is relatively small (about 1%).
  For small copies, the savings is large on cortex-a15/krait devices
  (between 5% to 30%).
  For cortex-a9 and small copies, the speed up is present, but relatively
  small (about 3% to 5%).
  For __memset_chk and large copies, the savings is also small (about 1%).
  However, all processors show larger speed-ups on small copies (about 30% to
  100%).

Bug: 9293744

Change-Id: I8926d59fe2673e36e8a27629e02a7b7059ebbc98
diff --git a/libc/arch-arm/arm.mk b/libc/arch-arm/arm.mk
index 7fb15a1..1d9863c 100644
--- a/libc/arch-arm/arm.mk
+++ b/libc/arch-arm/arm.mk
@@ -26,6 +26,15 @@
 _LIBC_ARCH_DYNAMIC_SRC_FILES := \
     arch-arm/bionic/exidx_dynamic.c
 
+# Remove the C++ fortify function implementations for which there is an
+# arm assembler version.
+_LIBC_FORTIFY_FILES_TO_REMOVE := \
+    bionic/__memcpy_chk.cpp \
+    bionic/__memset_chk.cpp \
+
+libc_common_src_files := \
+    $(filter-out $(_LIBC_FORTIFY_FILES_TO_REMOVE),$(libc_common_src_files))
+
 ifeq ($(strip $(wildcard bionic/libc/arch-arm/$(TARGET_CPU_VARIANT)/$(TARGET_CPU_VARIANT).mk)),)
 $(error "TARGET_CPU_VARIANT not set or set to an unknown value. Possible values are cortex-a7, cortex-a8, cortex-a9, cortex-a15, krait. Use generic for devices that do not have a CPU similar to any of the supported cpu variants.")
 endif
diff --git a/libc/arch-arm/cortex-a15/bionic/memcpy.S b/libc/arch-arm/cortex-a15/bionic/memcpy.S
index d297064..2394024 100644
--- a/libc/arch-arm/cortex-a15/bionic/memcpy.S
+++ b/libc/arch-arm/cortex-a15/bionic/memcpy.S
@@ -59,6 +59,7 @@
 
 #include <machine/cpu-features.h>
 #include <machine/asm.h>
+#include "libc_events.h"
 
         .text
         .syntax unified
@@ -66,6 +67,13 @@
 
 #define CACHE_LINE_SIZE 64
 
+ENTRY(__memcpy_chk)
+        cmp     r2, r3
+        bgt     fortify_check_failed
+
+        // Fall through to memcpy...
+END(__memcpy_chk)
+
 ENTRY(memcpy)
         // Assumes that n >= 0, and dst, src are valid pointers.
         // For any sizes less than 832 use the neon code that doesn't
@@ -321,4 +329,21 @@
 
         // Src is guaranteed to be at least word aligned by this point.
         b       word_aligned
+
+
+        // Only reached when the __memcpy_chk check fails.
+fortify_check_failed:
+        ldr     r0, error_message
+        ldr     r1, error_code
+1:
+        add     r0, pc
+        bl      __fortify_chk_fail
+error_code:
+        .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
+error_message:
+        .word   error_string-(1b+8)
 END(memcpy)
+
+        .data
+error_string:
+        .string "memcpy buffer overflow"
diff --git a/libc/arch-arm/cortex-a15/bionic/memset.S b/libc/arch-arm/cortex-a15/bionic/memset.S
index 2e1ad54..6c143ad 100644
--- a/libc/arch-arm/cortex-a15/bionic/memset.S
+++ b/libc/arch-arm/cortex-a15/bionic/memset.S
@@ -28,19 +28,38 @@
 
 #include <machine/cpu-features.h>
 #include <machine/asm.h>
+#include "libc_events.h"
 
-		/*
-		 * Optimized memset() for ARM.
+        /*
+         * Optimized memset() for ARM.
          *
          * memset() returns its first argument.
-		 */
+         */
 
         .fpu        neon
         .syntax     unified
 
+ENTRY(__memset_chk)
+        cmp         r2, r3
+        bls         done
+
+        ldr         r0, error_message
+        ldr         r1, error_code
+1:
+        add         r0, pc
+        bl          __fortify_chk_fail
+error_code:
+        .word       BIONIC_EVENT_MEMSET_BUFFER_OVERFLOW
+error_message:
+        .word       error_string-(1b+8)
+
+END(__memset_chk)
+
 ENTRY(bzero)
         mov         r2, r1
         mov         r1, #0
+
+done:
         // Fall through to memset...
 END(bzero)
 
@@ -162,3 +181,7 @@
         ldmfd       sp!, {r0}
         bx          lr
 END(memset)
+
+        .data
+error_string:
+        .string     "memset buffer overflow"
diff --git a/libc/arch-arm/cortex-a9/bionic/memcpy.S b/libc/arch-arm/cortex-a9/bionic/memcpy.S
index 70e27b0..4e624d4 100644
--- a/libc/arch-arm/cortex-a9/bionic/memcpy.S
+++ b/libc/arch-arm/cortex-a9/bionic/memcpy.S
@@ -28,6 +28,7 @@
 
 #include <machine/cpu-features.h>
 #include <machine/asm.h>
+#include "libc_events.h"
 
 /*
  * This code assumes it is running on a processor that supports all arm v7
@@ -40,6 +41,13 @@
 
 #define CACHE_LINE_SIZE     32
 
+ENTRY(__memcpy_chk)
+        cmp         r2, r3
+        bgt         fortify_check_failed
+
+        // Fall through to memcpy...
+END(__memcpy_chk)
+
 ENTRY(memcpy)
         .save       {r0, lr}
         /* start preloading as early as possible */
@@ -208,4 +216,21 @@
 6:
         ldmfd       sp!, {r4, r5, r6, r7, r8}
         ldmfd       sp!, {r0, pc}
+
+
+        // Only reached when the __memcpy_chk check fails.
+fortify_check_failed:
+        ldr     r0, error_message
+        ldr     r1, error_code
+1:
+        add     r0, pc
+        bl      __fortify_chk_fail
+error_code:
+        .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
+error_message:
+        .word   error_string-(1b+8)
 END(memcpy)
+
+        .data
+error_string:
+        .string     "memcpy buffer overflow"
diff --git a/libc/arch-arm/cortex-a9/bionic/memset.S b/libc/arch-arm/cortex-a9/bionic/memset.S
index b58aa45..d011430 100644
--- a/libc/arch-arm/cortex-a9/bionic/memset.S
+++ b/libc/arch-arm/cortex-a9/bionic/memset.S
@@ -28,6 +28,7 @@
 
 #include <machine/cpu-features.h>
 #include <machine/asm.h>
+#include "libc_events.h"
 
 /*
  * This code assumes it is running on a processor that supports all arm v7
@@ -36,9 +37,28 @@
 
     .fpu    neon
 
+ENTRY(__memset_chk)
+        cmp         r2, r3
+        bls         done
+
+        ldr         r0, error_message
+        ldr         r1, error_code
+1:
+        add         r0, pc
+        bl          __fortify_chk_fail
+error_code:
+        .word       BIONIC_EVENT_MEMSET_BUFFER_OVERFLOW
+error_message:
+        .word       error_string-(1b+8)
+
+END(__memset_chk)
+
 ENTRY(bzero)
         mov     r2, r1
         mov     r1, #0
+
+done:
+        // Fall through to memset...
 END(bzero)
 
 /* memset() returns its first argument.  */
@@ -150,3 +170,7 @@
         ldmfd       sp!, {r0, r4-r7, lr}
         bx          lr
 END(memset)
+
+        .data
+error_string:
+        .string     "memset buffer overflow"
diff --git a/libc/arch-arm/generic/bionic/memcpy.S b/libc/arch-arm/generic/bionic/memcpy.S
index 6890a55..24373d8 100644
--- a/libc/arch-arm/generic/bionic/memcpy.S
+++ b/libc/arch-arm/generic/bionic/memcpy.S
@@ -28,6 +28,7 @@
 
 #include <machine/cpu-features.h>
 #include <machine/asm.h>
+#include "libc_events.h"
 
         /*
          * Optimized memcpy() for ARM.
@@ -36,6 +37,13 @@
          * so we have to preserve R0.
          */
 
+ENTRY(__memcpy_chk)
+        cmp         r2, r3
+        bgt         fortify_check_failed
+
+        // Fall through to memcpy...
+END(__memcpy_chk)
+
 ENTRY(memcpy)
         /* The stack must always be 64-bits aligned to be compliant with the
          * ARM ABI. Since we have to save R0, we might as well save R4
@@ -377,4 +385,20 @@
         add         sp,  sp, #28
         ldmfd       sp!, {r0, r4, lr}
         bx          lr
+
+        // Only reached when the __memcpy_chk check fails.
+fortify_check_failed:
+        ldr     r0, error_message
+        ldr     r1, error_code
+1:
+        add     r0, pc
+        bl      __fortify_chk_fail
+error_code:
+        .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
+error_message:
+        .word   error_string-(1b+8)
 END(memcpy)
+
+        .data
+error_string:
+        .string     "memcpy buffer overflow"
diff --git a/libc/arch-arm/generic/bionic/memset.S b/libc/arch-arm/generic/bionic/memset.S
index 3c034e0..399bae9 100644
--- a/libc/arch-arm/generic/bionic/memset.S
+++ b/libc/arch-arm/generic/bionic/memset.S
@@ -27,6 +27,7 @@
  */
 
 #include <machine/asm.h>
+#include "libc_events.h"
 
         /*
          * Optimized memset() for ARM.
@@ -34,9 +35,28 @@
          * memset() returns its first argument.
          */
 
+ENTRY(__memset_chk)
+        cmp         r2, r3
+        bls         done
+
+        ldr         r0, error_message
+        ldr         r1, error_code
+1:
+        add         r0, pc
+        bl          __fortify_chk_fail
+error_code:
+        .word       BIONIC_EVENT_MEMSET_BUFFER_OVERFLOW
+error_message:
+        .word       error_string-(1b+8)
+
+END(__memset_chk)
+
 ENTRY(bzero)
         mov     r2, r1
         mov     r1, #0
+
+done:
+        // Fall through to memset...
 END(bzero)
 
 ENTRY(memset)
@@ -107,3 +127,7 @@
         ldmfd       sp!, {r0, r4-r7, lr}
         bx          lr
 END(memset)
+
+        .data
+error_string:
+        .string     "memset buffer overflow"
diff --git a/libc/arch-arm/krait/bionic/memcpy.S b/libc/arch-arm/krait/bionic/memcpy.S
index 4a21709..3afe18c 100644
--- a/libc/arch-arm/krait/bionic/memcpy.S
+++ b/libc/arch-arm/krait/bionic/memcpy.S
@@ -30,6 +30,7 @@
 
 #include <machine/cpu-features.h>
 #include <machine/asm.h>
+#include "libc_events.h"
 
 /*
  * This code assumes it is running on a processor that supports all arm v7
@@ -37,10 +38,17 @@
  * cache line.
  */
 
+#define CACHE_LINE_SIZE     32
+
         .text
         .fpu    neon
 
-#define CACHE_LINE_SIZE     32
+ENTRY(__memcpy_chk)
+        cmp         r2, r3
+        bgt         fortify_check_failed
+
+        // Fall through to memcpy...
+END(__memcpy_chk)
 
 ENTRY(memcpy)
         .save       {r0, lr}
@@ -124,4 +132,20 @@
 
         ldmfd       sp!, {r0, lr}
         bx          lr
+
+        // Only reached when the __memcpy_chk check fails.
+fortify_check_failed:
+        ldr     r0, error_message
+        ldr     r1, error_code
+1:
+        add     r0, pc
+        bl      __fortify_chk_fail
+error_code:
+        .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
+error_message:
+        .word   error_string-(1b+8)
 END(memcpy)
+
+        .data
+error_string:
+        .string     "memcpy buffer overflow"
diff --git a/libc/arch-arm/krait/bionic/memset.S b/libc/arch-arm/krait/bionic/memset.S
index a2e2d80..4e4788b 100644
--- a/libc/arch-arm/krait/bionic/memset.S
+++ b/libc/arch-arm/krait/bionic/memset.S
@@ -28,6 +28,7 @@
 
 #include <machine/cpu-features.h>
 #include <machine/asm.h>
+#include "libc_events.h"
 
 /*
  * This code assumes it is running on a processor that supports all arm v7
@@ -37,9 +38,28 @@
 
     .fpu    neon
 
+ENTRY(__memset_chk)
+        cmp         r2, r3
+        bls         done
+
+        ldr         r0, error_message
+        ldr         r1, error_code
+1:
+        add         r0, pc
+        bl          __fortify_chk_fail
+error_code:
+        .word       BIONIC_EVENT_MEMSET_BUFFER_OVERFLOW
+error_message:
+        .word       error_string-(1b+8)
+
+END(__memset_chk)
+
 ENTRY(bzero)
         mov     r2, r1
         mov     r1, #0
+
+done:
+        // Fall through to memset...
 END(bzero)
 
 /* memset() returns its first argument.  */
@@ -79,3 +99,7 @@
         ldmfd       sp!, {r0}
         bx          lr
 END(memset)
+
+        .data
+error_string:
+        .string     "memset buffer overflow"