libc: ARM: add Apple strcmp

* Tested on Nextbit Robin (MSM8992)

$ bench_strcmp -N "strcmp_1k" -s 1k -I 200

Before:
              prc thr   usecs/call      samples   errors cnt/samp     size
strcmp_1k      1   1      3.79192           75        0      500     1024

After:
              prc thr   usecs/call      samples   errors cnt/samp     size
strcmp_1k      1   1      1.49527          101        0      500     1024

Change-Id: I60599cbc903de777a12045081f1687613864cf39
diff --git a/libc/arch-arm/cortex-a15/bionic/strcmp.S b/libc/arch-arm/cortex-a15/bionic/strcmp.S
index acedf0e..d7b05c7 100644
--- a/libc/arch-arm/cortex-a15/bionic/strcmp.S
+++ b/libc/arch-arm/cortex-a15/bionic/strcmp.S
@@ -1,376 +1,127 @@
 /*
- * Copyright (c) 2013 ARM Ltd
- * All rights reserved.
+ * Copyright (c) 2011 Apple, Inc. All rights reserved.
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. The name of the company may not be used to endorse or promote
- *    products derived from this software without specific prior written
- *    permission.
+ * @APPLE_LICENSE_HEADER_START@
  *
- * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this
+ * file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_LICENSE_HEADER_END@
  */
 
 #include <machine/cpu-features.h>
 #include <private/bionic_asm.h>
 
-#ifdef __ARMEB__
-#define S2LOMEM lsl
-#define S2LOMEMEQ lsleq
-#define S2HIMEM lsr
-#define MSB 0x000000ff
-#define LSB 0xff000000
-#define BYTE0_OFFSET 24
-#define BYTE1_OFFSET 16
-#define BYTE2_OFFSET 8
-#define BYTE3_OFFSET 0
-#else /* not  __ARMEB__ */
-#define S2LOMEM lsr
-#define S2LOMEMEQ lsreq
-#define S2HIMEM lsl
-#define BYTE0_OFFSET 0
-#define BYTE1_OFFSET 8
-#define BYTE2_OFFSET 16
-#define BYTE3_OFFSET 24
-#define MSB 0xff000000
-#define LSB 0x000000ff
-#endif /* not  __ARMEB__ */
+.text
+.syntax unified
+.code 32
 
-.syntax         unified
+#define ESTABLISH_FRAME        \
+    push   {r4-r7,lr}         ;\
+    add     r7,     sp, #12   ;\
+    push   {r8}
+#define CLEAR_FRAME            \
+    pop    {r8}               ;\
+    pop    {r4-r7,lr}
 
-#if defined (__thumb__)
-        .thumb
-        .thumb_func
-#endif
+.align 3
+.long 0, 0x01010101
 
 ENTRY(strcmp)
-      /* Use LDRD whenever possible.  */
+//  Load a character from each string and advance the pointers.  If the loaded
+//  characters are unequal or NUL, return their difference.
+0:	ldrb    r2,    [r0],#1
+    ldrb    ip,    [r1],#1
+    cmp     r2,         #1
+    cmphs   r2,         ip
+    bne     L_earlyReturn
+//  If the address of the next character from s1 does not have word alignment,
+//  continue with the character-by-character comparison.  Otherwise, fall
+//  through into the word-by-word comparison path.
+    tst     r0,         #3
+    bne     0b
 
-/* The main thing to look out for when comparing large blocks is that
-   the loads do not cross a page boundary when loading past the index
-   of the byte with the first difference or the first string-terminator.
+//  We have not encountered a NUL or a mismatch, and s1 has word alignment.
+//  Establish a frame, since we're going to need additional registers anyway.
+    ESTABLISH_FRAME
+    ldr     lr,    (strcmp-4)
 
-   For example, if the strings are identical and the string-terminator
-   is at index k, byte by byte comparison will not load beyond address
-   s1+k and s2+k; word by word comparison may load up to 3 bytes beyond
-   k; double word - up to 7 bytes.  If the load of these bytes crosses
-   a page boundary, it might cause a memory fault (if the page is not mapped)
-   that would not have happened in byte by byte comparison.
+//  Word align s2, and place the remainder in r8.  Compute the right- and
+//  left-shifts to extract each word that we will compare to the other source
+//  from the aligned words that we load:
+//
+//      aligned s2        to be loaded on next iteration
+//      |   "true" s2     |
+//      v   v             v
+//      +---+---+---+---+ +---+---+---+---+
+//      | 0 | 1 | 2 | 3 | | 4 | 5 | 6 | 7 |
+//      +---+---+---+---+ +---+---+---+---+
+//          ^-----------------^
+//          to be compared on next iteration
+    and     r8,     r1, #3
+    bic     r1,     r1, #3
+    mov     r8,         r8, lsl #3
+    rsb     r5,     r8, #32
 
-   If an address is (double) word aligned, then a load of a (double) word
-   from that address will not cross a page boundary.
-   Therefore, the algorithm below considers word and double-word alignment
-   of strings separately.  */
+//  Load the first aligned word of s2.  OR 0x01 into any bytes that preceed the
+//  "true s2", to prevent our check for NUL from generating a false positive.
+//  Then check for NUL, and jump to the byte-by-byte comparison loop after
+//  unwinding the pointers if we enounter one.
+    ldr     r6,    [r1],#4
+    orr     r6,     r6, lr, lsr r5
+    sub     r2,     r6, lr
+    bic     r2,     r2, r6
+    tst     r2,         lr, lsl #7
+    mov     r4,         r6, lsr r8
+    bne     L_unwindLoopPreload
 
-/* High-level description of the algorithm.
+.align 3
+L_wordCompareLoop:
+    //  Load the next aligned word of s2 and check if it contains any NUL bytes.
+    //  Load the next aligned word of s1, and extract the corresponding bytes from
+    //  the two words of s2 loaded in this and the previous iteration of the loop.
+    //  Compare these two words.
+    //  If no NUL or mismatched words have been encountered, continue the loop.
+    ldr     r6,    [r1],#4
+    uqsub8  r2,     lr, r6
+    tst     r2,         r2
+    ldr     ip,    [r0],#4
+    orr     r3,     r4, r6, lsl r5
+    cmpeq   ip,         r3
+    mov     r4,         r6, lsr r8
+    beq     L_wordCompareLoop
 
-   * The fast path: if both strings are double-word aligned,
-     use LDRD to load two words from each string in every loop iteration.
-   * If the strings have the same offset from a word boundary,
-     use LDRB to load and compare byte by byte until
-     the first string is aligned to a word boundary (at most 3 bytes).
-     This is optimized for quick return on short unaligned strings.
-   * If the strings have the same offset from a double-word boundary,
-     use LDRD to load two words from each string in every loop iteration, as in the fast path.
-   * If the strings do not have the same offset from a double-word boundary,
-     load a word from the second string before the loop to initialize the queue.
-     Use LDRD to load two words from every string in every loop iteration.
-     Inside the loop, load the second word from the second string only after comparing
-     the first word, using the queued value, to guarantee safety across page boundaries.
-   * If the strings do not have the same offset from a word boundary,
-     use LDR and a shift queue. Order of loads and comparisons matters,
-     similarly to the previous case.
+//  Either we have encountered a NUL, or we have found a mismatch between s1
+//  and s2.  Unwind the pointers and use a byte-by-byte comparison loop.
+    sub     r0,     r0, #4
+    sub     r1,     r1, #4
+L_unwindLoopPreload:
+    sub     r1,     r1, r5, lsr #3
+    CLEAR_FRAME
 
-   * Use UADD8 and SEL to compare words, and use REV and CLZ to compute the return value.
-   * The only difference between ARM and Thumb modes is the use of CBZ instruction.
-   * The only difference between big and little endian is the use of REV in little endian
-     to compute the return value, instead of MOV.
-*/
+L_byteCompareLoop:
+//  Load a character from each string and advance the pointers.  If the loaded
+//  characters are unequal or NUL, return their difference.
+    ldrb    r2,    [r0],#1
+    ldrb    ip,    [r1],#1
+    cmp     r2,         #1
+    cmpcs   r2,         ip
+    beq     L_byteCompareLoop
 
-        .macro m_cbz reg label
-#ifdef __thumb2__
-        cbz     \reg, \label
-#else   /* not defined __thumb2__ */
-        cmp     \reg, #0
-        beq     \label
-#endif /* not defined __thumb2__ */
-        .endm /* m_cbz */
-
-        .macro m_cbnz reg label
-#ifdef __thumb2__
-        cbnz    \reg, \label
-#else   /* not defined __thumb2__ */
-        cmp     \reg, #0
-        bne     \label
-#endif /* not defined __thumb2__ */
-        .endm /* m_cbnz */
-
-        .macro  init
-        /* Macro to save temporary registers and prepare magic values.  */
-        subs    sp, sp, #16
-        .cfi_def_cfa_offset 16
-        strd    r4, r5, [sp, #8]
-        .cfi_rel_offset r4, 0
-        .cfi_rel_offset r5, 4
-        strd    r6, r7, [sp]
-        .cfi_rel_offset r6, 8
-        .cfi_rel_offset r7, 12
-        mvn     r6, #0  /* all F */
-        mov     r7, #0  /* all 0 */
-        .endm   /* init */
-
-        .macro  magic_compare_and_branch w1 w2 label
-        /* Macro to compare registers w1 and w2 and conditionally branch to label.  */
-        cmp     \w1, \w2        /* Are w1 and w2 the same?  */
-        magic_find_zero_bytes \w1
-        it      eq
-        cmpeq   ip, #0          /* Is there a zero byte in w1?  */
-        bne     \label
-        .endm /* magic_compare_and_branch */
-
-        .macro  magic_find_zero_bytes w1
-        /* Macro to find all-zero bytes in w1, result is in ip.  */
-        uadd8   ip, \w1, r6
-        sel     ip, r7, r6
-        .endm /* magic_find_zero_bytes */
-
-        .macro  setup_return w1 w2
-#ifdef __ARMEB__
-        mov     r1, \w1
-        mov     r2, \w2
-#else /* not  __ARMEB__ */
-        rev     r1, \w1
-        rev     r2, \w2
-#endif /* not  __ARMEB__ */
-        .endm /* setup_return */
-
-        pld [r0, #0]
-        pld [r1, #0]
-
-        /* Are both strings double-word aligned?  */
-        orr     ip, r0, r1
-        tst     ip, #7
-        bne     .L_do_align
-
-        /* Fast path.  */
-        init
-
-.L_doubleword_aligned:
-
-        /* Get here when the strings to compare are double-word aligned.  */
-        /* Compare two words in every iteration.  */
-        .p2align        2
-2:
-        pld [r0, #16]
-        pld [r1, #16]
-
-        /* Load the next double-word from each string.  */
-        ldrd    r2, r3, [r0], #8
-        ldrd    r4, r5, [r1], #8
-
-        magic_compare_and_branch w1=r2, w2=r4, label=.L_return_24
-        magic_compare_and_branch w1=r3, w2=r5, label=.L_return_35
-        b       2b
-
-.L_do_align:
-        /* Is the first string word-aligned?  */
-        ands    ip, r0, #3
-        beq     .L_word_aligned_r0
-
-        /* Fast compare byte by byte until the first string is word-aligned.  */
-        /* The offset of r0 from a word boundary is in ip. Thus, the number of bytes
-        to read until the next word boundary is 4-ip.  */
-        bic     r0, r0, #3
-        ldr     r2, [r0], #4
-        lsls    ip, ip, #31
-        beq     .L_byte2
-        bcs     .L_byte3
-
-.L_byte1:
-        ldrb    ip, [r1], #1
-        uxtb    r3, r2, ror #BYTE1_OFFSET
-        subs    ip, r3, ip
-        bne     .L_fast_return
-        m_cbz   reg=r3, label=.L_fast_return
-
-.L_byte2:
-        ldrb    ip, [r1], #1
-        uxtb    r3, r2, ror #BYTE2_OFFSET
-        subs    ip, r3, ip
-        bne     .L_fast_return
-        m_cbz   reg=r3, label=.L_fast_return
-
-.L_byte3:
-        ldrb    ip, [r1], #1
-        uxtb    r3, r2, ror #BYTE3_OFFSET
-        subs    ip, r3, ip
-        bne     .L_fast_return
-        m_cbnz  reg=r3, label=.L_word_aligned_r0
-
-.L_fast_return:
-        mov     r0, ip
-        bx      lr
-
-.L_word_aligned_r0:
-        init
-        /* The first string is word-aligned.  */
-        /* Is the second string word-aligned?  */
-        ands    ip, r1, #3
-        bne     .L_strcmp_unaligned
-
-.L_word_aligned:
-        /* The strings are word-aligned. */
-        /* Is the first string double-word aligned?  */
-        tst     r0, #4
-        beq     .L_doubleword_aligned_r0
-
-        /* If r0 is not double-word aligned yet, align it by loading
-        and comparing the next word from each string.  */
-        ldr     r2, [r0], #4
-        ldr     r4, [r1], #4
-        magic_compare_and_branch w1=r2 w2=r4 label=.L_return_24
-
-.L_doubleword_aligned_r0:
-        /* Get here when r0 is double-word aligned.  */
-        /* Is r1 doubleword_aligned?  */
-        tst     r1, #4
-        beq     .L_doubleword_aligned
-
-        /* Get here when the strings to compare are word-aligned,
-        r0 is double-word aligned, but r1 is not double-word aligned.  */
-
-        /* Initialize the queue.  */
-        ldr     r5, [r1], #4
-
-        /* Compare two words in every iteration.  */
-        .p2align        2
-3:
-        pld [r0, #16]
-        pld [r1, #16]
-
-        /* Load the next double-word from each string and compare.  */
-        ldrd    r2, r3, [r0], #8
-        magic_compare_and_branch w1=r2 w2=r5 label=.L_return_25
-        ldrd    r4, r5, [r1], #8
-        magic_compare_and_branch w1=r3 w2=r4 label=.L_return_34
-        b       3b
-
-        .macro miscmp_word offsetlo offsethi
-        /* Macro to compare misaligned strings.  */
-        /* r0, r1 are word-aligned, and at least one of the strings
-        is not double-word aligned.  */
-        /* Compare one word in every loop iteration.  */
-        /* OFFSETLO is the original bit-offset of r1 from a word-boundary,
-        OFFSETHI is 32 - OFFSETLO (i.e., offset from the next word).  */
-
-        /* Initialize the shift queue.  */
-        ldr     r5, [r1], #4
-
-        /* Compare one word from each string in every loop iteration.  */
-        .p2align        2
-7:
-        ldr     r3, [r0], #4
-        S2LOMEM r5, r5, #\offsetlo
-        magic_find_zero_bytes w1=r3
-        cmp     r7, ip, S2HIMEM #\offsetlo
-        and     r2, r3, r6, S2LOMEM #\offsetlo
-        it      eq
-        cmpeq   r2, r5
-        bne     .L_return_25
-        ldr     r5, [r1], #4
-        cmp     ip, #0
-        eor r3, r2, r3
-        S2HIMEM r2, r5, #\offsethi
-        it      eq
-        cmpeq   r3, r2
-        bne     .L_return_32
-        b       7b
-        .endm /* miscmp_word */
-
-.L_strcmp_unaligned:
-        /* r0 is word-aligned, r1 is at offset ip from a word.  */
-        /* Align r1 to the (previous) word-boundary.  */
-        bic     r1, r1, #3
-
-        /* Unaligned comparison word by word using LDRs. */
-        cmp     ip, #2
-        beq     .L_miscmp_word_16                 /* If ip == 2.  */
-        bge     .L_miscmp_word_24                 /* If ip == 3.  */
-        miscmp_word offsetlo=8 offsethi=24        /* If ip == 1.  */
-.L_miscmp_word_16:  miscmp_word offsetlo=16 offsethi=16
-.L_miscmp_word_24:  miscmp_word offsetlo=24 offsethi=8
-
-
-.L_return_32:
-        setup_return w1=r3, w2=r2
-        b       .L_do_return
-.L_return_34:
-        setup_return w1=r3, w2=r4
-        b       .L_do_return
-.L_return_25:
-        setup_return w1=r2, w2=r5
-        b       .L_do_return
-.L_return_35:
-        setup_return w1=r3, w2=r5
-        b       .L_do_return
-.L_return_24:
-        setup_return w1=r2, w2=r4
-
-.L_do_return:
-
-#ifdef __ARMEB__
-        mov     r0, ip
-#else /* not  __ARMEB__ */
-        rev     r0, ip
-#endif /* not  __ARMEB__ */
-
-        /* Restore temporaries early, before computing the return value.  */
-        ldrd    r6, r7, [sp]
-        ldrd    r4, r5, [sp, #8]
-        adds    sp, sp, #16
-        .cfi_def_cfa_offset 0
-        .cfi_restore r4
-        .cfi_restore r5
-        .cfi_restore r6
-        .cfi_restore r7
-
-        /* There is a zero or a different byte between r1 and r2.  */
-        /* r0 contains a mask of all-zero bytes in r1.  */
-        /* Using r0 and not ip here because cbz requires low register.  */
-        m_cbz   reg=r0, label=.L_compute_return_value
-        clz     r0, r0
-        /* r0 contains the number of bits on the left of the first all-zero byte in r1.  */
-        rsb     r0, r0, #24
-        /* Here, r0 contains the number of bits on the right of the first all-zero byte in r1.  */
-        lsr     r1, r1, r0
-        lsr     r2, r2, r0
-
-.L_compute_return_value:
-        movs    r0, #1
-        cmp     r1, r2
-        /* The return value is computed as follows.
-        If r1>r2 then (C==1 and Z==0) and LS doesn't hold and r0 is #1 at return.
-        If r1<r2 then (C==0 and Z==0) and we execute SBC with carry_in=0,
-        which means r0:=r0-r0-1 and r0 is #-1 at return.
-        If r1=r2 then (C==1 and Z==1) and we execute SBC with carry_in=1,
-        which means r0:=r0-r0 and r0 is #0 at return.
-        (C==0 and Z==1) cannot happen because the carry bit is "not borrow".  */
-        it      ls
-        sbcls   r0, r0, r0
-        bx      lr
+L_earlyReturn:
+//  Return the difference between the last two characters loaded.
+    sub     r0,     r2, ip
+    bx      lr
 END(strcmp)