libc/arch-arm/bionic/memcmp.S - platform_bionic - Gitiles

 /*
  * Copyright (C) 2008 The Android Open Source Project
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *  * Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  *  * Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the
  *    distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */

 #include <machine/cpu-features.h>
 #include <machine/asm.h>


 #ifdef HAVE_32_BYTE_CACHE_LINE
 #define CACHE_LINE_SIZE     32
 #else
 #define CACHE_LINE_SIZE     64
 #endif

 /*
  * Optimized memcmp() for Cortex-A9.
  */

 ENTRY(memcmp)
         pld         [r0, #(CACHE_LINE_SIZE * 0)]
         pld         [r0, #(CACHE_LINE_SIZE * 1)]

         /* take of the case where length is 0 or the buffers are the same */
         cmp         r0, r1
         moveq       r0, #0
         bxeq        lr

         pld         [r1, #(CACHE_LINE_SIZE * 0)]
         pld         [r1, #(CACHE_LINE_SIZE * 1)]

         /* make sure we have at least 8+4 bytes, this simplify things below
          * and avoid some overhead for small blocks
          */
         cmp        r2, #(8+4)
         bmi        10f
 /*
  * Neon optimization
  * Comparing 32 bytes at a time
  */
 #if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS)
         subs        r2, r2, #32
         blo         3f

         /* preload all the cache lines we need. */
         pld         [r0, #(CACHE_LINE_SIZE * 2)]
         pld         [r1, #(CACHE_LINE_SIZE * 2)]

 1:      /* The main loop compares 32 bytes at a time */
         vld1.8      {d0 - d3}, [r0]!
         pld         [r0, #(CACHE_LINE_SIZE * 2)]
         vld1.8      {d4 - d7}, [r1]!
         pld         [r1, #(CACHE_LINE_SIZE * 2)]

         /* Start subtracting the values and merge results */
         vsub.i8     q0, q2
         vsub.i8     q1, q3
         vorr        q2, q0, q1
         vorr        d4, d5
         vmov        r3, ip, d4
         /* Check if there are any differences among the 32 bytes */
         orrs        r3, ip
         bne         2f
         subs        r2, r2, #32
         bhs         1b
         b           3f
 2:
         /* Check if the difference was in the first or last 16 bytes */
         sub         r0, #32
         vorr        d0, d1
         sub         r1, #32
         vmov        r3, ip, d0
         orrs        r3, ip
         /* if the first 16 bytes are equal, we only have to rewind 16 bytes */
         ittt        eq
         subeq       r2, #16
         addeq       r0, #16
         addeq       r1, #16

 3:      /* fix-up the remaining count */
         add         r2, r2, #32

         cmp        r2, #(8+4)
         bmi        10f
 #endif

         /* save registers */
         stmfd       sp!, {r4, lr}
         .cfi_def_cfa_offset 8
         .cfi_rel_offset r4, 0
         .cfi_rel_offset lr, 4

         /* since r0 hold the result, move the first source
          * pointer somewhere else
          */
          mov        r4, r0

         /* align first pointer to word boundary
          * offset = -src & 3
          */
         rsb         r3, r4, #0
         ands        r3, r3, #3
         beq         0f

         /* align first pointer  */
         sub         r2, r2, r3
 1:      ldrb        r0, [r4], #1
         ldrb        ip, [r1], #1
         subs        r0, r0, ip
         bne         9f
         subs        r3, r3, #1
         bne         1b


 0:      /* here the first pointer is aligned, and we have at least 4 bytes
          * to process.
          */

         /* see if the pointers are congruent */
         eor         r0, r4, r1
         ands        r0, r0, #3
         bne         5f

         /* congruent case, 32 bytes per iteration
          * We need to make sure there are at least 32+4 bytes left
          * because we effectively read ahead one word, and we could
          * read past the buffer (and segfault) if we're not careful.
          */

         ldr         ip, [r1]
         subs        r2, r2, #(32 + 4)
         bmi         1f

 0:      pld         [r4, #(CACHE_LINE_SIZE * 2)]
         pld         [r1, #(CACHE_LINE_SIZE * 2)]
         ldr         r0, [r4], #4
         ldr         lr, [r1, #4]!
         eors        r0, r0, ip
         ldreq       r0, [r4], #4
         ldreq       ip, [r1, #4]!
         eoreqs      r0, r0, lr
         ldreq       r0, [r4], #4
         ldreq       lr, [r1, #4]!
         eoreqs      r0, r0, ip
         ldreq       r0, [r4], #4
         ldreq       ip, [r1, #4]!
         eoreqs      r0, r0, lr
         ldreq       r0, [r4], #4
         ldreq       lr, [r1, #4]!
         eoreqs      r0, r0, ip
         ldreq       r0, [r4], #4
         ldreq       ip, [r1, #4]!
         eoreqs      r0, r0, lr
         ldreq       r0, [r4], #4
         ldreq       lr, [r1, #4]!
         eoreqs      r0, r0, ip
         ldreq       r0, [r4], #4
         ldreq       ip, [r1, #4]!
         eoreqs      r0, r0, lr
         bne         2f
         subs        r2, r2, #32
         bhs         0b

         /* do we have at least 4 bytes left? */
 1:      adds        r2, r2, #(32 - 4 + 4)
         bmi         4f

         /* finish off 4 bytes at a time */
 3:      ldr         r0, [r4], #4
         ldr         ip, [r1], #4
         eors        r0, r0, ip
         bne         2f
         subs        r2, r2, #4
         bhs         3b

         /* are we done? */
 4:      adds        r2, r2, #4
         moveq       r0, #0
         beq         9f

         /* finish off the remaining bytes */
         b           8f

 2:      /* the last 4 bytes are different, restart them */
         sub         r4, r4, #4
         sub         r1, r1, #4
         mov         r2, #4

         /* process the last few bytes */
 8:      ldrb        r0, [r4], #1
         ldrb        ip, [r1], #1
         // stall
         subs        r0, r0, ip
         bne         9f
         subs        r2, r2, #1
         bne         8b

 9:      /* restore registers and return */
         ldmfd       sp!, {r4, lr}
         bx          lr

 10:     /* process less than 12 bytes */
         cmp         r2, #0
         moveq       r0, #0
         bxeq        lr
         mov         r3, r0
 11:
         ldrb        r0, [r3], #1
         ldrb        ip, [r1], #1
         subs        r0, ip
         bxne        lr
         subs        r2, r2, #1
         bne         11b
         bx          lr

 5:      /*************** non-congruent case ***************/
         and         r0, r1, #3
         cmp         r0, #2
         bne         4f

         /* here, offset is 2 (16-bits aligned, special cased) */

         /* make sure we have at least 16 bytes to process */
         subs        r2, r2, #16
         addmi       r2, r2, #16
         bmi         8b

         /* align the unaligned pointer */
         bic         r1, r1, #3
         ldr         lr, [r1], #4

 6:      pld         [r1, #(CACHE_LINE_SIZE * 2)]
         pld         [r4, #(CACHE_LINE_SIZE * 2)]
         mov         ip, lr, lsr #16
         ldr         lr, [r1], #4
         ldr         r0, [r4], #4
         orr         ip, ip, lr, lsl #16
         eors        r0, r0, ip
         moveq       ip, lr, lsr #16
         ldreq       lr, [r1], #4
         ldreq       r0, [r4], #4
         orreq       ip, ip, lr, lsl #16
         eoreqs      r0, r0, ip
         moveq       ip, lr, lsr #16
         ldreq       lr, [r1], #4
         ldreq       r0, [r4], #4
         orreq       ip, ip, lr, lsl #16
         eoreqs      r0, r0, ip
         moveq       ip, lr, lsr #16
         ldreq       lr, [r1], #4
         ldreq       r0, [r4], #4
         orreq       ip, ip, lr, lsl #16
         eoreqs      r0, r0, ip
         bne         7f
         subs        r2, r2, #16
         bhs         6b
         sub         r1, r1, #2
         /* are we done? */
         adds        r2, r2, #16
         moveq       r0, #0
         beq         9b
         /* finish off the remaining bytes */
         b           8b

 7:      /* fix up the 2 pointers and fallthrough... */
         sub         r1, r1, #(4+2)
         sub         r4, r4, #4
         mov         r2, #4
         b           8b


 4:      /*************** offset is 1 or 3 (less optimized) ***************/

 		stmfd		sp!, {r5, r6, r7}

         // r5 = rhs
         // r6 = lhs
         // r7 = scratch

         mov         r5, r0, lsl #3		/* r5 = right shift */
         rsb         r6, r5, #32         /* r6 = left shift */

         /* align the unaligned pointer */
         bic         r1, r1, #3
         ldr         r7, [r1], #4
         sub         r2, r2, #8

 6:      mov         ip, r7, lsr r5
         ldr         r7, [r1], #4
         ldr         r0, [r4], #4
         orr         ip, ip, r7, lsl r6
         eors        r0, r0, ip
         moveq       ip, r7, lsr r5
         ldreq       r7, [r1], #4
         ldreq       r0, [r4], #4
         orreq       ip, ip, r7, lsl r6
         eoreqs      r0, r0, ip
         bne         7f
         subs        r2, r2, #8
         bhs         6b

         sub         r1, r1, r6, lsr #3
 		ldmfd       sp!, {r5, r6, r7}

         /* are we done? */
         adds        r2, r2, #8
         moveq       r0, #0
         beq         9b

         /* finish off the remaining bytes */
         b           8b

 7:      /* fix up the 2 pointers and fallthrough... */
         sub         r1, r1, #4
         sub         r1, r1, r6, lsr #3
         sub         r4, r4, #4
         mov         r2, #4
 		ldmfd		sp!, {r5, r6, r7}
         b           8b
 END(memcmp)
	/*
	* Copyright (C) 2008 The Android Open Source Project
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in
	* the documentation and/or other materials provided with the
	* distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <machine/cpu-features.h>
	#include <machine/asm.h>


	#ifdef HAVE_32_BYTE_CACHE_LINE
	#define CACHE_LINE_SIZE 32
	#else
	#define CACHE_LINE_SIZE 64
	#endif

	/*
	* Optimized memcmp() for Cortex-A9.
	*/

	ENTRY(memcmp)
	pld [r0, #(CACHE_LINE_SIZE * 0)]
	pld [r0, #(CACHE_LINE_SIZE * 1)]

	/* take of the case where length is 0 or the buffers are the same */
	cmp r0, r1
	moveq r0, #0
	bxeq lr

	pld [r1, #(CACHE_LINE_SIZE * 0)]
	pld [r1, #(CACHE_LINE_SIZE * 1)]

	/* make sure we have at least 8+4 bytes, this simplify things below
	* and avoid some overhead for small blocks
	*/
	cmp r2, #(8+4)
	bmi 10f
	/*
	* Neon optimization
	* Comparing 32 bytes at a time
	*/
	#if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS)
	subs r2, r2, #32
	blo 3f

	/* preload all the cache lines we need. */
	pld [r0, #(CACHE_LINE_SIZE * 2)]
	pld [r1, #(CACHE_LINE_SIZE * 2)]

	1: /* The main loop compares 32 bytes at a time */
	vld1.8 {d0 - d3}, [r0]!
	pld [r0, #(CACHE_LINE_SIZE * 2)]
	vld1.8 {d4 - d7}, [r1]!
	pld [r1, #(CACHE_LINE_SIZE * 2)]

	/* Start subtracting the values and merge results */
	vsub.i8 q0, q2
	vsub.i8 q1, q3
	vorr q2, q0, q1
	vorr d4, d5
	vmov r3, ip, d4
	/* Check if there are any differences among the 32 bytes */
	orrs r3, ip
	bne 2f
	subs r2, r2, #32
	bhs 1b
	b 3f
	2:
	/* Check if the difference was in the first or last 16 bytes */
	sub r0, #32
	vorr d0, d1
	sub r1, #32
	vmov r3, ip, d0
	orrs r3, ip
	/* if the first 16 bytes are equal, we only have to rewind 16 bytes */
	ittt eq
	subeq r2, #16
	addeq r0, #16
	addeq r1, #16

	3: /* fix-up the remaining count */
	add r2, r2, #32

	cmp r2, #(8+4)
	bmi 10f
	#endif

	/* save registers */
	stmfd sp!, {r4, lr}
	.cfi_def_cfa_offset 8
	.cfi_rel_offset r4, 0
	.cfi_rel_offset lr, 4

	/* since r0 hold the result, move the first source
	* pointer somewhere else
	*/
	mov r4, r0

	/* align first pointer to word boundary
	* offset = -src & 3
	*/
	rsb r3, r4, #0
	ands r3, r3, #3
	beq 0f

	/* align first pointer */
	sub r2, r2, r3
	1: ldrb r0, [r4], #1
	ldrb ip, [r1], #1
	subs r0, r0, ip
	bne 9f
	subs r3, r3, #1
	bne 1b


	0: /* here the first pointer is aligned, and we have at least 4 bytes
	* to process.
	*/

	/* see if the pointers are congruent */
	eor r0, r4, r1
	ands r0, r0, #3
	bne 5f

	/* congruent case, 32 bytes per iteration
	* We need to make sure there are at least 32+4 bytes left
	* because we effectively read ahead one word, and we could
	* read past the buffer (and segfault) if we're not careful.
	*/

	ldr ip, [r1]
	subs r2, r2, #(32 + 4)
	bmi 1f

	0: pld [r4, #(CACHE_LINE_SIZE * 2)]
	pld [r1, #(CACHE_LINE_SIZE * 2)]
	ldr r0, [r4], #4
	ldr lr, [r1, #4]!
	eors r0, r0, ip
	ldreq r0, [r4], #4
	ldreq ip, [r1, #4]!
	eoreqs r0, r0, lr
	ldreq r0, [r4], #4
	ldreq lr, [r1, #4]!
	eoreqs r0, r0, ip
	ldreq r0, [r4], #4
	ldreq ip, [r1, #4]!
	eoreqs r0, r0, lr
	ldreq r0, [r4], #4
	ldreq lr, [r1, #4]!
	eoreqs r0, r0, ip
	ldreq r0, [r4], #4
	ldreq ip, [r1, #4]!
	eoreqs r0, r0, lr
	ldreq r0, [r4], #4
	ldreq lr, [r1, #4]!
	eoreqs r0, r0, ip
	ldreq r0, [r4], #4
	ldreq ip, [r1, #4]!
	eoreqs r0, r0, lr
	bne 2f
	subs r2, r2, #32
	bhs 0b

	/* do we have at least 4 bytes left? */
	1: adds r2, r2, #(32 - 4 + 4)
	bmi 4f

	/* finish off 4 bytes at a time */
	3: ldr r0, [r4], #4
	ldr ip, [r1], #4
	eors r0, r0, ip
	bne 2f
	subs r2, r2, #4
	bhs 3b

	/* are we done? */
	4: adds r2, r2, #4
	moveq r0, #0
	beq 9f

	/* finish off the remaining bytes */
	b 8f

	2: /* the last 4 bytes are different, restart them */
	sub r4, r4, #4
	sub r1, r1, #4
	mov r2, #4

	/* process the last few bytes */
	8: ldrb r0, [r4], #1
	ldrb ip, [r1], #1
	// stall
	subs r0, r0, ip
	bne 9f
	subs r2, r2, #1
	bne 8b

	9: /* restore registers and return */
	ldmfd sp!, {r4, lr}
	bx lr

	10: /* process less than 12 bytes */
	cmp r2, #0
	moveq r0, #0
	bxeq lr
	mov r3, r0
	11:
	ldrb r0, [r3], #1
	ldrb ip, [r1], #1
	subs r0, ip
	bxne lr
	subs r2, r2, #1
	bne 11b
	bx lr

	5: /************* non-congruent case *************/
	and r0, r1, #3
	cmp r0, #2
	bne 4f

	/* here, offset is 2 (16-bits aligned, special cased) */

	/* make sure we have at least 16 bytes to process */
	subs r2, r2, #16
	addmi r2, r2, #16
	bmi 8b

	/* align the unaligned pointer */
	bic r1, r1, #3
	ldr lr, [r1], #4

	6: pld [r1, #(CACHE_LINE_SIZE * 2)]
	pld [r4, #(CACHE_LINE_SIZE * 2)]
	mov ip, lr, lsr #16
	ldr lr, [r1], #4
	ldr r0, [r4], #4
	orr ip, ip, lr, lsl #16
	eors r0, r0, ip
	moveq ip, lr, lsr #16
	ldreq lr, [r1], #4
	ldreq r0, [r4], #4
	orreq ip, ip, lr, lsl #16
	eoreqs r0, r0, ip
	moveq ip, lr, lsr #16
	ldreq lr, [r1], #4
	ldreq r0, [r4], #4
	orreq ip, ip, lr, lsl #16
	eoreqs r0, r0, ip
	moveq ip, lr, lsr #16
	ldreq lr, [r1], #4
	ldreq r0, [r4], #4
	orreq ip, ip, lr, lsl #16
	eoreqs r0, r0, ip
	bne 7f
	subs r2, r2, #16
	bhs 6b
	sub r1, r1, #2
	/* are we done? */
	adds r2, r2, #16
	moveq r0, #0
	beq 9b
	/* finish off the remaining bytes */
	b 8b

	7: /* fix up the 2 pointers and fallthrough... */
	sub r1, r1, #(4+2)
	sub r4, r4, #4
	mov r2, #4
	b 8b


	4: /************* offset is 1 or 3 (less optimized) *************/

	stmfd sp!, {r5, r6, r7}

	// r5 = rhs
	// r6 = lhs
	// r7 = scratch

	mov r5, r0, lsl #3 /* r5 = right shift */
	rsb r6, r5, #32 /* r6 = left shift */

	/* align the unaligned pointer */
	bic r1, r1, #3
	ldr r7, [r1], #4
	sub r2, r2, #8

	6: mov ip, r7, lsr r5
	ldr r7, [r1], #4
	ldr r0, [r4], #4
	orr ip, ip, r7, lsl r6
	eors r0, r0, ip
	moveq ip, r7, lsr r5
	ldreq r7, [r1], #4
	ldreq r0, [r4], #4
	orreq ip, ip, r7, lsl r6
	eoreqs r0, r0, ip
	bne 7f
	subs r2, r2, #8
	bhs 6b

	sub r1, r1, r6, lsr #3
	ldmfd sp!, {r5, r6, r7}

	/* are we done? */
	adds r2, r2, #8
	moveq r0, #0
	beq 9b

	/* finish off the remaining bytes */
	b 8b

	7: /* fix up the 2 pointers and fallthrough... */
	sub r1, r1, #4
	sub r1, r1, r6, lsr #3
	sub r4, r4, #4
	mov r2, #4
	ldmfd sp!, {r5, r6, r7}
	b 8b
	END(memcmp)