| /* |
| * Copyright (C) 2008 The Android Open Source Project |
| * All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in |
| * the documentation and/or other materials provided with the |
| * distribution. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
| * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
| * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
| * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
| * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS |
| * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED |
| * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
| * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| * SUCH DAMAGE. |
| */ |
| |
| #include <machine/cpu-features.h> |
| #include <machine/asm.h> |
| |
| |
| #ifdef HAVE_32_BYTE_CACHE_LINE |
| #define CACHE_LINE_SIZE 32 |
| #else |
| #define CACHE_LINE_SIZE 64 |
| #endif |
| |
| /* |
| * Optimized memcmp() for Cortex-A9. |
| */ |
| |
| ENTRY(memcmp) |
| pld [r0, #(CACHE_LINE_SIZE * 0)] |
| pld [r0, #(CACHE_LINE_SIZE * 1)] |
| |
| /* take of the case where length is 0 or the buffers are the same */ |
| cmp r0, r1 |
| moveq r0, #0 |
| bxeq lr |
| |
| pld [r1, #(CACHE_LINE_SIZE * 0)] |
| pld [r1, #(CACHE_LINE_SIZE * 1)] |
| |
| /* make sure we have at least 8+4 bytes, this simplify things below |
| * and avoid some overhead for small blocks |
| */ |
| cmp r2, #(8+4) |
| bmi 10f |
| /* |
| * Neon optimization |
| * Comparing 32 bytes at a time |
| */ |
| #if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS) |
| subs r2, r2, #32 |
| blo 3f |
| |
| /* preload all the cache lines we need. */ |
| pld [r0, #(CACHE_LINE_SIZE * 2)] |
| pld [r1, #(CACHE_LINE_SIZE * 2)] |
| |
| 1: /* The main loop compares 32 bytes at a time */ |
| vld1.8 {d0 - d3}, [r0]! |
| pld [r0, #(CACHE_LINE_SIZE * 2)] |
| vld1.8 {d4 - d7}, [r1]! |
| pld [r1, #(CACHE_LINE_SIZE * 2)] |
| |
| /* Start subtracting the values and merge results */ |
| vsub.i8 q0, q2 |
| vsub.i8 q1, q3 |
| vorr q2, q0, q1 |
| vorr d4, d5 |
| vmov r3, ip, d4 |
| /* Check if there are any differences among the 32 bytes */ |
| orrs r3, ip |
| bne 2f |
| subs r2, r2, #32 |
| bhs 1b |
| b 3f |
| 2: |
| /* Check if the difference was in the first or last 16 bytes */ |
| sub r0, #32 |
| vorr d0, d1 |
| sub r1, #32 |
| vmov r3, ip, d0 |
| orrs r3, ip |
| /* if the first 16 bytes are equal, we only have to rewind 16 bytes */ |
| ittt eq |
| subeq r2, #16 |
| addeq r0, #16 |
| addeq r1, #16 |
| |
| 3: /* fix-up the remaining count */ |
| add r2, r2, #32 |
| |
| cmp r2, #(8+4) |
| bmi 10f |
| #endif |
| |
| /* save registers */ |
| stmfd sp!, {r4, lr} |
| .cfi_def_cfa_offset 8 |
| .cfi_rel_offset r4, 0 |
| .cfi_rel_offset lr, 4 |
| |
| /* since r0 hold the result, move the first source |
| * pointer somewhere else |
| */ |
| mov r4, r0 |
| |
| /* align first pointer to word boundary |
| * offset = -src & 3 |
| */ |
| rsb r3, r4, #0 |
| ands r3, r3, #3 |
| beq 0f |
| |
| /* align first pointer */ |
| sub r2, r2, r3 |
| 1: ldrb r0, [r4], #1 |
| ldrb ip, [r1], #1 |
| subs r0, r0, ip |
| bne 9f |
| subs r3, r3, #1 |
| bne 1b |
| |
| |
| 0: /* here the first pointer is aligned, and we have at least 4 bytes |
| * to process. |
| */ |
| |
| /* see if the pointers are congruent */ |
| eor r0, r4, r1 |
| ands r0, r0, #3 |
| bne 5f |
| |
| /* congruent case, 32 bytes per iteration |
| * We need to make sure there are at least 32+4 bytes left |
| * because we effectively read ahead one word, and we could |
| * read past the buffer (and segfault) if we're not careful. |
| */ |
| |
| ldr ip, [r1] |
| subs r2, r2, #(32 + 4) |
| bmi 1f |
| |
| 0: pld [r4, #(CACHE_LINE_SIZE * 2)] |
| pld [r1, #(CACHE_LINE_SIZE * 2)] |
| ldr r0, [r4], #4 |
| ldr lr, [r1, #4]! |
| eors r0, r0, ip |
| ldreq r0, [r4], #4 |
| ldreq ip, [r1, #4]! |
| eoreqs r0, r0, lr |
| ldreq r0, [r4], #4 |
| ldreq lr, [r1, #4]! |
| eoreqs r0, r0, ip |
| ldreq r0, [r4], #4 |
| ldreq ip, [r1, #4]! |
| eoreqs r0, r0, lr |
| ldreq r0, [r4], #4 |
| ldreq lr, [r1, #4]! |
| eoreqs r0, r0, ip |
| ldreq r0, [r4], #4 |
| ldreq ip, [r1, #4]! |
| eoreqs r0, r0, lr |
| ldreq r0, [r4], #4 |
| ldreq lr, [r1, #4]! |
| eoreqs r0, r0, ip |
| ldreq r0, [r4], #4 |
| ldreq ip, [r1, #4]! |
| eoreqs r0, r0, lr |
| bne 2f |
| subs r2, r2, #32 |
| bhs 0b |
| |
| /* do we have at least 4 bytes left? */ |
| 1: adds r2, r2, #(32 - 4 + 4) |
| bmi 4f |
| |
| /* finish off 4 bytes at a time */ |
| 3: ldr r0, [r4], #4 |
| ldr ip, [r1], #4 |
| eors r0, r0, ip |
| bne 2f |
| subs r2, r2, #4 |
| bhs 3b |
| |
| /* are we done? */ |
| 4: adds r2, r2, #4 |
| moveq r0, #0 |
| beq 9f |
| |
| /* finish off the remaining bytes */ |
| b 8f |
| |
| 2: /* the last 4 bytes are different, restart them */ |
| sub r4, r4, #4 |
| sub r1, r1, #4 |
| mov r2, #4 |
| |
| /* process the last few bytes */ |
| 8: ldrb r0, [r4], #1 |
| ldrb ip, [r1], #1 |
| // stall |
| subs r0, r0, ip |
| bne 9f |
| subs r2, r2, #1 |
| bne 8b |
| |
| 9: /* restore registers and return */ |
| ldmfd sp!, {r4, lr} |
| bx lr |
| |
| 10: /* process less than 12 bytes */ |
| cmp r2, #0 |
| moveq r0, #0 |
| bxeq lr |
| mov r3, r0 |
| 11: |
| ldrb r0, [r3], #1 |
| ldrb ip, [r1], #1 |
| subs r0, ip |
| bxne lr |
| subs r2, r2, #1 |
| bne 11b |
| bx lr |
| |
| 5: /*************** non-congruent case ***************/ |
| and r0, r1, #3 |
| cmp r0, #2 |
| bne 4f |
| |
| /* here, offset is 2 (16-bits aligned, special cased) */ |
| |
| /* make sure we have at least 16 bytes to process */ |
| subs r2, r2, #16 |
| addmi r2, r2, #16 |
| bmi 8b |
| |
| /* align the unaligned pointer */ |
| bic r1, r1, #3 |
| ldr lr, [r1], #4 |
| |
| 6: pld [r1, #(CACHE_LINE_SIZE * 2)] |
| pld [r4, #(CACHE_LINE_SIZE * 2)] |
| mov ip, lr, lsr #16 |
| ldr lr, [r1], #4 |
| ldr r0, [r4], #4 |
| orr ip, ip, lr, lsl #16 |
| eors r0, r0, ip |
| moveq ip, lr, lsr #16 |
| ldreq lr, [r1], #4 |
| ldreq r0, [r4], #4 |
| orreq ip, ip, lr, lsl #16 |
| eoreqs r0, r0, ip |
| moveq ip, lr, lsr #16 |
| ldreq lr, [r1], #4 |
| ldreq r0, [r4], #4 |
| orreq ip, ip, lr, lsl #16 |
| eoreqs r0, r0, ip |
| moveq ip, lr, lsr #16 |
| ldreq lr, [r1], #4 |
| ldreq r0, [r4], #4 |
| orreq ip, ip, lr, lsl #16 |
| eoreqs r0, r0, ip |
| bne 7f |
| subs r2, r2, #16 |
| bhs 6b |
| sub r1, r1, #2 |
| /* are we done? */ |
| adds r2, r2, #16 |
| moveq r0, #0 |
| beq 9b |
| /* finish off the remaining bytes */ |
| b 8b |
| |
| 7: /* fix up the 2 pointers and fallthrough... */ |
| sub r1, r1, #(4+2) |
| sub r4, r4, #4 |
| mov r2, #4 |
| b 8b |
| |
| |
| 4: /*************** offset is 1 or 3 (less optimized) ***************/ |
| |
| stmfd sp!, {r5, r6, r7} |
| |
| // r5 = rhs |
| // r6 = lhs |
| // r7 = scratch |
| |
| mov r5, r0, lsl #3 /* r5 = right shift */ |
| rsb r6, r5, #32 /* r6 = left shift */ |
| |
| /* align the unaligned pointer */ |
| bic r1, r1, #3 |
| ldr r7, [r1], #4 |
| sub r2, r2, #8 |
| |
| 6: mov ip, r7, lsr r5 |
| ldr r7, [r1], #4 |
| ldr r0, [r4], #4 |
| orr ip, ip, r7, lsl r6 |
| eors r0, r0, ip |
| moveq ip, r7, lsr r5 |
| ldreq r7, [r1], #4 |
| ldreq r0, [r4], #4 |
| orreq ip, ip, r7, lsl r6 |
| eoreqs r0, r0, ip |
| bne 7f |
| subs r2, r2, #8 |
| bhs 6b |
| |
| sub r1, r1, r6, lsr #3 |
| ldmfd sp!, {r5, r6, r7} |
| |
| /* are we done? */ |
| adds r2, r2, #8 |
| moveq r0, #0 |
| beq 9b |
| |
| /* finish off the remaining bytes */ |
| b 8b |
| |
| 7: /* fix up the 2 pointers and fallthrough... */ |
| sub r1, r1, #4 |
| sub r1, r1, r6, lsr #3 |
| sub r4, r4, #4 |
| mov r2, #4 |
| ldmfd sp!, {r5, r6, r7} |
| b 8b |
| END(memcmp) |