blob: d6d3ca132263910e527ebc6f7dd7e3b4a6911e13 [file] [log] [blame]
The Android Open Source Project1dc9e472009-03-03 19:28:35 -08001/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
13 * distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <machine/cpu-features.h>
Kenny Root420878c2011-02-16 11:55:58 -080030#include <machine/asm.h>
The Android Open Source Project1dc9e472009-03-03 19:28:35 -080031
Henrik Smiding3ebd31c2010-11-05 15:09:37 +010032
33#ifdef HAVE_32_BYTE_CACHE_LINE
34#define CACHE_LINE_SIZE 32
35#else
36#define CACHE_LINE_SIZE 64
37#endif
38
The Android Open Source Project1dc9e472009-03-03 19:28:35 -080039/*
Henrik Smiding3ebd31c2010-11-05 15:09:37 +010040 * Optimized memcmp() for Cortex-A9.
The Android Open Source Project1dc9e472009-03-03 19:28:35 -080041 */
42
Kenny Root420878c2011-02-16 11:55:58 -080043ENTRY(memcmp)
Henrik Smiding3ebd31c2010-11-05 15:09:37 +010044 pld [r0, #(CACHE_LINE_SIZE * 0)]
45 pld [r0, #(CACHE_LINE_SIZE * 1)]
The Android Open Source Project1dc9e472009-03-03 19:28:35 -080046
47 /* take of the case where length is 0 or the buffers are the same */
48 cmp r0, r1
The Android Open Source Project1dc9e472009-03-03 19:28:35 -080049 moveq r0, #0
50 bxeq lr
51
Henrik Smiding3ebd31c2010-11-05 15:09:37 +010052 pld [r1, #(CACHE_LINE_SIZE * 0)]
53 pld [r1, #(CACHE_LINE_SIZE * 1)]
54
55 /* make sure we have at least 8+4 bytes, this simplify things below
56 * and avoid some overhead for small blocks
57 */
58 cmp r2, #(8+4)
59 bmi 10f
60/*
61 * Neon optimization
62 * Comparing 32 bytes at a time
63 */
64#if defined(__ARM_NEON__) && defined(NEON_UNALIGNED_ACCESS)
65 subs r2, r2, #32
66 blo 3f
67
68 /* preload all the cache lines we need. */
69 pld [r0, #(CACHE_LINE_SIZE * 2)]
70 pld [r1, #(CACHE_LINE_SIZE * 2)]
71
721: /* The main loop compares 32 bytes at a time */
73 vld1.8 {d0 - d3}, [r0]!
74 pld [r0, #(CACHE_LINE_SIZE * 2)]
75 vld1.8 {d4 - d7}, [r1]!
76 pld [r1, #(CACHE_LINE_SIZE * 2)]
77
78 /* Start subtracting the values and merge results */
79 vsub.i8 q0, q2
80 vsub.i8 q1, q3
81 vorr q2, q0, q1
82 vorr d4, d5
83 vmov r3, ip, d4
84 /* Check if there are any differences among the 32 bytes */
85 orrs r3, ip
86 bne 2f
87 subs r2, r2, #32
88 bhs 1b
89 b 3f
902:
91 /* Check if the difference was in the first or last 16 bytes */
92 sub r0, #32
93 vorr d0, d1
94 sub r1, #32
95 vmov r3, ip, d0
96 orrs r3, ip
97 /* if the first 16 bytes are equal, we only have to rewind 16 bytes */
98 ittt eq
99 subeq r2, #16
100 addeq r0, #16
101 addeq r1, #16
102
1033: /* fix-up the remaining count */
104 add r2, r2, #32
105
106 cmp r2, #(8+4)
107 bmi 10f
108#endif
109
Ben Chengbd192b42009-09-15 13:41:14 -0700110 .save {r4, lr}
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800111 /* save registers */
112 stmfd sp!, {r4, lr}
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800113
114 /* since r0 hold the result, move the first source
115 * pointer somewhere else
116 */
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800117 mov r4, r0
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800118
119 /* align first pointer to word boundary
120 * offset = -src & 3
121 */
122 rsb r3, r4, #0
123 ands r3, r3, #3
124 beq 0f
125
126 /* align first pointer */
127 sub r2, r2, r3
1281: ldrb r0, [r4], #1
129 ldrb ip, [r1], #1
130 subs r0, r0, ip
131 bne 9f
132 subs r3, r3, #1
133 bne 1b
134
135
1360: /* here the first pointer is aligned, and we have at least 4 bytes
137 * to process.
138 */
139
140 /* see if the pointers are congruent */
141 eor r0, r4, r1
142 ands r0, r0, #3
143 bne 5f
144
145 /* congruent case, 32 bytes per iteration
146 * We need to make sure there are at least 32+4 bytes left
147 * because we effectively read ahead one word, and we could
148 * read past the buffer (and segfault) if we're not careful.
149 */
150
151 ldr ip, [r1]
152 subs r2, r2, #(32 + 4)
153 bmi 1f
154
Henrik Smiding3ebd31c2010-11-05 15:09:37 +01001550: pld [r4, #(CACHE_LINE_SIZE * 2)]
156 pld [r1, #(CACHE_LINE_SIZE * 2)]
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800157 ldr r0, [r4], #4
158 ldr lr, [r1, #4]!
159 eors r0, r0, ip
160 ldreq r0, [r4], #4
161 ldreq ip, [r1, #4]!
162 eoreqs r0, r0, lr
163 ldreq r0, [r4], #4
164 ldreq lr, [r1, #4]!
165 eoreqs r0, r0, ip
166 ldreq r0, [r4], #4
167 ldreq ip, [r1, #4]!
168 eoreqs r0, r0, lr
169 ldreq r0, [r4], #4
170 ldreq lr, [r1, #4]!
171 eoreqs r0, r0, ip
172 ldreq r0, [r4], #4
173 ldreq ip, [r1, #4]!
174 eoreqs r0, r0, lr
175 ldreq r0, [r4], #4
176 ldreq lr, [r1, #4]!
177 eoreqs r0, r0, ip
178 ldreq r0, [r4], #4
179 ldreq ip, [r1, #4]!
180 eoreqs r0, r0, lr
181 bne 2f
182 subs r2, r2, #32
183 bhs 0b
184
185 /* do we have at least 4 bytes left? */
1861: adds r2, r2, #(32 - 4 + 4)
187 bmi 4f
188
189 /* finish off 4 bytes at a time */
1903: ldr r0, [r4], #4
191 ldr ip, [r1], #4
192 eors r0, r0, ip
193 bne 2f
194 subs r2, r2, #4
195 bhs 3b
196
197 /* are we done? */
1984: adds r2, r2, #4
199 moveq r0, #0
200 beq 9f
201
202 /* finish off the remaining bytes */
203 b 8f
204
2052: /* the last 4 bytes are different, restart them */
206 sub r4, r4, #4
207 sub r1, r1, #4
208 mov r2, #4
209
210 /* process the last few bytes */
2118: ldrb r0, [r4], #1
212 ldrb ip, [r1], #1
213 // stall
214 subs r0, r0, ip
215 bne 9f
216 subs r2, r2, #1
217 bne 8b
218
2199: /* restore registers and return */
220 ldmfd sp!, {r4, lr}
221 bx lr
Henrik Smiding3ebd31c2010-11-05 15:09:37 +0100222
22310: /* process less than 12 bytes */
224 cmp r2, #0
225 moveq r0, #0
226 bxeq lr
227 mov r3, r0
22811:
229 ldrb r0, [r3], #1
230 ldrb ip, [r1], #1
231 subs r0, ip
232 bxne lr
233 subs r2, r2, #1
234 bne 11b
235 bx lr
Kenny Root420878c2011-02-16 11:55:58 -0800236END(memcmp)
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800237
238
239
The Android Open Source Project1dc9e472009-03-03 19:28:35 -08002405: /*************** non-congruent case ***************/
241 and r0, r1, #3
242 cmp r0, #2
243 bne 4f
244
245 /* here, offset is 2 (16-bits aligned, special cased) */
246
247 /* make sure we have at least 16 bytes to process */
248 subs r2, r2, #16
249 addmi r2, r2, #16
250 bmi 8b
251
252 /* align the unaligned pointer */
253 bic r1, r1, #3
254 ldr lr, [r1], #4
255
Henrik Smiding3ebd31c2010-11-05 15:09:37 +01002566: pld [r1, #(CACHE_LINE_SIZE * 2)]
257 pld [r4, #(CACHE_LINE_SIZE * 2)]
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800258 mov ip, lr, lsr #16
259 ldr lr, [r1], #4
260 ldr r0, [r4], #4
261 orr ip, ip, lr, lsl #16
262 eors r0, r0, ip
263 moveq ip, lr, lsr #16
264 ldreq lr, [r1], #4
265 ldreq r0, [r4], #4
266 orreq ip, ip, lr, lsl #16
267 eoreqs r0, r0, ip
268 moveq ip, lr, lsr #16
269 ldreq lr, [r1], #4
270 ldreq r0, [r4], #4
271 orreq ip, ip, lr, lsl #16
272 eoreqs r0, r0, ip
273 moveq ip, lr, lsr #16
274 ldreq lr, [r1], #4
275 ldreq r0, [r4], #4
276 orreq ip, ip, lr, lsl #16
277 eoreqs r0, r0, ip
278 bne 7f
279 subs r2, r2, #16
280 bhs 6b
281 sub r1, r1, #2
282 /* are we done? */
283 adds r2, r2, #16
284 moveq r0, #0
285 beq 9b
286 /* finish off the remaining bytes */
287 b 8b
288
2897: /* fix up the 2 pointers and fallthrough... */
290 sub r1, r1, #(4+2)
291 sub r4, r4, #4
292 mov r2, #4
293 b 8b
294
295
2964: /*************** offset is 1 or 3 (less optimized) ***************/
297
298 stmfd sp!, {r5, r6, r7}
299
300 // r5 = rhs
301 // r6 = lhs
302 // r7 = scratch
303
304 mov r5, r0, lsl #3 /* r5 = right shift */
305 rsb r6, r5, #32 /* r6 = left shift */
306
307 /* align the unaligned pointer */
308 bic r1, r1, #3
309 ldr r7, [r1], #4
310 sub r2, r2, #8
311
3126: mov ip, r7, lsr r5
313 ldr r7, [r1], #4
314 ldr r0, [r4], #4
315 orr ip, ip, r7, lsl r6
316 eors r0, r0, ip
317 moveq ip, r7, lsr r5
318 ldreq r7, [r1], #4
319 ldreq r0, [r4], #4
320 orreq ip, ip, r7, lsl r6
321 eoreqs r0, r0, ip
322 bne 7f
323 subs r2, r2, #8
324 bhs 6b
325
326 sub r1, r1, r6, lsr #3
327 ldmfd sp!, {r5, r6, r7}
328
329 /* are we done? */
330 adds r2, r2, #8
331 moveq r0, #0
332 beq 9b
333
334 /* finish off the remaining bytes */
335 b 8b
336
3377: /* fix up the 2 pointers and fallthrough... */
338 sub r1, r1, #4
339 sub r1, r1, r6, lsr #3
340 sub r4, r4, #4
341 mov r2, #4
342 ldmfd sp!, {r5, r6, r7}
343 b 8b