blob: c872a51bd0a30d26621ce4824d9cd33dceb6bbf2 [file] [log] [blame]
The Android Open Source Project1dc9e472009-03-03 19:28:35 -08001/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
13 * distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <machine/cpu-features.h>
Kenny Root420878c2011-02-16 11:55:58 -080030#include <machine/asm.h>
The Android Open Source Project1dc9e472009-03-03 19:28:35 -080031
32/*
33 * Optimized memcmp() for ARM9.
34 * This would not be optimal on XScale or ARM11, where more prefetching
35 * and use of PLD will be needed.
36 * The 2 major optimzations here are
37 * (1) The main loop compares 16 bytes at a time
38 * (2) The loads are scheduled in a way they won't stall
39 */
40
Kenny Root420878c2011-02-16 11:55:58 -080041ENTRY(memcmp)
The Android Open Source Project1dc9e472009-03-03 19:28:35 -080042 PLD (r0, #0)
43 PLD (r1, #0)
44
45 /* take of the case where length is 0 or the buffers are the same */
46 cmp r0, r1
47 cmpne r2, #0
48 moveq r0, #0
49 bxeq lr
50
Ben Chengbd192b42009-09-15 13:41:14 -070051 .save {r4, lr}
The Android Open Source Project1dc9e472009-03-03 19:28:35 -080052 /* save registers */
53 stmfd sp!, {r4, lr}
54
55 PLD (r0, #32)
56 PLD (r1, #32)
57
58 /* since r0 hold the result, move the first source
59 * pointer somewhere else
60 */
61
62 mov r4, r0
63
64 /* make sure we have at least 8+4 bytes, this simplify things below
65 * and avoid some overhead for small blocks
66 */
67 cmp r2, #(8+4)
68 bmi 8f
69
70 /* align first pointer to word boundary
71 * offset = -src & 3
72 */
73 rsb r3, r4, #0
74 ands r3, r3, #3
75 beq 0f
76
77 /* align first pointer */
78 sub r2, r2, r3
791: ldrb r0, [r4], #1
80 ldrb ip, [r1], #1
81 subs r0, r0, ip
82 bne 9f
83 subs r3, r3, #1
84 bne 1b
85
86
870: /* here the first pointer is aligned, and we have at least 4 bytes
88 * to process.
89 */
90
91 /* see if the pointers are congruent */
92 eor r0, r4, r1
93 ands r0, r0, #3
94 bne 5f
95
96 /* congruent case, 32 bytes per iteration
97 * We need to make sure there are at least 32+4 bytes left
98 * because we effectively read ahead one word, and we could
99 * read past the buffer (and segfault) if we're not careful.
100 */
101
102 ldr ip, [r1]
103 subs r2, r2, #(32 + 4)
104 bmi 1f
105
1060: PLD (r4, #64)
107 PLD (r1, #64)
108 ldr r0, [r4], #4
109 ldr lr, [r1, #4]!
110 eors r0, r0, ip
111 ldreq r0, [r4], #4
112 ldreq ip, [r1, #4]!
113 eoreqs r0, r0, lr
114 ldreq r0, [r4], #4
115 ldreq lr, [r1, #4]!
116 eoreqs r0, r0, ip
117 ldreq r0, [r4], #4
118 ldreq ip, [r1, #4]!
119 eoreqs r0, r0, lr
120 ldreq r0, [r4], #4
121 ldreq lr, [r1, #4]!
122 eoreqs r0, r0, ip
123 ldreq r0, [r4], #4
124 ldreq ip, [r1, #4]!
125 eoreqs r0, r0, lr
126 ldreq r0, [r4], #4
127 ldreq lr, [r1, #4]!
128 eoreqs r0, r0, ip
129 ldreq r0, [r4], #4
130 ldreq ip, [r1, #4]!
131 eoreqs r0, r0, lr
132 bne 2f
133 subs r2, r2, #32
134 bhs 0b
135
136 /* do we have at least 4 bytes left? */
1371: adds r2, r2, #(32 - 4 + 4)
138 bmi 4f
139
140 /* finish off 4 bytes at a time */
1413: ldr r0, [r4], #4
142 ldr ip, [r1], #4
143 eors r0, r0, ip
144 bne 2f
145 subs r2, r2, #4
146 bhs 3b
147
148 /* are we done? */
1494: adds r2, r2, #4
150 moveq r0, #0
151 beq 9f
152
153 /* finish off the remaining bytes */
154 b 8f
155
1562: /* the last 4 bytes are different, restart them */
157 sub r4, r4, #4
158 sub r1, r1, #4
159 mov r2, #4
160
161 /* process the last few bytes */
1628: ldrb r0, [r4], #1
163 ldrb ip, [r1], #1
164 // stall
165 subs r0, r0, ip
166 bne 9f
167 subs r2, r2, #1
168 bne 8b
169
1709: /* restore registers and return */
171 ldmfd sp!, {r4, lr}
172 bx lr
Kenny Root420878c2011-02-16 11:55:58 -0800173END(memcmp)
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800174
175
176
177
178
1795: /*************** non-congruent case ***************/
180 and r0, r1, #3
181 cmp r0, #2
182 bne 4f
183
184 /* here, offset is 2 (16-bits aligned, special cased) */
185
186 /* make sure we have at least 16 bytes to process */
187 subs r2, r2, #16
188 addmi r2, r2, #16
189 bmi 8b
190
191 /* align the unaligned pointer */
192 bic r1, r1, #3
193 ldr lr, [r1], #4
194
1956: PLD (r1, #64)
196 PLD (r4, #64)
197 mov ip, lr, lsr #16
198 ldr lr, [r1], #4
199 ldr r0, [r4], #4
200 orr ip, ip, lr, lsl #16
201 eors r0, r0, ip
202 moveq ip, lr, lsr #16
203 ldreq lr, [r1], #4
204 ldreq r0, [r4], #4
205 orreq ip, ip, lr, lsl #16
206 eoreqs r0, r0, ip
207 moveq ip, lr, lsr #16
208 ldreq lr, [r1], #4
209 ldreq r0, [r4], #4
210 orreq ip, ip, lr, lsl #16
211 eoreqs r0, r0, ip
212 moveq ip, lr, lsr #16
213 ldreq lr, [r1], #4
214 ldreq r0, [r4], #4
215 orreq ip, ip, lr, lsl #16
216 eoreqs r0, r0, ip
217 bne 7f
218 subs r2, r2, #16
219 bhs 6b
220 sub r1, r1, #2
221 /* are we done? */
222 adds r2, r2, #16
223 moveq r0, #0
224 beq 9b
225 /* finish off the remaining bytes */
226 b 8b
227
2287: /* fix up the 2 pointers and fallthrough... */
229 sub r1, r1, #(4+2)
230 sub r4, r4, #4
231 mov r2, #4
232 b 8b
233
234
2354: /*************** offset is 1 or 3 (less optimized) ***************/
236
237 stmfd sp!, {r5, r6, r7}
238
239 // r5 = rhs
240 // r6 = lhs
241 // r7 = scratch
242
243 mov r5, r0, lsl #3 /* r5 = right shift */
244 rsb r6, r5, #32 /* r6 = left shift */
245
246 /* align the unaligned pointer */
247 bic r1, r1, #3
248 ldr r7, [r1], #4
249 sub r2, r2, #8
250
2516: mov ip, r7, lsr r5
252 ldr r7, [r1], #4
253 ldr r0, [r4], #4
254 orr ip, ip, r7, lsl r6
255 eors r0, r0, ip
256 moveq ip, r7, lsr r5
257 ldreq r7, [r1], #4
258 ldreq r0, [r4], #4
259 orreq ip, ip, r7, lsl r6
260 eoreqs r0, r0, ip
261 bne 7f
262 subs r2, r2, #8
263 bhs 6b
264
265 sub r1, r1, r6, lsr #3
266 ldmfd sp!, {r5, r6, r7}
267
268 /* are we done? */
269 adds r2, r2, #8
270 moveq r0, #0
271 beq 9b
272
273 /* finish off the remaining bytes */
274 b 8b
275
2767: /* fix up the 2 pointers and fallthrough... */
277 sub r1, r1, #4
278 sub r1, r1, r6, lsr #3
279 sub r4, r4, #4
280 mov r2, #4
281 ldmfd sp!, {r5, r6, r7}
282 b 8b