blob: 67dcddc1b46e84fc51e9a1fb30427eaa84f22db9 [file] [log] [blame]
The Android Open Source Project1dc9e472009-03-03 19:28:35 -08001/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
13 * distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <machine/cpu-features.h>
30
31 .text
32
33 .global memcmp
34 .type memcmp, %function
35 .align 4
36
37/*
38 * Optimized memcmp() for ARM9.
39 * This would not be optimal on XScale or ARM11, where more prefetching
40 * and use of PLD will be needed.
41 * The 2 major optimzations here are
42 * (1) The main loop compares 16 bytes at a time
43 * (2) The loads are scheduled in a way they won't stall
44 */
45
46memcmp:
Ben Chengbd192b42009-09-15 13:41:14 -070047 .fnstart
The Android Open Source Project1dc9e472009-03-03 19:28:35 -080048 PLD (r0, #0)
49 PLD (r1, #0)
50
51 /* take of the case where length is 0 or the buffers are the same */
52 cmp r0, r1
53 cmpne r2, #0
54 moveq r0, #0
55 bxeq lr
56
Ben Chengbd192b42009-09-15 13:41:14 -070057 .save {r4, lr}
The Android Open Source Project1dc9e472009-03-03 19:28:35 -080058 /* save registers */
59 stmfd sp!, {r4, lr}
60
61 PLD (r0, #32)
62 PLD (r1, #32)
63
64 /* since r0 hold the result, move the first source
65 * pointer somewhere else
66 */
67
68 mov r4, r0
69
70 /* make sure we have at least 8+4 bytes, this simplify things below
71 * and avoid some overhead for small blocks
72 */
73 cmp r2, #(8+4)
74 bmi 8f
75
76 /* align first pointer to word boundary
77 * offset = -src & 3
78 */
79 rsb r3, r4, #0
80 ands r3, r3, #3
81 beq 0f
82
83 /* align first pointer */
84 sub r2, r2, r3
851: ldrb r0, [r4], #1
86 ldrb ip, [r1], #1
87 subs r0, r0, ip
88 bne 9f
89 subs r3, r3, #1
90 bne 1b
91
92
930: /* here the first pointer is aligned, and we have at least 4 bytes
94 * to process.
95 */
96
97 /* see if the pointers are congruent */
98 eor r0, r4, r1
99 ands r0, r0, #3
100 bne 5f
101
102 /* congruent case, 32 bytes per iteration
103 * We need to make sure there are at least 32+4 bytes left
104 * because we effectively read ahead one word, and we could
105 * read past the buffer (and segfault) if we're not careful.
106 */
107
108 ldr ip, [r1]
109 subs r2, r2, #(32 + 4)
110 bmi 1f
111
1120: PLD (r4, #64)
113 PLD (r1, #64)
114 ldr r0, [r4], #4
115 ldr lr, [r1, #4]!
116 eors r0, r0, ip
117 ldreq r0, [r4], #4
118 ldreq ip, [r1, #4]!
119 eoreqs r0, r0, lr
120 ldreq r0, [r4], #4
121 ldreq lr, [r1, #4]!
122 eoreqs r0, r0, ip
123 ldreq r0, [r4], #4
124 ldreq ip, [r1, #4]!
125 eoreqs r0, r0, lr
126 ldreq r0, [r4], #4
127 ldreq lr, [r1, #4]!
128 eoreqs r0, r0, ip
129 ldreq r0, [r4], #4
130 ldreq ip, [r1, #4]!
131 eoreqs r0, r0, lr
132 ldreq r0, [r4], #4
133 ldreq lr, [r1, #4]!
134 eoreqs r0, r0, ip
135 ldreq r0, [r4], #4
136 ldreq ip, [r1, #4]!
137 eoreqs r0, r0, lr
138 bne 2f
139 subs r2, r2, #32
140 bhs 0b
141
142 /* do we have at least 4 bytes left? */
1431: adds r2, r2, #(32 - 4 + 4)
144 bmi 4f
145
146 /* finish off 4 bytes at a time */
1473: ldr r0, [r4], #4
148 ldr ip, [r1], #4
149 eors r0, r0, ip
150 bne 2f
151 subs r2, r2, #4
152 bhs 3b
153
154 /* are we done? */
1554: adds r2, r2, #4
156 moveq r0, #0
157 beq 9f
158
159 /* finish off the remaining bytes */
160 b 8f
161
1622: /* the last 4 bytes are different, restart them */
163 sub r4, r4, #4
164 sub r1, r1, #4
165 mov r2, #4
166
167 /* process the last few bytes */
1688: ldrb r0, [r4], #1
169 ldrb ip, [r1], #1
170 // stall
171 subs r0, r0, ip
172 bne 9f
173 subs r2, r2, #1
174 bne 8b
175
1769: /* restore registers and return */
177 ldmfd sp!, {r4, lr}
178 bx lr
Ben Chengbd192b42009-09-15 13:41:14 -0700179 .fnend
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800180
181
182
183
184
1855: /*************** non-congruent case ***************/
186 and r0, r1, #3
187 cmp r0, #2
188 bne 4f
189
190 /* here, offset is 2 (16-bits aligned, special cased) */
191
192 /* make sure we have at least 16 bytes to process */
193 subs r2, r2, #16
194 addmi r2, r2, #16
195 bmi 8b
196
197 /* align the unaligned pointer */
198 bic r1, r1, #3
199 ldr lr, [r1], #4
200
2016: PLD (r1, #64)
202 PLD (r4, #64)
203 mov ip, lr, lsr #16
204 ldr lr, [r1], #4
205 ldr r0, [r4], #4
206 orr ip, ip, lr, lsl #16
207 eors r0, r0, ip
208 moveq ip, lr, lsr #16
209 ldreq lr, [r1], #4
210 ldreq r0, [r4], #4
211 orreq ip, ip, lr, lsl #16
212 eoreqs r0, r0, ip
213 moveq ip, lr, lsr #16
214 ldreq lr, [r1], #4
215 ldreq r0, [r4], #4
216 orreq ip, ip, lr, lsl #16
217 eoreqs r0, r0, ip
218 moveq ip, lr, lsr #16
219 ldreq lr, [r1], #4
220 ldreq r0, [r4], #4
221 orreq ip, ip, lr, lsl #16
222 eoreqs r0, r0, ip
223 bne 7f
224 subs r2, r2, #16
225 bhs 6b
226 sub r1, r1, #2
227 /* are we done? */
228 adds r2, r2, #16
229 moveq r0, #0
230 beq 9b
231 /* finish off the remaining bytes */
232 b 8b
233
2347: /* fix up the 2 pointers and fallthrough... */
235 sub r1, r1, #(4+2)
236 sub r4, r4, #4
237 mov r2, #4
238 b 8b
239
240
2414: /*************** offset is 1 or 3 (less optimized) ***************/
242
243 stmfd sp!, {r5, r6, r7}
244
245 // r5 = rhs
246 // r6 = lhs
247 // r7 = scratch
248
249 mov r5, r0, lsl #3 /* r5 = right shift */
250 rsb r6, r5, #32 /* r6 = left shift */
251
252 /* align the unaligned pointer */
253 bic r1, r1, #3
254 ldr r7, [r1], #4
255 sub r2, r2, #8
256
2576: mov ip, r7, lsr r5
258 ldr r7, [r1], #4
259 ldr r0, [r4], #4
260 orr ip, ip, r7, lsl r6
261 eors r0, r0, ip
262 moveq ip, r7, lsr r5
263 ldreq r7, [r1], #4
264 ldreq r0, [r4], #4
265 orreq ip, ip, r7, lsl r6
266 eoreqs r0, r0, ip
267 bne 7f
268 subs r2, r2, #8
269 bhs 6b
270
271 sub r1, r1, r6, lsr #3
272 ldmfd sp!, {r5, r6, r7}
273
274 /* are we done? */
275 adds r2, r2, #8
276 moveq r0, #0
277 beq 9b
278
279 /* finish off the remaining bytes */
280 b 8b
281
2827: /* fix up the 2 pointers and fallthrough... */
283 sub r1, r1, #4
284 sub r1, r1, r6, lsr #3
285 sub r4, r4, #4
286 mov r2, #4
287 ldmfd sp!, {r5, r6, r7}
288 b 8b