blob: f45b56bc3997d9232d9b476d1714c58940dad332 [file] [log] [blame]
The Android Open Source Project1dc9e472009-03-03 19:28:35 -08001/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
13 * distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <machine/cpu-features.h>
30
31 .text
32
33 .global memcmp
34 .type memcmp, %function
35 .align 4
36
37/*
38 * Optimized memcmp() for ARM9.
39 * This would not be optimal on XScale or ARM11, where more prefetching
40 * and use of PLD will be needed.
41 * The 2 major optimzations here are
42 * (1) The main loop compares 16 bytes at a time
43 * (2) The loads are scheduled in a way they won't stall
44 */
45
46memcmp:
47 PLD (r0, #0)
48 PLD (r1, #0)
49
50 /* take of the case where length is 0 or the buffers are the same */
51 cmp r0, r1
52 cmpne r2, #0
53 moveq r0, #0
54 bxeq lr
55
56 /* save registers */
57 stmfd sp!, {r4, lr}
58
59 PLD (r0, #32)
60 PLD (r1, #32)
61
62 /* since r0 hold the result, move the first source
63 * pointer somewhere else
64 */
65
66 mov r4, r0
67
68 /* make sure we have at least 8+4 bytes, this simplify things below
69 * and avoid some overhead for small blocks
70 */
71 cmp r2, #(8+4)
72 bmi 8f
73
74 /* align first pointer to word boundary
75 * offset = -src & 3
76 */
77 rsb r3, r4, #0
78 ands r3, r3, #3
79 beq 0f
80
81 /* align first pointer */
82 sub r2, r2, r3
831: ldrb r0, [r4], #1
84 ldrb ip, [r1], #1
85 subs r0, r0, ip
86 bne 9f
87 subs r3, r3, #1
88 bne 1b
89
90
910: /* here the first pointer is aligned, and we have at least 4 bytes
92 * to process.
93 */
94
95 /* see if the pointers are congruent */
96 eor r0, r4, r1
97 ands r0, r0, #3
98 bne 5f
99
100 /* congruent case, 32 bytes per iteration
101 * We need to make sure there are at least 32+4 bytes left
102 * because we effectively read ahead one word, and we could
103 * read past the buffer (and segfault) if we're not careful.
104 */
105
106 ldr ip, [r1]
107 subs r2, r2, #(32 + 4)
108 bmi 1f
109
1100: PLD (r4, #64)
111 PLD (r1, #64)
112 ldr r0, [r4], #4
113 ldr lr, [r1, #4]!
114 eors r0, r0, ip
115 ldreq r0, [r4], #4
116 ldreq ip, [r1, #4]!
117 eoreqs r0, r0, lr
118 ldreq r0, [r4], #4
119 ldreq lr, [r1, #4]!
120 eoreqs r0, r0, ip
121 ldreq r0, [r4], #4
122 ldreq ip, [r1, #4]!
123 eoreqs r0, r0, lr
124 ldreq r0, [r4], #4
125 ldreq lr, [r1, #4]!
126 eoreqs r0, r0, ip
127 ldreq r0, [r4], #4
128 ldreq ip, [r1, #4]!
129 eoreqs r0, r0, lr
130 ldreq r0, [r4], #4
131 ldreq lr, [r1, #4]!
132 eoreqs r0, r0, ip
133 ldreq r0, [r4], #4
134 ldreq ip, [r1, #4]!
135 eoreqs r0, r0, lr
136 bne 2f
137 subs r2, r2, #32
138 bhs 0b
139
140 /* do we have at least 4 bytes left? */
1411: adds r2, r2, #(32 - 4 + 4)
142 bmi 4f
143
144 /* finish off 4 bytes at a time */
1453: ldr r0, [r4], #4
146 ldr ip, [r1], #4
147 eors r0, r0, ip
148 bne 2f
149 subs r2, r2, #4
150 bhs 3b
151
152 /* are we done? */
1534: adds r2, r2, #4
154 moveq r0, #0
155 beq 9f
156
157 /* finish off the remaining bytes */
158 b 8f
159
1602: /* the last 4 bytes are different, restart them */
161 sub r4, r4, #4
162 sub r1, r1, #4
163 mov r2, #4
164
165 /* process the last few bytes */
1668: ldrb r0, [r4], #1
167 ldrb ip, [r1], #1
168 // stall
169 subs r0, r0, ip
170 bne 9f
171 subs r2, r2, #1
172 bne 8b
173
1749: /* restore registers and return */
175 ldmfd sp!, {r4, lr}
176 bx lr
177
178
179
180
181
1825: /*************** non-congruent case ***************/
183 and r0, r1, #3
184 cmp r0, #2
185 bne 4f
186
187 /* here, offset is 2 (16-bits aligned, special cased) */
188
189 /* make sure we have at least 16 bytes to process */
190 subs r2, r2, #16
191 addmi r2, r2, #16
192 bmi 8b
193
194 /* align the unaligned pointer */
195 bic r1, r1, #3
196 ldr lr, [r1], #4
197
1986: PLD (r1, #64)
199 PLD (r4, #64)
200 mov ip, lr, lsr #16
201 ldr lr, [r1], #4
202 ldr r0, [r4], #4
203 orr ip, ip, lr, lsl #16
204 eors r0, r0, ip
205 moveq ip, lr, lsr #16
206 ldreq lr, [r1], #4
207 ldreq r0, [r4], #4
208 orreq ip, ip, lr, lsl #16
209 eoreqs r0, r0, ip
210 moveq ip, lr, lsr #16
211 ldreq lr, [r1], #4
212 ldreq r0, [r4], #4
213 orreq ip, ip, lr, lsl #16
214 eoreqs r0, r0, ip
215 moveq ip, lr, lsr #16
216 ldreq lr, [r1], #4
217 ldreq r0, [r4], #4
218 orreq ip, ip, lr, lsl #16
219 eoreqs r0, r0, ip
220 bne 7f
221 subs r2, r2, #16
222 bhs 6b
223 sub r1, r1, #2
224 /* are we done? */
225 adds r2, r2, #16
226 moveq r0, #0
227 beq 9b
228 /* finish off the remaining bytes */
229 b 8b
230
2317: /* fix up the 2 pointers and fallthrough... */
232 sub r1, r1, #(4+2)
233 sub r4, r4, #4
234 mov r2, #4
235 b 8b
236
237
2384: /*************** offset is 1 or 3 (less optimized) ***************/
239
240 stmfd sp!, {r5, r6, r7}
241
242 // r5 = rhs
243 // r6 = lhs
244 // r7 = scratch
245
246 mov r5, r0, lsl #3 /* r5 = right shift */
247 rsb r6, r5, #32 /* r6 = left shift */
248
249 /* align the unaligned pointer */
250 bic r1, r1, #3
251 ldr r7, [r1], #4
252 sub r2, r2, #8
253
2546: mov ip, r7, lsr r5
255 ldr r7, [r1], #4
256 ldr r0, [r4], #4
257 orr ip, ip, r7, lsl r6
258 eors r0, r0, ip
259 moveq ip, r7, lsr r5
260 ldreq r7, [r1], #4
261 ldreq r0, [r4], #4
262 orreq ip, ip, r7, lsl r6
263 eoreqs r0, r0, ip
264 bne 7f
265 subs r2, r2, #8
266 bhs 6b
267
268 sub r1, r1, r6, lsr #3
269 ldmfd sp!, {r5, r6, r7}
270
271 /* are we done? */
272 adds r2, r2, #8
273 moveq r0, #0
274 beq 9b
275
276 /* finish off the remaining bytes */
277 b 8b
278
2797: /* fix up the 2 pointers and fallthrough... */
280 sub r1, r1, #4
281 sub r1, r1, r6, lsr #3
282 sub r4, r4, #4
283 mov r2, #4
284 ldmfd sp!, {r5, r6, r7}
285 b 8b