blob: d19dfb97f8206617bbd8e8931dbacb3147895ea3 [file] [log] [blame]
The Android Open Source Projecta27d2ba2008-10-21 07:00:00 -07001/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
13 * distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28 .text
29
30 .global memcmp
31 .type memcmp, %function
32 .align 4
33
34/*
35 * Optimized memcmp() for ARM9.
36 * This would not be optimal on XScale or ARM11, where more prefetching
37 * and use of PLD will be needed.
38 * The 2 major optimzations here are
39 * (1) The main loop compares 16 bytes at a time
40 * (2) The loads are scheduled in a way they won't stall
41 */
42
43memcmp:
44 pld [r0, #0]
45 pld [r1, #0]
46
47 /* take of the case where length is 0 or the buffers are the same */
48 cmp r0, r1
49 cmpne r2, #0
50 moveq r0, #0
51 bxeq lr
52
53 /* save registers */
54 stmfd sp!, {r4, lr}
55
56 pld [r0, #32]
57 pld [r1, #32]
58
59 /* since r0 hold the result, move the first source
60 * pointer somewhere else
61 */
62
63 mov r4, r0
64
65 /* make sure we have at least 8+4 bytes, this simplify things below
66 * and avoid some overhead for small blocks
67 */
68 cmp r2, #(8+4)
69 bmi 8f
70
71 /* align first pointer to word boundary
72 * offset = -src & 3
73 */
74 rsb r3, r4, #0
75 ands r3, r3, #3
76 beq 0f
77
78 /* align first pointer */
79 sub r2, r2, r3
801: ldrb r0, [r4], #1
81 ldrb ip, [r1], #1
82 subs r0, r0, ip
83 bne 9f
84 subs r3, r3, #1
85 bne 1b
86
87
880: /* here the first pointer is aligned, and we have at least 4 bytes
89 * to process.
90 */
91
92 /* see if the pointers are congruent */
93 eor r0, r4, r1
94 ands r0, r0, #3
95 bne 5f
96
97 /* congruent case, 32 bytes per iteration
98 * We need to make sure there are at least 32+4 bytes left
99 * because we effectively read ahead one word, and we could
100 * read past the buffer (and segfault) if we're not careful.
101 */
102
103 ldr ip, [r1]
104 subs r2, r2, #(32 + 4)
105 bmi 1f
106
1070: pld [r4, #64]
108 pld [r1, #64]
109 ldr r0, [r4], #4
110 ldr lr, [r1, #4]!
111 eors r0, r0, ip
112 ldreq r0, [r4], #4
113 ldreq ip, [r1, #4]!
114 eoreqs r0, r0, lr
115 ldreq r0, [r4], #4
116 ldreq lr, [r1, #4]!
117 eoreqs r0, r0, ip
118 ldreq r0, [r4], #4
119 ldreq ip, [r1, #4]!
120 eoreqs r0, r0, lr
121 ldreq r0, [r4], #4
122 ldreq lr, [r1, #4]!
123 eoreqs r0, r0, ip
124 ldreq r0, [r4], #4
125 ldreq ip, [r1, #4]!
126 eoreqs r0, r0, lr
127 ldreq r0, [r4], #4
128 ldreq lr, [r1, #4]!
129 eoreqs r0, r0, ip
130 ldreq r0, [r4], #4
131 ldreq ip, [r1, #4]!
132 eoreqs r0, r0, lr
133 bne 2f
134 subs r2, r2, #32
135 bhs 0b
136
137 /* do we have at least 4 bytes left? */
1381: adds r2, r2, #(32 - 4 + 4)
139 bmi 4f
140
141 /* finish off 4 bytes at a time */
1423: ldr r0, [r4], #4
143 ldr ip, [r1], #4
144 eors r0, r0, ip
145 bne 2f
146 subs r2, r2, #4
147 bhs 3b
148
149 /* are we done? */
1504: adds r2, r2, #4
151 moveq r0, #0
152 beq 9f
153
154 /* finish off the remaining bytes */
155 b 8f
156
1572: /* the last 4 bytes are different, restart them */
158 sub r4, r4, #4
159 sub r1, r1, #4
160 mov r2, #4
161
162 /* process the last few bytes */
1638: ldrb r0, [r4], #1
164 ldrb ip, [r1], #1
165 // stall
166 subs r0, r0, ip
167 bne 9f
168 subs r2, r2, #1
169 bne 8b
170
1719: /* restore registers and return */
172 ldmfd sp!, {r4, lr}
173 bx lr
174
175
176
177
178
1795: /*************** non-congruent case ***************/
180 and r0, r1, #3
181 cmp r0, #2
182 bne 4f
183
184 /* here, offset is 2 (16-bits aligned, special cased) */
185
186 /* make sure we have at least 16 bytes to process */
187 subs r2, r2, #16
188 addmi r2, r2, #16
189 bmi 8b
190
191 /* align the unaligned pointer */
192 bic r1, r1, #3
193 ldr lr, [r1], #4
194
1956: pld [r1, #64]
196 pld [r4, #64]
197 mov ip, lr, lsr #16
198 ldr lr, [r1], #4
199 ldr r0, [r4], #4
200 orr ip, ip, lr, lsl #16
201 eors r0, r0, ip
202 moveq ip, lr, lsr #16
203 ldreq lr, [r1], #4
204 ldreq r0, [r4], #4
205 orreq ip, ip, lr, lsl #16
206 eoreqs r0, r0, ip
207 moveq ip, lr, lsr #16
208 ldreq lr, [r1], #4
209 ldreq r0, [r4], #4
210 orreq ip, ip, lr, lsl #16
211 eoreqs r0, r0, ip
212 moveq ip, lr, lsr #16
213 ldreq lr, [r1], #4
214 ldreq r0, [r4], #4
215 orreq ip, ip, lr, lsl #16
216 eoreqs r0, r0, ip
217 bne 7f
218 subs r2, r2, #16
219 bhs 6b
220 sub r1, r1, #2
221 /* are we done? */
222 adds r2, r2, #16
223 moveq r0, #0
224 beq 9b
225 /* finish off the remaining bytes */
226 b 8b
227
2287: /* fix up the 2 pointers and fallthrough... */
229 sub r1, r1, #(4+2)
230 sub r4, r4, #4
231 mov r2, #4
232 b 8b
233
234
2354: /*************** offset is 1 or 3 (less optimized) ***************/
236
237 stmfd sp!, {r5, r6, r7}
238
239 // r5 = rhs
240 // r6 = lhs
241 // r7 = scratch
242
243 mov r5, r0, lsl #3 /* r5 = right shift */
244 rsb r6, r5, #32 /* r6 = left shift */
245
246 /* align the unaligned pointer */
247 bic r1, r1, #3
248 ldr r7, [r1], #4
249 sub r2, r2, #8
250
2516: mov ip, r7, lsr r5
252 ldr r7, [r1], #4
253 ldr r0, [r4], #4
254 orr ip, ip, r7, lsl r6
255 eors r0, r0, ip
256 moveq ip, r7, lsr r5
257 ldreq r7, [r1], #4
258 ldreq r0, [r4], #4
259 orreq ip, ip, r7, lsl r6
260 eoreqs r0, r0, ip
261 bne 7f
262 subs r2, r2, #8
263 bhs 6b
264
265 sub r1, r1, r6, lsr #3
266 ldmfd sp!, {r5, r6, r7}
267
268 /* are we done? */
269 adds r2, r2, #8
270 moveq r0, #0
271 beq 9b
272
273 /* finish off the remaining bytes */
274 b 8b
275
2767: /* fix up the 2 pointers and fallthrough... */
277 sub r1, r1, #4
278 sub r1, r1, r6, lsr #3
279 sub r4, r4, #4
280 mov r2, #4
281 ldmfd sp!, {r5, r6, r7}
282 b 8b