blob: 38d8b62328bbe872797580393e5ddff3d007411e [file] [log] [blame]
The Android Open Source Project1dc9e472009-03-03 19:28:35 -08001/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
13 * distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <machine/cpu-features.h>
30
31 .text
32
33 .global __memcmp16
34 .type __memcmp16, %function
35 .align 4
36
37/*
38 * Optimized memcmp16() for ARM9.
39 * This would not be optimal on XScale or ARM11, where more prefetching
40 * and use of PLD will be needed.
41 * The 2 major optimzations here are
42 * (1) The main loop compares 16 bytes at a time
43 * (2) The loads are scheduled in a way they won't stall
44 */
45
46__memcmp16:
47 PLD (r0, #0)
48 PLD (r1, #0)
49
50 /* take of the case where length is nul or the buffers are the same */
51 cmp r0, r1
52 cmpne r2, #0
53 moveq r0, #0
54 bxeq lr
55
56 /* since r0 hold the result, move the first source
57 * pointer somewhere else
58 */
59
60 mov r3, r0
61
62 /* make sure we have at least 12 words, this simplify things below
63 * and avoid some overhead for small blocks
64 */
65
66 cmp r2, #12
67 bpl 0f
68
69 /* small blocks (less then 12 words) */
70 PLD (r0, #32)
71 PLD (r1, #32)
72
731: ldrh r0, [r3], #2
74 ldrh ip, [r1], #2
75 subs r0, r0, ip
76 bxne lr
77 subs r2, r2, #1
78 bne 1b
79 bx lr
80
81
82 /* save registers */
830: stmfd sp!, {r4, lr}
84
85 /* align first pointer to word boundary */
86 tst r3, #2
87 beq 0f
88
89 ldrh r0, [r3], #2
90 ldrh ip, [r1], #2
91 sub r2, r2, #1
92 subs r0, r0, ip
93 /* restore registers and return */
94 ldmnefd sp!, {r4, lr}
95 bxne lr
96
97
98
990: /* here the first pointer is aligned, and we have at least 3 words
100 * to process.
101 */
102
103 /* see if the pointers are congruent */
104 eor r0, r3, r1
105 ands r0, r0, #2
106 bne 5f
107
108 /* congruent case, 16 half-words per iteration
109 * We need to make sure there are at least 16+2 words left
110 * because we effectively read ahead one long word, and we could
111 * read past the buffer (and segfault) if we're not careful.
112 */
113
114 ldr ip, [r1]
115 subs r2, r2, #(16 + 2)
116 bmi 1f
117
1180:
119 PLD (r3, #64)
120 PLD (r1, #64)
121 ldr r0, [r3], #4
122 ldr lr, [r1, #4]!
123 eors r0, r0, ip
124 ldreq r0, [r3], #4
125 ldreq ip, [r1, #4]!
126 eoreqs r0, r0, lr
127 ldreq r0, [r3], #4
128 ldreq lr, [r1, #4]!
129 eoreqs r0, r0, ip
130 ldreq r0, [r3], #4
131 ldreq ip, [r1, #4]!
132 eoreqs r0, r0, lr
133 ldreq r0, [r3], #4
134 ldreq lr, [r1, #4]!
135 eoreqs r0, r0, ip
136 ldreq r0, [r3], #4
137 ldreq ip, [r1, #4]!
138 eoreqs r0, r0, lr
139 ldreq r0, [r3], #4
140 ldreq lr, [r1, #4]!
141 eoreqs r0, r0, ip
142 ldreq r0, [r3], #4
143 ldreq ip, [r1, #4]!
144 eoreqs r0, r0, lr
145 bne 2f
146 subs r2, r2, #16
147 bhs 0b
148
149 /* do we have at least 2 words left? */
1501: adds r2, r2, #(16 - 2 + 2)
151 bmi 4f
152
153 /* finish off 2 words at a time */
1543: ldr r0, [r3], #4
155 ldr ip, [r1], #4
156 eors r0, r0, ip
157 bne 2f
158 subs r2, r2, #2
159 bhs 3b
160
161 /* are we done? */
1624: adds r2, r2, #2
163 bne 8f
164 /* restore registers and return */
165 mov r0, #0
166 ldmfd sp!, {r4, lr}
167 bx lr
168
1692: /* the last 2 words are different, restart them */
170 ldrh r0, [r3, #-4]
171 ldrh ip, [r1, #-4]
172 subs r0, r0, ip
173 ldreqh r0, [r3, #-2]
174 ldreqh ip, [r1, #-2]
175 subeqs r0, r0, ip
176 /* restore registers and return */
177 ldmfd sp!, {r4, lr}
178 bx lr
179
180 /* process the last few words */
1818: ldrh r0, [r3], #2
182 ldrh ip, [r1], #2
183 subs r0, r0, ip
184 bne 9f
185 subs r2, r2, #1
186 bne 8b
187
1889: /* restore registers and return */
189 ldmfd sp!, {r4, lr}
190 bx lr
191
192
1935: /*************** non-congruent case ***************/
194
195 /* align the unaligned pointer */
196 bic r1, r1, #3
197 ldr lr, [r1], #4
198 sub r2, r2, #8
199
2006:
201 PLD (r3, #64)
202 PLD (r1, #64)
203 mov ip, lr, lsr #16
204 ldr lr, [r1], #4
205 ldr r0, [r3], #4
206 orr ip, ip, lr, lsl #16
207 eors r0, r0, ip
208 moveq ip, lr, lsr #16
209 ldreq lr, [r1], #4
210 ldreq r0, [r3], #4
211 orreq ip, ip, lr, lsl #16
212 eoreqs r0, r0, ip
213 moveq ip, lr, lsr #16
214 ldreq lr, [r1], #4
215 ldreq r0, [r3], #4
216 orreq ip, ip, lr, lsl #16
217 eoreqs r0, r0, ip
218 moveq ip, lr, lsr #16
219 ldreq lr, [r1], #4
220 ldreq r0, [r3], #4
221 orreq ip, ip, lr, lsl #16
222 eoreqs r0, r0, ip
223 bne 7f
224 subs r2, r2, #8
225 bhs 6b
226 sub r1, r1, #2
227 /* are we done? */
228 adds r2, r2, #8
229 moveq r0, #0
230 beq 9b
231 /* finish off the remaining bytes */
232 b 8b
233
2347: /* fix up the 2 pointers and fallthrough... */
235 sub r1, r1, #2
236 b 2b