blob: 37621946ef8e2a46df10ca46c464c67f78841f37 [file] [log] [blame]
Andreas Gampe4d0589c2014-06-10 16:10:56 -07001/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#ifndef ART_RUNTIME_ARCH_ARM_MEMCMP16_ARM_S_
18#define ART_RUNTIME_ARCH_ARM_MEMCMP16_ARM_S_
19
20#include "asm_support_arm.S"
21
22/*
23 * Optimized memcmp16() for ARM9.
24 * This would not be optimal on XScale or ARM11, where more prefetching
25 * and use of pld will be needed.
26 * The 2 major optimzations here are
27 * (1) The main loop compares 16 bytes at a time
28 * (2) The loads are scheduled in a way they won't stall
29 */
30
31ARM_ENTRY __memcmp16
32 pld [r0, #0]
33 pld [r1, #0]
34
35 /* take of the case where length is nul or the buffers are the same */
36 cmp r0, r1
37 cmpne r2, #0
38 moveq r0, #0
39 bxeq lr
40
41 /* since r0 hold the result, move the first source
42 * pointer somewhere else
43 */
44
45 mov r3, r0
46
47 /* make sure we have at least 12 words, this simplify things below
48 * and avoid some overhead for small blocks
49 */
50
51 cmp r2, #12
52 bpl 0f
53
54 /* small blocks (less then 12 words) */
55 pld [r0, #32]
56 pld [r1, #32]
57
581: ldrh r0, [r3], #2
59 ldrh ip, [r1], #2
60 subs r0, r0, ip
61 bxne lr
62 subs r2, r2, #1
63 bne 1b
64 bx lr
65
66
67 /* save registers */
680: stmfd sp!, {r4, lr}
69 .cfi_def_cfa_offset 8
70 .cfi_rel_offset r4, 0
71 .cfi_rel_offset lr, 4
72
73 /* align first pointer to word boundary */
74 tst r3, #2
75 beq 0f
76
77 ldrh r0, [r3], #2
78 ldrh ip, [r1], #2
79 sub r2, r2, #1
80 subs r0, r0, ip
81 /* restore registers and return */
82 ldmnefd sp!, {r4, lr}
83 bxne lr
84
85
860: /* here the first pointer is aligned, and we have at least 3 words
87 * to process.
88 */
89
90 /* see if the pointers are congruent */
91 eor r0, r3, r1
92 ands r0, r0, #2
93 bne 5f
94
95 /* congruent case, 16 half-words per iteration
96 * We need to make sure there are at least 16+2 words left
97 * because we effectively read ahead one long word, and we could
98 * read past the buffer (and segfault) if we're not careful.
99 */
100
101 ldr ip, [r1]
102 subs r2, r2, #(16 + 2)
103 bmi 1f
104
1050:
106 pld [r3, #64]
107 pld [r1, #64]
108 ldr r0, [r3], #4
109 ldr lr, [r1, #4]!
110 eors r0, r0, ip
111 ldreq r0, [r3], #4
112 ldreq ip, [r1, #4]!
113 eoreqs r0, r0, lr
114 ldreq r0, [r3], #4
115 ldreq lr, [r1, #4]!
116 eoreqs r0, r0, ip
117 ldreq r0, [r3], #4
118 ldreq ip, [r1, #4]!
119 eoreqs r0, r0, lr
120 ldreq r0, [r3], #4
121 ldreq lr, [r1, #4]!
122 eoreqs r0, r0, ip
123 ldreq r0, [r3], #4
124 ldreq ip, [r1, #4]!
125 eoreqs r0, r0, lr
126 ldreq r0, [r3], #4
127 ldreq lr, [r1, #4]!
128 eoreqs r0, r0, ip
129 ldreq r0, [r3], #4
130 ldreq ip, [r1, #4]!
131 eoreqs r0, r0, lr
132 bne 2f
133 subs r2, r2, #16
134 bhs 0b
135
136 /* do we have at least 2 words left? */
1371: adds r2, r2, #(16 - 2 + 2)
138 bmi 4f
139
140 /* finish off 2 words at a time */
1413: ldr r0, [r3], #4
142 ldr ip, [r1], #4
143 eors r0, r0, ip
144 bne 2f
145 subs r2, r2, #2
146 bhs 3b
147
148 /* are we done? */
1494: adds r2, r2, #2
150 bne 8f
151 /* restore registers and return */
152 mov r0, #0
153 ldmfd sp!, {r4, lr}
154 bx lr
155
1562: /* the last 2 words are different, restart them */
157 ldrh r0, [r3, #-4]
158 ldrh ip, [r1, #-4]
159 subs r0, r0, ip
160 ldreqh r0, [r3, #-2]
161 ldreqh ip, [r1, #-2]
162 subeqs r0, r0, ip
163 /* restore registers and return */
164 ldmfd sp!, {r4, lr}
165 bx lr
166
167 /* process the last few words */
1688: ldrh r0, [r3], #2
169 ldrh ip, [r1], #2
170 subs r0, r0, ip
171 bne 9f
172 subs r2, r2, #1
173 bne 8b
174
1759: /* restore registers and return */
176 ldmfd sp!, {r4, lr}
177 bx lr
178
179
1805: /*************** non-congruent case ***************/
181
182 /* align the unaligned pointer */
183 bic r1, r1, #3
184 ldr lr, [r1], #4
185 sub r2, r2, #8
186
1876:
188 pld [r3, #64]
189 pld [r1, #64]
190 mov ip, lr, lsr #16
191 ldr lr, [r1], #4
192 ldr r0, [r3], #4
193 orr ip, ip, lr, lsl #16
194 eors r0, r0, ip
195 moveq ip, lr, lsr #16
196 ldreq lr, [r1], #4
197 ldreq r0, [r3], #4
198 orreq ip, ip, lr, lsl #16
199 eoreqs r0, r0, ip
200 moveq ip, lr, lsr #16
201 ldreq lr, [r1], #4
202 ldreq r0, [r3], #4
203 orreq ip, ip, lr, lsl #16
204 eoreqs r0, r0, ip
205 moveq ip, lr, lsr #16
206 ldreq lr, [r1], #4
207 ldreq r0, [r3], #4
208 orreq ip, ip, lr, lsl #16
209 eoreqs r0, r0, ip
210 bne 7f
211 subs r2, r2, #8
212 bhs 6b
213 sub r1, r1, #2
214 /* are we done? */
215 adds r2, r2, #8
216 moveq r0, #0
217 beq 9b
218 /* finish off the remaining bytes */
219 b 8b
220
2217: /* fix up the 2 pointers and fallthrough... */
222 sub r1, r1, #2
223 b 2b
224END __memcmp16
225
226
227#endif // ART_RUNTIME_ARCH_ARM_MEMCMP16_ARM_S_