blob: 04ba848719c7fcbec0fc0b4ef3f61ae129225bf6 [file] [log] [blame]
The Android Open Source Project1dc9e472009-03-03 19:28:35 -08001/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
13 * distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <machine/cpu-features.h>
Kenny Root420878c2011-02-16 11:55:58 -080030#include <machine/asm.h>
The Android Open Source Project1dc9e472009-03-03 19:28:35 -080031
Colin Crossecede402010-03-09 16:23:51 -080032#if defined(__ARM_NEON__)
Mathias Agopianee223d02009-09-27 17:46:43 -070033
34 .text
35 .fpu neon
36
37 .global memcpy
38 .type memcpy, %function
39 .align 4
40
Mathias Agopian199f9d92009-10-28 02:54:37 -070041/* a prefetch distance of 4 cache-lines works best experimentally */
42#define CACHE_LINE_SIZE 64
43#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*4)
Mathias Agopianee223d02009-09-27 17:46:43 -070044
45memcpy:
46 .fnstart
47 .save {r0, lr}
48 stmfd sp!, {r0, lr}
49
50 /* start preloading as early as possible */
Mathias Agopian199f9d92009-10-28 02:54:37 -070051 pld [r1, #(CACHE_LINE_SIZE*0)]
52 pld [r1, #(CACHE_LINE_SIZE*1)]
Mathias Agopianee223d02009-09-27 17:46:43 -070053
54 /* do we have at least 16-bytes to copy (needed for alignment below) */
55 cmp r2, #16
56 blo 5f
57
58 /* align destination to half cache-line for the write-buffer */
59 rsb r3, r0, #0
60 ands r3, r3, #0xF
61 beq 0f
62
63 /* copy up to 15-bytes (count in r3) */
64 sub r2, r2, r3
65 movs ip, r3, lsl #31
66 ldrmib lr, [r1], #1
67 strmib lr, [r0], #1
68 ldrcsb ip, [r1], #1
69 ldrcsb lr, [r1], #1
70 strcsb ip, [r0], #1
71 strcsb lr, [r0], #1
72 movs ip, r3, lsl #29
73 bge 1f
74 // copies 4 bytes, destination 32-bits aligned
75 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
76 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
771: bcc 2f
78 // copies 8 bytes, destination 64-bits aligned
79 vld1.8 {d0}, [r1]!
80 vst1.8 {d0}, [r0, :64]!
812:
82
830: /* preload immediately the next cache line, which we may need */
Mathias Agopian199f9d92009-10-28 02:54:37 -070084 pld [r1, #(CACHE_LINE_SIZE*0)]
85 pld [r1, #(CACHE_LINE_SIZE*1)]
Mathias Agopianee223d02009-09-27 17:46:43 -070086
Mathias Agopian199f9d92009-10-28 02:54:37 -070087 /* make sure we have at least 64 bytes to copy */
88 subs r2, r2, #64
Mathias Agopianee223d02009-09-27 17:46:43 -070089 blo 2f
90
91 /* preload all the cache lines we need.
92 * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
93 * ideally would would increase the distance in the main loop to
94 * avoid the goofy code below. In practice this doesn't seem to make
95 * a big difference.
96 */
Mathias Agopian199f9d92009-10-28 02:54:37 -070097 pld [r1, #(CACHE_LINE_SIZE*2)]
98 pld [r1, #(CACHE_LINE_SIZE*3)]
99 pld [r1, #(PREFETCH_DISTANCE)]
Mathias Agopianee223d02009-09-27 17:46:43 -0700100
Mathias Agopian199f9d92009-10-28 02:54:37 -07001011: /* The main loop copies 64 bytes at a time */
Mathias Agopianee223d02009-09-27 17:46:43 -0700102 vld1.8 {d0 - d3}, [r1]!
103 vld1.8 {d4 - d7}, [r1]!
Mathias Agopian199f9d92009-10-28 02:54:37 -0700104 pld [r1, #(PREFETCH_DISTANCE)]
105 subs r2, r2, #64
Mathias Agopianee223d02009-09-27 17:46:43 -0700106 vst1.8 {d0 - d3}, [r0, :128]!
107 vst1.8 {d4 - d7}, [r0, :128]!
Mathias Agopianee223d02009-09-27 17:46:43 -0700108 bhs 1b
109
1102: /* fix-up the remaining count and make sure we have >= 32 bytes left */
Mathias Agopian199f9d92009-10-28 02:54:37 -0700111 add r2, r2, #64
Mathias Agopianee223d02009-09-27 17:46:43 -0700112 subs r2, r2, #32
113 blo 4f
114
1153: /* 32 bytes at a time. These cache lines were already preloaded */
116 vld1.8 {d0 - d3}, [r1]!
117 subs r2, r2, #32
118 vst1.8 {d0 - d3}, [r0, :128]!
119 bhs 3b
120
1214: /* less than 32 left */
122 add r2, r2, #32
123 tst r2, #0x10
124 beq 5f
125 // copies 16 bytes, 128-bits aligned
126 vld1.8 {d0, d1}, [r1]!
127 vst1.8 {d0, d1}, [r0, :128]!
128
1295: /* copy up to 15-bytes (count in r2) */
130 movs ip, r2, lsl #29
131 bcc 1f
132 vld1.8 {d0}, [r1]!
133 vst1.8 {d0}, [r0]!
1341: bge 2f
135 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
136 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
1372: movs ip, r2, lsl #31
138 ldrmib r3, [r1], #1
139 ldrcsb ip, [r1], #1
140 ldrcsb lr, [r1], #1
141 strmib r3, [r0], #1
142 strcsb ip, [r0], #1
143 strcsb lr, [r0], #1
144
145 ldmfd sp!, {r0, lr}
146 bx lr
Kenny Root420878c2011-02-16 11:55:58 -0800147END(memcpy)
Mathias Agopianee223d02009-09-27 17:46:43 -0700148
149
150#else /* __ARM_ARCH__ < 7 */
151
152
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800153 /*
154 * Optimized memcpy() for ARM.
155 *
156 * note that memcpy() always returns the destination pointer,
157 * so we have to preserve R0.
158 */
Mathias Agopianee223d02009-09-27 17:46:43 -0700159
Kenny Root420878c2011-02-16 11:55:58 -0800160ENTRY(memcpy)
Mathias Agopianee223d02009-09-27 17:46:43 -0700161 /* The stack must always be 64-bits aligned to be compliant with the
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800162 * ARM ABI. Since we have to save R0, we might as well save R4
163 * which we can use for better pipelining of the reads below
164 */
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800165 .save {r0, r4, lr}
166 stmfd sp!, {r0, r4, lr}
167 /* Making room for r5-r11 which will be spilled later */
168 .pad #28
169 sub sp, sp, #28
170
171 // preload the destination because we'll align it to a cache line
172 // with small writes. Also start the source "pump".
173 PLD (r0, #0)
174 PLD (r1, #0)
175 PLD (r1, #32)
176
177 /* it simplifies things to take care of len<4 early */
178 cmp r2, #4
179 blo copy_last_3_and_return
180
181 /* compute the offset to align the source
182 * offset = (4-(src&3))&3 = -src & 3
183 */
184 rsb r3, r1, #0
185 ands r3, r3, #3
186 beq src_aligned
187
188 /* align source to 32 bits. We need to insert 2 instructions between
189 * a ldr[b|h] and str[b|h] because byte and half-word instructions
190 * stall 2 cycles.
191 */
192 movs r12, r3, lsl #31
193 sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
194 ldrmib r3, [r1], #1
195 ldrcsb r4, [r1], #1
196 ldrcsb r12,[r1], #1
197 strmib r3, [r0], #1
198 strcsb r4, [r0], #1
199 strcsb r12,[r0], #1
Mathias Agopianee223d02009-09-27 17:46:43 -0700200
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800201src_aligned:
202
Mathias Agopianee223d02009-09-27 17:46:43 -0700203 /* see if src and dst are aligned together (congruent) */
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800204 eor r12, r0, r1
205 tst r12, #3
206 bne non_congruent
207
208 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
209 * frame. Don't update sp.
210 */
211 stmea sp, {r5-r11}
212
213 /* align the destination to a cache-line */
214 rsb r3, r0, #0
215 ands r3, r3, #0x1C
216 beq congruent_aligned32
217 cmp r3, r2
218 andhi r3, r2, #0x1C
219
220 /* conditionnaly copies 0 to 7 words (length in r3) */
Mathias Agopianee223d02009-09-27 17:46:43 -0700221 movs r12, r3, lsl #28
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800222 ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
223 ldmmiia r1!, {r8, r9} /* 8 bytes */
224 stmcsia r0!, {r4, r5, r6, r7}
225 stmmiia r0!, {r8, r9}
226 tst r3, #0x4
227 ldrne r10,[r1], #4 /* 4 bytes */
228 strne r10,[r0], #4
229 sub r2, r2, r3
230
231congruent_aligned32:
232 /*
233 * here source is aligned to 32 bytes.
234 */
235
236cached_aligned32:
237 subs r2, r2, #32
238 blo less_than_32_left
239
240 /*
241 * We preload a cache-line up to 64 bytes ahead. On the 926, this will
Mathias Agopianee223d02009-09-27 17:46:43 -0700242 * stall only until the requested world is fetched, but the linefill
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800243 * continues in the the background.
244 * While the linefill is going, we write our previous cache-line
245 * into the write-buffer (which should have some free space).
246 * When the linefill is done, the writebuffer will
247 * start dumping its content into memory
248 *
249 * While all this is going, we then load a full cache line into
250 * 8 registers, this cache line should be in the cache by now
251 * (or partly in the cache).
252 *
253 * This code should work well regardless of the source/dest alignment.
254 *
255 */
256
257 // Align the preload register to a cache-line because the cpu does
258 // "critical word first" (the first word requested is loaded first).
259 bic r12, r1, #0x1F
260 add r12, r12, #64
261
2621: ldmia r1!, { r4-r11 }
263 PLD (r12, #64)
264 subs r2, r2, #32
265
266 // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
267 // for ARM9 preload will not be safely guarded by the preceding subs.
Mathias Agopianee223d02009-09-27 17:46:43 -0700268 // When it is safely guarded the only possibility to have SIGSEGV here
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800269 // is because the caller overstates the length.
270 ldrhi r3, [r12], #32 /* cheap ARM9 preload */
271 stmia r0!, { r4-r11 }
272 bhs 1b
Mathias Agopianee223d02009-09-27 17:46:43 -0700273
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800274 add r2, r2, #32
275
276
277
278
279less_than_32_left:
Mathias Agopianee223d02009-09-27 17:46:43 -0700280 /*
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800281 * less than 32 bytes left at this point (length in r2)
282 */
283
284 /* skip all this if there is nothing to do, which should
285 * be a common case (if not executed the code below takes
286 * about 16 cycles)
287 */
288 tst r2, #0x1F
289 beq 1f
290
291 /* conditionnaly copies 0 to 31 bytes */
Mathias Agopianee223d02009-09-27 17:46:43 -0700292 movs r12, r2, lsl #28
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800293 ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
294 ldmmiia r1!, {r8, r9} /* 8 bytes */
295 stmcsia r0!, {r4, r5, r6, r7}
296 stmmiia r0!, {r8, r9}
297 movs r12, r2, lsl #30
298 ldrcs r3, [r1], #4 /* 4 bytes */
299 ldrmih r4, [r1], #2 /* 2 bytes */
Mathias Agopianee223d02009-09-27 17:46:43 -0700300 strcs r3, [r0], #4
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800301 strmih r4, [r0], #2
302 tst r2, #0x1
303 ldrneb r3, [r1] /* last byte */
304 strneb r3, [r0]
305
306 /* we're done! restore everything and return */
3071: ldmfd sp!, {r5-r11}
308 ldmfd sp!, {r0, r4, lr}
309 bx lr
310
311 /********************************************************************/
312
313non_congruent:
314 /*
315 * here source is aligned to 4 bytes
316 * but destination is not.
317 *
Mathias Agopianee223d02009-09-27 17:46:43 -0700318 * in the code below r2 is the number of bytes read
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800319 * (the number of bytes written is always smaller, because we have
320 * partial words in the shift queue)
321 */
322 cmp r2, #4
323 blo copy_last_3_and_return
Mathias Agopianee223d02009-09-27 17:46:43 -0700324
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800325 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
326 * frame. Don't update sp.
327 */
328 stmea sp, {r5-r11}
Mathias Agopianee223d02009-09-27 17:46:43 -0700329
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800330 /* compute shifts needed to align src to dest */
331 rsb r5, r0, #0
332 and r5, r5, #3 /* r5 = # bytes in partial words */
Mathias Agopianee223d02009-09-27 17:46:43 -0700333 mov r12, r5, lsl #3 /* r12 = right */
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800334 rsb lr, r12, #32 /* lr = left */
Mathias Agopianee223d02009-09-27 17:46:43 -0700335
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800336 /* read the first word */
337 ldr r3, [r1], #4
338 sub r2, r2, #4
Mathias Agopianee223d02009-09-27 17:46:43 -0700339
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800340 /* write a partial word (0 to 3 bytes), such that destination
341 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
342 */
343 movs r5, r5, lsl #31
344 strmib r3, [r0], #1
Mathias Agopianee223d02009-09-27 17:46:43 -0700345 movmi r3, r3, lsr #8
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800346 strcsb r3, [r0], #1
347 movcs r3, r3, lsr #8
348 strcsb r3, [r0], #1
349 movcs r3, r3, lsr #8
350
351 cmp r2, #4
352 blo partial_word_tail
Mathias Agopianee223d02009-09-27 17:46:43 -0700353
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800354 /* Align destination to 32 bytes (cache line boundary) */
3551: tst r0, #0x1c
356 beq 2f
357 ldr r5, [r1], #4
358 sub r2, r2, #4
359 orr r4, r3, r5, lsl lr
360 mov r3, r5, lsr r12
361 str r4, [r0], #4
362 cmp r2, #4
363 bhs 1b
364 blo partial_word_tail
365
366 /* copy 32 bytes at a time */
3672: subs r2, r2, #32
368 blo less_than_thirtytwo
369
370 /* Use immediate mode for the shifts, because there is an extra cycle
371 * for register shifts, which could account for up to 50% of
372 * performance hit.
373 */
374
375 cmp r12, #24
376 beq loop24
377 cmp r12, #8
378 beq loop8
379
380loop16:
381 ldr r12, [r1], #4
3821: mov r4, r12
383 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
384 PLD (r1, #64)
385 subs r2, r2, #32
386 ldrhs r12, [r1], #4
387 orr r3, r3, r4, lsl #16
388 mov r4, r4, lsr #16
389 orr r4, r4, r5, lsl #16
390 mov r5, r5, lsr #16
391 orr r5, r5, r6, lsl #16
392 mov r6, r6, lsr #16
393 orr r6, r6, r7, lsl #16
394 mov r7, r7, lsr #16
395 orr r7, r7, r8, lsl #16
396 mov r8, r8, lsr #16
397 orr r8, r8, r9, lsl #16
398 mov r9, r9, lsr #16
399 orr r9, r9, r10, lsl #16
400 mov r10, r10, lsr #16
401 orr r10, r10, r11, lsl #16
402 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
403 mov r3, r11, lsr #16
404 bhs 1b
405 b less_than_thirtytwo
406
407loop8:
408 ldr r12, [r1], #4
4091: mov r4, r12
410 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
411 PLD (r1, #64)
412 subs r2, r2, #32
413 ldrhs r12, [r1], #4
414 orr r3, r3, r4, lsl #24
415 mov r4, r4, lsr #8
416 orr r4, r4, r5, lsl #24
417 mov r5, r5, lsr #8
418 orr r5, r5, r6, lsl #24
419 mov r6, r6, lsr #8
420 orr r6, r6, r7, lsl #24
421 mov r7, r7, lsr #8
422 orr r7, r7, r8, lsl #24
423 mov r8, r8, lsr #8
424 orr r8, r8, r9, lsl #24
425 mov r9, r9, lsr #8
426 orr r9, r9, r10, lsl #24
427 mov r10, r10, lsr #8
428 orr r10, r10, r11, lsl #24
429 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
430 mov r3, r11, lsr #8
431 bhs 1b
432 b less_than_thirtytwo
433
434loop24:
435 ldr r12, [r1], #4
4361: mov r4, r12
437 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
438 PLD (r1, #64)
439 subs r2, r2, #32
440 ldrhs r12, [r1], #4
441 orr r3, r3, r4, lsl #8
442 mov r4, r4, lsr #24
443 orr r4, r4, r5, lsl #8
444 mov r5, r5, lsr #24
445 orr r5, r5, r6, lsl #8
446 mov r6, r6, lsr #24
447 orr r6, r6, r7, lsl #8
448 mov r7, r7, lsr #24
449 orr r7, r7, r8, lsl #8
450 mov r8, r8, lsr #24
451 orr r8, r8, r9, lsl #8
452 mov r9, r9, lsr #24
453 orr r9, r9, r10, lsl #8
454 mov r10, r10, lsr #24
455 orr r10, r10, r11, lsl #8
456 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
457 mov r3, r11, lsr #24
458 bhs 1b
459
460
461less_than_thirtytwo:
462 /* copy the last 0 to 31 bytes of the source */
463 rsb r12, lr, #32 /* we corrupted r12, recompute it */
464 add r2, r2, #32
465 cmp r2, #4
466 blo partial_word_tail
467
4681: ldr r5, [r1], #4
469 sub r2, r2, #4
470 orr r4, r3, r5, lsl lr
471 mov r3, r5, lsr r12
472 str r4, [r0], #4
473 cmp r2, #4
474 bhs 1b
475
476partial_word_tail:
477 /* we have a partial word in the input buffer */
478 movs r5, lr, lsl #(31-3)
479 strmib r3, [r0], #1
480 movmi r3, r3, lsr #8
481 strcsb r3, [r0], #1
482 movcs r3, r3, lsr #8
483 strcsb r3, [r0], #1
Mathias Agopianee223d02009-09-27 17:46:43 -0700484
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800485 /* Refill spilled registers from the stack. Don't update sp. */
486 ldmfd sp, {r5-r11}
487
488copy_last_3_and_return:
489 movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */
490 ldrmib r2, [r1], #1
491 ldrcsb r3, [r1], #1
492 ldrcsb r12,[r1]
493 strmib r2, [r0], #1
494 strcsb r3, [r0], #1
495 strcsb r12,[r0]
496
497 /* we're done! restore sp and spilled registers and return */
498 add sp, sp, #28
499 ldmfd sp!, {r0, r4, lr}
500 bx lr
Kenny Root420878c2011-02-16 11:55:58 -0800501END(memcpy)
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800502
Mathias Agopianee223d02009-09-27 17:46:43 -0700503
504#endif /* __ARM_ARCH__ < 7 */