blob: 550989aa8ecd45efc8a3486083aef67edd20e242 [file] [log] [blame]
Will Newtonb61103d2013-07-01 11:15:27 +01001/* Copyright (c) 2013, Linaro Limited
2 All rights reserved.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 * Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 * Neither the name of Linaro Limited nor the names of its
16 contributors may be used to endorse or promote products derived
17 from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31 This memcpy routine is optimised for Cortex-A15 cores and takes advantage
32 of VFP or NEON when built with the appropriate flags.
33
34 Assumptions:
35
36 ARMv6 (ARMv7-a if using Neon)
37 ARM state
38 Unaligned accesses
39 LDRD/STRD support unaligned word accesses
40
41 */
42
43#include <machine/cpu-features.h>
44#include <machine/asm.h>
45
46 .syntax unified
47 /* This implementation requires ARM state. */
48 .arm
49
50#ifdef __ARM_NEON__
51
52 .fpu neon
53 .arch armv7-a
54# define FRAME_SIZE 4
55# define USE_VFP
56# define USE_NEON
57
58#elif !defined (__SOFTFP__)
59
60 .arch armv6
61 .fpu vfpv2
62# define FRAME_SIZE 32
63# define USE_VFP
64
65#else
66 .arch armv6
67# define FRAME_SIZE 32
68
69#endif
70
71/* Old versions of GAS incorrectly implement the NEON align semantics. */
72#ifdef BROKEN_ASM_NEON_ALIGN
73#define ALIGN(addr, align) addr,:align
74#else
75#define ALIGN(addr, align) addr:align
76#endif
77
78#define PC_OFFSET 8 /* PC pipeline compensation. */
79#define INSN_SIZE 4
80
81/* Call parameters. */
82#define dstin r0
83#define src r1
84#define count r2
85
86/* Locals. */
87#define tmp1 r3
88#define dst ip
89#define tmp2 r10
90
91#ifndef USE_NEON
92/* For bulk copies using GP registers. */
93#define A_l r2 /* Call-clobbered. */
94#define A_h r3 /* Call-clobbered. */
95#define B_l r4
96#define B_h r5
97#define C_l r6
98#define C_h r7
99#define D_l r8
100#define D_h r9
101#endif
102
103/* Number of lines ahead to pre-fetch data. If you change this the code
104 below will need adjustment to compensate. */
105
106#define prefetch_lines 5
107
108#ifdef USE_VFP
109 .macro cpy_line_vfp vreg, base
110 vstr \vreg, [dst, #\base]
111 vldr \vreg, [src, #\base]
112 vstr d0, [dst, #\base + 8]
113 vldr d0, [src, #\base + 8]
114 vstr d1, [dst, #\base + 16]
115 vldr d1, [src, #\base + 16]
116 vstr d2, [dst, #\base + 24]
117 vldr d2, [src, #\base + 24]
118 vstr \vreg, [dst, #\base + 32]
119 vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
120 vstr d0, [dst, #\base + 40]
121 vldr d0, [src, #\base + 40]
122 vstr d1, [dst, #\base + 48]
123 vldr d1, [src, #\base + 48]
124 vstr d2, [dst, #\base + 56]
125 vldr d2, [src, #\base + 56]
126 .endm
127
128 .macro cpy_tail_vfp vreg, base
129 vstr \vreg, [dst, #\base]
130 vldr \vreg, [src, #\base]
131 vstr d0, [dst, #\base + 8]
132 vldr d0, [src, #\base + 8]
133 vstr d1, [dst, #\base + 16]
134 vldr d1, [src, #\base + 16]
135 vstr d2, [dst, #\base + 24]
136 vldr d2, [src, #\base + 24]
137 vstr \vreg, [dst, #\base + 32]
138 vstr d0, [dst, #\base + 40]
139 vldr d0, [src, #\base + 40]
140 vstr d1, [dst, #\base + 48]
141 vldr d1, [src, #\base + 48]
142 vstr d2, [dst, #\base + 56]
143 vldr d2, [src, #\base + 56]
144 .endm
145#endif
146
147 .p2align 6
148ENTRY(memcpy)
149
150 mov dst, dstin /* Preserve dstin, we need to return it. */
151 cmp count, #64
152 bge .Lcpy_not_short
153 /* Deal with small copies quickly by dropping straight into the
154 exit block. */
155
156.Ltail63unaligned:
157#ifdef USE_NEON
158 and tmp1, count, #0x38
159 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
160 add pc, pc, tmp1
161 vld1.8 {d0}, [src]! /* 14 words to go. */
162 vst1.8 {d0}, [dst]!
163 vld1.8 {d0}, [src]! /* 12 words to go. */
164 vst1.8 {d0}, [dst]!
165 vld1.8 {d0}, [src]! /* 10 words to go. */
166 vst1.8 {d0}, [dst]!
167 vld1.8 {d0}, [src]! /* 8 words to go. */
168 vst1.8 {d0}, [dst]!
169 vld1.8 {d0}, [src]! /* 6 words to go. */
170 vst1.8 {d0}, [dst]!
171 vld1.8 {d0}, [src]! /* 4 words to go. */
172 vst1.8 {d0}, [dst]!
173 vld1.8 {d0}, [src]! /* 2 words to go. */
174 vst1.8 {d0}, [dst]!
175
176 tst count, #4
177 ldrne tmp1, [src], #4
178 strne tmp1, [dst], #4
179#else
180 /* Copy up to 15 full words of data. May not be aligned. */
181 /* Cannot use VFP for unaligned data. */
182 and tmp1, count, #0x3c
183 add dst, dst, tmp1
184 add src, src, tmp1
185 rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
186 /* Jump directly into the sequence below at the correct offset. */
187 add pc, pc, tmp1, lsl #1
188
189 ldr tmp1, [src, #-60] /* 15 words to go. */
190 str tmp1, [dst, #-60]
191
192 ldr tmp1, [src, #-56] /* 14 words to go. */
193 str tmp1, [dst, #-56]
194 ldr tmp1, [src, #-52]
195 str tmp1, [dst, #-52]
196
197 ldr tmp1, [src, #-48] /* 12 words to go. */
198 str tmp1, [dst, #-48]
199 ldr tmp1, [src, #-44]
200 str tmp1, [dst, #-44]
201
202 ldr tmp1, [src, #-40] /* 10 words to go. */
203 str tmp1, [dst, #-40]
204 ldr tmp1, [src, #-36]
205 str tmp1, [dst, #-36]
206
207 ldr tmp1, [src, #-32] /* 8 words to go. */
208 str tmp1, [dst, #-32]
209 ldr tmp1, [src, #-28]
210 str tmp1, [dst, #-28]
211
212 ldr tmp1, [src, #-24] /* 6 words to go. */
213 str tmp1, [dst, #-24]
214 ldr tmp1, [src, #-20]
215 str tmp1, [dst, #-20]
216
217 ldr tmp1, [src, #-16] /* 4 words to go. */
218 str tmp1, [dst, #-16]
219 ldr tmp1, [src, #-12]
220 str tmp1, [dst, #-12]
221
222 ldr tmp1, [src, #-8] /* 2 words to go. */
223 str tmp1, [dst, #-8]
224 ldr tmp1, [src, #-4]
225 str tmp1, [dst, #-4]
226#endif
227
228 lsls count, count, #31
229 ldrhcs tmp1, [src], #2
230 ldrbne src, [src] /* Src is dead, use as a scratch. */
231 strhcs tmp1, [dst], #2
232 strbne src, [dst]
233 bx lr
234
235.Lcpy_not_short:
236 /* At least 64 bytes to copy, but don't know the alignment yet. */
237 str tmp2, [sp, #-FRAME_SIZE]!
238 and tmp2, src, #7
239 and tmp1, dst, #7
240 cmp tmp1, tmp2
241 bne .Lcpy_notaligned
242
243#ifdef USE_VFP
244 /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
245 that the FP pipeline is much better at streaming loads and
246 stores. This is outside the critical loop. */
247 vmov.f32 s0, s0
248#endif
249
250 /* SRC and DST have the same mutual 32-bit alignment, but we may
251 still need to pre-copy some bytes to get to natural alignment.
252 We bring DST into full 64-bit alignment. */
253 lsls tmp2, dst, #29
254 beq 1f
255 rsbs tmp2, tmp2, #0
256 sub count, count, tmp2, lsr #29
257 ldrmi tmp1, [src], #4
258 strmi tmp1, [dst], #4
259 lsls tmp2, tmp2, #2
260 ldrhcs tmp1, [src], #2
261 ldrbne tmp2, [src], #1
262 strhcs tmp1, [dst], #2
263 strbne tmp2, [dst], #1
264
2651:
266 subs tmp2, count, #64 /* Use tmp2 for count. */
267 blt .Ltail63aligned
268
269 cmp tmp2, #512
270 bge .Lcpy_body_long
271
272.Lcpy_body_medium: /* Count in tmp2. */
273#ifdef USE_VFP
2741:
275 vldr d0, [src, #0]
276 subs tmp2, tmp2, #64
277 vldr d1, [src, #8]
278 vstr d0, [dst, #0]
279 vldr d0, [src, #16]
280 vstr d1, [dst, #8]
281 vldr d1, [src, #24]
282 vstr d0, [dst, #16]
283 vldr d0, [src, #32]
284 vstr d1, [dst, #24]
285 vldr d1, [src, #40]
286 vstr d0, [dst, #32]
287 vldr d0, [src, #48]
288 vstr d1, [dst, #40]
289 vldr d1, [src, #56]
290 vstr d0, [dst, #48]
291 add src, src, #64
292 vstr d1, [dst, #56]
293 add dst, dst, #64
294 bge 1b
295 tst tmp2, #0x3f
296 beq .Ldone
297
298.Ltail63aligned: /* Count in tmp2. */
299 and tmp1, tmp2, #0x38
300 add dst, dst, tmp1
301 add src, src, tmp1
302 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
303 add pc, pc, tmp1
304
305 vldr d0, [src, #-56] /* 14 words to go. */
306 vstr d0, [dst, #-56]
307 vldr d0, [src, #-48] /* 12 words to go. */
308 vstr d0, [dst, #-48]
309 vldr d0, [src, #-40] /* 10 words to go. */
310 vstr d0, [dst, #-40]
311 vldr d0, [src, #-32] /* 8 words to go. */
312 vstr d0, [dst, #-32]
313 vldr d0, [src, #-24] /* 6 words to go. */
314 vstr d0, [dst, #-24]
315 vldr d0, [src, #-16] /* 4 words to go. */
316 vstr d0, [dst, #-16]
317 vldr d0, [src, #-8] /* 2 words to go. */
318 vstr d0, [dst, #-8]
319#else
320 sub src, src, #8
321 sub dst, dst, #8
3221:
323 ldrd A_l, A_h, [src, #8]
324 strd A_l, A_h, [dst, #8]
325 ldrd A_l, A_h, [src, #16]
326 strd A_l, A_h, [dst, #16]
327 ldrd A_l, A_h, [src, #24]
328 strd A_l, A_h, [dst, #24]
329 ldrd A_l, A_h, [src, #32]
330 strd A_l, A_h, [dst, #32]
331 ldrd A_l, A_h, [src, #40]
332 strd A_l, A_h, [dst, #40]
333 ldrd A_l, A_h, [src, #48]
334 strd A_l, A_h, [dst, #48]
335 ldrd A_l, A_h, [src, #56]
336 strd A_l, A_h, [dst, #56]
337 ldrd A_l, A_h, [src, #64]!
338 strd A_l, A_h, [dst, #64]!
339 subs tmp2, tmp2, #64
340 bge 1b
341 tst tmp2, #0x3f
342 bne 1f
343 ldr tmp2,[sp], #FRAME_SIZE
344 bx lr
3451:
346 add src, src, #8
347 add dst, dst, #8
348
349.Ltail63aligned: /* Count in tmp2. */
350 /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
351 we know that the src and dest are 32-bit aligned so we can use
352 LDRD/STRD to improve efficiency. */
353 /* TMP2 is now negative, but we don't care about that. The bottom
354 six bits still tell us how many bytes are left to copy. */
355
356 and tmp1, tmp2, #0x38
357 add dst, dst, tmp1
358 add src, src, tmp1
359 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
360 add pc, pc, tmp1
361 ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
362 strd A_l, A_h, [dst, #-56]
363 ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
364 strd A_l, A_h, [dst, #-48]
365 ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
366 strd A_l, A_h, [dst, #-40]
367 ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
368 strd A_l, A_h, [dst, #-32]
369 ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
370 strd A_l, A_h, [dst, #-24]
371 ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
372 strd A_l, A_h, [dst, #-16]
373 ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
374 strd A_l, A_h, [dst, #-8]
375
376#endif
377 tst tmp2, #4
378 ldrne tmp1, [src], #4
379 strne tmp1, [dst], #4
380 lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
381 ldrhcs tmp1, [src], #2
382 ldrbne tmp2, [src]
383 strhcs tmp1, [dst], #2
384 strbne tmp2, [dst]
385
386.Ldone:
387 ldr tmp2, [sp], #FRAME_SIZE
388 bx lr
389
390.Lcpy_body_long: /* Count in tmp2. */
391
392 /* Long copy. We know that there's at least (prefetch_lines * 64)
393 bytes to go. */
394#ifdef USE_VFP
395 /* Don't use PLD. Instead, read some data in advance of the current
396 copy position into a register. This should act like a PLD
397 operation but we won't have to repeat the transfer. */
398
399 vldr d3, [src, #0]
400 vldr d4, [src, #64]
401 vldr d5, [src, #128]
402 vldr d6, [src, #192]
403 vldr d7, [src, #256]
404
405 vldr d0, [src, #8]
406 vldr d1, [src, #16]
407 vldr d2, [src, #24]
408 add src, src, #32
409
410 subs tmp2, tmp2, #prefetch_lines * 64 * 2
411 blt 2f
4121:
413 cpy_line_vfp d3, 0
414 cpy_line_vfp d4, 64
415 cpy_line_vfp d5, 128
416 add dst, dst, #3 * 64
417 add src, src, #3 * 64
418 cpy_line_vfp d6, 0
419 cpy_line_vfp d7, 64
420 add dst, dst, #2 * 64
421 add src, src, #2 * 64
422 subs tmp2, tmp2, #prefetch_lines * 64
423 bge 1b
424
4252:
426 cpy_tail_vfp d3, 0
427 cpy_tail_vfp d4, 64
428 cpy_tail_vfp d5, 128
429 add src, src, #3 * 64
430 add dst, dst, #3 * 64
431 cpy_tail_vfp d6, 0
432 vstr d7, [dst, #64]
433 vldr d7, [src, #64]
434 vstr d0, [dst, #64 + 8]
435 vldr d0, [src, #64 + 8]
436 vstr d1, [dst, #64 + 16]
437 vldr d1, [src, #64 + 16]
438 vstr d2, [dst, #64 + 24]
439 vldr d2, [src, #64 + 24]
440 vstr d7, [dst, #64 + 32]
441 add src, src, #96
442 vstr d0, [dst, #64 + 40]
443 vstr d1, [dst, #64 + 48]
444 vstr d2, [dst, #64 + 56]
445 add dst, dst, #128
446 add tmp2, tmp2, #prefetch_lines * 64
447 b .Lcpy_body_medium
448#else
449 /* Long copy. Use an SMS style loop to maximize the I/O
450 bandwidth of the core. We don't have enough spare registers
451 to synthesise prefetching, so use PLD operations. */
452 /* Pre-bias src and dst. */
453 sub src, src, #8
454 sub dst, dst, #8
455 pld [src, #8]
456 pld [src, #72]
457 subs tmp2, tmp2, #64
458 pld [src, #136]
459 ldrd A_l, A_h, [src, #8]
460 strd B_l, B_h, [sp, #8]
461 ldrd B_l, B_h, [src, #16]
462 strd C_l, C_h, [sp, #16]
463 ldrd C_l, C_h, [src, #24]
464 strd D_l, D_h, [sp, #24]
465 pld [src, #200]
466 ldrd D_l, D_h, [src, #32]!
467 b 1f
468 .p2align 6
4692:
470 pld [src, #232]
471 strd A_l, A_h, [dst, #40]
472 ldrd A_l, A_h, [src, #40]
473 strd B_l, B_h, [dst, #48]
474 ldrd B_l, B_h, [src, #48]
475 strd C_l, C_h, [dst, #56]
476 ldrd C_l, C_h, [src, #56]
477 strd D_l, D_h, [dst, #64]!
478 ldrd D_l, D_h, [src, #64]!
479 subs tmp2, tmp2, #64
4801:
481 strd A_l, A_h, [dst, #8]
482 ldrd A_l, A_h, [src, #8]
483 strd B_l, B_h, [dst, #16]
484 ldrd B_l, B_h, [src, #16]
485 strd C_l, C_h, [dst, #24]
486 ldrd C_l, C_h, [src, #24]
487 strd D_l, D_h, [dst, #32]
488 ldrd D_l, D_h, [src, #32]
489 bcs 2b
490 /* Save the remaining bytes and restore the callee-saved regs. */
491 strd A_l, A_h, [dst, #40]
492 add src, src, #40
493 strd B_l, B_h, [dst, #48]
494 ldrd B_l, B_h, [sp, #8]
495 strd C_l, C_h, [dst, #56]
496 ldrd C_l, C_h, [sp, #16]
497 strd D_l, D_h, [dst, #64]
498 ldrd D_l, D_h, [sp, #24]
499 add dst, dst, #72
500 tst tmp2, #0x3f
501 bne .Ltail63aligned
502 ldr tmp2, [sp], #FRAME_SIZE
503 bx lr
504#endif
505
506.Lcpy_notaligned:
507 pld [src]
508 pld [src, #64]
509 /* There's at least 64 bytes to copy, but there is no mutual
510 alignment. */
511 /* Bring DST to 64-bit alignment. */
512 lsls tmp2, dst, #29
513 pld [src, #(2 * 64)]
514 beq 1f
515 rsbs tmp2, tmp2, #0
516 sub count, count, tmp2, lsr #29
517 ldrmi tmp1, [src], #4
518 strmi tmp1, [dst], #4
519 lsls tmp2, tmp2, #2
520 ldrbne tmp1, [src], #1
521 ldrhcs tmp2, [src], #2
522 strbne tmp1, [dst], #1
523 strhcs tmp2, [dst], #2
5241:
525 pld [src, #(3 * 64)]
526 subs count, count, #64
527 ldrmi tmp2, [sp], #FRAME_SIZE
528 bmi .Ltail63unaligned
529 pld [src, #(4 * 64)]
530
531#ifdef USE_NEON
532 vld1.8 {d0-d3}, [src]!
533 vld1.8 {d4-d7}, [src]!
534 subs count, count, #64
535 bmi 2f
5361:
537 pld [src, #(4 * 64)]
538 vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
539 vld1.8 {d0-d3}, [src]!
540 vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
541 vld1.8 {d4-d7}, [src]!
542 subs count, count, #64
543 bpl 1b
5442:
545 vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
546 vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
547 ands count, count, #0x3f
548#else
549 /* Use an SMS style loop to maximize the I/O bandwidth. */
550 sub src, src, #4
551 sub dst, dst, #8
552 subs tmp2, count, #64 /* Use tmp2 for count. */
553 ldr A_l, [src, #4]
554 ldr A_h, [src, #8]
555 strd B_l, B_h, [sp, #8]
556 ldr B_l, [src, #12]
557 ldr B_h, [src, #16]
558 strd C_l, C_h, [sp, #16]
559 ldr C_l, [src, #20]
560 ldr C_h, [src, #24]
561 strd D_l, D_h, [sp, #24]
562 ldr D_l, [src, #28]
563 ldr D_h, [src, #32]!
564 b 1f
565 .p2align 6
5662:
567 pld [src, #(5 * 64) - (32 - 4)]
568 strd A_l, A_h, [dst, #40]
569 ldr A_l, [src, #36]
570 ldr A_h, [src, #40]
571 strd B_l, B_h, [dst, #48]
572 ldr B_l, [src, #44]
573 ldr B_h, [src, #48]
574 strd C_l, C_h, [dst, #56]
575 ldr C_l, [src, #52]
576 ldr C_h, [src, #56]
577 strd D_l, D_h, [dst, #64]!
578 ldr D_l, [src, #60]
579 ldr D_h, [src, #64]!
580 subs tmp2, tmp2, #64
5811:
582 strd A_l, A_h, [dst, #8]
583 ldr A_l, [src, #4]
584 ldr A_h, [src, #8]
585 strd B_l, B_h, [dst, #16]
586 ldr B_l, [src, #12]
587 ldr B_h, [src, #16]
588 strd C_l, C_h, [dst, #24]
589 ldr C_l, [src, #20]
590 ldr C_h, [src, #24]
591 strd D_l, D_h, [dst, #32]
592 ldr D_l, [src, #28]
593 ldr D_h, [src, #32]
594 bcs 2b
595
596 /* Save the remaining bytes and restore the callee-saved regs. */
597 strd A_l, A_h, [dst, #40]
598 add src, src, #36
599 strd B_l, B_h, [dst, #48]
600 ldrd B_l, B_h, [sp, #8]
601 strd C_l, C_h, [dst, #56]
602 ldrd C_l, C_h, [sp, #16]
603 strd D_l, D_h, [dst, #64]
604 ldrd D_l, D_h, [sp, #24]
605 add dst, dst, #72
606 ands count, tmp2, #0x3f
607#endif
608 ldr tmp2, [sp], #FRAME_SIZE
609 bne .Ltail63unaligned
610 bx lr
611END(memcpy)