blob: 2ba1ff51d705584a7e9eeaa51f78fd6015443c37 [file] [log] [blame]
Will Newtonb61103d2013-07-01 11:15:27 +01001/* Copyright (c) 2013, Linaro Limited
2 All rights reserved.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions
6 are met:
7
8 * Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright
12 notice, this list of conditions and the following disclaimer in the
13 documentation and/or other materials provided with the distribution.
14
15 * Neither the name of Linaro Limited nor the names of its
16 contributors may be used to endorse or promote products derived
17 from this software without specific prior written permission.
18
19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
Will Newton2753e122013-07-03 09:44:30 +010031 */
32
33/*
Will Newtonb61103d2013-07-01 11:15:27 +010034 This memcpy routine is optimised for Cortex-A15 cores and takes advantage
35 of VFP or NEON when built with the appropriate flags.
36
37 Assumptions:
38
39 ARMv6 (ARMv7-a if using Neon)
40 ARM state
41 Unaligned accesses
42 LDRD/STRD support unaligned word accesses
43
44 */
45
46#include <machine/cpu-features.h>
47#include <machine/asm.h>
48
49 .syntax unified
50 /* This implementation requires ARM state. */
51 .arm
52
53#ifdef __ARM_NEON__
54
55 .fpu neon
56 .arch armv7-a
57# define FRAME_SIZE 4
58# define USE_VFP
59# define USE_NEON
60
61#elif !defined (__SOFTFP__)
62
63 .arch armv6
64 .fpu vfpv2
65# define FRAME_SIZE 32
66# define USE_VFP
67
68#else
69 .arch armv6
70# define FRAME_SIZE 32
71
72#endif
73
74/* Old versions of GAS incorrectly implement the NEON align semantics. */
75#ifdef BROKEN_ASM_NEON_ALIGN
76#define ALIGN(addr, align) addr,:align
77#else
78#define ALIGN(addr, align) addr:align
79#endif
80
81#define PC_OFFSET 8 /* PC pipeline compensation. */
82#define INSN_SIZE 4
83
84/* Call parameters. */
85#define dstin r0
86#define src r1
87#define count r2
88
89/* Locals. */
90#define tmp1 r3
91#define dst ip
92#define tmp2 r10
93
94#ifndef USE_NEON
95/* For bulk copies using GP registers. */
96#define A_l r2 /* Call-clobbered. */
97#define A_h r3 /* Call-clobbered. */
98#define B_l r4
99#define B_h r5
100#define C_l r6
101#define C_h r7
102#define D_l r8
103#define D_h r9
104#endif
105
106/* Number of lines ahead to pre-fetch data. If you change this the code
107 below will need adjustment to compensate. */
108
109#define prefetch_lines 5
110
111#ifdef USE_VFP
112 .macro cpy_line_vfp vreg, base
113 vstr \vreg, [dst, #\base]
114 vldr \vreg, [src, #\base]
115 vstr d0, [dst, #\base + 8]
116 vldr d0, [src, #\base + 8]
117 vstr d1, [dst, #\base + 16]
118 vldr d1, [src, #\base + 16]
119 vstr d2, [dst, #\base + 24]
120 vldr d2, [src, #\base + 24]
121 vstr \vreg, [dst, #\base + 32]
122 vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
123 vstr d0, [dst, #\base + 40]
124 vldr d0, [src, #\base + 40]
125 vstr d1, [dst, #\base + 48]
126 vldr d1, [src, #\base + 48]
127 vstr d2, [dst, #\base + 56]
128 vldr d2, [src, #\base + 56]
129 .endm
130
131 .macro cpy_tail_vfp vreg, base
132 vstr \vreg, [dst, #\base]
133 vldr \vreg, [src, #\base]
134 vstr d0, [dst, #\base + 8]
135 vldr d0, [src, #\base + 8]
136 vstr d1, [dst, #\base + 16]
137 vldr d1, [src, #\base + 16]
138 vstr d2, [dst, #\base + 24]
139 vldr d2, [src, #\base + 24]
140 vstr \vreg, [dst, #\base + 32]
141 vstr d0, [dst, #\base + 40]
142 vldr d0, [src, #\base + 40]
143 vstr d1, [dst, #\base + 48]
144 vldr d1, [src, #\base + 48]
145 vstr d2, [dst, #\base + 56]
146 vldr d2, [src, #\base + 56]
147 .endm
148#endif
149
150 .p2align 6
151ENTRY(memcpy)
152
153 mov dst, dstin /* Preserve dstin, we need to return it. */
154 cmp count, #64
155 bge .Lcpy_not_short
156 /* Deal with small copies quickly by dropping straight into the
157 exit block. */
158
159.Ltail63unaligned:
160#ifdef USE_NEON
161 and tmp1, count, #0x38
162 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
163 add pc, pc, tmp1
164 vld1.8 {d0}, [src]! /* 14 words to go. */
165 vst1.8 {d0}, [dst]!
166 vld1.8 {d0}, [src]! /* 12 words to go. */
167 vst1.8 {d0}, [dst]!
168 vld1.8 {d0}, [src]! /* 10 words to go. */
169 vst1.8 {d0}, [dst]!
170 vld1.8 {d0}, [src]! /* 8 words to go. */
171 vst1.8 {d0}, [dst]!
172 vld1.8 {d0}, [src]! /* 6 words to go. */
173 vst1.8 {d0}, [dst]!
174 vld1.8 {d0}, [src]! /* 4 words to go. */
175 vst1.8 {d0}, [dst]!
176 vld1.8 {d0}, [src]! /* 2 words to go. */
177 vst1.8 {d0}, [dst]!
178
179 tst count, #4
180 ldrne tmp1, [src], #4
181 strne tmp1, [dst], #4
182#else
183 /* Copy up to 15 full words of data. May not be aligned. */
184 /* Cannot use VFP for unaligned data. */
185 and tmp1, count, #0x3c
186 add dst, dst, tmp1
187 add src, src, tmp1
188 rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
189 /* Jump directly into the sequence below at the correct offset. */
190 add pc, pc, tmp1, lsl #1
191
192 ldr tmp1, [src, #-60] /* 15 words to go. */
193 str tmp1, [dst, #-60]
194
195 ldr tmp1, [src, #-56] /* 14 words to go. */
196 str tmp1, [dst, #-56]
197 ldr tmp1, [src, #-52]
198 str tmp1, [dst, #-52]
199
200 ldr tmp1, [src, #-48] /* 12 words to go. */
201 str tmp1, [dst, #-48]
202 ldr tmp1, [src, #-44]
203 str tmp1, [dst, #-44]
204
205 ldr tmp1, [src, #-40] /* 10 words to go. */
206 str tmp1, [dst, #-40]
207 ldr tmp1, [src, #-36]
208 str tmp1, [dst, #-36]
209
210 ldr tmp1, [src, #-32] /* 8 words to go. */
211 str tmp1, [dst, #-32]
212 ldr tmp1, [src, #-28]
213 str tmp1, [dst, #-28]
214
215 ldr tmp1, [src, #-24] /* 6 words to go. */
216 str tmp1, [dst, #-24]
217 ldr tmp1, [src, #-20]
218 str tmp1, [dst, #-20]
219
220 ldr tmp1, [src, #-16] /* 4 words to go. */
221 str tmp1, [dst, #-16]
222 ldr tmp1, [src, #-12]
223 str tmp1, [dst, #-12]
224
225 ldr tmp1, [src, #-8] /* 2 words to go. */
226 str tmp1, [dst, #-8]
227 ldr tmp1, [src, #-4]
228 str tmp1, [dst, #-4]
229#endif
230
231 lsls count, count, #31
232 ldrhcs tmp1, [src], #2
233 ldrbne src, [src] /* Src is dead, use as a scratch. */
234 strhcs tmp1, [dst], #2
235 strbne src, [dst]
236 bx lr
237
238.Lcpy_not_short:
239 /* At least 64 bytes to copy, but don't know the alignment yet. */
240 str tmp2, [sp, #-FRAME_SIZE]!
241 and tmp2, src, #7
242 and tmp1, dst, #7
243 cmp tmp1, tmp2
244 bne .Lcpy_notaligned
245
246#ifdef USE_VFP
247 /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
248 that the FP pipeline is much better at streaming loads and
249 stores. This is outside the critical loop. */
250 vmov.f32 s0, s0
251#endif
252
253 /* SRC and DST have the same mutual 32-bit alignment, but we may
254 still need to pre-copy some bytes to get to natural alignment.
255 We bring DST into full 64-bit alignment. */
256 lsls tmp2, dst, #29
257 beq 1f
258 rsbs tmp2, tmp2, #0
259 sub count, count, tmp2, lsr #29
260 ldrmi tmp1, [src], #4
261 strmi tmp1, [dst], #4
262 lsls tmp2, tmp2, #2
263 ldrhcs tmp1, [src], #2
264 ldrbne tmp2, [src], #1
265 strhcs tmp1, [dst], #2
266 strbne tmp2, [dst], #1
267
2681:
269 subs tmp2, count, #64 /* Use tmp2 for count. */
270 blt .Ltail63aligned
271
272 cmp tmp2, #512
273 bge .Lcpy_body_long
274
275.Lcpy_body_medium: /* Count in tmp2. */
276#ifdef USE_VFP
2771:
278 vldr d0, [src, #0]
279 subs tmp2, tmp2, #64
280 vldr d1, [src, #8]
281 vstr d0, [dst, #0]
282 vldr d0, [src, #16]
283 vstr d1, [dst, #8]
284 vldr d1, [src, #24]
285 vstr d0, [dst, #16]
286 vldr d0, [src, #32]
287 vstr d1, [dst, #24]
288 vldr d1, [src, #40]
289 vstr d0, [dst, #32]
290 vldr d0, [src, #48]
291 vstr d1, [dst, #40]
292 vldr d1, [src, #56]
293 vstr d0, [dst, #48]
294 add src, src, #64
295 vstr d1, [dst, #56]
296 add dst, dst, #64
297 bge 1b
298 tst tmp2, #0x3f
299 beq .Ldone
300
301.Ltail63aligned: /* Count in tmp2. */
302 and tmp1, tmp2, #0x38
303 add dst, dst, tmp1
304 add src, src, tmp1
305 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
306 add pc, pc, tmp1
307
308 vldr d0, [src, #-56] /* 14 words to go. */
309 vstr d0, [dst, #-56]
310 vldr d0, [src, #-48] /* 12 words to go. */
311 vstr d0, [dst, #-48]
312 vldr d0, [src, #-40] /* 10 words to go. */
313 vstr d0, [dst, #-40]
314 vldr d0, [src, #-32] /* 8 words to go. */
315 vstr d0, [dst, #-32]
316 vldr d0, [src, #-24] /* 6 words to go. */
317 vstr d0, [dst, #-24]
318 vldr d0, [src, #-16] /* 4 words to go. */
319 vstr d0, [dst, #-16]
320 vldr d0, [src, #-8] /* 2 words to go. */
321 vstr d0, [dst, #-8]
322#else
323 sub src, src, #8
324 sub dst, dst, #8
3251:
326 ldrd A_l, A_h, [src, #8]
327 strd A_l, A_h, [dst, #8]
328 ldrd A_l, A_h, [src, #16]
329 strd A_l, A_h, [dst, #16]
330 ldrd A_l, A_h, [src, #24]
331 strd A_l, A_h, [dst, #24]
332 ldrd A_l, A_h, [src, #32]
333 strd A_l, A_h, [dst, #32]
334 ldrd A_l, A_h, [src, #40]
335 strd A_l, A_h, [dst, #40]
336 ldrd A_l, A_h, [src, #48]
337 strd A_l, A_h, [dst, #48]
338 ldrd A_l, A_h, [src, #56]
339 strd A_l, A_h, [dst, #56]
340 ldrd A_l, A_h, [src, #64]!
341 strd A_l, A_h, [dst, #64]!
342 subs tmp2, tmp2, #64
343 bge 1b
344 tst tmp2, #0x3f
345 bne 1f
346 ldr tmp2,[sp], #FRAME_SIZE
347 bx lr
3481:
349 add src, src, #8
350 add dst, dst, #8
351
352.Ltail63aligned: /* Count in tmp2. */
353 /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
354 we know that the src and dest are 32-bit aligned so we can use
355 LDRD/STRD to improve efficiency. */
356 /* TMP2 is now negative, but we don't care about that. The bottom
357 six bits still tell us how many bytes are left to copy. */
358
359 and tmp1, tmp2, #0x38
360 add dst, dst, tmp1
361 add src, src, tmp1
362 rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
363 add pc, pc, tmp1
364 ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
365 strd A_l, A_h, [dst, #-56]
366 ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
367 strd A_l, A_h, [dst, #-48]
368 ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
369 strd A_l, A_h, [dst, #-40]
370 ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
371 strd A_l, A_h, [dst, #-32]
372 ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
373 strd A_l, A_h, [dst, #-24]
374 ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
375 strd A_l, A_h, [dst, #-16]
376 ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
377 strd A_l, A_h, [dst, #-8]
378
379#endif
380 tst tmp2, #4
381 ldrne tmp1, [src], #4
382 strne tmp1, [dst], #4
383 lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
384 ldrhcs tmp1, [src], #2
385 ldrbne tmp2, [src]
386 strhcs tmp1, [dst], #2
387 strbne tmp2, [dst]
388
389.Ldone:
390 ldr tmp2, [sp], #FRAME_SIZE
391 bx lr
392
393.Lcpy_body_long: /* Count in tmp2. */
394
395 /* Long copy. We know that there's at least (prefetch_lines * 64)
396 bytes to go. */
397#ifdef USE_VFP
398 /* Don't use PLD. Instead, read some data in advance of the current
399 copy position into a register. This should act like a PLD
400 operation but we won't have to repeat the transfer. */
401
402 vldr d3, [src, #0]
403 vldr d4, [src, #64]
404 vldr d5, [src, #128]
405 vldr d6, [src, #192]
406 vldr d7, [src, #256]
407
408 vldr d0, [src, #8]
409 vldr d1, [src, #16]
410 vldr d2, [src, #24]
411 add src, src, #32
412
413 subs tmp2, tmp2, #prefetch_lines * 64 * 2
414 blt 2f
4151:
416 cpy_line_vfp d3, 0
417 cpy_line_vfp d4, 64
418 cpy_line_vfp d5, 128
419 add dst, dst, #3 * 64
420 add src, src, #3 * 64
421 cpy_line_vfp d6, 0
422 cpy_line_vfp d7, 64
423 add dst, dst, #2 * 64
424 add src, src, #2 * 64
425 subs tmp2, tmp2, #prefetch_lines * 64
426 bge 1b
427
4282:
429 cpy_tail_vfp d3, 0
430 cpy_tail_vfp d4, 64
431 cpy_tail_vfp d5, 128
432 add src, src, #3 * 64
433 add dst, dst, #3 * 64
434 cpy_tail_vfp d6, 0
435 vstr d7, [dst, #64]
436 vldr d7, [src, #64]
437 vstr d0, [dst, #64 + 8]
438 vldr d0, [src, #64 + 8]
439 vstr d1, [dst, #64 + 16]
440 vldr d1, [src, #64 + 16]
441 vstr d2, [dst, #64 + 24]
442 vldr d2, [src, #64 + 24]
443 vstr d7, [dst, #64 + 32]
444 add src, src, #96
445 vstr d0, [dst, #64 + 40]
446 vstr d1, [dst, #64 + 48]
447 vstr d2, [dst, #64 + 56]
448 add dst, dst, #128
449 add tmp2, tmp2, #prefetch_lines * 64
450 b .Lcpy_body_medium
451#else
452 /* Long copy. Use an SMS style loop to maximize the I/O
453 bandwidth of the core. We don't have enough spare registers
454 to synthesise prefetching, so use PLD operations. */
455 /* Pre-bias src and dst. */
456 sub src, src, #8
457 sub dst, dst, #8
458 pld [src, #8]
459 pld [src, #72]
460 subs tmp2, tmp2, #64
461 pld [src, #136]
462 ldrd A_l, A_h, [src, #8]
463 strd B_l, B_h, [sp, #8]
464 ldrd B_l, B_h, [src, #16]
465 strd C_l, C_h, [sp, #16]
466 ldrd C_l, C_h, [src, #24]
467 strd D_l, D_h, [sp, #24]
468 pld [src, #200]
469 ldrd D_l, D_h, [src, #32]!
470 b 1f
471 .p2align 6
4722:
473 pld [src, #232]
474 strd A_l, A_h, [dst, #40]
475 ldrd A_l, A_h, [src, #40]
476 strd B_l, B_h, [dst, #48]
477 ldrd B_l, B_h, [src, #48]
478 strd C_l, C_h, [dst, #56]
479 ldrd C_l, C_h, [src, #56]
480 strd D_l, D_h, [dst, #64]!
481 ldrd D_l, D_h, [src, #64]!
482 subs tmp2, tmp2, #64
4831:
484 strd A_l, A_h, [dst, #8]
485 ldrd A_l, A_h, [src, #8]
486 strd B_l, B_h, [dst, #16]
487 ldrd B_l, B_h, [src, #16]
488 strd C_l, C_h, [dst, #24]
489 ldrd C_l, C_h, [src, #24]
490 strd D_l, D_h, [dst, #32]
491 ldrd D_l, D_h, [src, #32]
492 bcs 2b
493 /* Save the remaining bytes and restore the callee-saved regs. */
494 strd A_l, A_h, [dst, #40]
495 add src, src, #40
496 strd B_l, B_h, [dst, #48]
497 ldrd B_l, B_h, [sp, #8]
498 strd C_l, C_h, [dst, #56]
499 ldrd C_l, C_h, [sp, #16]
500 strd D_l, D_h, [dst, #64]
501 ldrd D_l, D_h, [sp, #24]
502 add dst, dst, #72
503 tst tmp2, #0x3f
504 bne .Ltail63aligned
505 ldr tmp2, [sp], #FRAME_SIZE
506 bx lr
507#endif
508
509.Lcpy_notaligned:
510 pld [src]
511 pld [src, #64]
512 /* There's at least 64 bytes to copy, but there is no mutual
513 alignment. */
514 /* Bring DST to 64-bit alignment. */
515 lsls tmp2, dst, #29
516 pld [src, #(2 * 64)]
517 beq 1f
518 rsbs tmp2, tmp2, #0
519 sub count, count, tmp2, lsr #29
520 ldrmi tmp1, [src], #4
521 strmi tmp1, [dst], #4
522 lsls tmp2, tmp2, #2
523 ldrbne tmp1, [src], #1
524 ldrhcs tmp2, [src], #2
525 strbne tmp1, [dst], #1
526 strhcs tmp2, [dst], #2
5271:
528 pld [src, #(3 * 64)]
529 subs count, count, #64
530 ldrmi tmp2, [sp], #FRAME_SIZE
531 bmi .Ltail63unaligned
532 pld [src, #(4 * 64)]
533
534#ifdef USE_NEON
535 vld1.8 {d0-d3}, [src]!
536 vld1.8 {d4-d7}, [src]!
537 subs count, count, #64
538 bmi 2f
5391:
540 pld [src, #(4 * 64)]
541 vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
542 vld1.8 {d0-d3}, [src]!
543 vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
544 vld1.8 {d4-d7}, [src]!
545 subs count, count, #64
546 bpl 1b
5472:
548 vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
549 vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
550 ands count, count, #0x3f
551#else
552 /* Use an SMS style loop to maximize the I/O bandwidth. */
553 sub src, src, #4
554 sub dst, dst, #8
555 subs tmp2, count, #64 /* Use tmp2 for count. */
556 ldr A_l, [src, #4]
557 ldr A_h, [src, #8]
558 strd B_l, B_h, [sp, #8]
559 ldr B_l, [src, #12]
560 ldr B_h, [src, #16]
561 strd C_l, C_h, [sp, #16]
562 ldr C_l, [src, #20]
563 ldr C_h, [src, #24]
564 strd D_l, D_h, [sp, #24]
565 ldr D_l, [src, #28]
566 ldr D_h, [src, #32]!
567 b 1f
568 .p2align 6
5692:
570 pld [src, #(5 * 64) - (32 - 4)]
571 strd A_l, A_h, [dst, #40]
572 ldr A_l, [src, #36]
573 ldr A_h, [src, #40]
574 strd B_l, B_h, [dst, #48]
575 ldr B_l, [src, #44]
576 ldr B_h, [src, #48]
577 strd C_l, C_h, [dst, #56]
578 ldr C_l, [src, #52]
579 ldr C_h, [src, #56]
580 strd D_l, D_h, [dst, #64]!
581 ldr D_l, [src, #60]
582 ldr D_h, [src, #64]!
583 subs tmp2, tmp2, #64
5841:
585 strd A_l, A_h, [dst, #8]
586 ldr A_l, [src, #4]
587 ldr A_h, [src, #8]
588 strd B_l, B_h, [dst, #16]
589 ldr B_l, [src, #12]
590 ldr B_h, [src, #16]
591 strd C_l, C_h, [dst, #24]
592 ldr C_l, [src, #20]
593 ldr C_h, [src, #24]
594 strd D_l, D_h, [dst, #32]
595 ldr D_l, [src, #28]
596 ldr D_h, [src, #32]
597 bcs 2b
598
599 /* Save the remaining bytes and restore the callee-saved regs. */
600 strd A_l, A_h, [dst, #40]
601 add src, src, #36
602 strd B_l, B_h, [dst, #48]
603 ldrd B_l, B_h, [sp, #8]
604 strd C_l, C_h, [dst, #56]
605 ldrd C_l, C_h, [sp, #16]
606 strd D_l, D_h, [dst, #64]
607 ldrd D_l, D_h, [sp, #24]
608 add dst, dst, #72
609 ands count, tmp2, #0x3f
610#endif
611 ldr tmp2, [sp], #FRAME_SIZE
612 bne .Ltail63unaligned
613 bx lr
614END(memcpy)