Will Newton | b61103d | 2013-07-01 11:15:27 +0100 | [diff] [blame] | 1 | /* Copyright (c) 2013, Linaro Limited |
| 2 | All rights reserved. |
| 3 | |
| 4 | Redistribution and use in source and binary forms, with or without |
| 5 | modification, are permitted provided that the following conditions |
| 6 | are met: |
| 7 | |
| 8 | * Redistributions of source code must retain the above copyright |
| 9 | notice, this list of conditions and the following disclaimer. |
| 10 | |
| 11 | * Redistributions in binary form must reproduce the above copyright |
| 12 | notice, this list of conditions and the following disclaimer in the |
| 13 | documentation and/or other materials provided with the distribution. |
| 14 | |
| 15 | * Neither the name of Linaro Limited nor the names of its |
| 16 | contributors may be used to endorse or promote products derived |
| 17 | from this software without specific prior written permission. |
| 18 | |
| 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 20 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 21 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 22 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 23 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 24 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 25 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 26 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 27 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 28 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 30 | |
Will Newton | 2753e12 | 2013-07-03 09:44:30 +0100 | [diff] [blame] | 31 | */ |
| 32 | |
| 33 | /* |
Will Newton | b61103d | 2013-07-01 11:15:27 +0100 | [diff] [blame] | 34 | This memcpy routine is optimised for Cortex-A15 cores and takes advantage |
| 35 | of VFP or NEON when built with the appropriate flags. |
| 36 | |
| 37 | Assumptions: |
| 38 | |
| 39 | ARMv6 (ARMv7-a if using Neon) |
| 40 | ARM state |
| 41 | Unaligned accesses |
| 42 | LDRD/STRD support unaligned word accesses |
| 43 | |
| 44 | */ |
| 45 | |
| 46 | #include <machine/cpu-features.h> |
| 47 | #include <machine/asm.h> |
| 48 | |
| 49 | .syntax unified |
| 50 | /* This implementation requires ARM state. */ |
| 51 | .arm |
| 52 | |
| 53 | #ifdef __ARM_NEON__ |
| 54 | |
| 55 | .fpu neon |
| 56 | .arch armv7-a |
| 57 | # define FRAME_SIZE 4 |
| 58 | # define USE_VFP |
| 59 | # define USE_NEON |
| 60 | |
| 61 | #elif !defined (__SOFTFP__) |
| 62 | |
| 63 | .arch armv6 |
| 64 | .fpu vfpv2 |
| 65 | # define FRAME_SIZE 32 |
| 66 | # define USE_VFP |
| 67 | |
| 68 | #else |
| 69 | .arch armv6 |
| 70 | # define FRAME_SIZE 32 |
| 71 | |
| 72 | #endif |
| 73 | |
| 74 | /* Old versions of GAS incorrectly implement the NEON align semantics. */ |
| 75 | #ifdef BROKEN_ASM_NEON_ALIGN |
| 76 | #define ALIGN(addr, align) addr,:align |
| 77 | #else |
| 78 | #define ALIGN(addr, align) addr:align |
| 79 | #endif |
| 80 | |
| 81 | #define PC_OFFSET 8 /* PC pipeline compensation. */ |
| 82 | #define INSN_SIZE 4 |
| 83 | |
| 84 | /* Call parameters. */ |
| 85 | #define dstin r0 |
| 86 | #define src r1 |
| 87 | #define count r2 |
| 88 | |
| 89 | /* Locals. */ |
| 90 | #define tmp1 r3 |
| 91 | #define dst ip |
| 92 | #define tmp2 r10 |
| 93 | |
| 94 | #ifndef USE_NEON |
| 95 | /* For bulk copies using GP registers. */ |
| 96 | #define A_l r2 /* Call-clobbered. */ |
| 97 | #define A_h r3 /* Call-clobbered. */ |
| 98 | #define B_l r4 |
| 99 | #define B_h r5 |
| 100 | #define C_l r6 |
| 101 | #define C_h r7 |
| 102 | #define D_l r8 |
| 103 | #define D_h r9 |
| 104 | #endif |
| 105 | |
| 106 | /* Number of lines ahead to pre-fetch data. If you change this the code |
| 107 | below will need adjustment to compensate. */ |
| 108 | |
| 109 | #define prefetch_lines 5 |
| 110 | |
| 111 | #ifdef USE_VFP |
| 112 | .macro cpy_line_vfp vreg, base |
| 113 | vstr \vreg, [dst, #\base] |
| 114 | vldr \vreg, [src, #\base] |
| 115 | vstr d0, [dst, #\base + 8] |
| 116 | vldr d0, [src, #\base + 8] |
| 117 | vstr d1, [dst, #\base + 16] |
| 118 | vldr d1, [src, #\base + 16] |
| 119 | vstr d2, [dst, #\base + 24] |
| 120 | vldr d2, [src, #\base + 24] |
| 121 | vstr \vreg, [dst, #\base + 32] |
| 122 | vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] |
| 123 | vstr d0, [dst, #\base + 40] |
| 124 | vldr d0, [src, #\base + 40] |
| 125 | vstr d1, [dst, #\base + 48] |
| 126 | vldr d1, [src, #\base + 48] |
| 127 | vstr d2, [dst, #\base + 56] |
| 128 | vldr d2, [src, #\base + 56] |
| 129 | .endm |
| 130 | |
| 131 | .macro cpy_tail_vfp vreg, base |
| 132 | vstr \vreg, [dst, #\base] |
| 133 | vldr \vreg, [src, #\base] |
| 134 | vstr d0, [dst, #\base + 8] |
| 135 | vldr d0, [src, #\base + 8] |
| 136 | vstr d1, [dst, #\base + 16] |
| 137 | vldr d1, [src, #\base + 16] |
| 138 | vstr d2, [dst, #\base + 24] |
| 139 | vldr d2, [src, #\base + 24] |
| 140 | vstr \vreg, [dst, #\base + 32] |
| 141 | vstr d0, [dst, #\base + 40] |
| 142 | vldr d0, [src, #\base + 40] |
| 143 | vstr d1, [dst, #\base + 48] |
| 144 | vldr d1, [src, #\base + 48] |
| 145 | vstr d2, [dst, #\base + 56] |
| 146 | vldr d2, [src, #\base + 56] |
| 147 | .endm |
| 148 | #endif |
| 149 | |
| 150 | .p2align 6 |
| 151 | ENTRY(memcpy) |
| 152 | |
| 153 | mov dst, dstin /* Preserve dstin, we need to return it. */ |
| 154 | cmp count, #64 |
| 155 | bge .Lcpy_not_short |
| 156 | /* Deal with small copies quickly by dropping straight into the |
| 157 | exit block. */ |
| 158 | |
| 159 | .Ltail63unaligned: |
| 160 | #ifdef USE_NEON |
| 161 | and tmp1, count, #0x38 |
| 162 | rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) |
| 163 | add pc, pc, tmp1 |
| 164 | vld1.8 {d0}, [src]! /* 14 words to go. */ |
| 165 | vst1.8 {d0}, [dst]! |
| 166 | vld1.8 {d0}, [src]! /* 12 words to go. */ |
| 167 | vst1.8 {d0}, [dst]! |
| 168 | vld1.8 {d0}, [src]! /* 10 words to go. */ |
| 169 | vst1.8 {d0}, [dst]! |
| 170 | vld1.8 {d0}, [src]! /* 8 words to go. */ |
| 171 | vst1.8 {d0}, [dst]! |
| 172 | vld1.8 {d0}, [src]! /* 6 words to go. */ |
| 173 | vst1.8 {d0}, [dst]! |
| 174 | vld1.8 {d0}, [src]! /* 4 words to go. */ |
| 175 | vst1.8 {d0}, [dst]! |
| 176 | vld1.8 {d0}, [src]! /* 2 words to go. */ |
| 177 | vst1.8 {d0}, [dst]! |
| 178 | |
| 179 | tst count, #4 |
| 180 | ldrne tmp1, [src], #4 |
| 181 | strne tmp1, [dst], #4 |
| 182 | #else |
| 183 | /* Copy up to 15 full words of data. May not be aligned. */ |
| 184 | /* Cannot use VFP for unaligned data. */ |
| 185 | and tmp1, count, #0x3c |
| 186 | add dst, dst, tmp1 |
| 187 | add src, src, tmp1 |
| 188 | rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2) |
| 189 | /* Jump directly into the sequence below at the correct offset. */ |
| 190 | add pc, pc, tmp1, lsl #1 |
| 191 | |
| 192 | ldr tmp1, [src, #-60] /* 15 words to go. */ |
| 193 | str tmp1, [dst, #-60] |
| 194 | |
| 195 | ldr tmp1, [src, #-56] /* 14 words to go. */ |
| 196 | str tmp1, [dst, #-56] |
| 197 | ldr tmp1, [src, #-52] |
| 198 | str tmp1, [dst, #-52] |
| 199 | |
| 200 | ldr tmp1, [src, #-48] /* 12 words to go. */ |
| 201 | str tmp1, [dst, #-48] |
| 202 | ldr tmp1, [src, #-44] |
| 203 | str tmp1, [dst, #-44] |
| 204 | |
| 205 | ldr tmp1, [src, #-40] /* 10 words to go. */ |
| 206 | str tmp1, [dst, #-40] |
| 207 | ldr tmp1, [src, #-36] |
| 208 | str tmp1, [dst, #-36] |
| 209 | |
| 210 | ldr tmp1, [src, #-32] /* 8 words to go. */ |
| 211 | str tmp1, [dst, #-32] |
| 212 | ldr tmp1, [src, #-28] |
| 213 | str tmp1, [dst, #-28] |
| 214 | |
| 215 | ldr tmp1, [src, #-24] /* 6 words to go. */ |
| 216 | str tmp1, [dst, #-24] |
| 217 | ldr tmp1, [src, #-20] |
| 218 | str tmp1, [dst, #-20] |
| 219 | |
| 220 | ldr tmp1, [src, #-16] /* 4 words to go. */ |
| 221 | str tmp1, [dst, #-16] |
| 222 | ldr tmp1, [src, #-12] |
| 223 | str tmp1, [dst, #-12] |
| 224 | |
| 225 | ldr tmp1, [src, #-8] /* 2 words to go. */ |
| 226 | str tmp1, [dst, #-8] |
| 227 | ldr tmp1, [src, #-4] |
| 228 | str tmp1, [dst, #-4] |
| 229 | #endif |
| 230 | |
| 231 | lsls count, count, #31 |
| 232 | ldrhcs tmp1, [src], #2 |
| 233 | ldrbne src, [src] /* Src is dead, use as a scratch. */ |
| 234 | strhcs tmp1, [dst], #2 |
| 235 | strbne src, [dst] |
| 236 | bx lr |
| 237 | |
| 238 | .Lcpy_not_short: |
| 239 | /* At least 64 bytes to copy, but don't know the alignment yet. */ |
| 240 | str tmp2, [sp, #-FRAME_SIZE]! |
| 241 | and tmp2, src, #7 |
| 242 | and tmp1, dst, #7 |
| 243 | cmp tmp1, tmp2 |
| 244 | bne .Lcpy_notaligned |
| 245 | |
| 246 | #ifdef USE_VFP |
| 247 | /* Magic dust alert! Force VFP on Cortex-A9. Experiments show |
| 248 | that the FP pipeline is much better at streaming loads and |
| 249 | stores. This is outside the critical loop. */ |
| 250 | vmov.f32 s0, s0 |
| 251 | #endif |
| 252 | |
| 253 | /* SRC and DST have the same mutual 32-bit alignment, but we may |
| 254 | still need to pre-copy some bytes to get to natural alignment. |
| 255 | We bring DST into full 64-bit alignment. */ |
| 256 | lsls tmp2, dst, #29 |
| 257 | beq 1f |
| 258 | rsbs tmp2, tmp2, #0 |
| 259 | sub count, count, tmp2, lsr #29 |
| 260 | ldrmi tmp1, [src], #4 |
| 261 | strmi tmp1, [dst], #4 |
| 262 | lsls tmp2, tmp2, #2 |
| 263 | ldrhcs tmp1, [src], #2 |
| 264 | ldrbne tmp2, [src], #1 |
| 265 | strhcs tmp1, [dst], #2 |
| 266 | strbne tmp2, [dst], #1 |
| 267 | |
| 268 | 1: |
| 269 | subs tmp2, count, #64 /* Use tmp2 for count. */ |
| 270 | blt .Ltail63aligned |
| 271 | |
| 272 | cmp tmp2, #512 |
| 273 | bge .Lcpy_body_long |
| 274 | |
| 275 | .Lcpy_body_medium: /* Count in tmp2. */ |
| 276 | #ifdef USE_VFP |
| 277 | 1: |
| 278 | vldr d0, [src, #0] |
| 279 | subs tmp2, tmp2, #64 |
| 280 | vldr d1, [src, #8] |
| 281 | vstr d0, [dst, #0] |
| 282 | vldr d0, [src, #16] |
| 283 | vstr d1, [dst, #8] |
| 284 | vldr d1, [src, #24] |
| 285 | vstr d0, [dst, #16] |
| 286 | vldr d0, [src, #32] |
| 287 | vstr d1, [dst, #24] |
| 288 | vldr d1, [src, #40] |
| 289 | vstr d0, [dst, #32] |
| 290 | vldr d0, [src, #48] |
| 291 | vstr d1, [dst, #40] |
| 292 | vldr d1, [src, #56] |
| 293 | vstr d0, [dst, #48] |
| 294 | add src, src, #64 |
| 295 | vstr d1, [dst, #56] |
| 296 | add dst, dst, #64 |
| 297 | bge 1b |
| 298 | tst tmp2, #0x3f |
| 299 | beq .Ldone |
| 300 | |
| 301 | .Ltail63aligned: /* Count in tmp2. */ |
| 302 | and tmp1, tmp2, #0x38 |
| 303 | add dst, dst, tmp1 |
| 304 | add src, src, tmp1 |
| 305 | rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) |
| 306 | add pc, pc, tmp1 |
| 307 | |
| 308 | vldr d0, [src, #-56] /* 14 words to go. */ |
| 309 | vstr d0, [dst, #-56] |
| 310 | vldr d0, [src, #-48] /* 12 words to go. */ |
| 311 | vstr d0, [dst, #-48] |
| 312 | vldr d0, [src, #-40] /* 10 words to go. */ |
| 313 | vstr d0, [dst, #-40] |
| 314 | vldr d0, [src, #-32] /* 8 words to go. */ |
| 315 | vstr d0, [dst, #-32] |
| 316 | vldr d0, [src, #-24] /* 6 words to go. */ |
| 317 | vstr d0, [dst, #-24] |
| 318 | vldr d0, [src, #-16] /* 4 words to go. */ |
| 319 | vstr d0, [dst, #-16] |
| 320 | vldr d0, [src, #-8] /* 2 words to go. */ |
| 321 | vstr d0, [dst, #-8] |
| 322 | #else |
| 323 | sub src, src, #8 |
| 324 | sub dst, dst, #8 |
| 325 | 1: |
| 326 | ldrd A_l, A_h, [src, #8] |
| 327 | strd A_l, A_h, [dst, #8] |
| 328 | ldrd A_l, A_h, [src, #16] |
| 329 | strd A_l, A_h, [dst, #16] |
| 330 | ldrd A_l, A_h, [src, #24] |
| 331 | strd A_l, A_h, [dst, #24] |
| 332 | ldrd A_l, A_h, [src, #32] |
| 333 | strd A_l, A_h, [dst, #32] |
| 334 | ldrd A_l, A_h, [src, #40] |
| 335 | strd A_l, A_h, [dst, #40] |
| 336 | ldrd A_l, A_h, [src, #48] |
| 337 | strd A_l, A_h, [dst, #48] |
| 338 | ldrd A_l, A_h, [src, #56] |
| 339 | strd A_l, A_h, [dst, #56] |
| 340 | ldrd A_l, A_h, [src, #64]! |
| 341 | strd A_l, A_h, [dst, #64]! |
| 342 | subs tmp2, tmp2, #64 |
| 343 | bge 1b |
| 344 | tst tmp2, #0x3f |
| 345 | bne 1f |
| 346 | ldr tmp2,[sp], #FRAME_SIZE |
| 347 | bx lr |
| 348 | 1: |
| 349 | add src, src, #8 |
| 350 | add dst, dst, #8 |
| 351 | |
| 352 | .Ltail63aligned: /* Count in tmp2. */ |
| 353 | /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but |
| 354 | we know that the src and dest are 32-bit aligned so we can use |
| 355 | LDRD/STRD to improve efficiency. */ |
| 356 | /* TMP2 is now negative, but we don't care about that. The bottom |
| 357 | six bits still tell us how many bytes are left to copy. */ |
| 358 | |
| 359 | and tmp1, tmp2, #0x38 |
| 360 | add dst, dst, tmp1 |
| 361 | add src, src, tmp1 |
| 362 | rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) |
| 363 | add pc, pc, tmp1 |
| 364 | ldrd A_l, A_h, [src, #-56] /* 14 words to go. */ |
| 365 | strd A_l, A_h, [dst, #-56] |
| 366 | ldrd A_l, A_h, [src, #-48] /* 12 words to go. */ |
| 367 | strd A_l, A_h, [dst, #-48] |
| 368 | ldrd A_l, A_h, [src, #-40] /* 10 words to go. */ |
| 369 | strd A_l, A_h, [dst, #-40] |
| 370 | ldrd A_l, A_h, [src, #-32] /* 8 words to go. */ |
| 371 | strd A_l, A_h, [dst, #-32] |
| 372 | ldrd A_l, A_h, [src, #-24] /* 6 words to go. */ |
| 373 | strd A_l, A_h, [dst, #-24] |
| 374 | ldrd A_l, A_h, [src, #-16] /* 4 words to go. */ |
| 375 | strd A_l, A_h, [dst, #-16] |
| 376 | ldrd A_l, A_h, [src, #-8] /* 2 words to go. */ |
| 377 | strd A_l, A_h, [dst, #-8] |
| 378 | |
| 379 | #endif |
| 380 | tst tmp2, #4 |
| 381 | ldrne tmp1, [src], #4 |
| 382 | strne tmp1, [dst], #4 |
| 383 | lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ |
| 384 | ldrhcs tmp1, [src], #2 |
| 385 | ldrbne tmp2, [src] |
| 386 | strhcs tmp1, [dst], #2 |
| 387 | strbne tmp2, [dst] |
| 388 | |
| 389 | .Ldone: |
| 390 | ldr tmp2, [sp], #FRAME_SIZE |
| 391 | bx lr |
| 392 | |
| 393 | .Lcpy_body_long: /* Count in tmp2. */ |
| 394 | |
| 395 | /* Long copy. We know that there's at least (prefetch_lines * 64) |
| 396 | bytes to go. */ |
| 397 | #ifdef USE_VFP |
| 398 | /* Don't use PLD. Instead, read some data in advance of the current |
| 399 | copy position into a register. This should act like a PLD |
| 400 | operation but we won't have to repeat the transfer. */ |
| 401 | |
| 402 | vldr d3, [src, #0] |
| 403 | vldr d4, [src, #64] |
| 404 | vldr d5, [src, #128] |
| 405 | vldr d6, [src, #192] |
| 406 | vldr d7, [src, #256] |
| 407 | |
| 408 | vldr d0, [src, #8] |
| 409 | vldr d1, [src, #16] |
| 410 | vldr d2, [src, #24] |
| 411 | add src, src, #32 |
| 412 | |
| 413 | subs tmp2, tmp2, #prefetch_lines * 64 * 2 |
| 414 | blt 2f |
| 415 | 1: |
| 416 | cpy_line_vfp d3, 0 |
| 417 | cpy_line_vfp d4, 64 |
| 418 | cpy_line_vfp d5, 128 |
| 419 | add dst, dst, #3 * 64 |
| 420 | add src, src, #3 * 64 |
| 421 | cpy_line_vfp d6, 0 |
| 422 | cpy_line_vfp d7, 64 |
| 423 | add dst, dst, #2 * 64 |
| 424 | add src, src, #2 * 64 |
| 425 | subs tmp2, tmp2, #prefetch_lines * 64 |
| 426 | bge 1b |
| 427 | |
| 428 | 2: |
| 429 | cpy_tail_vfp d3, 0 |
| 430 | cpy_tail_vfp d4, 64 |
| 431 | cpy_tail_vfp d5, 128 |
| 432 | add src, src, #3 * 64 |
| 433 | add dst, dst, #3 * 64 |
| 434 | cpy_tail_vfp d6, 0 |
| 435 | vstr d7, [dst, #64] |
| 436 | vldr d7, [src, #64] |
| 437 | vstr d0, [dst, #64 + 8] |
| 438 | vldr d0, [src, #64 + 8] |
| 439 | vstr d1, [dst, #64 + 16] |
| 440 | vldr d1, [src, #64 + 16] |
| 441 | vstr d2, [dst, #64 + 24] |
| 442 | vldr d2, [src, #64 + 24] |
| 443 | vstr d7, [dst, #64 + 32] |
| 444 | add src, src, #96 |
| 445 | vstr d0, [dst, #64 + 40] |
| 446 | vstr d1, [dst, #64 + 48] |
| 447 | vstr d2, [dst, #64 + 56] |
| 448 | add dst, dst, #128 |
| 449 | add tmp2, tmp2, #prefetch_lines * 64 |
| 450 | b .Lcpy_body_medium |
| 451 | #else |
| 452 | /* Long copy. Use an SMS style loop to maximize the I/O |
| 453 | bandwidth of the core. We don't have enough spare registers |
| 454 | to synthesise prefetching, so use PLD operations. */ |
| 455 | /* Pre-bias src and dst. */ |
| 456 | sub src, src, #8 |
| 457 | sub dst, dst, #8 |
| 458 | pld [src, #8] |
| 459 | pld [src, #72] |
| 460 | subs tmp2, tmp2, #64 |
| 461 | pld [src, #136] |
| 462 | ldrd A_l, A_h, [src, #8] |
| 463 | strd B_l, B_h, [sp, #8] |
| 464 | ldrd B_l, B_h, [src, #16] |
| 465 | strd C_l, C_h, [sp, #16] |
| 466 | ldrd C_l, C_h, [src, #24] |
| 467 | strd D_l, D_h, [sp, #24] |
| 468 | pld [src, #200] |
| 469 | ldrd D_l, D_h, [src, #32]! |
| 470 | b 1f |
| 471 | .p2align 6 |
| 472 | 2: |
| 473 | pld [src, #232] |
| 474 | strd A_l, A_h, [dst, #40] |
| 475 | ldrd A_l, A_h, [src, #40] |
| 476 | strd B_l, B_h, [dst, #48] |
| 477 | ldrd B_l, B_h, [src, #48] |
| 478 | strd C_l, C_h, [dst, #56] |
| 479 | ldrd C_l, C_h, [src, #56] |
| 480 | strd D_l, D_h, [dst, #64]! |
| 481 | ldrd D_l, D_h, [src, #64]! |
| 482 | subs tmp2, tmp2, #64 |
| 483 | 1: |
| 484 | strd A_l, A_h, [dst, #8] |
| 485 | ldrd A_l, A_h, [src, #8] |
| 486 | strd B_l, B_h, [dst, #16] |
| 487 | ldrd B_l, B_h, [src, #16] |
| 488 | strd C_l, C_h, [dst, #24] |
| 489 | ldrd C_l, C_h, [src, #24] |
| 490 | strd D_l, D_h, [dst, #32] |
| 491 | ldrd D_l, D_h, [src, #32] |
| 492 | bcs 2b |
| 493 | /* Save the remaining bytes and restore the callee-saved regs. */ |
| 494 | strd A_l, A_h, [dst, #40] |
| 495 | add src, src, #40 |
| 496 | strd B_l, B_h, [dst, #48] |
| 497 | ldrd B_l, B_h, [sp, #8] |
| 498 | strd C_l, C_h, [dst, #56] |
| 499 | ldrd C_l, C_h, [sp, #16] |
| 500 | strd D_l, D_h, [dst, #64] |
| 501 | ldrd D_l, D_h, [sp, #24] |
| 502 | add dst, dst, #72 |
| 503 | tst tmp2, #0x3f |
| 504 | bne .Ltail63aligned |
| 505 | ldr tmp2, [sp], #FRAME_SIZE |
| 506 | bx lr |
| 507 | #endif |
| 508 | |
| 509 | .Lcpy_notaligned: |
| 510 | pld [src] |
| 511 | pld [src, #64] |
| 512 | /* There's at least 64 bytes to copy, but there is no mutual |
| 513 | alignment. */ |
| 514 | /* Bring DST to 64-bit alignment. */ |
| 515 | lsls tmp2, dst, #29 |
| 516 | pld [src, #(2 * 64)] |
| 517 | beq 1f |
| 518 | rsbs tmp2, tmp2, #0 |
| 519 | sub count, count, tmp2, lsr #29 |
| 520 | ldrmi tmp1, [src], #4 |
| 521 | strmi tmp1, [dst], #4 |
| 522 | lsls tmp2, tmp2, #2 |
| 523 | ldrbne tmp1, [src], #1 |
| 524 | ldrhcs tmp2, [src], #2 |
| 525 | strbne tmp1, [dst], #1 |
| 526 | strhcs tmp2, [dst], #2 |
| 527 | 1: |
| 528 | pld [src, #(3 * 64)] |
| 529 | subs count, count, #64 |
| 530 | ldrmi tmp2, [sp], #FRAME_SIZE |
| 531 | bmi .Ltail63unaligned |
| 532 | pld [src, #(4 * 64)] |
| 533 | |
| 534 | #ifdef USE_NEON |
| 535 | vld1.8 {d0-d3}, [src]! |
| 536 | vld1.8 {d4-d7}, [src]! |
| 537 | subs count, count, #64 |
| 538 | bmi 2f |
| 539 | 1: |
| 540 | pld [src, #(4 * 64)] |
| 541 | vst1.8 {d0-d3}, [ALIGN (dst, 64)]! |
| 542 | vld1.8 {d0-d3}, [src]! |
| 543 | vst1.8 {d4-d7}, [ALIGN (dst, 64)]! |
| 544 | vld1.8 {d4-d7}, [src]! |
| 545 | subs count, count, #64 |
| 546 | bpl 1b |
| 547 | 2: |
| 548 | vst1.8 {d0-d3}, [ALIGN (dst, 64)]! |
| 549 | vst1.8 {d4-d7}, [ALIGN (dst, 64)]! |
| 550 | ands count, count, #0x3f |
| 551 | #else |
| 552 | /* Use an SMS style loop to maximize the I/O bandwidth. */ |
| 553 | sub src, src, #4 |
| 554 | sub dst, dst, #8 |
| 555 | subs tmp2, count, #64 /* Use tmp2 for count. */ |
| 556 | ldr A_l, [src, #4] |
| 557 | ldr A_h, [src, #8] |
| 558 | strd B_l, B_h, [sp, #8] |
| 559 | ldr B_l, [src, #12] |
| 560 | ldr B_h, [src, #16] |
| 561 | strd C_l, C_h, [sp, #16] |
| 562 | ldr C_l, [src, #20] |
| 563 | ldr C_h, [src, #24] |
| 564 | strd D_l, D_h, [sp, #24] |
| 565 | ldr D_l, [src, #28] |
| 566 | ldr D_h, [src, #32]! |
| 567 | b 1f |
| 568 | .p2align 6 |
| 569 | 2: |
| 570 | pld [src, #(5 * 64) - (32 - 4)] |
| 571 | strd A_l, A_h, [dst, #40] |
| 572 | ldr A_l, [src, #36] |
| 573 | ldr A_h, [src, #40] |
| 574 | strd B_l, B_h, [dst, #48] |
| 575 | ldr B_l, [src, #44] |
| 576 | ldr B_h, [src, #48] |
| 577 | strd C_l, C_h, [dst, #56] |
| 578 | ldr C_l, [src, #52] |
| 579 | ldr C_h, [src, #56] |
| 580 | strd D_l, D_h, [dst, #64]! |
| 581 | ldr D_l, [src, #60] |
| 582 | ldr D_h, [src, #64]! |
| 583 | subs tmp2, tmp2, #64 |
| 584 | 1: |
| 585 | strd A_l, A_h, [dst, #8] |
| 586 | ldr A_l, [src, #4] |
| 587 | ldr A_h, [src, #8] |
| 588 | strd B_l, B_h, [dst, #16] |
| 589 | ldr B_l, [src, #12] |
| 590 | ldr B_h, [src, #16] |
| 591 | strd C_l, C_h, [dst, #24] |
| 592 | ldr C_l, [src, #20] |
| 593 | ldr C_h, [src, #24] |
| 594 | strd D_l, D_h, [dst, #32] |
| 595 | ldr D_l, [src, #28] |
| 596 | ldr D_h, [src, #32] |
| 597 | bcs 2b |
| 598 | |
| 599 | /* Save the remaining bytes and restore the callee-saved regs. */ |
| 600 | strd A_l, A_h, [dst, #40] |
| 601 | add src, src, #36 |
| 602 | strd B_l, B_h, [dst, #48] |
| 603 | ldrd B_l, B_h, [sp, #8] |
| 604 | strd C_l, C_h, [dst, #56] |
| 605 | ldrd C_l, C_h, [sp, #16] |
| 606 | strd D_l, D_h, [dst, #64] |
| 607 | ldrd D_l, D_h, [sp, #24] |
| 608 | add dst, dst, #72 |
| 609 | ands count, tmp2, #0x3f |
| 610 | #endif |
| 611 | ldr tmp2, [sp], #FRAME_SIZE |
| 612 | bne .Ltail63unaligned |
| 613 | bx lr |
| 614 | END(memcpy) |