blob: f25b3e3565a0073d1082a4e5132aba17f1a330df [file] [log] [blame]
The Android Open Source Project1dc9e472009-03-03 19:28:35 -08001/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
13 * distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <machine/cpu-features.h>
Kenny Root420878c2011-02-16 11:55:58 -080030#include <machine/asm.h>
The Android Open Source Project1dc9e472009-03-03 19:28:35 -080031
Prajakta Gudadhe08e72d02012-05-07 14:17:44 -070032#if defined(__ARM_NEON__) && !defined(ARCH_ARM_USE_NON_NEON_MEMCPY)
Mathias Agopianee223d02009-09-27 17:46:43 -070033
34 .text
35 .fpu neon
36
Henrik Smidingfe6338d2010-09-15 16:08:03 +020037#ifdef HAVE_32_BYTE_CACHE_LINE
38/* a prefetch distance of 2 cache-lines */
39#define CACHE_LINE_SIZE 32
Henrik Smidingfe6338d2010-09-15 16:08:03 +020040#else
Mathias Agopian199f9d92009-10-28 02:54:37 -070041/* a prefetch distance of 4 cache-lines works best experimentally */
42#define CACHE_LINE_SIZE 64
Henrik Smidingfe6338d2010-09-15 16:08:03 +020043#endif
Mathias Agopianee223d02009-09-27 17:46:43 -070044
Evgeniy Stepanov487b6132011-10-04 14:22:15 +040045ENTRY(memcpy)
Mathias Agopianee223d02009-09-27 17:46:43 -070046 .save {r0, lr}
Mathias Agopianee223d02009-09-27 17:46:43 -070047 /* start preloading as early as possible */
Henrik Smiding6d0bcdc2011-01-17 16:05:41 +010048 pld [r1, #(CACHE_LINE_SIZE * 0)]
Henrik Smidingfe6338d2010-09-15 16:08:03 +020049 stmfd sp!, {r0, lr}
Henrik Smiding6d0bcdc2011-01-17 16:05:41 +010050 pld [r1, #(CACHE_LINE_SIZE * 1)]
Mathias Agopianee223d02009-09-27 17:46:43 -070051
Henrik Smiding6d0bcdc2011-01-17 16:05:41 +010052/* If Neon supports unaligned access then remove the align code,
53 * unless a size limit has been specified.
54 */
55#ifndef NEON_UNALIGNED_ACCESS
Mathias Agopianee223d02009-09-27 17:46:43 -070056 /* do we have at least 16-bytes to copy (needed for alignment below) */
57 cmp r2, #16
58 blo 5f
59
Henrik Smiding6d0bcdc2011-01-17 16:05:41 +010060 /* check if buffers are aligned. If so, run arm-only version */
61 eor r3, r0, r1
62 ands r3, r3, #0x3
63 beq 11f
64
Henrik Smidingfe6338d2010-09-15 16:08:03 +020065 /* align destination to cache-line for the write-buffer */
Mathias Agopianee223d02009-09-27 17:46:43 -070066 rsb r3, r0, #0
67 ands r3, r3, #0xF
Henrik Smiding6d0bcdc2011-01-17 16:05:41 +010068 beq 2f
Mathias Agopianee223d02009-09-27 17:46:43 -070069
70 /* copy up to 15-bytes (count in r3) */
71 sub r2, r2, r3
72 movs ip, r3, lsl #31
73 ldrmib lr, [r1], #1
74 strmib lr, [r0], #1
75 ldrcsb ip, [r1], #1
76 ldrcsb lr, [r1], #1
77 strcsb ip, [r0], #1
78 strcsb lr, [r0], #1
79 movs ip, r3, lsl #29
80 bge 1f
81 // copies 4 bytes, destination 32-bits aligned
82 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
83 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
841: bcc 2f
85 // copies 8 bytes, destination 64-bits aligned
86 vld1.8 {d0}, [r1]!
87 vst1.8 {d0}, [r0, :64]!
882:
Henrik Smiding6d0bcdc2011-01-17 16:05:41 +010089 /* preload immediately the next cache line, which we may need */
90 pld [r1, #(CACHE_LINE_SIZE * 0)]
91 pld [r1, #(CACHE_LINE_SIZE * 1)]
Mathias Agopianee223d02009-09-27 17:46:43 -070092
Henrik Smidingfe6338d2010-09-15 16:08:03 +020093#ifdef HAVE_32_BYTE_CACHE_LINE
94 /* make sure we have at least 32 bytes to copy */
95 subs r2, r2, #32
96 blo 4f
97
98 /* preload all the cache lines we need.
99 * NOTE: the number of pld below depends on PREFETCH_DISTANCE,
100 * ideally would would increase the distance in the main loop to
101 * avoid the goofy code below. In practice this doesn't seem to make
102 * a big difference.
103 */
104 pld [r1, #(PREFETCH_DISTANCE)]
105
1061: /* The main loop copies 32 bytes at a time */
107 vld1.8 {d0 - d3}, [r1]!
108 pld [r1, #(PREFETCH_DISTANCE)]
109 subs r2, r2, #32
110 vst1.8 {d0 - d3}, [r0, :128]!
111 bhs 1b
112#else
Mathias Agopian199f9d92009-10-28 02:54:37 -0700113 /* make sure we have at least 64 bytes to copy */
114 subs r2, r2, #64
Mathias Agopianee223d02009-09-27 17:46:43 -0700115 blo 2f
116
Henrik Smiding6d0bcdc2011-01-17 16:05:41 +0100117 /* preload all the cache lines we need. */
118 pld [r1, #(CACHE_LINE_SIZE * 2)]
119 pld [r1, #(CACHE_LINE_SIZE * 3)]
Mathias Agopianee223d02009-09-27 17:46:43 -0700120
Mathias Agopian199f9d92009-10-28 02:54:37 -07001211: /* The main loop copies 64 bytes at a time */
Henrik Smiding6d0bcdc2011-01-17 16:05:41 +0100122 vld1.8 {d0 - d3}, [r1]!
123 vld1.8 {d4 - d7}, [r1]!
124#ifdef HAVE_32_BYTE_CACHE_LINE
125 pld [r1, #(CACHE_LINE_SIZE * 2)]
126 pld [r1, #(CACHE_LINE_SIZE * 3)]
127#else
128 pld [r1, #(CACHE_LINE_SIZE * 3)]
129#endif
Mathias Agopian199f9d92009-10-28 02:54:37 -0700130 subs r2, r2, #64
Henrik Smiding6d0bcdc2011-01-17 16:05:41 +0100131 vst1.8 {d0 - d3}, [r0, :128]!
132 vst1.8 {d4 - d7}, [r0, :128]!
Mathias Agopianee223d02009-09-27 17:46:43 -0700133 bhs 1b
134
1352: /* fix-up the remaining count and make sure we have >= 32 bytes left */
Mathias Agopian199f9d92009-10-28 02:54:37 -0700136 add r2, r2, #64
Mathias Agopianee223d02009-09-27 17:46:43 -0700137 subs r2, r2, #32
138 blo 4f
139
1403: /* 32 bytes at a time. These cache lines were already preloaded */
Henrik Smiding6d0bcdc2011-01-17 16:05:41 +0100141 vld1.8 {d0 - d3}, [r1]!
Mathias Agopianee223d02009-09-27 17:46:43 -0700142 subs r2, r2, #32
Henrik Smiding6d0bcdc2011-01-17 16:05:41 +0100143 vst1.8 {d0 - d3}, [r0, :128]!
Mathias Agopianee223d02009-09-27 17:46:43 -0700144 bhs 3b
Henrik Smidingfe6338d2010-09-15 16:08:03 +0200145#endif
Mathias Agopianee223d02009-09-27 17:46:43 -07001464: /* less than 32 left */
147 add r2, r2, #32
148 tst r2, #0x10
149 beq 5f
150 // copies 16 bytes, 128-bits aligned
151 vld1.8 {d0, d1}, [r1]!
152 vst1.8 {d0, d1}, [r0, :128]!
Mathias Agopianee223d02009-09-27 17:46:43 -07001535: /* copy up to 15-bytes (count in r2) */
154 movs ip, r2, lsl #29
155 bcc 1f
156 vld1.8 {d0}, [r1]!
157 vst1.8 {d0}, [r0]!
1581: bge 2f
159 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
160 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
1612: movs ip, r2, lsl #31
162 ldrmib r3, [r1], #1
163 ldrcsb ip, [r1], #1
164 ldrcsb lr, [r1], #1
165 strmib r3, [r0], #1
166 strcsb ip, [r0], #1
167 strcsb lr, [r0], #1
168
169 ldmfd sp!, {r0, lr}
170 bx lr
Henrik Smiding6d0bcdc2011-01-17 16:05:41 +0100171
172#else /* NEON_UNALIGNED_ACCESS */
173
174 // Check so divider is at least 16 bytes, needed for alignment code.
175 cmp r2, #16
176 blo 5f
177
178#ifdef NEON_MEMCPY_ALIGNMENT_DIVIDER
179 /* Check the upper size limit for Neon unaligned memory access in memcpy */
180#if NEON_MEMCPY_ALIGNMENT_DIVIDER >= 16
181 cmp r2, #NEON_MEMCPY_ALIGNMENT_DIVIDER
182 blo 3f
183#endif
184 /* check if buffers are aligned. If so, run arm-only version */
185 eor r3, r0, r1
186 ands r3, r3, #0x3
187 beq 11f
188
189 /* align destination to 16 bytes for the write-buffer */
190 rsb r3, r0, #0
191 ands r3, r3, #0xF
192 beq 3f
193
194 /* copy up to 15-bytes (count in r3) */
195 sub r2, r2, r3
196 movs ip, r3, lsl #31
197 ldrmib lr, [r1], #1
198 strmib lr, [r0], #1
199 ldrcsb ip, [r1], #1
200 ldrcsb lr, [r1], #1
201 strcsb ip, [r0], #1
202 strcsb lr, [r0], #1
203 movs ip, r3, lsl #29
204 bge 1f
205 // copies 4 bytes, destination 32-bits aligned
206 vld1.32 {d0[0]}, [r1]!
207 vst1.32 {d0[0]}, [r0, :32]!
2081: bcc 2f
209 // copies 8 bytes, destination 64-bits aligned
210 vld1.8 {d0}, [r1]!
211 vst1.8 {d0}, [r0, :64]!
2122:
213 /* preload immediately the next cache line, which we may need */
214 pld [r1, #(CACHE_LINE_SIZE * 0)]
215 pld [r1, #(CACHE_LINE_SIZE * 1)]
2163:
217#endif
218 /* make sure we have at least 64 bytes to copy */
219 subs r2, r2, #64
220 blo 2f
221
222 /* preload all the cache lines we need */
223 pld [r1, #(CACHE_LINE_SIZE * 2)]
224 pld [r1, #(CACHE_LINE_SIZE * 3)]
225
2261: /* The main loop copies 64 bytes at a time */
227 vld1.8 {d0 - d3}, [r1]!
228 vld1.8 {d4 - d7}, [r1]!
229#ifdef HAVE_32_BYTE_CACHE_LINE
230 pld [r1, #(CACHE_LINE_SIZE * 2)]
231 pld [r1, #(CACHE_LINE_SIZE * 3)]
232#else
233 pld [r1, #(CACHE_LINE_SIZE * 3)]
234#endif
235 subs r2, r2, #64
236 vst1.8 {d0 - d3}, [r0]!
237 vst1.8 {d4 - d7}, [r0]!
238 bhs 1b
239
2402: /* fix-up the remaining count and make sure we have >= 32 bytes left */
241 add r2, r2, #64
242 subs r2, r2, #32
243 blo 4f
244
2453: /* 32 bytes at a time. These cache lines were already preloaded */
246 vld1.8 {d0 - d3}, [r1]!
247 subs r2, r2, #32
248 vst1.8 {d0 - d3}, [r0]!
249 bhs 3b
250
2514: /* less than 32 left */
252 add r2, r2, #32
253 tst r2, #0x10
254 beq 5f
255 // copies 16 bytes, 128-bits aligned
256 vld1.8 {d0, d1}, [r1]!
257 vst1.8 {d0, d1}, [r0]!
2585: /* copy up to 15-bytes (count in r2) */
259 movs ip, r2, lsl #29
260 bcc 1f
261 vld1.8 {d0}, [r1]!
262 vst1.8 {d0}, [r0]!
2631: bge 2f
264 vld1.32 {d0[0]}, [r1]!
265 vst1.32 {d0[0]}, [r0]!
2662: movs ip, r2, lsl #31
267 ldrmib r3, [r1], #1
268 ldrcsb ip, [r1], #1
269 ldrcsb lr, [r1], #1
270 strmib r3, [r0], #1
271 strcsb ip, [r0], #1
272 strcsb lr, [r0], #1
273
274 ldmfd sp!, {r0, lr}
275 bx lr
276#endif /* NEON_UNALIGNED_ACCESS */
27711:
278 /* Simple arm-only copy loop to handle aligned copy operations */
279 stmfd sp!, {r4, r5, r6, r7, r8}
280 pld [r1, #(CACHE_LINE_SIZE * 2)]
281
282 /* Check alignment */
283 rsb r3, r1, #0
284 ands r3, #3
285 beq 2f
286
287 /* align source to 32 bits. We need to insert 2 instructions between
288 * a ldr[b|h] and str[b|h] because byte and half-word instructions
289 * stall 2 cycles.
290 */
291 movs r12, r3, lsl #31
292 sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
293 ldrmib r3, [r1], #1
294 ldrcsb r4, [r1], #1
295 ldrcsb r5, [r1], #1
296 strmib r3, [r0], #1
297 strcsb r4, [r0], #1
298 strcsb r5, [r0], #1
2992:
300 subs r2, #32
301 blt 5f
302 pld [r1, #(CACHE_LINE_SIZE * 3)]
3033: /* Main copy loop, copying 32 bytes at a time */
304 pld [r1, #(CACHE_LINE_SIZE * 4)]
305 ldmia r1!, {r3, r4, r5, r6, r7, r8, r12, lr}
306 subs r2, r2, #32
307 stmia r0!, {r3, r4, r5, r6, r7, r8, r12, lr}
308 bge 3b
3095: /* Handle any remaining bytes */
310 adds r2, #32
311 beq 6f
312
313 movs r12, r2, lsl #28
314 ldmcsia r1!, {r3, r4, r5, r6} /* 16 bytes */
315 ldmmiia r1!, {r7, r8} /* 8 bytes */
316 stmcsia r0!, {r3, r4, r5, r6}
317 stmmiia r0!, {r7, r8}
318 movs r12, r2, lsl #30
319 ldrcs r3, [r1], #4 /* 4 bytes */
320 ldrmih r4, [r1], #2 /* 2 bytes */
321 strcs r3, [r0], #4
322 strmih r4, [r0], #2
323 tst r2, #0x1
324 ldrneb r3, [r1] /* last byte */
325 strneb r3, [r0]
3266:
327 ldmfd sp!, {r4, r5, r6, r7, r8}
328 ldmfd sp!, {r0, pc}
Kenny Root420878c2011-02-16 11:55:58 -0800329END(memcpy)
Mathias Agopianee223d02009-09-27 17:46:43 -0700330
331
332#else /* __ARM_ARCH__ < 7 */
333
334
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800335 /*
336 * Optimized memcpy() for ARM.
337 *
338 * note that memcpy() always returns the destination pointer,
339 * so we have to preserve R0.
340 */
Mathias Agopianee223d02009-09-27 17:46:43 -0700341
Kenny Root420878c2011-02-16 11:55:58 -0800342ENTRY(memcpy)
Mathias Agopianee223d02009-09-27 17:46:43 -0700343 /* The stack must always be 64-bits aligned to be compliant with the
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800344 * ARM ABI. Since we have to save R0, we might as well save R4
345 * which we can use for better pipelining of the reads below
346 */
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800347 .save {r0, r4, lr}
348 stmfd sp!, {r0, r4, lr}
349 /* Making room for r5-r11 which will be spilled later */
350 .pad #28
351 sub sp, sp, #28
352
353 // preload the destination because we'll align it to a cache line
354 // with small writes. Also start the source "pump".
Elliott Hughesc54ca402013-12-13 12:17:13 -0800355 pld [r0, #0]
356 pld [r1, #0]
357 pld [r1, #32]
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800358
359 /* it simplifies things to take care of len<4 early */
360 cmp r2, #4
361 blo copy_last_3_and_return
362
363 /* compute the offset to align the source
364 * offset = (4-(src&3))&3 = -src & 3
365 */
366 rsb r3, r1, #0
367 ands r3, r3, #3
368 beq src_aligned
369
370 /* align source to 32 bits. We need to insert 2 instructions between
371 * a ldr[b|h] and str[b|h] because byte and half-word instructions
372 * stall 2 cycles.
373 */
374 movs r12, r3, lsl #31
375 sub r2, r2, r3 /* we know that r3 <= r2 because r2 >= 4 */
376 ldrmib r3, [r1], #1
377 ldrcsb r4, [r1], #1
378 ldrcsb r12,[r1], #1
379 strmib r3, [r0], #1
380 strcsb r4, [r0], #1
381 strcsb r12,[r0], #1
Mathias Agopianee223d02009-09-27 17:46:43 -0700382
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800383src_aligned:
384
Mathias Agopianee223d02009-09-27 17:46:43 -0700385 /* see if src and dst are aligned together (congruent) */
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800386 eor r12, r0, r1
387 tst r12, #3
388 bne non_congruent
389
390 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
391 * frame. Don't update sp.
392 */
393 stmea sp, {r5-r11}
394
395 /* align the destination to a cache-line */
396 rsb r3, r0, #0
397 ands r3, r3, #0x1C
398 beq congruent_aligned32
399 cmp r3, r2
400 andhi r3, r2, #0x1C
401
402 /* conditionnaly copies 0 to 7 words (length in r3) */
Mathias Agopianee223d02009-09-27 17:46:43 -0700403 movs r12, r3, lsl #28
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800404 ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
405 ldmmiia r1!, {r8, r9} /* 8 bytes */
406 stmcsia r0!, {r4, r5, r6, r7}
407 stmmiia r0!, {r8, r9}
408 tst r3, #0x4
409 ldrne r10,[r1], #4 /* 4 bytes */
410 strne r10,[r0], #4
411 sub r2, r2, r3
412
413congruent_aligned32:
414 /*
415 * here source is aligned to 32 bytes.
416 */
417
418cached_aligned32:
419 subs r2, r2, #32
420 blo less_than_32_left
421
422 /*
423 * We preload a cache-line up to 64 bytes ahead. On the 926, this will
Mathias Agopianee223d02009-09-27 17:46:43 -0700424 * stall only until the requested world is fetched, but the linefill
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800425 * continues in the the background.
426 * While the linefill is going, we write our previous cache-line
427 * into the write-buffer (which should have some free space).
428 * When the linefill is done, the writebuffer will
429 * start dumping its content into memory
430 *
431 * While all this is going, we then load a full cache line into
432 * 8 registers, this cache line should be in the cache by now
433 * (or partly in the cache).
434 *
435 * This code should work well regardless of the source/dest alignment.
436 *
437 */
438
439 // Align the preload register to a cache-line because the cpu does
440 // "critical word first" (the first word requested is loaded first).
441 bic r12, r1, #0x1F
442 add r12, r12, #64
443
4441: ldmia r1!, { r4-r11 }
Elliott Hughesc54ca402013-12-13 12:17:13 -0800445 pld [r12, #64]
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800446 subs r2, r2, #32
447
448 // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
449 // for ARM9 preload will not be safely guarded by the preceding subs.
Mathias Agopianee223d02009-09-27 17:46:43 -0700450 // When it is safely guarded the only possibility to have SIGSEGV here
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800451 // is because the caller overstates the length.
452 ldrhi r3, [r12], #32 /* cheap ARM9 preload */
453 stmia r0!, { r4-r11 }
454 bhs 1b
Mathias Agopianee223d02009-09-27 17:46:43 -0700455
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800456 add r2, r2, #32
457
458
459
460
461less_than_32_left:
Mathias Agopianee223d02009-09-27 17:46:43 -0700462 /*
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800463 * less than 32 bytes left at this point (length in r2)
464 */
465
466 /* skip all this if there is nothing to do, which should
467 * be a common case (if not executed the code below takes
468 * about 16 cycles)
469 */
470 tst r2, #0x1F
471 beq 1f
472
473 /* conditionnaly copies 0 to 31 bytes */
Mathias Agopianee223d02009-09-27 17:46:43 -0700474 movs r12, r2, lsl #28
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800475 ldmcsia r1!, {r4, r5, r6, r7} /* 16 bytes */
476 ldmmiia r1!, {r8, r9} /* 8 bytes */
477 stmcsia r0!, {r4, r5, r6, r7}
478 stmmiia r0!, {r8, r9}
479 movs r12, r2, lsl #30
480 ldrcs r3, [r1], #4 /* 4 bytes */
481 ldrmih r4, [r1], #2 /* 2 bytes */
Mathias Agopianee223d02009-09-27 17:46:43 -0700482 strcs r3, [r0], #4
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800483 strmih r4, [r0], #2
484 tst r2, #0x1
485 ldrneb r3, [r1] /* last byte */
486 strneb r3, [r0]
487
488 /* we're done! restore everything and return */
4891: ldmfd sp!, {r5-r11}
490 ldmfd sp!, {r0, r4, lr}
491 bx lr
492
493 /********************************************************************/
494
495non_congruent:
496 /*
497 * here source is aligned to 4 bytes
498 * but destination is not.
499 *
Mathias Agopianee223d02009-09-27 17:46:43 -0700500 * in the code below r2 is the number of bytes read
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800501 * (the number of bytes written is always smaller, because we have
502 * partial words in the shift queue)
503 */
504 cmp r2, #4
505 blo copy_last_3_and_return
Mathias Agopianee223d02009-09-27 17:46:43 -0700506
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800507 /* Use post-incriment mode for stm to spill r5-r11 to reserved stack
508 * frame. Don't update sp.
509 */
510 stmea sp, {r5-r11}
Mathias Agopianee223d02009-09-27 17:46:43 -0700511
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800512 /* compute shifts needed to align src to dest */
513 rsb r5, r0, #0
514 and r5, r5, #3 /* r5 = # bytes in partial words */
Mathias Agopianee223d02009-09-27 17:46:43 -0700515 mov r12, r5, lsl #3 /* r12 = right */
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800516 rsb lr, r12, #32 /* lr = left */
Mathias Agopianee223d02009-09-27 17:46:43 -0700517
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800518 /* read the first word */
519 ldr r3, [r1], #4
520 sub r2, r2, #4
Mathias Agopianee223d02009-09-27 17:46:43 -0700521
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800522 /* write a partial word (0 to 3 bytes), such that destination
523 * becomes aligned to 32 bits (r5 = nb of words to copy for alignment)
524 */
525 movs r5, r5, lsl #31
526 strmib r3, [r0], #1
Mathias Agopianee223d02009-09-27 17:46:43 -0700527 movmi r3, r3, lsr #8
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800528 strcsb r3, [r0], #1
529 movcs r3, r3, lsr #8
530 strcsb r3, [r0], #1
531 movcs r3, r3, lsr #8
532
533 cmp r2, #4
534 blo partial_word_tail
Mathias Agopianee223d02009-09-27 17:46:43 -0700535
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800536 /* Align destination to 32 bytes (cache line boundary) */
5371: tst r0, #0x1c
538 beq 2f
539 ldr r5, [r1], #4
540 sub r2, r2, #4
541 orr r4, r3, r5, lsl lr
542 mov r3, r5, lsr r12
543 str r4, [r0], #4
544 cmp r2, #4
545 bhs 1b
546 blo partial_word_tail
547
548 /* copy 32 bytes at a time */
5492: subs r2, r2, #32
550 blo less_than_thirtytwo
551
552 /* Use immediate mode for the shifts, because there is an extra cycle
553 * for register shifts, which could account for up to 50% of
554 * performance hit.
555 */
556
557 cmp r12, #24
558 beq loop24
559 cmp r12, #8
560 beq loop8
561
562loop16:
563 ldr r12, [r1], #4
5641: mov r4, r12
565 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
Elliott Hughesc54ca402013-12-13 12:17:13 -0800566 pld [r1, #64]
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800567 subs r2, r2, #32
568 ldrhs r12, [r1], #4
569 orr r3, r3, r4, lsl #16
570 mov r4, r4, lsr #16
571 orr r4, r4, r5, lsl #16
572 mov r5, r5, lsr #16
573 orr r5, r5, r6, lsl #16
574 mov r6, r6, lsr #16
575 orr r6, r6, r7, lsl #16
576 mov r7, r7, lsr #16
577 orr r7, r7, r8, lsl #16
578 mov r8, r8, lsr #16
579 orr r8, r8, r9, lsl #16
580 mov r9, r9, lsr #16
581 orr r9, r9, r10, lsl #16
582 mov r10, r10, lsr #16
583 orr r10, r10, r11, lsl #16
584 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
585 mov r3, r11, lsr #16
586 bhs 1b
587 b less_than_thirtytwo
588
589loop8:
590 ldr r12, [r1], #4
5911: mov r4, r12
592 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
Elliott Hughesc54ca402013-12-13 12:17:13 -0800593 pld [r1, #64]
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800594 subs r2, r2, #32
595 ldrhs r12, [r1], #4
596 orr r3, r3, r4, lsl #24
597 mov r4, r4, lsr #8
598 orr r4, r4, r5, lsl #24
599 mov r5, r5, lsr #8
600 orr r5, r5, r6, lsl #24
601 mov r6, r6, lsr #8
602 orr r6, r6, r7, lsl #24
603 mov r7, r7, lsr #8
604 orr r7, r7, r8, lsl #24
605 mov r8, r8, lsr #8
606 orr r8, r8, r9, lsl #24
607 mov r9, r9, lsr #8
608 orr r9, r9, r10, lsl #24
609 mov r10, r10, lsr #8
610 orr r10, r10, r11, lsl #24
611 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
612 mov r3, r11, lsr #8
613 bhs 1b
614 b less_than_thirtytwo
615
616loop24:
617 ldr r12, [r1], #4
6181: mov r4, r12
619 ldmia r1!, { r5,r6,r7, r8,r9,r10,r11}
Elliott Hughesc54ca402013-12-13 12:17:13 -0800620 pld [r1, #64]
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800621 subs r2, r2, #32
622 ldrhs r12, [r1], #4
623 orr r3, r3, r4, lsl #8
624 mov r4, r4, lsr #24
625 orr r4, r4, r5, lsl #8
626 mov r5, r5, lsr #24
627 orr r5, r5, r6, lsl #8
628 mov r6, r6, lsr #24
629 orr r6, r6, r7, lsl #8
630 mov r7, r7, lsr #24
631 orr r7, r7, r8, lsl #8
632 mov r8, r8, lsr #24
633 orr r8, r8, r9, lsl #8
634 mov r9, r9, lsr #24
635 orr r9, r9, r10, lsl #8
636 mov r10, r10, lsr #24
637 orr r10, r10, r11, lsl #8
638 stmia r0!, {r3,r4,r5,r6, r7,r8,r9,r10}
639 mov r3, r11, lsr #24
640 bhs 1b
641
642
643less_than_thirtytwo:
644 /* copy the last 0 to 31 bytes of the source */
645 rsb r12, lr, #32 /* we corrupted r12, recompute it */
646 add r2, r2, #32
647 cmp r2, #4
648 blo partial_word_tail
649
6501: ldr r5, [r1], #4
651 sub r2, r2, #4
652 orr r4, r3, r5, lsl lr
653 mov r3, r5, lsr r12
654 str r4, [r0], #4
655 cmp r2, #4
656 bhs 1b
657
658partial_word_tail:
659 /* we have a partial word in the input buffer */
660 movs r5, lr, lsl #(31-3)
661 strmib r3, [r0], #1
662 movmi r3, r3, lsr #8
663 strcsb r3, [r0], #1
664 movcs r3, r3, lsr #8
665 strcsb r3, [r0], #1
Mathias Agopianee223d02009-09-27 17:46:43 -0700666
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800667 /* Refill spilled registers from the stack. Don't update sp. */
668 ldmfd sp, {r5-r11}
669
670copy_last_3_and_return:
671 movs r2, r2, lsl #31 /* copy remaining 0, 1, 2 or 3 bytes */
672 ldrmib r2, [r1], #1
673 ldrcsb r3, [r1], #1
674 ldrcsb r12,[r1]
675 strmib r2, [r0], #1
676 strcsb r3, [r0], #1
677 strcsb r12,[r0]
678
679 /* we're done! restore sp and spilled registers and return */
680 add sp, sp, #28
681 ldmfd sp!, {r0, r4, lr}
682 bx lr
Kenny Root420878c2011-02-16 11:55:58 -0800683END(memcpy)
The Android Open Source Project1dc9e472009-03-03 19:28:35 -0800684
Mathias Agopianee223d02009-09-27 17:46:43 -0700685
686#endif /* __ARM_ARCH__ < 7 */