blob: 516e20c623c0d2281afcd5263e130cb3269094bd [file] [log] [blame]
Greta Yorsh5b349fc2011-10-04 16:02:25 +00001/*
2 * Copyright (c) 2013 ARM Ltd
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. The name of the company may not be used to endorse or promote
14 * products derived from this software without specific prior written
15 * permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
18 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
19 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
22 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
Greta Yorsh5b349fc2011-10-04 16:02:25 +000029 /* Prototype: void *memcpy (void *dst, const void *src, size_t count). */
30
31 /* Use the version of memcpy implemented using LDRD and STRD.
32 This version is tuned for Cortex-A15.
33 This might not be the best for other ARMv7-A CPUs,
34 but there is no predefine to distinguish between
35 different CPUs in the same architecture,
36 and this version is better than the plain memcpy provided in newlib.
37
38 Therefore, we use this version for all ARMv7-A CPUS. */
39
40 /* To make the same code compile for both ARM and Thumb instruction
41 sets, switch to unified syntax at the beginning of this function.
42 However, by using the same code, we may be missing optimization
43 opportunities. For instance, in LDRD/STRD instructions, the first
44 destination register must be even and the second consecutive in
45 ARM state, but not in Thumb state. */
46
Ben Cheng14283002013-03-01 12:38:09 -080047#include <machine/cpu-features.h>
48#include <machine/asm.h>
49
Greta Yorsh5b349fc2011-10-04 16:02:25 +000050 .syntax unified
51
Ben Cheng14283002013-03-01 12:38:09 -080052ENTRY(memcpy)
Greta Yorsh5b349fc2011-10-04 16:02:25 +000053
54 /* Assumes that n >= 0, and dst, src are valid pointers.
55 If there is at least 8 bytes to copy, use LDRD/STRD.
56 If src and dst are misaligned with different offsets,
57 first copy byte by byte until dst is aligned,
58 and then copy using LDRD/STRD and shift if needed.
59 When less than 8 left, copy a word and then byte by byte. */
60
61 /* Save registers (r0 holds the return value):
Ben Cheng14283002013-03-01 12:38:09 -080062 optimized push {r0, r4, r5, r6, r7, lr}.
Greta Yorsh5b349fc2011-10-04 16:02:25 +000063 To try and improve performance, stack layout changed,
64 i.e., not keeping the stack looking like users expect
65 (highest numbered register at highest address). */
Ben Cheng14283002013-03-01 12:38:09 -080066 .save {r0, lr}
67 push {r0, lr}
68 .save {r4, r5}
69 strd r4, r5, [sp, #-8]!
70 .save {r6, r7}
71 strd r6, r7, [sp, #-8]!
Greta Yorsh5b349fc2011-10-04 16:02:25 +000072
73 /* TODO: Add debug frame directives.
74 We don't need exception unwind directives, because the code below
75 does not throw any exceptions and does not call any other functions.
76 Generally, newlib functions like this lack debug information for
77 assembler source. */
78
79 /* Get copying of tiny blocks out of the way first. */
80 /* Is there at least 4 bytes to copy? */
81 subs r2, r2, #4
82 blt copy_less_than_4 /* If n < 4. */
83
84 /* Check word alignment. */
85 ands ip, r0, #3 /* ip = last 2 bits of dst. */
86 bne dst_not_word_aligned /* If dst is not word-aligned. */
87
88 /* Get here if dst is word-aligned. */
89 ands ip, r1, #3 /* ip = last 2 bits of src. */
90 bne src_not_word_aligned /* If src is not word-aligned. */
91word_aligned:
92 /* Get here if source and dst both are word-aligned.
93 The number of bytes remaining to copy is r2+4. */
94
95 /* Is there is at least 64 bytes to copy? */
96 subs r2, r2, #60
97 blt copy_less_than_64 /* If r2 + 4 < 64. */
98
99 /* First, align the destination buffer to 8-bytes,
100 to make sure double loads and stores don't cross cache line boundary,
101 as they are then more expensive even if the data is in the cache
102 (require two load/store issue cycles instead of one).
103 If only one of the buffers is not 8-bytes aligned,
104 then it's more important to align dst than src,
105 because there is more penalty for stores
106 than loads that cross cacheline boundary.
107 This check and realignment are only worth doing
108 if there is a lot to copy. */
109
110 /* Get here if dst is word aligned,
111 i.e., the 2 least significant bits are 0.
112 If dst is not 2w aligned (i.e., the 3rd bit is not set in dst),
113 then copy 1 word (4 bytes). */
114 ands r3, r0, #4
115 beq 11f /* If dst already two-word aligned. */
116 ldr r3, [r1], #4
117 str r3, [r0], #4
118 subs r2, r2, #4
119 blt copy_less_than_64
120
12111:
122 /* TODO: Align to cacheline (useful for PLD optimization). */
123
124 /* Every loop iteration copies 64 bytes. */
1251:
126 .irp offset, #0, #8, #16, #24, #32, #40, #48, #56
127 ldrd r4, r5, [r1, \offset]
128 strd r4, r5, [r0, \offset]
129 .endr
130
131 add r0, r0, #64
132 add r1, r1, #64
133 subs r2, r2, #64
134 bge 1b /* If there is more to copy. */
135
136copy_less_than_64:
137
138 /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
139 Restore the count if there is more than 7 bytes to copy. */
140 adds r2, r2, #56
141 blt copy_less_than_8
142
143 /* Copy 8 bytes at a time. */
1442:
145 ldrd r4, r5, [r1], #8
146 strd r4, r5, [r0], #8
147 subs r2, r2, #8
148 bge 2b /* If there is more to copy. */
149
150copy_less_than_8:
151
152 /* Get here if less than 8 bytes to copy, -8 <= r2 < 0.
153 Check if there is more to copy. */
154 cmn r2, #8
155 beq return /* If r2 + 8 == 0. */
156
157 /* Restore the count if there is more than 3 bytes to copy. */
158 adds r2, r2, #4
159 blt copy_less_than_4
160
161 /* Copy 4 bytes. */
162 ldr r3, [r1], #4
163 str r3, [r0], #4
164
165copy_less_than_4:
166 /* Get here if less than 4 bytes to copy, -4 <= r2 < 0. */
167
168 /* Restore the count, check if there is more to copy. */
169 adds r2, r2, #4
170 beq return /* If r2 == 0. */
171
172 /* Get here with r2 is in {1,2,3}={01,10,11}. */
173 /* Logical shift left r2, insert 0s, update flags. */
174 lsls r2, r2, #31
175
176 /* Copy byte by byte.
177 Condition ne means the last bit of r2 is 0.
178 Condition cs means the second to last bit of r2 is set,
179 i.e., r2 is 1 or 3. */
180 itt ne
181 ldrbne r3, [r1], #1
182 strbne r3, [r0], #1
183
184 itttt cs
185 ldrbcs r4, [r1], #1
186 ldrbcs r5, [r1]
187 strbcs r4, [r0], #1
188 strbcs r5, [r0]
189
190return:
Ben Cheng14283002013-03-01 12:38:09 -0800191 /* Restore registers: optimized pop {r0, r4, r5, r6, r7, pc} */
192 /* This is the only return point of memcpy. */
193 ldrd r6, r7, [sp], #8
Greta Yorsh5b349fc2011-10-04 16:02:25 +0000194 ldrd r4, r5, [sp], #8
Ben Cheng14283002013-03-01 12:38:09 -0800195 pop {r0, pc}
Greta Yorsh5b349fc2011-10-04 16:02:25 +0000196
197#ifndef __ARM_FEATURE_UNALIGNED
198
199 /* The following assembly macro implements misaligned copy in software.
200 Assumes that dst is word aligned, src is at offset "pull" bits from
201 word, push = 32 - pull, and the number of bytes that remain to copy
202 is r2 + 4, r2 >= 0. */
203
204 /* In the code below, r2 is the number of bytes that remain to be
205 written. The number of bytes read is always larger, because we have
206 partial words in the shift queue. */
207
208 .macro miscopy pull push shiftleft shiftright
209
210 /* Align src to the previous word boundary. */
211 bic r1, r1, #3
212
213 /* Initialize the shift queue. */
214 ldr r5, [r1], #4 /* Load a word from source. */
215
216 subs r2, r2, #4
217 blt 6f /* Go to misaligned copy of less than 8 bytes. */
218
219 /* Get here if there is more than 8 bytes to copy.
220 The number of bytes to copy is r2+8, r2 >= 0. */
221
Greta Yorsh5b349fc2011-10-04 16:02:25 +0000222 subs r2, r2, #56
223 blt 4f /* Go to misaligned copy of less than 64 bytes. */
224
2253:
226 /* Get here if there is more than 64 bytes to copy.
227 The number of bytes to copy is r2+64, r2 >= 0. */
228
229 /* Copy 64 bytes in every iteration.
230 Use a partial word from the shift queue. */
231 .irp offset, #0, #8, #16, #24, #32, #40, #48, #56
232 mov r6, r5, \shiftleft #\pull
233 ldrd r4, r5, [r1, \offset]
234 orr r6, r6, r4, \shiftright #\push
235 mov r7, r4, \shiftleft #\pull
236 orr r7, r7, r5, \shiftright #\push
237 strd r6, r7, [r0, \offset]
238 .endr
239
240 add r1, r1, #64
241 add r0, r0, #64
242 subs r2, r2, #64
243 bge 3b
244
2454:
246 /* Get here if there is less than 64 bytes to copy (-64 <= r2 < 0)
247 and they are misaligned. */
248
249 /* Restore the count if there is more than 7 bytes to copy. */
250 adds r2, r2, #56
251
Greta Yorsh5b349fc2011-10-04 16:02:25 +0000252 blt 6f /* Go to misaligned copy of less than 8 bytes. */
253
2545:
255 /* Copy 8 bytes at a time.
256 Use a partial word from the shift queue. */
257 mov r6, r5, \shiftleft #\pull
258 ldrd r4, r5, [r1], #8
259 orr r6, r6, r4, \shiftright #\push
260 mov r7, r4, \shiftleft #\pull
261 orr r7, r7, r5, \shiftright #\push
262 strd r6, r7, [r0], #8
263
264 subs r2, r2, #8
265 bge 5b /* If there is more to copy. */
266
Greta Yorsh5b349fc2011-10-04 16:02:25 +00002676:
268 /* Get here if there less than 8 bytes to copy (-8 <= r2 < 0)
269 and they are misaligned. */
270
271 /* Check if there is more to copy. */
272 cmn r2, #8
273 beq return
274
275 /* Check if there is less than 4 bytes to copy. */
276 cmn r2, #4
277
278 itt lt
279 /* Restore src offset from word-align. */
280 sublt r1, r1, #(\push / 8)
281 blt copy_less_than_4
282
283 /* Use a partial word from the shift queue. */
284 mov r3, r5, \shiftleft #\pull
285 /* Load a word from src, but without writeback
286 (this word is not fully written to dst). */
287 ldr r5, [r1]
288
289 /* Restore src offset from word-align. */
290 add r1, r1, #(\pull / 8)
291
292 /* Shift bytes to create one dst word and store it. */
293 orr r3, r3, r5, \shiftright #\push
294 str r3, [r0], #4
295
296 /* Use single byte copying of the remaining bytes. */
297 b copy_less_than_4
298
299 .endm
300
301#endif /* not __ARM_FEATURE_UNALIGNED */
302
303dst_not_word_aligned:
304
305 /* Get here when dst is not aligned and ip has the last 2 bits of dst,
306 i.e., ip is the offset of dst from word.
307 The number of bytes that remains to copy is r2 + 4,
308 i.e., there are at least 4 bytes to copy.
309 Write a partial word (0 to 3 bytes), such that dst becomes
310 word-aligned. */
311
312 /* If dst is at ip bytes offset from a word (with 0 < ip < 4),
313 then there are (4 - ip) bytes to fill up to align dst to the next
314 word. */
315 rsb ip, ip, #4 /* ip = #4 - ip. */
316 cmp ip, #2
317
318 /* Copy byte by byte with conditionals. */
319 itt gt
320 ldrbgt r3, [r1], #1
321 strbgt r3, [r0], #1
322
323 itt ge
324 ldrbge r4, [r1], #1
325 strbge r4, [r0], #1
326
327 ldrb lr, [r1], #1
328 strb lr, [r0], #1
329
330 /* Update the count.
331 ip holds the number of bytes we have just copied. */
332 subs r2, r2, ip /* r2 = r2 - ip. */
333 blt copy_less_than_4 /* If r2 < ip. */
334
335 /* Get here if there are more than 4 bytes to copy.
336 Check if src is aligned. If beforehand src and dst were not word
337 aligned but congruent (same offset), then now they are both
338 word-aligned, and we can copy the rest efficiently (without
339 shifting). */
340 ands ip, r1, #3 /* ip = last 2 bits of src. */
341 beq word_aligned /* If r1 is word-aligned. */
342
343src_not_word_aligned:
344 /* Get here when src is not word-aligned, but dst is word-aligned.
345 The number of bytes that remains to copy is r2+4. */
346
347#ifdef __ARM_FEATURE_UNALIGNED
348 /* Copy word by word using LDR when alignment can be done in hardware,
349 i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */
350 subs r2, r2, #60
351 blt 8f
352
3537:
354 /* Copy 64 bytes in every loop iteration. */
355 .irp offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60
356 ldr r3, [r1, \offset]
357 str r3, [r0, \offset]
358 .endr
359
360 add r0, r0, #64
361 add r1, r1, #64
362 subs r2, r2, #64
363 bge 7b
364
3658:
366 /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
367 Check if there is more than 3 bytes to copy. */
368 adds r2, r2, #60
369 blt copy_less_than_4
370
3719:
372 /* Get here if there is less than 64 but at least 4 bytes to copy,
373 where the number of bytes to copy is r2+4. */
374 ldr r3, [r1], #4
375 str r3, [r0], #4
376 subs r2, r2, #4
377 bge 9b
378
379 b copy_less_than_4
380
381#else /* not __ARM_FEATURE_UNALIGNED */
382
383 /* ip has last 2 bits of src,
384 i.e., ip is the offset of src from word, and ip > 0.
385 Compute shifts needed to copy from src to dst. */
386 cmp ip, #2
387 beq miscopy_16_16 /* If ip == 2. */
388 bge miscopy_24_8 /* If ip == 3. */
389
390 /* Get here if ip == 1. */
391
392 /* Endian independent macros for shifting bytes within registers. */
393
394#ifndef __ARMEB__
395miscopy_8_24: miscopy pull=8 push=24 shiftleft=lsr shiftright=lsl
396miscopy_16_16: miscopy pull=16 push=16 shiftleft=lsr shiftright=lsl
397miscopy_24_8: miscopy pull=24 push=8 shiftleft=lsr shiftright=lsl
398#else /* not __ARMEB__ */
399miscopy_8_24: miscopy pull=8 push=24 shiftleft=lsl shiftright=lsr
400miscopy_16_16: miscopy pull=16 push=16 shiftleft=lsl shiftright=lsr
401miscopy_24_8: miscopy pull=24 push=8 shiftleft=lsl shiftright=lsr
402#endif /* not __ARMEB__ */
403
404#endif /* not __ARM_FEATURE_UNALIGNED */
405
Ben Cheng14283002013-03-01 12:38:09 -0800406END(memcpy)