blob: 8b366a3141271dfb943198a812524180ae86b5b2 [file] [log] [blame]
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +01001/* Copyright (c) 2014, Linaro Limited
2 All rights reserved.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the name of the Linaro nor the
12 names of its contributors may be used to endorse or promote products
13 derived from this software without specific prior written permission.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28/* Assumptions:
29 *
30 * ARMv8-a, AArch64
31 * Unaligned accesses
Bernhard Rosenkraenzer6f2bde32014-05-23 17:44:18 +020032 * wchar_t is 4 bytes
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010033 */
34
35#include <private/bionic_asm.h>
36
37/* Parameters and result. */
Bernhard Rosenkraenzer6f2bde32014-05-23 17:44:18 +020038#ifdef BCOPY
39#define origdstin x1
40#define origsrc x0
41#endif
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010042#define dstin x0
43#define src x1
44#define count x2
45#define tmp1 x3
46#define tmp1w w3
47#define tmp2 x4
48#define tmp2w w4
49#define tmp3 x5
50#define tmp3w w5
51#define dst x6
52
53#define A_l x7
54#define A_h x8
55#define B_l x9
56#define B_h x10
57#define C_l x11
58#define C_h x12
59#define D_l x13
60#define D_h x14
61
Bernhard Rosenkraenzer6f2bde32014-05-23 17:44:18 +020062#ifdef BCOPY
63ENTRY(bcopy)
64 /* Swap src and dst so that a branch to memcpy doesn't cause issues. */
65 mov tmp1, origsrc
66 mov origsrc, origdstin
67 mov origdstin, tmp1
68#elif defined(WMEMMOVE)
69ENTRY(wmemmove)
70 lsl count, count, #2
71#else
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010072ENTRY(memmove)
Bernhard Rosenkraenzer6f2bde32014-05-23 17:44:18 +020073#endif
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010074 cmp dstin, src
75 b.lo .Ldownwards
76 add tmp1, src, count
77 cmp dstin, tmp1
78 b.hs memcpy /* No overlap. */
79
80 /* Upwards move with potential overlap.
81 * Need to move from the tail backwards. SRC and DST point one
82 * byte beyond the remaining data to move. */
83 add dst, dstin, count
84 add src, src, count
85 cmp count, #64
86 b.ge .Lmov_not_short_up
87
88 /* Deal with small moves quickly by dropping straight into the
89 * exit block. */
90.Ltail63up:
91 /* Move up to 48 bytes of data. At this point we only need the
92 * bottom 6 bits of count to be accurate. */
93 ands tmp1, count, #0x30
94 b.eq .Ltail15up
95 sub dst, dst, tmp1
96 sub src, src, tmp1
97 cmp tmp1w, #0x20
98 b.eq 1f
99 b.lt 2f
100 ldp A_l, A_h, [src, #32]
101 stp A_l, A_h, [dst, #32]
1021:
103 ldp A_l, A_h, [src, #16]
104 stp A_l, A_h, [dst, #16]
1052:
106 ldp A_l, A_h, [src]
107 stp A_l, A_h, [dst]
108.Ltail15up:
109 /* Move up to 15 bytes of data. Does not assume additional data
110 * being moved. */
111 tbz count, #3, 1f
112 ldr tmp1, [src, #-8]!
113 str tmp1, [dst, #-8]!
1141:
115 tbz count, #2, 1f
116 ldr tmp1w, [src, #-4]!
117 str tmp1w, [dst, #-4]!
1181:
119 tbz count, #1, 1f
120 ldrh tmp1w, [src, #-2]!
121 strh tmp1w, [dst, #-2]!
1221:
123 tbz count, #0, 1f
124 ldrb tmp1w, [src, #-1]
125 strb tmp1w, [dst, #-1]
1261:
127 ret
128
129.Lmov_not_short_up:
130 /* We don't much care about the alignment of DST, but we want SRC
131 * to be 128-bit (16 byte) aligned so that we don't cross cache line
132 * boundaries on both loads and stores. */
133 ands tmp2, src, #15 /* Bytes to reach alignment. */
134 b.eq 2f
135 sub count, count, tmp2
136 /* Move enough data to reach alignment; unlike memcpy, we have to
137 * be aware of the overlap, which means we can't move data twice. */
138 tbz tmp2, #3, 1f
139 ldr tmp1, [src, #-8]!
140 str tmp1, [dst, #-8]!
1411:
142 tbz tmp2, #2, 1f
143 ldr tmp1w, [src, #-4]!
144 str tmp1w, [dst, #-4]!
1451:
146 tbz tmp2, #1, 1f
147 ldrh tmp1w, [src, #-2]!
148 strh tmp1w, [dst, #-2]!
1491:
150 tbz tmp2, #0, 1f
151 ldrb tmp1w, [src, #-1]!
152 strb tmp1w, [dst, #-1]!
1531:
154
155 /* There may be less than 63 bytes to go now. */
156 cmp count, #63
157 b.le .Ltail63up
1582:
159 subs count, count, #128
160 b.ge .Lmov_body_large_up
161 /* Less than 128 bytes to move, so handle 64 here and then jump
162 * to the tail. */
163 ldp A_l, A_h, [src, #-64]!
164 ldp B_l, B_h, [src, #16]
165 ldp C_l, C_h, [src, #32]
166 ldp D_l, D_h, [src, #48]
167 stp A_l, A_h, [dst, #-64]!
168 stp B_l, B_h, [dst, #16]
169 stp C_l, C_h, [dst, #32]
170 stp D_l, D_h, [dst, #48]
171 tst count, #0x3f
172 b.ne .Ltail63up
173 ret
174
175 /* Critical loop. Start at a new Icache line boundary. Assuming
176 * 64 bytes per line this ensures the entire loop is in one line. */
177 .p2align 6
178.Lmov_body_large_up:
179 /* There are at least 128 bytes to move. */
180 ldp A_l, A_h, [src, #-16]
181 ldp B_l, B_h, [src, #-32]
182 ldp C_l, C_h, [src, #-48]
183 ldp D_l, D_h, [src, #-64]!
1841:
185 stp A_l, A_h, [dst, #-16]
186 ldp A_l, A_h, [src, #-16]
187 stp B_l, B_h, [dst, #-32]
188 ldp B_l, B_h, [src, #-32]
189 stp C_l, C_h, [dst, #-48]
190 ldp C_l, C_h, [src, #-48]
191 stp D_l, D_h, [dst, #-64]!
192 ldp D_l, D_h, [src, #-64]!
193 subs count, count, #64
194 b.ge 1b
195 stp A_l, A_h, [dst, #-16]
196 stp B_l, B_h, [dst, #-32]
197 stp C_l, C_h, [dst, #-48]
198 stp D_l, D_h, [dst, #-64]!
199 tst count, #0x3f
200 b.ne .Ltail63up
201 ret
202
203
204.Ldownwards:
205 /* For a downwards move we can safely use memcpy provided that
206 * DST is more than 16 bytes away from SRC. */
207 sub tmp1, src, #16
208 cmp dstin, tmp1
209 b.ls memcpy /* May overlap, but not critically. */
210
211 mov dst, dstin /* Preserve DSTIN for return value. */
212 cmp count, #64
213 b.ge .Lmov_not_short_down
214
215 /* Deal with small moves quickly by dropping straight into the
216 * exit block. */
217.Ltail63down:
218 /* Move up to 48 bytes of data. At this point we only need the
219 * bottom 6 bits of count to be accurate. */
220 ands tmp1, count, #0x30
221 b.eq .Ltail15down
222 add dst, dst, tmp1
223 add src, src, tmp1
224 cmp tmp1w, #0x20
225 b.eq 1f
226 b.lt 2f
227 ldp A_l, A_h, [src, #-48]
228 stp A_l, A_h, [dst, #-48]
2291:
230 ldp A_l, A_h, [src, #-32]
231 stp A_l, A_h, [dst, #-32]
2322:
233 ldp A_l, A_h, [src, #-16]
234 stp A_l, A_h, [dst, #-16]
235.Ltail15down:
236 /* Move up to 15 bytes of data. Does not assume additional data
237 being moved. */
238 tbz count, #3, 1f
239 ldr tmp1, [src], #8
240 str tmp1, [dst], #8
2411:
242 tbz count, #2, 1f
243 ldr tmp1w, [src], #4
244 str tmp1w, [dst], #4
2451:
246 tbz count, #1, 1f
247 ldrh tmp1w, [src], #2
248 strh tmp1w, [dst], #2
2491:
250 tbz count, #0, 1f
251 ldrb tmp1w, [src]
252 strb tmp1w, [dst]
2531:
254 ret
255
256.Lmov_not_short_down:
257 /* We don't much care about the alignment of DST, but we want SRC
258 * to be 128-bit (16 byte) aligned so that we don't cross cache line
259 * boundaries on both loads and stores. */
260 neg tmp2, src
261 ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
262 b.eq 2f
263 sub count, count, tmp2
264 /* Move enough data to reach alignment; unlike memcpy, we have to
265 * be aware of the overlap, which means we can't move data twice. */
266 tbz tmp2, #3, 1f
267 ldr tmp1, [src], #8
268 str tmp1, [dst], #8
2691:
270 tbz tmp2, #2, 1f
271 ldr tmp1w, [src], #4
272 str tmp1w, [dst], #4
2731:
274 tbz tmp2, #1, 1f
275 ldrh tmp1w, [src], #2
276 strh tmp1w, [dst], #2
2771:
278 tbz tmp2, #0, 1f
279 ldrb tmp1w, [src], #1
280 strb tmp1w, [dst], #1
2811:
282
283 /* There may be less than 63 bytes to go now. */
284 cmp count, #63
285 b.le .Ltail63down
2862:
287 subs count, count, #128
288 b.ge .Lmov_body_large_down
289 /* Less than 128 bytes to move, so handle 64 here and then jump
290 * to the tail. */
291 ldp A_l, A_h, [src]
292 ldp B_l, B_h, [src, #16]
293 ldp C_l, C_h, [src, #32]
294 ldp D_l, D_h, [src, #48]
295 stp A_l, A_h, [dst]
296 stp B_l, B_h, [dst, #16]
297 stp C_l, C_h, [dst, #32]
298 stp D_l, D_h, [dst, #48]
299 tst count, #0x3f
300 add src, src, #64
301 add dst, dst, #64
302 b.ne .Ltail63down
303 ret
304
305 /* Critical loop. Start at a new cache line boundary. Assuming
306 * 64 bytes per line this ensures the entire loop is in one line. */
307 .p2align 6
308.Lmov_body_large_down:
309 /* There are at least 128 bytes to move. */
310 ldp A_l, A_h, [src, #0]
311 sub dst, dst, #16 /* Pre-bias. */
312 ldp B_l, B_h, [src, #16]
313 ldp C_l, C_h, [src, #32]
314 ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
3151:
316 stp A_l, A_h, [dst, #16]
317 ldp A_l, A_h, [src, #16]
318 stp B_l, B_h, [dst, #32]
319 ldp B_l, B_h, [src, #32]
320 stp C_l, C_h, [dst, #48]
321 ldp C_l, C_h, [src, #48]
322 stp D_l, D_h, [dst, #64]!
323 ldp D_l, D_h, [src, #64]!
324 subs count, count, #64
325 b.ge 1b
326 stp A_l, A_h, [dst, #16]
327 stp B_l, B_h, [dst, #32]
328 stp C_l, C_h, [dst, #48]
329 stp D_l, D_h, [dst, #64]
330 add src, src, #16
331 add dst, dst, #64 + 16
332 tst count, #0x3f
333 b.ne .Ltail63down
334 ret
Bernhard Rosenkraenzer6f2bde32014-05-23 17:44:18 +0200335#ifdef BCOPY
336END(bcopy)
337#elif defined(WMEMMOVE)
338END(wmemmove)
339#else
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +0100340END(memmove)
Bernhard Rosenkraenzer6f2bde32014-05-23 17:44:18 +0200341#endif