blob: 739ce49820cbe594d95b6bf564963540028971ec [file] [log] [blame]
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +01001/* Copyright (c) 2014, Linaro Limited
2 All rights reserved.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the name of the Linaro nor the
12 names of its contributors may be used to endorse or promote products
13 derived from this software without specific prior written permission.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28/* Assumptions:
29 *
30 * ARMv8-a, AArch64
31 * Unaligned accesses
Bernhard Rosenkraenzer6f2bde32014-05-23 17:44:18 +020032 * wchar_t is 4 bytes
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010033 */
34
35#include <private/bionic_asm.h>
36
37/* Parameters and result. */
38#define dstin x0
39#define src x1
40#define count x2
41#define tmp1 x3
42#define tmp1w w3
43#define tmp2 x4
44#define tmp2w w4
45#define tmp3 x5
46#define tmp3w w5
47#define dst x6
48
49#define A_l x7
50#define A_h x8
51#define B_l x9
52#define B_h x10
53#define C_l x11
54#define C_h x12
55#define D_l x13
56#define D_h x14
57
Jake Weinstein2926f9a2015-08-16 00:44:40 +000058#if defined(WMEMMOVE)
Bernhard Rosenkraenzer6f2bde32014-05-23 17:44:18 +020059ENTRY(wmemmove)
60 lsl count, count, #2
61#else
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010062ENTRY(memmove)
Bernhard Rosenkraenzer6f2bde32014-05-23 17:44:18 +020063#endif
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010064 cmp dstin, src
65 b.lo .Ldownwards
66 add tmp1, src, count
67 cmp dstin, tmp1
68 b.hs memcpy /* No overlap. */
69
70 /* Upwards move with potential overlap.
71 * Need to move from the tail backwards. SRC and DST point one
72 * byte beyond the remaining data to move. */
73 add dst, dstin, count
74 add src, src, count
75 cmp count, #64
76 b.ge .Lmov_not_short_up
77
78 /* Deal with small moves quickly by dropping straight into the
79 * exit block. */
80.Ltail63up:
81 /* Move up to 48 bytes of data. At this point we only need the
82 * bottom 6 bits of count to be accurate. */
83 ands tmp1, count, #0x30
84 b.eq .Ltail15up
85 sub dst, dst, tmp1
86 sub src, src, tmp1
87 cmp tmp1w, #0x20
88 b.eq 1f
89 b.lt 2f
90 ldp A_l, A_h, [src, #32]
91 stp A_l, A_h, [dst, #32]
921:
93 ldp A_l, A_h, [src, #16]
94 stp A_l, A_h, [dst, #16]
952:
96 ldp A_l, A_h, [src]
97 stp A_l, A_h, [dst]
98.Ltail15up:
99 /* Move up to 15 bytes of data. Does not assume additional data
100 * being moved. */
101 tbz count, #3, 1f
102 ldr tmp1, [src, #-8]!
103 str tmp1, [dst, #-8]!
1041:
105 tbz count, #2, 1f
106 ldr tmp1w, [src, #-4]!
107 str tmp1w, [dst, #-4]!
1081:
109 tbz count, #1, 1f
110 ldrh tmp1w, [src, #-2]!
111 strh tmp1w, [dst, #-2]!
1121:
113 tbz count, #0, 1f
114 ldrb tmp1w, [src, #-1]
115 strb tmp1w, [dst, #-1]
1161:
117 ret
118
119.Lmov_not_short_up:
120 /* We don't much care about the alignment of DST, but we want SRC
121 * to be 128-bit (16 byte) aligned so that we don't cross cache line
122 * boundaries on both loads and stores. */
123 ands tmp2, src, #15 /* Bytes to reach alignment. */
124 b.eq 2f
125 sub count, count, tmp2
126 /* Move enough data to reach alignment; unlike memcpy, we have to
127 * be aware of the overlap, which means we can't move data twice. */
128 tbz tmp2, #3, 1f
129 ldr tmp1, [src, #-8]!
130 str tmp1, [dst, #-8]!
1311:
132 tbz tmp2, #2, 1f
133 ldr tmp1w, [src, #-4]!
134 str tmp1w, [dst, #-4]!
1351:
136 tbz tmp2, #1, 1f
137 ldrh tmp1w, [src, #-2]!
138 strh tmp1w, [dst, #-2]!
1391:
140 tbz tmp2, #0, 1f
141 ldrb tmp1w, [src, #-1]!
142 strb tmp1w, [dst, #-1]!
1431:
144
145 /* There may be less than 63 bytes to go now. */
146 cmp count, #63
147 b.le .Ltail63up
1482:
149 subs count, count, #128
150 b.ge .Lmov_body_large_up
151 /* Less than 128 bytes to move, so handle 64 here and then jump
152 * to the tail. */
153 ldp A_l, A_h, [src, #-64]!
154 ldp B_l, B_h, [src, #16]
155 ldp C_l, C_h, [src, #32]
156 ldp D_l, D_h, [src, #48]
157 stp A_l, A_h, [dst, #-64]!
158 stp B_l, B_h, [dst, #16]
159 stp C_l, C_h, [dst, #32]
160 stp D_l, D_h, [dst, #48]
161 tst count, #0x3f
162 b.ne .Ltail63up
163 ret
164
165 /* Critical loop. Start at a new Icache line boundary. Assuming
166 * 64 bytes per line this ensures the entire loop is in one line. */
167 .p2align 6
168.Lmov_body_large_up:
169 /* There are at least 128 bytes to move. */
170 ldp A_l, A_h, [src, #-16]
171 ldp B_l, B_h, [src, #-32]
172 ldp C_l, C_h, [src, #-48]
173 ldp D_l, D_h, [src, #-64]!
1741:
175 stp A_l, A_h, [dst, #-16]
176 ldp A_l, A_h, [src, #-16]
177 stp B_l, B_h, [dst, #-32]
178 ldp B_l, B_h, [src, #-32]
179 stp C_l, C_h, [dst, #-48]
180 ldp C_l, C_h, [src, #-48]
181 stp D_l, D_h, [dst, #-64]!
182 ldp D_l, D_h, [src, #-64]!
183 subs count, count, #64
184 b.ge 1b
185 stp A_l, A_h, [dst, #-16]
186 stp B_l, B_h, [dst, #-32]
187 stp C_l, C_h, [dst, #-48]
188 stp D_l, D_h, [dst, #-64]!
189 tst count, #0x3f
190 b.ne .Ltail63up
191 ret
192
193
194.Ldownwards:
195 /* For a downwards move we can safely use memcpy provided that
196 * DST is more than 16 bytes away from SRC. */
197 sub tmp1, src, #16
198 cmp dstin, tmp1
199 b.ls memcpy /* May overlap, but not critically. */
200
201 mov dst, dstin /* Preserve DSTIN for return value. */
202 cmp count, #64
203 b.ge .Lmov_not_short_down
204
205 /* Deal with small moves quickly by dropping straight into the
206 * exit block. */
207.Ltail63down:
208 /* Move up to 48 bytes of data. At this point we only need the
209 * bottom 6 bits of count to be accurate. */
210 ands tmp1, count, #0x30
211 b.eq .Ltail15down
212 add dst, dst, tmp1
213 add src, src, tmp1
214 cmp tmp1w, #0x20
215 b.eq 1f
216 b.lt 2f
217 ldp A_l, A_h, [src, #-48]
218 stp A_l, A_h, [dst, #-48]
2191:
220 ldp A_l, A_h, [src, #-32]
221 stp A_l, A_h, [dst, #-32]
2222:
223 ldp A_l, A_h, [src, #-16]
224 stp A_l, A_h, [dst, #-16]
225.Ltail15down:
226 /* Move up to 15 bytes of data. Does not assume additional data
227 being moved. */
228 tbz count, #3, 1f
229 ldr tmp1, [src], #8
230 str tmp1, [dst], #8
2311:
232 tbz count, #2, 1f
233 ldr tmp1w, [src], #4
234 str tmp1w, [dst], #4
2351:
236 tbz count, #1, 1f
237 ldrh tmp1w, [src], #2
238 strh tmp1w, [dst], #2
2391:
240 tbz count, #0, 1f
241 ldrb tmp1w, [src]
242 strb tmp1w, [dst]
2431:
244 ret
245
246.Lmov_not_short_down:
247 /* We don't much care about the alignment of DST, but we want SRC
248 * to be 128-bit (16 byte) aligned so that we don't cross cache line
249 * boundaries on both loads and stores. */
250 neg tmp2, src
251 ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
252 b.eq 2f
253 sub count, count, tmp2
254 /* Move enough data to reach alignment; unlike memcpy, we have to
255 * be aware of the overlap, which means we can't move data twice. */
256 tbz tmp2, #3, 1f
257 ldr tmp1, [src], #8
258 str tmp1, [dst], #8
2591:
260 tbz tmp2, #2, 1f
261 ldr tmp1w, [src], #4
262 str tmp1w, [dst], #4
2631:
264 tbz tmp2, #1, 1f
265 ldrh tmp1w, [src], #2
266 strh tmp1w, [dst], #2
2671:
268 tbz tmp2, #0, 1f
269 ldrb tmp1w, [src], #1
270 strb tmp1w, [dst], #1
2711:
272
273 /* There may be less than 63 bytes to go now. */
274 cmp count, #63
275 b.le .Ltail63down
2762:
277 subs count, count, #128
278 b.ge .Lmov_body_large_down
279 /* Less than 128 bytes to move, so handle 64 here and then jump
280 * to the tail. */
281 ldp A_l, A_h, [src]
282 ldp B_l, B_h, [src, #16]
283 ldp C_l, C_h, [src, #32]
284 ldp D_l, D_h, [src, #48]
285 stp A_l, A_h, [dst]
286 stp B_l, B_h, [dst, #16]
287 stp C_l, C_h, [dst, #32]
288 stp D_l, D_h, [dst, #48]
289 tst count, #0x3f
290 add src, src, #64
291 add dst, dst, #64
292 b.ne .Ltail63down
293 ret
294
295 /* Critical loop. Start at a new cache line boundary. Assuming
296 * 64 bytes per line this ensures the entire loop is in one line. */
297 .p2align 6
298.Lmov_body_large_down:
299 /* There are at least 128 bytes to move. */
300 ldp A_l, A_h, [src, #0]
301 sub dst, dst, #16 /* Pre-bias. */
302 ldp B_l, B_h, [src, #16]
303 ldp C_l, C_h, [src, #32]
304 ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
3051:
306 stp A_l, A_h, [dst, #16]
307 ldp A_l, A_h, [src, #16]
308 stp B_l, B_h, [dst, #32]
309 ldp B_l, B_h, [src, #32]
310 stp C_l, C_h, [dst, #48]
311 ldp C_l, C_h, [src, #48]
312 stp D_l, D_h, [dst, #64]!
313 ldp D_l, D_h, [src, #64]!
314 subs count, count, #64
315 b.ge 1b
316 stp A_l, A_h, [dst, #16]
317 stp B_l, B_h, [dst, #32]
318 stp C_l, C_h, [dst, #48]
319 stp D_l, D_h, [dst, #64]
320 add src, src, #16
321 add dst, dst, #64 + 16
322 tst count, #0x3f
323 b.ne .Ltail63down
324 ret
Jake Weinstein2926f9a2015-08-16 00:44:40 +0000325#if defined(WMEMMOVE)
Bernhard Rosenkraenzer6f2bde32014-05-23 17:44:18 +0200326END(wmemmove)
327#else
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +0100328END(memmove)
Bernhard Rosenkraenzer6f2bde32014-05-23 17:44:18 +0200329#endif