blob: d6ecb868bbddfbf1f119ac7d0a918721be709869 [file] [log] [blame]
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +01001/* Copyright (c) 2014, Linaro Limited
2 All rights reserved.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the name of the Linaro nor the
12 names of its contributors may be used to endorse or promote products
13 derived from this software without specific prior written permission.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28/* Assumptions:
29 *
30 * ARMv8-a, AArch64
31 * Unaligned accesses
32 */
33
34#include <private/bionic_asm.h>
35
36/* Parameters and result. */
37#define dstin x0
38#define src x1
39#define count x2
40#define tmp1 x3
41#define tmp1w w3
42#define tmp2 x4
43#define tmp2w w4
44#define tmp3 x5
45#define tmp3w w5
46#define dst x6
47
48#define A_l x7
49#define A_h x8
50#define B_l x9
51#define B_h x10
52#define C_l x11
53#define C_h x12
54#define D_l x13
55#define D_h x14
56
57ENTRY(memmove)
58 cmp dstin, src
59 b.lo .Ldownwards
60 add tmp1, src, count
61 cmp dstin, tmp1
62 b.hs memcpy /* No overlap. */
63
64 /* Upwards move with potential overlap.
65 * Need to move from the tail backwards. SRC and DST point one
66 * byte beyond the remaining data to move. */
67 add dst, dstin, count
68 add src, src, count
69 cmp count, #64
70 b.ge .Lmov_not_short_up
71
72 /* Deal with small moves quickly by dropping straight into the
73 * exit block. */
74.Ltail63up:
75 /* Move up to 48 bytes of data. At this point we only need the
76 * bottom 6 bits of count to be accurate. */
77 ands tmp1, count, #0x30
78 b.eq .Ltail15up
79 sub dst, dst, tmp1
80 sub src, src, tmp1
81 cmp tmp1w, #0x20
82 b.eq 1f
83 b.lt 2f
84 ldp A_l, A_h, [src, #32]
85 stp A_l, A_h, [dst, #32]
861:
87 ldp A_l, A_h, [src, #16]
88 stp A_l, A_h, [dst, #16]
892:
90 ldp A_l, A_h, [src]
91 stp A_l, A_h, [dst]
92.Ltail15up:
93 /* Move up to 15 bytes of data. Does not assume additional data
94 * being moved. */
95 tbz count, #3, 1f
96 ldr tmp1, [src, #-8]!
97 str tmp1, [dst, #-8]!
981:
99 tbz count, #2, 1f
100 ldr tmp1w, [src, #-4]!
101 str tmp1w, [dst, #-4]!
1021:
103 tbz count, #1, 1f
104 ldrh tmp1w, [src, #-2]!
105 strh tmp1w, [dst, #-2]!
1061:
107 tbz count, #0, 1f
108 ldrb tmp1w, [src, #-1]
109 strb tmp1w, [dst, #-1]
1101:
111 ret
112
113.Lmov_not_short_up:
114 /* We don't much care about the alignment of DST, but we want SRC
115 * to be 128-bit (16 byte) aligned so that we don't cross cache line
116 * boundaries on both loads and stores. */
117 ands tmp2, src, #15 /* Bytes to reach alignment. */
118 b.eq 2f
119 sub count, count, tmp2
120 /* Move enough data to reach alignment; unlike memcpy, we have to
121 * be aware of the overlap, which means we can't move data twice. */
122 tbz tmp2, #3, 1f
123 ldr tmp1, [src, #-8]!
124 str tmp1, [dst, #-8]!
1251:
126 tbz tmp2, #2, 1f
127 ldr tmp1w, [src, #-4]!
128 str tmp1w, [dst, #-4]!
1291:
130 tbz tmp2, #1, 1f
131 ldrh tmp1w, [src, #-2]!
132 strh tmp1w, [dst, #-2]!
1331:
134 tbz tmp2, #0, 1f
135 ldrb tmp1w, [src, #-1]!
136 strb tmp1w, [dst, #-1]!
1371:
138
139 /* There may be less than 63 bytes to go now. */
140 cmp count, #63
141 b.le .Ltail63up
1422:
143 subs count, count, #128
144 b.ge .Lmov_body_large_up
145 /* Less than 128 bytes to move, so handle 64 here and then jump
146 * to the tail. */
147 ldp A_l, A_h, [src, #-64]!
148 ldp B_l, B_h, [src, #16]
149 ldp C_l, C_h, [src, #32]
150 ldp D_l, D_h, [src, #48]
151 stp A_l, A_h, [dst, #-64]!
152 stp B_l, B_h, [dst, #16]
153 stp C_l, C_h, [dst, #32]
154 stp D_l, D_h, [dst, #48]
155 tst count, #0x3f
156 b.ne .Ltail63up
157 ret
158
159 /* Critical loop. Start at a new Icache line boundary. Assuming
160 * 64 bytes per line this ensures the entire loop is in one line. */
161 .p2align 6
162.Lmov_body_large_up:
163 /* There are at least 128 bytes to move. */
164 ldp A_l, A_h, [src, #-16]
165 ldp B_l, B_h, [src, #-32]
166 ldp C_l, C_h, [src, #-48]
167 ldp D_l, D_h, [src, #-64]!
1681:
169 stp A_l, A_h, [dst, #-16]
170 ldp A_l, A_h, [src, #-16]
171 stp B_l, B_h, [dst, #-32]
172 ldp B_l, B_h, [src, #-32]
173 stp C_l, C_h, [dst, #-48]
174 ldp C_l, C_h, [src, #-48]
175 stp D_l, D_h, [dst, #-64]!
176 ldp D_l, D_h, [src, #-64]!
177 subs count, count, #64
178 b.ge 1b
179 stp A_l, A_h, [dst, #-16]
180 stp B_l, B_h, [dst, #-32]
181 stp C_l, C_h, [dst, #-48]
182 stp D_l, D_h, [dst, #-64]!
183 tst count, #0x3f
184 b.ne .Ltail63up
185 ret
186
187
188.Ldownwards:
189 /* For a downwards move we can safely use memcpy provided that
190 * DST is more than 16 bytes away from SRC. */
191 sub tmp1, src, #16
192 cmp dstin, tmp1
193 b.ls memcpy /* May overlap, but not critically. */
194
195 mov dst, dstin /* Preserve DSTIN for return value. */
196 cmp count, #64
197 b.ge .Lmov_not_short_down
198
199 /* Deal with small moves quickly by dropping straight into the
200 * exit block. */
201.Ltail63down:
202 /* Move up to 48 bytes of data. At this point we only need the
203 * bottom 6 bits of count to be accurate. */
204 ands tmp1, count, #0x30
205 b.eq .Ltail15down
206 add dst, dst, tmp1
207 add src, src, tmp1
208 cmp tmp1w, #0x20
209 b.eq 1f
210 b.lt 2f
211 ldp A_l, A_h, [src, #-48]
212 stp A_l, A_h, [dst, #-48]
2131:
214 ldp A_l, A_h, [src, #-32]
215 stp A_l, A_h, [dst, #-32]
2162:
217 ldp A_l, A_h, [src, #-16]
218 stp A_l, A_h, [dst, #-16]
219.Ltail15down:
220 /* Move up to 15 bytes of data. Does not assume additional data
221 being moved. */
222 tbz count, #3, 1f
223 ldr tmp1, [src], #8
224 str tmp1, [dst], #8
2251:
226 tbz count, #2, 1f
227 ldr tmp1w, [src], #4
228 str tmp1w, [dst], #4
2291:
230 tbz count, #1, 1f
231 ldrh tmp1w, [src], #2
232 strh tmp1w, [dst], #2
2331:
234 tbz count, #0, 1f
235 ldrb tmp1w, [src]
236 strb tmp1w, [dst]
2371:
238 ret
239
240.Lmov_not_short_down:
241 /* We don't much care about the alignment of DST, but we want SRC
242 * to be 128-bit (16 byte) aligned so that we don't cross cache line
243 * boundaries on both loads and stores. */
244 neg tmp2, src
245 ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
246 b.eq 2f
247 sub count, count, tmp2
248 /* Move enough data to reach alignment; unlike memcpy, we have to
249 * be aware of the overlap, which means we can't move data twice. */
250 tbz tmp2, #3, 1f
251 ldr tmp1, [src], #8
252 str tmp1, [dst], #8
2531:
254 tbz tmp2, #2, 1f
255 ldr tmp1w, [src], #4
256 str tmp1w, [dst], #4
2571:
258 tbz tmp2, #1, 1f
259 ldrh tmp1w, [src], #2
260 strh tmp1w, [dst], #2
2611:
262 tbz tmp2, #0, 1f
263 ldrb tmp1w, [src], #1
264 strb tmp1w, [dst], #1
2651:
266
267 /* There may be less than 63 bytes to go now. */
268 cmp count, #63
269 b.le .Ltail63down
2702:
271 subs count, count, #128
272 b.ge .Lmov_body_large_down
273 /* Less than 128 bytes to move, so handle 64 here and then jump
274 * to the tail. */
275 ldp A_l, A_h, [src]
276 ldp B_l, B_h, [src, #16]
277 ldp C_l, C_h, [src, #32]
278 ldp D_l, D_h, [src, #48]
279 stp A_l, A_h, [dst]
280 stp B_l, B_h, [dst, #16]
281 stp C_l, C_h, [dst, #32]
282 stp D_l, D_h, [dst, #48]
283 tst count, #0x3f
284 add src, src, #64
285 add dst, dst, #64
286 b.ne .Ltail63down
287 ret
288
289 /* Critical loop. Start at a new cache line boundary. Assuming
290 * 64 bytes per line this ensures the entire loop is in one line. */
291 .p2align 6
292.Lmov_body_large_down:
293 /* There are at least 128 bytes to move. */
294 ldp A_l, A_h, [src, #0]
295 sub dst, dst, #16 /* Pre-bias. */
296 ldp B_l, B_h, [src, #16]
297 ldp C_l, C_h, [src, #32]
298 ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
2991:
300 stp A_l, A_h, [dst, #16]
301 ldp A_l, A_h, [src, #16]
302 stp B_l, B_h, [dst, #32]
303 ldp B_l, B_h, [src, #32]
304 stp C_l, C_h, [dst, #48]
305 ldp C_l, C_h, [src, #48]
306 stp D_l, D_h, [dst, #64]!
307 ldp D_l, D_h, [src, #64]!
308 subs count, count, #64
309 b.ge 1b
310 stp A_l, A_h, [dst, #16]
311 stp B_l, B_h, [dst, #32]
312 stp C_l, C_h, [dst, #48]
313 stp D_l, D_h, [dst, #64]
314 add src, src, #16
315 add dst, dst, #64 + 16
316 tst count, #0x3f
317 b.ne .Ltail63down
318 ret
319END(memmove)