blob: f8506240829bf462fa3df93a873544a667c0e558 [file] [log] [blame]
Jake Weinstein372f19e2016-11-17 16:01:25 -05001/* Copyright (c) 2012-2013, Linaro Limited
Christopher Ferris8cf61da2014-09-24 17:05:20 -07002 All rights reserved.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the name of the Linaro nor the
12 names of its contributors may be used to endorse or promote products
13 derived from this software without specific prior written permission.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
Jake Weinstein372f19e2016-11-17 16:01:25 -050025 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
26
27/*
28 * Copyright (c) 2015 ARM Ltd
29 * All rights reserved.
30 *
31 * Redistribution and use in source and binary forms, with or without
32 * modification, are permitted provided that the following conditions
33 * are met:
34 * 1. Redistributions of source code must retain the above copyright
35 * notice, this list of conditions and the following disclaimer.
36 * 2. Redistributions in binary form must reproduce the above copyright
37 * notice, this list of conditions and the following disclaimer in the
38 * documentation and/or other materials provided with the distribution.
39 * 3. The name of the company may not be used to endorse or promote
40 * products derived from this software without specific prior written
41 * permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
44 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
45 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
46 * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
48 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
49 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
50 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
51 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
52 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 */
Christopher Ferris8cf61da2014-09-24 17:05:20 -070054
55/* Assumptions:
56 *
Jake Weinstein372f19e2016-11-17 16:01:25 -050057 * ARMv8-a, AArch64, unaligned accesses.
Christopher Ferris8cf61da2014-09-24 17:05:20 -070058 *
59 */
60
Jake Weinstein372f19e2016-11-17 16:01:25 -050061#include <private/bionic_asm.h>
62
Christopher Ferris8cf61da2014-09-24 17:05:20 -070063#define dstin x0
64#define src x1
65#define count x2
Jake Weinstein372f19e2016-11-17 16:01:25 -050066#define dst x3
67#define srcend x4
68#define dstend x5
69#define A_l x6
70#define A_lw w6
71#define A_h x7
72#define A_hw w7
73#define B_l x8
74#define B_lw w8
75#define B_h x9
76#define C_l x10
77#define C_h x11
78#define D_l x12
79#define D_h x13
80#define E_l src
81#define E_h count
82#define F_l srcend
83#define F_h dst
84#define tmp1 x9
Christopher Ferris8cf61da2014-09-24 17:05:20 -070085
Jake Weinstein372f19e2016-11-17 16:01:25 -050086#define L(l) .L ## l
Christopher Ferris8cf61da2014-09-24 17:05:20 -070087
Jake Weinstein372f19e2016-11-17 16:01:25 -050088/* Copies are split into 3 main cases: small copies of up to 16 bytes,
89 medium copies of 17..96 bytes which are fully unrolled. Large copies
90 of more than 96 bytes align the destination and use an unrolled loop
91 processing 64 bytes per iteration.
92 Small and medium copies read all data before writing, allowing any
93 kind of overlap, and memmove tailcalls memcpy for these cases as
94 well as non-overlapping copies.
95*/
Christopher Ferris8cf61da2014-09-24 17:05:20 -070096
Jake Weinstein372f19e2016-11-17 16:01:25 -050097 prfm PLDL1KEEP, [src]
98 add srcend, src, count
99 add dstend, dstin, count
100 cmp count, 16
101 b.ls L(copy16)
102 cmp count, 96
103 b.hi L(copy_long)
Christopher Ferris8cf61da2014-09-24 17:05:20 -0700104
Jake Weinstein372f19e2016-11-17 16:01:25 -0500105 /* Medium copies: 17..96 bytes. */
106 sub tmp1, count, 1
Christopher Ferris8cf61da2014-09-24 17:05:20 -0700107 ldp A_l, A_h, [src]
Jake Weinstein372f19e2016-11-17 16:01:25 -0500108 tbnz tmp1, 6, L(copy96)
109 ldp D_l, D_h, [srcend, -16]
110 tbz tmp1, 5, 1f
111 ldp B_l, B_h, [src, 16]
112 ldp C_l, C_h, [srcend, -32]
113 stp B_l, B_h, [dstin, 16]
114 stp C_l, C_h, [dstend, -32]
1151:
116 stp A_l, A_h, [dstin]
117 stp D_l, D_h, [dstend, -16]
Christopher Ferris8cf61da2014-09-24 17:05:20 -0700118 ret
119
Jake Weinstein372f19e2016-11-17 16:01:25 -0500120 .p2align 4
121
122 /* Small copies: 0..16 bytes. */
123L(copy16):
124 cmp count, 8
125 b.lo 1f
126 ldr A_l, [src]
127 ldr A_h, [srcend, -8]
128 str A_l, [dstin]
129 str A_h, [dstend, -8]
130 ret
131 .p2align 4
Christopher Ferris8cf61da2014-09-24 17:05:20 -07001321:
Jake Weinstein372f19e2016-11-17 16:01:25 -0500133 tbz count, 2, 1f
134 ldr A_lw, [src]
135 ldr A_hw, [srcend, -4]
136 str A_lw, [dstin]
137 str A_hw, [dstend, -4]
138 ret
139
140 /* Copy 0..3 bytes. Use a branchless sequence that copies the same
141 byte 3 times if count==1, or the 2nd byte twice if count==2. */
1421:
143 cbz count, 2f
144 lsr tmp1, count, 1
145 ldrb A_lw, [src]
146 ldrb A_hw, [srcend, -1]
147 ldrb B_lw, [src, tmp1]
148 strb A_lw, [dstin]
149 strb B_lw, [dstin, tmp1]
150 strb A_hw, [dstend, -1]
1512: ret
152
153 .p2align 4
154 /* Copy 64..96 bytes. Copy 64 bytes from the start and
155 32 bytes from the end. */
156L(copy96):
157 ldp B_l, B_h, [src, 16]
158 ldp C_l, C_h, [src, 32]
159 ldp D_l, D_h, [src, 48]
160 ldp E_l, E_h, [srcend, -32]
161 ldp F_l, F_h, [srcend, -16]
162 stp A_l, A_h, [dstin]
163 stp B_l, B_h, [dstin, 16]
164 stp C_l, C_h, [dstin, 32]
165 stp D_l, D_h, [dstin, 48]
166 stp E_l, E_h, [dstend, -32]
167 stp F_l, F_h, [dstend, -16]
168 ret
169
170 /* Align DST to 16 byte alignment so that we don't cross cache line
171 boundaries on both loads and stores. There are at least 96 bytes
172 to copy, so copy 16 bytes unaligned and then align. The loop
173 copies 64 bytes per iteration and prefetches one iteration ahead. */
174
175 .p2align 4
176L(copy_long):
177 and tmp1, dstin, 15
178 bic dst, dstin, 15
179 ldp D_l, D_h, [src]
180 sub src, src, tmp1
181 add count, count, tmp1 /* Count is now 16 too large. */
182 ldp A_l, A_h, [src, 16]
183 stp D_l, D_h, [dstin]
184 ldp B_l, B_h, [src, 32]
185 ldp C_l, C_h, [src, 48]
186 ldp D_l, D_h, [src, 64]!
187 subs count, count, 128 + 16 /* Test and readjust count. */
188 b.ls 2f
1891:
190 stp A_l, A_h, [dst, 16]
191 ldp A_l, A_h, [src, 16]
192 stp B_l, B_h, [dst, 32]
193 ldp B_l, B_h, [src, 32]
194 stp C_l, C_h, [dst, 48]
195 ldp C_l, C_h, [src, 48]
196 stp D_l, D_h, [dst, 64]!
197 ldp D_l, D_h, [src, 64]!
198 subs count, count, 64
199 b.hi 1b
200
201 /* Write the last full set of 64 bytes. The remainder is at most 64
202 bytes, so it is safe to always copy 64 bytes from the end even if
203 there is just 1 byte left. */
2042:
205 ldp E_l, E_h, [srcend, -64]
206 stp A_l, A_h, [dst, 16]
207 ldp A_l, A_h, [srcend, -48]
208 stp B_l, B_h, [dst, 32]
209 ldp B_l, B_h, [srcend, -32]
210 stp C_l, C_h, [dst, 48]
211 ldp C_l, C_h, [srcend, -16]
212 stp D_l, D_h, [dst, 64]
213 stp E_l, E_h, [dstend, -64]
214 stp A_l, A_h, [dstend, -48]
215 stp B_l, B_h, [dstend, -32]
216 stp C_l, C_h, [dstend, -16]
Christopher Ferris8cf61da2014-09-24 17:05:20 -0700217 ret