blob: 9127d89f196c8a60e4fdfcdc63bb55af553ff57f [file] [log] [blame]
Shu Zhang0ef7a8f2014-03-17 15:15:32 +08001/* Copyright (c) 2012, Linaro Limited
2 All rights reserved.
3 Copyright (c) 2014, NVIDIA Corporation. All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in the
11 documentation and/or other materials provided with the distribution.
12 * Neither the name of the Linaro nor the
13 names of its contributors may be used to endorse or promote products
14 derived from this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27*/
28
29/* Assumptions:
30 *
31 * denver, ARMv8-a, AArch64
32 * Unaligned accesses
33 *
34 */
35
36#include <private/bionic_asm.h>
37
38/* By default we assume that the DC instruction can be used to zero
39 data blocks more efficiently. In some circumstances this might be
40 unsafe, for example in an asymmetric multiprocessor environment with
41 different DC clear lengths (neither the upper nor lower lengths are
42 safe to use). The feature can be disabled by defining DONT_USE_DC.
43
44 If code may be run in a virtualized environment, then define
45 MAYBE_VIRT. This will cause the code to cache the system register
46 values rather than re-reading them each call. */
47
48#define dstin x0
49#define val w1
50#define count x2
51#define tmp1 x3
52#define tmp1w w3
53#define tmp2 x4
54#define tmp2w w4
55#define zva_len_x x5
56#define zva_len w5
57#define zva_bits_x x6
58
59#define A_l x7
60#define A_lw w7
61#define dst x8
62#define tmp3w w9
63
64#define QA_l q0
65
66ENTRY(memset)
67
68 mov dst, dstin /* Preserve return value. */
69 ands A_lw, val, #255
70#ifndef DONT_USE_DC
71# b.eq .Lzero_mem
72#endif
73 orr A_lw, A_lw, A_lw, lsl #8
74 orr A_lw, A_lw, A_lw, lsl #16
75 orr A_l, A_l, A_l, lsl #32
76.Ltail_maybe_long:
77 cmp count, #256
78 b.ge .Lnot_short
79.Ltail_maybe_tiny:
80 cmp count, #15
81 b.le .Ltail15tiny
82.Ltail255:
83 ands tmp1, count, #0xC0
84 b.eq .Ltail63
85 dup v0.4s, A_lw
86 cmp tmp1w, #0x80
87 b.eq 1f
88 b.lt 2f
89 stp QA_l, QA_l, [dst], #32
90 stp QA_l, QA_l, [dst], #32
911:
92 stp QA_l, QA_l, [dst], #32
93 stp QA_l, QA_l, [dst], #32
942:
95 stp QA_l, QA_l, [dst], #32
96 stp QA_l, QA_l, [dst], #32
97.Ltail63:
98 ands tmp1, count, #0x30
99 b.eq .Ltail15
100 add dst, dst, tmp1
101 cmp tmp1w, #0x20
102 b.eq 1f
103 b.lt 2f
104 stp A_l, A_l, [dst, #-48]
1051:
106 stp A_l, A_l, [dst, #-32]
1072:
108 stp A_l, A_l, [dst, #-16]
109
110.Ltail15:
111 and count, count, #15
112 add dst, dst, count
113 stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */
114 ret
115
116.Ltail15tiny:
117 /* Set up to 15 bytes. Does not assume earlier memory
118 being set. */
119 tbz count, #3, 1f
120 str A_l, [dst], #8
1211:
122 tbz count, #2, 1f
123 str A_lw, [dst], #4
1241:
125 tbz count, #1, 1f
126 strh A_lw, [dst], #2
1271:
128 tbz count, #0, 1f
129 strb A_lw, [dst]
1301:
131 ret
132
133 /* Critical loop. Start at a new cache line boundary. Assuming
134 * 64 bytes per line, this ensures the entire loop is in one line. */
135 .p2align 6
136.Lnot_short:
137 dup v0.4s, A_lw
138 neg tmp2, dst
139 ands tmp2, tmp2, #15
140 b.eq 2f
141 /* Bring DST to 128-bit (16-byte) alignment. We know that there's
142 * more than that to set, so we simply store 16 bytes and advance by
143 * the amount required to reach alignment. */
144 sub count, count, tmp2
145 stp A_l, A_l, [dst]
146 add dst, dst, tmp2
147 /* There may be less than 63 bytes to go now. */
148 cmp count, #255
149 b.le .Ltail255
1502:
151 cmp count, #2097152
152 b.gt 3f
1531:
154 sub count, count, #256
1552:
156 stp QA_l, QA_l, [dst], #32
157 stp QA_l, QA_l, [dst], #32
158 stp QA_l, QA_l, [dst], #32
159 stp QA_l, QA_l, [dst], #32
160 stp QA_l, QA_l, [dst], #32
161 stp QA_l, QA_l, [dst], #32
162 stp QA_l, QA_l, [dst], #32
163 stp QA_l, QA_l, [dst], #32
164 subs count, count, #256
165 b.ge 2b
166 tst count, #0xff
167 b.ne .Ltail255
168 ret
1693:
170 sub count, count, #64
1714:
172 subs count, count, #64
173 stnp QA_l, QA_l, [dst]
174 stnp QA_l, QA_l, [dst, #32]
175 add dst, dst, #64
176 b.ge 4b
177 tst count, #0x3f
178 b.ne .Ltail63
179 ret
180
181#ifndef DONT_USE_DC
182 /* For zeroing memory, check to see if we can use the ZVA feature to
183 * zero entire 'cache' lines. */
184.Lzero_mem:
185 mov A_l, #0
186 cmp count, #63
187 b.le .Ltail_maybe_tiny
188 neg tmp2, dst
189 ands tmp2, tmp2, #15
190 b.eq 1f
191 sub count, count, tmp2
192 stp A_l, A_l, [dst]
193 add dst, dst, tmp2
194 cmp count, #63
195 b.le .Ltail63
1961:
197 /* For zeroing small amounts of memory, it's not worth setting up
198 * the line-clear code. */
199 cmp count, #128
200 b.lt .Lnot_short
201#ifdef MAYBE_VIRT
202 /* For efficiency when virtualized, we cache the ZVA capability. */
203 adrp tmp2, .Lcache_clear
204 ldr zva_len, [tmp2, #:lo12:.Lcache_clear]
205 tbnz zva_len, #31, .Lnot_short
206 cbnz zva_len, .Lzero_by_line
207 mrs tmp1, dczid_el0
208 tbz tmp1, #4, 1f
209 /* ZVA not available. Remember this for next time. */
210 mov zva_len, #~0
211 str zva_len, [tmp2, #:lo12:.Lcache_clear]
212 b .Lnot_short
2131:
214 mov tmp3w, #4
215 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
216 lsl zva_len, tmp3w, zva_len
217 str zva_len, [tmp2, #:lo12:.Lcache_clear]
218#else
219 mrs tmp1, dczid_el0
220 tbnz tmp1, #4, .Lnot_short
221 mov tmp3w, #4
222 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
223 lsl zva_len, tmp3w, zva_len
224#endif
225
226.Lzero_by_line:
227 /* Compute how far we need to go to become suitably aligned. We're
228 * already at quad-word alignment. */
229 cmp count, zva_len_x
230 b.lt .Lnot_short /* Not enough to reach alignment. */
231 sub zva_bits_x, zva_len_x, #1
232 neg tmp2, dst
233 ands tmp2, tmp2, zva_bits_x
234 b.eq 1f /* Already aligned. */
235 /* Not aligned, check that there's enough to copy after alignment. */
236 sub tmp1, count, tmp2
237 cmp tmp1, #64
238 ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
239 b.lt .Lnot_short
240 /* We know that there's at least 64 bytes to zero and that it's safe
241 * to overrun by 64 bytes. */
242 mov count, tmp1
2432:
244 stp A_l, A_l, [dst]
245 stp A_l, A_l, [dst, #16]
246 stp A_l, A_l, [dst, #32]
247 subs tmp2, tmp2, #64
248 stp A_l, A_l, [dst, #48]
249 add dst, dst, #64
250 b.ge 2b
251 /* We've overrun a bit, so adjust dst downwards. */
252 add dst, dst, tmp2
2531:
254 sub count, count, zva_len_x
2553:
256 dc zva, dst
257 add dst, dst, zva_len_x
258 subs count, count, zva_len_x
259 b.ge 3b
260 ands count, count, zva_bits_x
261 b.ne .Ltail_maybe_long
262 ret
263END(memset)
264
265#ifdef MAYBE_VIRT
266 .bss
267 .p2align 2
268.Lcache_clear:
269 .space 4
270#endif
271#endif /* DONT_USE_DC */