blob: 7c204b4a7feb2d1c48b307db6114b06de4bad5a1 [file] [log] [blame]
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +01001/* Copyright (c) 2012, Linaro Limited
2 All rights reserved.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the name of the Linaro nor the
12 names of its contributors may be used to endorse or promote products
13 derived from this software without specific prior written permission.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28/* Assumptions:
29 *
30 * ARMv8-a, AArch64
31 * Unaligned accesses
32 *
33 */
34
35#include <private/bionic_asm.h>
36
37/* By default we assume that the DC instruction can be used to zero
38 data blocks more efficiently. In some circumstances this might be
39 unsafe, for example in an asymmetric multiprocessor environment with
40 different DC clear lengths (neither the upper nor lower lengths are
Bernhard Rosenkraenzer62d92e12014-05-19 13:16:41 +020041 safe to use).
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010042
43 If code may be run in a virtualized environment, then define
44 MAYBE_VIRT. This will cause the code to cache the system register
45 values rather than re-reading them each call. */
46
47#define dstin x0
Bernhard Rosenkraenzer62d92e12014-05-19 13:16:41 +020048#ifdef BZERO
49#define count x1
50#else
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010051#define count x2
Bernhard Rosenkraenzer62d92e12014-05-19 13:16:41 +020052#endif
53#define val w1
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010054#define tmp1 x3
55#define tmp1w w3
56#define tmp2 x4
57#define tmp2w w4
58#define zva_len_x x5
59#define zva_len w5
60#define zva_bits_x x6
61
62#define A_l x7
63#define A_lw w7
64#define dst x8
65#define tmp3w w9
66
Bernhard Rosenkraenzer62d92e12014-05-19 13:16:41 +020067#ifdef BZERO
68ENTRY(bzero)
69#else
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010070ENTRY(memset)
Bernhard Rosenkraenzer62d92e12014-05-19 13:16:41 +020071#endif
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010072
73 mov dst, dstin /* Preserve return value. */
Bernhard Rosenkraenzer62d92e12014-05-19 13:16:41 +020074#ifdef BZERO
75 b .Lzero_mem
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010076#endif
Bernhard Rosenkraenzer62d92e12014-05-19 13:16:41 +020077 ands A_lw, val, #255
78 b.eq .Lzero_mem
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010079 orr A_lw, A_lw, A_lw, lsl #8
80 orr A_lw, A_lw, A_lw, lsl #16
81 orr A_l, A_l, A_l, lsl #32
82.Ltail_maybe_long:
83 cmp count, #64
84 b.ge .Lnot_short
85.Ltail_maybe_tiny:
86 cmp count, #15
87 b.le .Ltail15tiny
88.Ltail63:
89 ands tmp1, count, #0x30
90 b.eq .Ltail15
91 add dst, dst, tmp1
92 cmp tmp1w, #0x20
93 b.eq 1f
94 b.lt 2f
95 stp A_l, A_l, [dst, #-48]
961:
97 stp A_l, A_l, [dst, #-32]
982:
99 stp A_l, A_l, [dst, #-16]
100
101.Ltail15:
102 and count, count, #15
103 add dst, dst, count
104 stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */
105 ret
106
107.Ltail15tiny:
108 /* Set up to 15 bytes. Does not assume earlier memory
109 being set. */
110 tbz count, #3, 1f
111 str A_l, [dst], #8
1121:
113 tbz count, #2, 1f
114 str A_lw, [dst], #4
1151:
116 tbz count, #1, 1f
117 strh A_lw, [dst], #2
1181:
119 tbz count, #0, 1f
120 strb A_lw, [dst]
1211:
122 ret
123
124 /* Critical loop. Start at a new cache line boundary. Assuming
125 * 64 bytes per line, this ensures the entire loop is in one line. */
126 .p2align 6
127.Lnot_short:
128 neg tmp2, dst
129 ands tmp2, tmp2, #15
130 b.eq 2f
131 /* Bring DST to 128-bit (16-byte) alignment. We know that there's
132 * more than that to set, so we simply store 16 bytes and advance by
133 * the amount required to reach alignment. */
134 sub count, count, tmp2
135 stp A_l, A_l, [dst]
136 add dst, dst, tmp2
137 /* There may be less than 63 bytes to go now. */
138 cmp count, #63
139 b.le .Ltail63
1402:
141 sub dst, dst, #16 /* Pre-bias. */
142 sub count, count, #64
1431:
144 stp A_l, A_l, [dst, #16]
145 stp A_l, A_l, [dst, #32]
146 stp A_l, A_l, [dst, #48]
147 stp A_l, A_l, [dst, #64]!
148 subs count, count, #64
149 b.ge 1b
150 tst count, #0x3f
151 add dst, dst, #16
152 b.ne .Ltail63
153 ret
154
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +0100155 /* For zeroing memory, check to see if we can use the ZVA feature to
156 * zero entire 'cache' lines. */
157.Lzero_mem:
158 mov A_l, #0
159 cmp count, #63
160 b.le .Ltail_maybe_tiny
161 neg tmp2, dst
162 ands tmp2, tmp2, #15
163 b.eq 1f
164 sub count, count, tmp2
165 stp A_l, A_l, [dst]
166 add dst, dst, tmp2
167 cmp count, #63
168 b.le .Ltail63
1691:
170 /* For zeroing small amounts of memory, it's not worth setting up
171 * the line-clear code. */
172 cmp count, #128
173 b.lt .Lnot_short
174#ifdef MAYBE_VIRT
175 /* For efficiency when virtualized, we cache the ZVA capability. */
176 adrp tmp2, .Lcache_clear
177 ldr zva_len, [tmp2, #:lo12:.Lcache_clear]
178 tbnz zva_len, #31, .Lnot_short
179 cbnz zva_len, .Lzero_by_line
180 mrs tmp1, dczid_el0
181 tbz tmp1, #4, 1f
182 /* ZVA not available. Remember this for next time. */
183 mov zva_len, #~0
184 str zva_len, [tmp2, #:lo12:.Lcache_clear]
185 b .Lnot_short
1861:
187 mov tmp3w, #4
188 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
189 lsl zva_len, tmp3w, zva_len
190 str zva_len, [tmp2, #:lo12:.Lcache_clear]
191#else
192 mrs tmp1, dczid_el0
193 tbnz tmp1, #4, .Lnot_short
194 mov tmp3w, #4
195 and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
196 lsl zva_len, tmp3w, zva_len
197#endif
198
199.Lzero_by_line:
200 /* Compute how far we need to go to become suitably aligned. We're
201 * already at quad-word alignment. */
202 cmp count, zva_len_x
203 b.lt .Lnot_short /* Not enough to reach alignment. */
204 sub zva_bits_x, zva_len_x, #1
205 neg tmp2, dst
206 ands tmp2, tmp2, zva_bits_x
207 b.eq 1f /* Already aligned. */
208 /* Not aligned, check that there's enough to copy after alignment. */
209 sub tmp1, count, tmp2
210 cmp tmp1, #64
211 ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
212 b.lt .Lnot_short
213 /* We know that there's at least 64 bytes to zero and that it's safe
214 * to overrun by 64 bytes. */
215 mov count, tmp1
2162:
217 stp A_l, A_l, [dst]
218 stp A_l, A_l, [dst, #16]
219 stp A_l, A_l, [dst, #32]
220 subs tmp2, tmp2, #64
221 stp A_l, A_l, [dst, #48]
222 add dst, dst, #64
223 b.ge 2b
224 /* We've overrun a bit, so adjust dst downwards. */
225 add dst, dst, tmp2
2261:
227 sub count, count, zva_len_x
2283:
229 dc zva, dst
230 add dst, dst, zva_len_x
231 subs count, count, zva_len_x
232 b.ge 3b
233 ands count, count, zva_bits_x
234 b.ne .Ltail_maybe_long
235 ret
Bernhard Rosenkraenzer62d92e12014-05-19 13:16:41 +0200236#ifdef BZERO
237END(bzero)
238#else
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +0100239END(memset)
Bernhard Rosenkraenzer62d92e12014-05-19 13:16:41 +0200240#endif
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +0100241
242#ifdef MAYBE_VIRT
243 .bss
244 .p2align 2
245.Lcache_clear:
246 .space 4
247#endif