blob: 6e540fc330389c765d107f7e4a33f4b224c9d353 [file] [log] [blame]
Jake Weinstein372f19e2016-11-17 16:01:25 -05001/* Copyright (c) 2013-2015, Linaro Limited
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +01002 All rights reserved.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
Jake Weinstein372f19e2016-11-17 16:01:25 -05007 notice, this list of conditions and the following disclaimer.
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +01008 * Redistributions in binary form must reproduce the above copyright
Jake Weinstein372f19e2016-11-17 16:01:25 -05009 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010011 * Neither the name of the Linaro nor the
Jake Weinstein372f19e2016-11-17 16:01:25 -050012 names of its contributors may be used to endorse or promote products
13 derived from this software without specific prior written permission.
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010014
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
Jake Weinstein372f19e2016-11-17 16:01:25 -050025 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010026
27/* Assumptions:
28 *
Jake Weinstein372f19e2016-11-17 16:01:25 -050029 * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010030 */
31
32#include <private/bionic_asm.h>
33
Jake Weinstein372f19e2016-11-17 16:01:25 -050034/* To test the page crossing code path more thoroughly, compile with
35 -DTEST_PAGE_CROSS - this will force all calls through the slower
36 entry path. This option is not intended for production use. */
37
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010038/* Arguments and results. */
39#define srcin x0
40#define len x0
41
42/* Locals and temporaries. */
43#define src x1
44#define data1 x2
45#define data2 x3
Jake Weinstein372f19e2016-11-17 16:01:25 -050046#define has_nul1 x4
47#define has_nul2 x5
48#define tmp1 x4
49#define tmp2 x5
50#define tmp3 x6
51#define tmp4 x7
52#define zeroones x8
53
54#define L(l) .L ## l
55
56 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
57 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
58 can be done in parallel across the entire word. A faster check
59 (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
60 false hits for characters 129..255. */
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010061
62#define REP8_01 0x0101010101010101
63#define REP8_7f 0x7f7f7f7f7f7f7f7f
64#define REP8_80 0x8080808080808080
65
Jake Weinstein372f19e2016-11-17 16:01:25 -050066#ifdef TEST_PAGE_CROSS
67# define MIN_PAGE_SIZE 15
68#else
69# define MIN_PAGE_SIZE 4096
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +010070#endif
Jake Weinstein372f19e2016-11-17 16:01:25 -050071
72 /* Since strings are short on average, we check the first 16 bytes
73 of the string for a NUL character. In order to do an unaligned ldp
74 safely we have to do a page cross check first. If there is a NUL
75 byte we calculate the length from the 2 8-byte words using
76 conditional select to reduce branch mispredictions (it is unlikely
77 strlen will be repeatedly called on strings with the same length).
78
79 If the string is longer than 16 bytes, we align src so don't need
80 further page cross checks, and process 32 bytes per iteration
81 using the fast NUL check. If we encounter non-ASCII characters,
82 fallback to a second loop using the full NUL check.
83
84 If the page cross check fails, we read 16 bytes from an aligned
85 address, remove any characters before the string, and continue
86 in the main loop using aligned loads. Since strings crossing a
87 page in the first 16 bytes are rare (probability of
88 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
89
90 AArch64 systems have a minimum page size of 4k. We don't bother
91 checking for larger page sizes - the cost of setting up the correct
92 page size is just not worth the extra gain from a small reduction in
93 the cases taking the slow path. Note that we only care about
94 whether the first fetch, which may be misaligned, crosses a page
95 boundary. */
96
97ENTRY(strlen)
98 and tmp1, srcin, MIN_PAGE_SIZE - 1
99 mov zeroones, REP8_01
100 cmp tmp1, MIN_PAGE_SIZE - 16
101 b.gt L(page_cross)
102 ldp data1, data2, [srcin]
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +0100103#ifdef __AARCH64EB__
104 /* For big-endian, carry propagation (if the final byte in the
Jake Weinstein372f19e2016-11-17 16:01:25 -0500105 string is 0x01) means we cannot use has_nul1/2 directly.
106 Since we expect strings to be small and early-exit,
107 byte-swap the data now so has_null1/2 will be correct. */
108 rev data1, data1
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +0100109 rev data2, data2
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +0100110#endif
Jake Weinstein372f19e2016-11-17 16:01:25 -0500111 sub tmp1, data1, zeroones
112 orr tmp2, data1, REP8_7f
113 sub tmp3, data2, zeroones
114 orr tmp4, data2, REP8_7f
115 bics has_nul1, tmp1, tmp2
116 bic has_nul2, tmp3, tmp4
117 ccmp has_nul2, 0, 0, eq
118 beq L(main_loop_entry)
119
120 /* Enter with C = has_nul1 == 0. */
121 csel has_nul1, has_nul1, has_nul2, cc
122 mov len, 8
123 rev has_nul1, has_nul1
124 clz tmp1, has_nul1
125 csel len, xzr, len, cc
126 add len, len, tmp1, lsr 3
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +0100127 ret
128
Jake Weinstein372f19e2016-11-17 16:01:25 -0500129 /* The inner loop processes 32 bytes per iteration and uses the fast
130 NUL check. If we encounter non-ASCII characters, use a second
131 loop with the accurate NUL check. */
132 .p2align 4
133L(main_loop_entry):
134 bic src, srcin, 15
135 sub src, src, 16
136L(main_loop):
137 ldp data1, data2, [src, 32]!
138.Lpage_cross_entry:
139 sub tmp1, data1, zeroones
140 sub tmp3, data2, zeroones
141 orr tmp2, tmp1, tmp3
142 tst tmp2, zeroones, lsl 7
143 bne 1f
144 ldp data1, data2, [src, 16]
145 sub tmp1, data1, zeroones
146 sub tmp3, data2, zeroones
147 orr tmp2, tmp1, tmp3
148 tst tmp2, zeroones, lsl 7
149 beq L(main_loop)
150 add src, src, 16
1511:
152 /* The fast check failed, so do the slower, accurate NUL check. */
153 orr tmp2, data1, REP8_7f
154 orr tmp4, data2, REP8_7f
155 bics has_nul1, tmp1, tmp2
156 bic has_nul2, tmp3, tmp4
157 ccmp has_nul2, 0, 0, eq
158 beq L(nonascii_loop)
159
160 /* Enter with C = has_nul1 == 0. */
161L(tail):
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +0100162#ifdef __AARCH64EB__
Jake Weinstein372f19e2016-11-17 16:01:25 -0500163 /* For big-endian, carry propagation (if the final byte in the
164 string is 0x01) means we cannot use has_nul1/2 directly. The
165 easiest way to get the correct byte is to byte-swap the data
166 and calculate the syndrome a second time. */
167 csel data1, data1, data2, cc
168 rev data1, data1
169 sub tmp1, data1, zeroones
170 orr tmp2, data1, REP8_7f
171 bic has_nul1, tmp1, tmp2
172#else
173 csel has_nul1, has_nul1, has_nul2, cc
174#endif
175 sub len, src, srcin
176 rev has_nul1, has_nul1
177 add tmp2, len, 8
178 clz tmp1, has_nul1
179 csel len, len, tmp2, cc
180 add len, len, tmp1, lsr 3
181 ret
182
183L(nonascii_loop):
184 ldp data1, data2, [src, 16]!
185 sub tmp1, data1, zeroones
186 orr tmp2, data1, REP8_7f
187 sub tmp3, data2, zeroones
188 orr tmp4, data2, REP8_7f
189 bics has_nul1, tmp1, tmp2
190 bic has_nul2, tmp3, tmp4
191 ccmp has_nul2, 0, 0, eq
192 bne L(tail)
193 ldp data1, data2, [src, 16]!
194 sub tmp1, data1, zeroones
195 orr tmp2, data1, REP8_7f
196 sub tmp3, data2, zeroones
197 orr tmp4, data2, REP8_7f
198 bics has_nul1, tmp1, tmp2
199 bic has_nul2, tmp3, tmp4
200 ccmp has_nul2, 0, 0, eq
201 beq L(nonascii_loop)
202 b L(tail)
203
204 /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
205 srcin to 0x7f, so we ignore any NUL bytes before the string.
206 Then continue in the aligned loop. */
207L(page_cross):
208 bic src, srcin, 15
209 ldp data1, data2, [src]
210 lsl tmp1, srcin, 3
211 mov tmp4, -1
212#ifdef __AARCH64EB__
213 /* Big-endian. Early bytes are at MSB. */
214 lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +0100215#else
216 /* Little-endian. Early bytes are at LSB. */
Jake Weinstein372f19e2016-11-17 16:01:25 -0500217 lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +0100218#endif
Jake Weinstein372f19e2016-11-17 16:01:25 -0500219 orr tmp1, tmp1, REP8_80
220 orn data1, data1, tmp1
221 orn tmp2, data2, tmp1
222 tst srcin, 8
223 csel data1, data1, tmp4, eq
224 csel data2, data2, tmp2, eq
225 b L(page_cross_entry)
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +0100226
227END(strlen)