blob: 3bd98093555e66cd5e61942c353e8fa8179aae54 [file] [log] [blame]
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +01001/* Copyright (c) 2014, Linaro Limited
2 All rights reserved.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the name of the Linaro nor the
12 names of its contributors may be used to endorse or promote products
13 derived from this software without specific prior written permission.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28/* Assumptions:
29 *
30 * ARMv8-a, AArch64
31 */
32
33#include <private/bionic_asm.h>
34
35/* Arguments and results. */
36#define srcin x0
37#define len x0
38
39/* Locals and temporaries. */
40#define src x1
41#define data1 x2
42#define data2 x3
43#define data2a x4
44#define has_nul1 x5
45#define has_nul2 x6
46#define tmp1 x7
47#define tmp2 x8
48#define tmp3 x9
49#define tmp4 x10
50#define zeroones x11
51#define pos x12
52
53#define REP8_01 0x0101010101010101
54#define REP8_7f 0x7f7f7f7f7f7f7f7f
55#define REP8_80 0x8080808080808080
56
57 /* Start of critial section -- keep to one 64Byte cache line. */
58ENTRY(strlen)
59 mov zeroones, #REP8_01
60 bic src, srcin, #15
61 ands tmp1, srcin, #15
62 b.ne .Lmisaligned
63 /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
64 (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
65 can be done in parallel across the entire word. */
66 /* The inner loop deals with two Dwords at a time. This has a
67 slightly higher start-up cost, but we should win quite quickly,
68 especially on cores with a high number of issue slots per
69 cycle, as we get much better parallelism out of the operations. */
70.Lloop:
71 ldp data1, data2, [src], #16
72.Lrealigned:
73 sub tmp1, data1, zeroones
74 orr tmp2, data1, #REP8_7f
75 sub tmp3, data2, zeroones
76 orr tmp4, data2, #REP8_7f
77 bic has_nul1, tmp1, tmp2
78 bics has_nul2, tmp3, tmp4
79 ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
80 b.eq .Lloop
81 /* End of critical section -- keep to one 64Byte cache line. */
82
83 sub len, src, srcin
84 cbz has_nul1, .Lnul_in_data2
85#ifdef __AARCH64EB__
86 mov data2, data1
87#endif
88 sub len, len, #8
89 mov has_nul2, has_nul1
90.Lnul_in_data2:
91#ifdef __AARCH64EB__
92 /* For big-endian, carry propagation (if the final byte in the
93 string is 0x01) means we cannot use has_nul directly. The
94 easiest way to get the correct byte is to byte-swap the data
95 and calculate the syndrome a second time. */
96 rev data2, data2
97 sub tmp1, data2, zeroones
98 orr tmp2, data2, #REP8_7f
99 bic has_nul2, tmp1, tmp2
100#endif
101 sub len, len, #8
102 rev has_nul2, has_nul2
103 clz pos, has_nul2
104 add len, len, pos, lsr #3 /* Bits to bytes. */
105 ret
106
107.Lmisaligned:
108 cmp tmp1, #8
109 neg tmp1, tmp1
110 ldp data1, data2, [src], #16
111 lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
112 mov tmp2, #~0
113#ifdef __AARCH64EB__
114 /* Big-endian. Early bytes are at MSB. */
115 lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
116#else
117 /* Little-endian. Early bytes are at LSB. */
118 lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
119#endif
120 orr data1, data1, tmp2
121 orr data2a, data2, tmp2
122 csinv data1, data1, xzr, le
123 csel data2, data2, data2a, le
124 b .Lrealigned
125
126END(strlen)