Blame - libc/arch-arm64/generic/bionic/strlen.S - platform_bionic

blob: 6e540fc330389c765d107f7e4a33f4b224c9d353 [file] [log] [blame]

Jake Weinstein	372f19e	2016-11-17 16:01:25 -0500	[diff] [blame]	1	/* Copyright (c) 2013-2015, Linaro Limited
Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	2	All rights reserved.
				3
				4	Redistribution and use in source and binary forms, with or without
				5	modification, are permitted provided that the following conditions are met:
				6	* Redistributions of source code must retain the above copyright
Jake Weinstein	372f19e	2016-11-17 16:01:25 -0500	[diff] [blame]	7	notice, this list of conditions and the following disclaimer.
Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	8	* Redistributions in binary form must reproduce the above copyright
Jake Weinstein	372f19e	2016-11-17 16:01:25 -0500	[diff] [blame]	9	notice, this list of conditions and the following disclaimer in the
				10	documentation and/or other materials provided with the distribution.
Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	11	* Neither the name of the Linaro nor the
Jake Weinstein	372f19e	2016-11-17 16:01:25 -0500	[diff] [blame]	12	names of its contributors may be used to endorse or promote products
				13	derived from this software without specific prior written permission.
Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	14
				15	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				16	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				17	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				18	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				19	HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				20	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				21	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				22	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				23	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				24	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
Jake Weinstein	372f19e	2016-11-17 16:01:25 -0500	[diff] [blame]	25	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */
Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	26
				27	/* Assumptions:
				28	*
Jake Weinstein	372f19e	2016-11-17 16:01:25 -0500	[diff] [blame]	29	* ARMv8-a, AArch64, unaligned accesses, min page size 4k.
Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	30	*/
				31
				32	#include <private/bionic_asm.h>
				33
Jake Weinstein	372f19e	2016-11-17 16:01:25 -0500	[diff] [blame]	34	/* To test the page crossing code path more thoroughly, compile with
				35	-DTEST_PAGE_CROSS - this will force all calls through the slower
				36	entry path. This option is not intended for production use. */
				37
Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	38	/* Arguments and results. */
				39	#define srcin x0
				40	#define len x0
				41
				42	/* Locals and temporaries. */
				43	#define src x1
				44	#define data1 x2
				45	#define data2 x3
Jake Weinstein	372f19e	2016-11-17 16:01:25 -0500	[diff] [blame]	46	#define has_nul1 x4
				47	#define has_nul2 x5
				48	#define tmp1 x4
				49	#define tmp2 x5
				50	#define tmp3 x6
				51	#define tmp4 x7
				52	#define zeroones x8
				53
				54	#define L(l) .L ## l
				55
				56	/* NUL detection works on the principle that (X - 1) & (~X) & 0x80
				57	(=> (X - 1) & ~(X \| 0x7f)) is non-zero iff a byte is zero, and
				58	can be done in parallel across the entire word. A faster check
				59	(X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
				60	false hits for characters 129..255. */
Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	61
				62	#define REP8_01 0x0101010101010101
				63	#define REP8_7f 0x7f7f7f7f7f7f7f7f
				64	#define REP8_80 0x8080808080808080
				65
Jake Weinstein	372f19e	2016-11-17 16:01:25 -0500	[diff] [blame]	66	#ifdef TEST_PAGE_CROSS
				67	# define MIN_PAGE_SIZE 15
				68	#else
				69	# define MIN_PAGE_SIZE 4096
Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	70	#endif
Jake Weinstein	372f19e	2016-11-17 16:01:25 -0500	[diff] [blame]	71
				72	/* Since strings are short on average, we check the first 16 bytes
				73	of the string for a NUL character. In order to do an unaligned ldp
				74	safely we have to do a page cross check first. If there is a NUL
				75	byte we calculate the length from the 2 8-byte words using
				76	conditional select to reduce branch mispredictions (it is unlikely
				77	strlen will be repeatedly called on strings with the same length).
				78
				79	If the string is longer than 16 bytes, we align src so don't need
				80	further page cross checks, and process 32 bytes per iteration
				81	using the fast NUL check. If we encounter non-ASCII characters,
				82	fallback to a second loop using the full NUL check.
				83
				84	If the page cross check fails, we read 16 bytes from an aligned
				85	address, remove any characters before the string, and continue
				86	in the main loop using aligned loads. Since strings crossing a
				87	page in the first 16 bytes are rare (probability of
				88	16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
				89
				90	AArch64 systems have a minimum page size of 4k. We don't bother
				91	checking for larger page sizes - the cost of setting up the correct
				92	page size is just not worth the extra gain from a small reduction in
				93	the cases taking the slow path. Note that we only care about
				94	whether the first fetch, which may be misaligned, crosses a page
				95	boundary. */
				96
				97	ENTRY(strlen)
				98	and tmp1, srcin, MIN_PAGE_SIZE - 1
				99	mov zeroones, REP8_01
				100	cmp tmp1, MIN_PAGE_SIZE - 16
				101	b.gt L(page_cross)
				102	ldp data1, data2, [srcin]
Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	103	#ifdef __AARCH64EB__
				104	/* For big-endian, carry propagation (if the final byte in the
Jake Weinstein	372f19e	2016-11-17 16:01:25 -0500	[diff] [blame]	105	string is 0x01) means we cannot use has_nul1/2 directly.
				106	Since we expect strings to be small and early-exit,
				107	byte-swap the data now so has_null1/2 will be correct. */
				108	rev data1, data1
Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	109	rev data2, data2
Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	110	#endif
Jake Weinstein	372f19e	2016-11-17 16:01:25 -0500	[diff] [blame]	111	sub tmp1, data1, zeroones
				112	orr tmp2, data1, REP8_7f
				113	sub tmp3, data2, zeroones
				114	orr tmp4, data2, REP8_7f
				115	bics has_nul1, tmp1, tmp2
				116	bic has_nul2, tmp3, tmp4
				117	ccmp has_nul2, 0, 0, eq
				118	beq L(main_loop_entry)
				119
				120	/* Enter with C = has_nul1 == 0. */
				121	csel has_nul1, has_nul1, has_nul2, cc
				122	mov len, 8
				123	rev has_nul1, has_nul1
				124	clz tmp1, has_nul1
				125	csel len, xzr, len, cc
				126	add len, len, tmp1, lsr 3
Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	127	ret
				128
Jake Weinstein	372f19e	2016-11-17 16:01:25 -0500	[diff] [blame]	129	/* The inner loop processes 32 bytes per iteration and uses the fast
				130	NUL check. If we encounter non-ASCII characters, use a second
				131	loop with the accurate NUL check. */
				132	.p2align 4
				133	L(main_loop_entry):
				134	bic src, srcin, 15
				135	sub src, src, 16
				136	L(main_loop):
				137	ldp data1, data2, [src, 32]!
				138	.Lpage_cross_entry:
				139	sub tmp1, data1, zeroones
				140	sub tmp3, data2, zeroones
				141	orr tmp2, tmp1, tmp3
				142	tst tmp2, zeroones, lsl 7
				143	bne 1f
				144	ldp data1, data2, [src, 16]
				145	sub tmp1, data1, zeroones
				146	sub tmp3, data2, zeroones
				147	orr tmp2, tmp1, tmp3
				148	tst tmp2, zeroones, lsl 7
				149	beq L(main_loop)
				150	add src, src, 16
				151	1:
				152	/* The fast check failed, so do the slower, accurate NUL check. */
				153	orr tmp2, data1, REP8_7f
				154	orr tmp4, data2, REP8_7f
				155	bics has_nul1, tmp1, tmp2
				156	bic has_nul2, tmp3, tmp4
				157	ccmp has_nul2, 0, 0, eq
				158	beq L(nonascii_loop)
				159
				160	/* Enter with C = has_nul1 == 0. */
				161	L(tail):
Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	162	#ifdef __AARCH64EB__
Jake Weinstein	372f19e	2016-11-17 16:01:25 -0500	[diff] [blame]	163	/* For big-endian, carry propagation (if the final byte in the
				164	string is 0x01) means we cannot use has_nul1/2 directly. The
				165	easiest way to get the correct byte is to byte-swap the data
				166	and calculate the syndrome a second time. */
				167	csel data1, data1, data2, cc
				168	rev data1, data1
				169	sub tmp1, data1, zeroones
				170	orr tmp2, data1, REP8_7f
				171	bic has_nul1, tmp1, tmp2
				172	#else
				173	csel has_nul1, has_nul1, has_nul2, cc
				174	#endif
				175	sub len, src, srcin
				176	rev has_nul1, has_nul1
				177	add tmp2, len, 8
				178	clz tmp1, has_nul1
				179	csel len, len, tmp2, cc
				180	add len, len, tmp1, lsr 3
				181	ret
				182
				183	L(nonascii_loop):
				184	ldp data1, data2, [src, 16]!
				185	sub tmp1, data1, zeroones
				186	orr tmp2, data1, REP8_7f
				187	sub tmp3, data2, zeroones
				188	orr tmp4, data2, REP8_7f
				189	bics has_nul1, tmp1, tmp2
				190	bic has_nul2, tmp3, tmp4
				191	ccmp has_nul2, 0, 0, eq
				192	bne L(tail)
				193	ldp data1, data2, [src, 16]!
				194	sub tmp1, data1, zeroones
				195	orr tmp2, data1, REP8_7f
				196	sub tmp3, data2, zeroones
				197	orr tmp4, data2, REP8_7f
				198	bics has_nul1, tmp1, tmp2
				199	bic has_nul2, tmp3, tmp4
				200	ccmp has_nul2, 0, 0, eq
				201	beq L(nonascii_loop)
				202	b L(tail)
				203
				204	/* Load 16 bytes from [srcin & ~15] and force the bytes that precede
				205	srcin to 0x7f, so we ignore any NUL bytes before the string.
				206	Then continue in the aligned loop. */
				207	L(page_cross):
				208	bic src, srcin, 15
				209	ldp data1, data2, [src]
				210	lsl tmp1, srcin, 3
				211	mov tmp4, -1
				212	#ifdef __AARCH64EB__
				213	/* Big-endian. Early bytes are at MSB. */
				214	lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	215	#else
				216	/* Little-endian. Early bytes are at LSB. */
Jake Weinstein	372f19e	2016-11-17 16:01:25 -0500	[diff] [blame]	217	lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	218	#endif
Jake Weinstein	372f19e	2016-11-17 16:01:25 -0500	[diff] [blame]	219	orr tmp1, tmp1, REP8_80
				220	orn data1, data1, tmp1
				221	orn tmp2, data2, tmp1
				222	tst srcin, 8
				223	csel data1, data1, tmp4, eq
				224	csel data2, data2, tmp2, eq
				225	b L(page_cross_entry)
Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	226
				227	END(strlen)