Blame - libc/arch-arm64/generic/bionic/memmove.S - platform_bionic

blob: 739ce49820cbe594d95b6bf564963540028971ec [file] [log] [blame]

Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	1	/* Copyright (c) 2014, Linaro Limited
				2	All rights reserved.
				3
				4	Redistribution and use in source and binary forms, with or without
				5	modification, are permitted provided that the following conditions are met:
				6	* Redistributions of source code must retain the above copyright
				7	notice, this list of conditions and the following disclaimer.
				8	* Redistributions in binary form must reproduce the above copyright
				9	notice, this list of conditions and the following disclaimer in the
				10	documentation and/or other materials provided with the distribution.
				11	* Neither the name of the Linaro nor the
				12	names of its contributors may be used to endorse or promote products
				13	derived from this software without specific prior written permission.
				14
				15	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				16	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				17	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				18	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				19	HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				20	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				21	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				22	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				23	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				24	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				25	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				26	*/
				27
				28	/* Assumptions:
				29	*
				30	* ARMv8-a, AArch64
				31	* Unaligned accesses
Bernhard Rosenkraenzer	6f2bde3	2014-05-23 17:44:18 +0200	[diff] [blame]	32	* wchar_t is 4 bytes
Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	33	*/
				34
				35	#include <private/bionic_asm.h>
				36
				37	/* Parameters and result. */
				38	#define dstin x0
				39	#define src x1
				40	#define count x2
				41	#define tmp1 x3
				42	#define tmp1w w3
				43	#define tmp2 x4
				44	#define tmp2w w4
				45	#define tmp3 x5
				46	#define tmp3w w5
				47	#define dst x6
				48
				49	#define A_l x7
				50	#define A_h x8
				51	#define B_l x9
				52	#define B_h x10
				53	#define C_l x11
				54	#define C_h x12
				55	#define D_l x13
				56	#define D_h x14
				57
Jake Weinstein	2926f9a	2015-08-16 00:44:40 +0000	[diff] [blame]	58	#if defined(WMEMMOVE)
Bernhard Rosenkraenzer	6f2bde3	2014-05-23 17:44:18 +0200	[diff] [blame]	59	ENTRY(wmemmove)
				60	lsl count, count, #2
				61	#else
Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	62	ENTRY(memmove)
Bernhard Rosenkraenzer	6f2bde3	2014-05-23 17:44:18 +0200	[diff] [blame]	63	#endif
Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	64	cmp dstin, src
				65	b.lo .Ldownwards
				66	add tmp1, src, count
				67	cmp dstin, tmp1
				68	b.hs memcpy /* No overlap. */
				69
				70	/* Upwards move with potential overlap.
				71	* Need to move from the tail backwards. SRC and DST point one
				72	* byte beyond the remaining data to move. */
				73	add dst, dstin, count
				74	add src, src, count
				75	cmp count, #64
				76	b.ge .Lmov_not_short_up
				77
				78	/* Deal with small moves quickly by dropping straight into the
				79	* exit block. */
				80	.Ltail63up:
				81	/* Move up to 48 bytes of data. At this point we only need the
				82	* bottom 6 bits of count to be accurate. */
				83	ands tmp1, count, #0x30
				84	b.eq .Ltail15up
				85	sub dst, dst, tmp1
				86	sub src, src, tmp1
				87	cmp tmp1w, #0x20
				88	b.eq 1f
				89	b.lt 2f
				90	ldp A_l, A_h, [src, #32]
				91	stp A_l, A_h, [dst, #32]
				92	1:
				93	ldp A_l, A_h, [src, #16]
				94	stp A_l, A_h, [dst, #16]
				95	2:
				96	ldp A_l, A_h, [src]
				97	stp A_l, A_h, [dst]
				98	.Ltail15up:
				99	/* Move up to 15 bytes of data. Does not assume additional data
				100	* being moved. */
				101	tbz count, #3, 1f
				102	ldr tmp1, [src, #-8]!
				103	str tmp1, [dst, #-8]!
				104	1:
				105	tbz count, #2, 1f
				106	ldr tmp1w, [src, #-4]!
				107	str tmp1w, [dst, #-4]!
				108	1:
				109	tbz count, #1, 1f
				110	ldrh tmp1w, [src, #-2]!
				111	strh tmp1w, [dst, #-2]!
				112	1:
				113	tbz count, #0, 1f
				114	ldrb tmp1w, [src, #-1]
				115	strb tmp1w, [dst, #-1]
				116	1:
				117	ret
				118
				119	.Lmov_not_short_up:
				120	/* We don't much care about the alignment of DST, but we want SRC
				121	* to be 128-bit (16 byte) aligned so that we don't cross cache line
				122	* boundaries on both loads and stores. */
				123	ands tmp2, src, #15 /* Bytes to reach alignment. */
				124	b.eq 2f
				125	sub count, count, tmp2
				126	/* Move enough data to reach alignment; unlike memcpy, we have to
				127	* be aware of the overlap, which means we can't move data twice. */
				128	tbz tmp2, #3, 1f
				129	ldr tmp1, [src, #-8]!
				130	str tmp1, [dst, #-8]!
				131	1:
				132	tbz tmp2, #2, 1f
				133	ldr tmp1w, [src, #-4]!
				134	str tmp1w, [dst, #-4]!
				135	1:
				136	tbz tmp2, #1, 1f
				137	ldrh tmp1w, [src, #-2]!
				138	strh tmp1w, [dst, #-2]!
				139	1:
				140	tbz tmp2, #0, 1f
				141	ldrb tmp1w, [src, #-1]!
				142	strb tmp1w, [dst, #-1]!
				143	1:
				144
				145	/* There may be less than 63 bytes to go now. */
				146	cmp count, #63
				147	b.le .Ltail63up
				148	2:
				149	subs count, count, #128
				150	b.ge .Lmov_body_large_up
				151	/* Less than 128 bytes to move, so handle 64 here and then jump
				152	* to the tail. */
				153	ldp A_l, A_h, [src, #-64]!
				154	ldp B_l, B_h, [src, #16]
				155	ldp C_l, C_h, [src, #32]
				156	ldp D_l, D_h, [src, #48]
				157	stp A_l, A_h, [dst, #-64]!
				158	stp B_l, B_h, [dst, #16]
				159	stp C_l, C_h, [dst, #32]
				160	stp D_l, D_h, [dst, #48]
				161	tst count, #0x3f
				162	b.ne .Ltail63up
				163	ret
				164
				165	/* Critical loop. Start at a new Icache line boundary. Assuming
				166	* 64 bytes per line this ensures the entire loop is in one line. */
				167	.p2align 6
				168	.Lmov_body_large_up:
				169	/* There are at least 128 bytes to move. */
				170	ldp A_l, A_h, [src, #-16]
				171	ldp B_l, B_h, [src, #-32]
				172	ldp C_l, C_h, [src, #-48]
				173	ldp D_l, D_h, [src, #-64]!
				174	1:
				175	stp A_l, A_h, [dst, #-16]
				176	ldp A_l, A_h, [src, #-16]
				177	stp B_l, B_h, [dst, #-32]
				178	ldp B_l, B_h, [src, #-32]
				179	stp C_l, C_h, [dst, #-48]
				180	ldp C_l, C_h, [src, #-48]
				181	stp D_l, D_h, [dst, #-64]!
				182	ldp D_l, D_h, [src, #-64]!
				183	subs count, count, #64
				184	b.ge 1b
				185	stp A_l, A_h, [dst, #-16]
				186	stp B_l, B_h, [dst, #-32]
				187	stp C_l, C_h, [dst, #-48]
				188	stp D_l, D_h, [dst, #-64]!
				189	tst count, #0x3f
				190	b.ne .Ltail63up
				191	ret
				192
				193
				194	.Ldownwards:
				195	/* For a downwards move we can safely use memcpy provided that
				196	* DST is more than 16 bytes away from SRC. */
				197	sub tmp1, src, #16
				198	cmp dstin, tmp1
				199	b.ls memcpy /* May overlap, but not critically. */
				200
				201	mov dst, dstin /* Preserve DSTIN for return value. */
				202	cmp count, #64
				203	b.ge .Lmov_not_short_down
				204
				205	/* Deal with small moves quickly by dropping straight into the
				206	* exit block. */
				207	.Ltail63down:
				208	/* Move up to 48 bytes of data. At this point we only need the
				209	* bottom 6 bits of count to be accurate. */
				210	ands tmp1, count, #0x30
				211	b.eq .Ltail15down
				212	add dst, dst, tmp1
				213	add src, src, tmp1
				214	cmp tmp1w, #0x20
				215	b.eq 1f
				216	b.lt 2f
				217	ldp A_l, A_h, [src, #-48]
				218	stp A_l, A_h, [dst, #-48]
				219	1:
				220	ldp A_l, A_h, [src, #-32]
				221	stp A_l, A_h, [dst, #-32]
				222	2:
				223	ldp A_l, A_h, [src, #-16]
				224	stp A_l, A_h, [dst, #-16]
				225	.Ltail15down:
				226	/* Move up to 15 bytes of data. Does not assume additional data
				227	being moved. */
				228	tbz count, #3, 1f
				229	ldr tmp1, [src], #8
				230	str tmp1, [dst], #8
				231	1:
				232	tbz count, #2, 1f
				233	ldr tmp1w, [src], #4
				234	str tmp1w, [dst], #4
				235	1:
				236	tbz count, #1, 1f
				237	ldrh tmp1w, [src], #2
				238	strh tmp1w, [dst], #2
				239	1:
				240	tbz count, #0, 1f
				241	ldrb tmp1w, [src]
				242	strb tmp1w, [dst]
				243	1:
				244	ret
				245
				246	.Lmov_not_short_down:
				247	/* We don't much care about the alignment of DST, but we want SRC
				248	* to be 128-bit (16 byte) aligned so that we don't cross cache line
				249	* boundaries on both loads and stores. */
				250	neg tmp2, src
				251	ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
				252	b.eq 2f
				253	sub count, count, tmp2
				254	/* Move enough data to reach alignment; unlike memcpy, we have to
				255	* be aware of the overlap, which means we can't move data twice. */
				256	tbz tmp2, #3, 1f
				257	ldr tmp1, [src], #8
				258	str tmp1, [dst], #8
				259	1:
				260	tbz tmp2, #2, 1f
				261	ldr tmp1w, [src], #4
				262	str tmp1w, [dst], #4
				263	1:
				264	tbz tmp2, #1, 1f
				265	ldrh tmp1w, [src], #2
				266	strh tmp1w, [dst], #2
				267	1:
				268	tbz tmp2, #0, 1f
				269	ldrb tmp1w, [src], #1
				270	strb tmp1w, [dst], #1
				271	1:
				272
				273	/* There may be less than 63 bytes to go now. */
				274	cmp count, #63
				275	b.le .Ltail63down
				276	2:
				277	subs count, count, #128
				278	b.ge .Lmov_body_large_down
				279	/* Less than 128 bytes to move, so handle 64 here and then jump
				280	* to the tail. */
				281	ldp A_l, A_h, [src]
				282	ldp B_l, B_h, [src, #16]
				283	ldp C_l, C_h, [src, #32]
				284	ldp D_l, D_h, [src, #48]
				285	stp A_l, A_h, [dst]
				286	stp B_l, B_h, [dst, #16]
				287	stp C_l, C_h, [dst, #32]
				288	stp D_l, D_h, [dst, #48]
				289	tst count, #0x3f
				290	add src, src, #64
				291	add dst, dst, #64
				292	b.ne .Ltail63down
				293	ret
				294
				295	/* Critical loop. Start at a new cache line boundary. Assuming
				296	* 64 bytes per line this ensures the entire loop is in one line. */
				297	.p2align 6
				298	.Lmov_body_large_down:
				299	/* There are at least 128 bytes to move. */
				300	ldp A_l, A_h, [src, #0]
				301	sub dst, dst, #16 /* Pre-bias. */
				302	ldp B_l, B_h, [src, #16]
				303	ldp C_l, C_h, [src, #32]
				304	ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
				305	1:
				306	stp A_l, A_h, [dst, #16]
				307	ldp A_l, A_h, [src, #16]
				308	stp B_l, B_h, [dst, #32]
				309	ldp B_l, B_h, [src, #32]
				310	stp C_l, C_h, [dst, #48]
				311	ldp C_l, C_h, [src, #48]
				312	stp D_l, D_h, [dst, #64]!
				313	ldp D_l, D_h, [src, #64]!
				314	subs count, count, #64
				315	b.ge 1b
				316	stp A_l, A_h, [dst, #16]
				317	stp B_l, B_h, [dst, #32]
				318	stp C_l, C_h, [dst, #48]
				319	stp D_l, D_h, [dst, #64]
				320	add src, src, #16
				321	add dst, dst, #64 + 16
				322	tst count, #0x3f
				323	b.ne .Ltail63down
				324	ret
Jake Weinstein	2926f9a	2015-08-16 00:44:40 +0000	[diff] [blame]	325	#if defined(WMEMMOVE)
Bernhard Rosenkraenzer	6f2bde3	2014-05-23 17:44:18 +0200	[diff] [blame]	326	END(wmemmove)
				327	#else
Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	328	END(memmove)
Bernhard Rosenkraenzer	6f2bde3	2014-05-23 17:44:18 +0200	[diff] [blame]	329	#endif