Blame - libc/arch-arm64/generic/bionic/memmove.S - platform_bionic

blob: d6ecb868bbddfbf1f119ac7d0a918721be709869 [file] [log] [blame]

Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame^]	1	/* Copyright (c) 2014, Linaro Limited
				2	All rights reserved.
				3
				4	Redistribution and use in source and binary forms, with or without
				5	modification, are permitted provided that the following conditions are met:
				6	* Redistributions of source code must retain the above copyright
				7	notice, this list of conditions and the following disclaimer.
				8	* Redistributions in binary form must reproduce the above copyright
				9	notice, this list of conditions and the following disclaimer in the
				10	documentation and/or other materials provided with the distribution.
				11	* Neither the name of the Linaro nor the
				12	names of its contributors may be used to endorse or promote products
				13	derived from this software without specific prior written permission.
				14
				15	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				16	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				17	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				18	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				19	HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				20	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				21	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				22	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				23	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				24	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				25	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				26	*/
				27
				28	/* Assumptions:
				29	*
				30	* ARMv8-a, AArch64
				31	* Unaligned accesses
				32	*/
				33
				34	#include <private/bionic_asm.h>
				35
				36	/* Parameters and result. */
				37	#define dstin x0
				38	#define src x1
				39	#define count x2
				40	#define tmp1 x3
				41	#define tmp1w w3
				42	#define tmp2 x4
				43	#define tmp2w w4
				44	#define tmp3 x5
				45	#define tmp3w w5
				46	#define dst x6
				47
				48	#define A_l x7
				49	#define A_h x8
				50	#define B_l x9
				51	#define B_h x10
				52	#define C_l x11
				53	#define C_h x12
				54	#define D_l x13
				55	#define D_h x14
				56
				57	ENTRY(memmove)
				58	cmp dstin, src
				59	b.lo .Ldownwards
				60	add tmp1, src, count
				61	cmp dstin, tmp1
				62	b.hs memcpy /* No overlap. */
				63
				64	/* Upwards move with potential overlap.
				65	* Need to move from the tail backwards. SRC and DST point one
				66	* byte beyond the remaining data to move. */
				67	add dst, dstin, count
				68	add src, src, count
				69	cmp count, #64
				70	b.ge .Lmov_not_short_up
				71
				72	/* Deal with small moves quickly by dropping straight into the
				73	* exit block. */
				74	.Ltail63up:
				75	/* Move up to 48 bytes of data. At this point we only need the
				76	* bottom 6 bits of count to be accurate. */
				77	ands tmp1, count, #0x30
				78	b.eq .Ltail15up
				79	sub dst, dst, tmp1
				80	sub src, src, tmp1
				81	cmp tmp1w, #0x20
				82	b.eq 1f
				83	b.lt 2f
				84	ldp A_l, A_h, [src, #32]
				85	stp A_l, A_h, [dst, #32]
				86	1:
				87	ldp A_l, A_h, [src, #16]
				88	stp A_l, A_h, [dst, #16]
				89	2:
				90	ldp A_l, A_h, [src]
				91	stp A_l, A_h, [dst]
				92	.Ltail15up:
				93	/* Move up to 15 bytes of data. Does not assume additional data
				94	* being moved. */
				95	tbz count, #3, 1f
				96	ldr tmp1, [src, #-8]!
				97	str tmp1, [dst, #-8]!
				98	1:
				99	tbz count, #2, 1f
				100	ldr tmp1w, [src, #-4]!
				101	str tmp1w, [dst, #-4]!
				102	1:
				103	tbz count, #1, 1f
				104	ldrh tmp1w, [src, #-2]!
				105	strh tmp1w, [dst, #-2]!
				106	1:
				107	tbz count, #0, 1f
				108	ldrb tmp1w, [src, #-1]
				109	strb tmp1w, [dst, #-1]
				110	1:
				111	ret
				112
				113	.Lmov_not_short_up:
				114	/* We don't much care about the alignment of DST, but we want SRC
				115	* to be 128-bit (16 byte) aligned so that we don't cross cache line
				116	* boundaries on both loads and stores. */
				117	ands tmp2, src, #15 /* Bytes to reach alignment. */
				118	b.eq 2f
				119	sub count, count, tmp2
				120	/* Move enough data to reach alignment; unlike memcpy, we have to
				121	* be aware of the overlap, which means we can't move data twice. */
				122	tbz tmp2, #3, 1f
				123	ldr tmp1, [src, #-8]!
				124	str tmp1, [dst, #-8]!
				125	1:
				126	tbz tmp2, #2, 1f
				127	ldr tmp1w, [src, #-4]!
				128	str tmp1w, [dst, #-4]!
				129	1:
				130	tbz tmp2, #1, 1f
				131	ldrh tmp1w, [src, #-2]!
				132	strh tmp1w, [dst, #-2]!
				133	1:
				134	tbz tmp2, #0, 1f
				135	ldrb tmp1w, [src, #-1]!
				136	strb tmp1w, [dst, #-1]!
				137	1:
				138
				139	/* There may be less than 63 bytes to go now. */
				140	cmp count, #63
				141	b.le .Ltail63up
				142	2:
				143	subs count, count, #128
				144	b.ge .Lmov_body_large_up
				145	/* Less than 128 bytes to move, so handle 64 here and then jump
				146	* to the tail. */
				147	ldp A_l, A_h, [src, #-64]!
				148	ldp B_l, B_h, [src, #16]
				149	ldp C_l, C_h, [src, #32]
				150	ldp D_l, D_h, [src, #48]
				151	stp A_l, A_h, [dst, #-64]!
				152	stp B_l, B_h, [dst, #16]
				153	stp C_l, C_h, [dst, #32]
				154	stp D_l, D_h, [dst, #48]
				155	tst count, #0x3f
				156	b.ne .Ltail63up
				157	ret
				158
				159	/* Critical loop. Start at a new Icache line boundary. Assuming
				160	* 64 bytes per line this ensures the entire loop is in one line. */
				161	.p2align 6
				162	.Lmov_body_large_up:
				163	/* There are at least 128 bytes to move. */
				164	ldp A_l, A_h, [src, #-16]
				165	ldp B_l, B_h, [src, #-32]
				166	ldp C_l, C_h, [src, #-48]
				167	ldp D_l, D_h, [src, #-64]!
				168	1:
				169	stp A_l, A_h, [dst, #-16]
				170	ldp A_l, A_h, [src, #-16]
				171	stp B_l, B_h, [dst, #-32]
				172	ldp B_l, B_h, [src, #-32]
				173	stp C_l, C_h, [dst, #-48]
				174	ldp C_l, C_h, [src, #-48]
				175	stp D_l, D_h, [dst, #-64]!
				176	ldp D_l, D_h, [src, #-64]!
				177	subs count, count, #64
				178	b.ge 1b
				179	stp A_l, A_h, [dst, #-16]
				180	stp B_l, B_h, [dst, #-32]
				181	stp C_l, C_h, [dst, #-48]
				182	stp D_l, D_h, [dst, #-64]!
				183	tst count, #0x3f
				184	b.ne .Ltail63up
				185	ret
				186
				187
				188	.Ldownwards:
				189	/* For a downwards move we can safely use memcpy provided that
				190	* DST is more than 16 bytes away from SRC. */
				191	sub tmp1, src, #16
				192	cmp dstin, tmp1
				193	b.ls memcpy /* May overlap, but not critically. */
				194
				195	mov dst, dstin /* Preserve DSTIN for return value. */
				196	cmp count, #64
				197	b.ge .Lmov_not_short_down
				198
				199	/* Deal with small moves quickly by dropping straight into the
				200	* exit block. */
				201	.Ltail63down:
				202	/* Move up to 48 bytes of data. At this point we only need the
				203	* bottom 6 bits of count to be accurate. */
				204	ands tmp1, count, #0x30
				205	b.eq .Ltail15down
				206	add dst, dst, tmp1
				207	add src, src, tmp1
				208	cmp tmp1w, #0x20
				209	b.eq 1f
				210	b.lt 2f
				211	ldp A_l, A_h, [src, #-48]
				212	stp A_l, A_h, [dst, #-48]
				213	1:
				214	ldp A_l, A_h, [src, #-32]
				215	stp A_l, A_h, [dst, #-32]
				216	2:
				217	ldp A_l, A_h, [src, #-16]
				218	stp A_l, A_h, [dst, #-16]
				219	.Ltail15down:
				220	/* Move up to 15 bytes of data. Does not assume additional data
				221	being moved. */
				222	tbz count, #3, 1f
				223	ldr tmp1, [src], #8
				224	str tmp1, [dst], #8
				225	1:
				226	tbz count, #2, 1f
				227	ldr tmp1w, [src], #4
				228	str tmp1w, [dst], #4
				229	1:
				230	tbz count, #1, 1f
				231	ldrh tmp1w, [src], #2
				232	strh tmp1w, [dst], #2
				233	1:
				234	tbz count, #0, 1f
				235	ldrb tmp1w, [src]
				236	strb tmp1w, [dst]
				237	1:
				238	ret
				239
				240	.Lmov_not_short_down:
				241	/* We don't much care about the alignment of DST, but we want SRC
				242	* to be 128-bit (16 byte) aligned so that we don't cross cache line
				243	* boundaries on both loads and stores. */
				244	neg tmp2, src
				245	ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
				246	b.eq 2f
				247	sub count, count, tmp2
				248	/* Move enough data to reach alignment; unlike memcpy, we have to
				249	* be aware of the overlap, which means we can't move data twice. */
				250	tbz tmp2, #3, 1f
				251	ldr tmp1, [src], #8
				252	str tmp1, [dst], #8
				253	1:
				254	tbz tmp2, #2, 1f
				255	ldr tmp1w, [src], #4
				256	str tmp1w, [dst], #4
				257	1:
				258	tbz tmp2, #1, 1f
				259	ldrh tmp1w, [src], #2
				260	strh tmp1w, [dst], #2
				261	1:
				262	tbz tmp2, #0, 1f
				263	ldrb tmp1w, [src], #1
				264	strb tmp1w, [dst], #1
				265	1:
				266
				267	/* There may be less than 63 bytes to go now. */
				268	cmp count, #63
				269	b.le .Ltail63down
				270	2:
				271	subs count, count, #128
				272	b.ge .Lmov_body_large_down
				273	/* Less than 128 bytes to move, so handle 64 here and then jump
				274	* to the tail. */
				275	ldp A_l, A_h, [src]
				276	ldp B_l, B_h, [src, #16]
				277	ldp C_l, C_h, [src, #32]
				278	ldp D_l, D_h, [src, #48]
				279	stp A_l, A_h, [dst]
				280	stp B_l, B_h, [dst, #16]
				281	stp C_l, C_h, [dst, #32]
				282	stp D_l, D_h, [dst, #48]
				283	tst count, #0x3f
				284	add src, src, #64
				285	add dst, dst, #64
				286	b.ne .Ltail63down
				287	ret
				288
				289	/* Critical loop. Start at a new cache line boundary. Assuming
				290	* 64 bytes per line this ensures the entire loop is in one line. */
				291	.p2align 6
				292	.Lmov_body_large_down:
				293	/* There are at least 128 bytes to move. */
				294	ldp A_l, A_h, [src, #0]
				295	sub dst, dst, #16 /* Pre-bias. */
				296	ldp B_l, B_h, [src, #16]
				297	ldp C_l, C_h, [src, #32]
				298	ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
				299	1:
				300	stp A_l, A_h, [dst, #16]
				301	ldp A_l, A_h, [src, #16]
				302	stp B_l, B_h, [dst, #32]
				303	ldp B_l, B_h, [src, #32]
				304	stp C_l, C_h, [dst, #48]
				305	ldp C_l, C_h, [src, #48]
				306	stp D_l, D_h, [dst, #64]!
				307	ldp D_l, D_h, [src, #64]!
				308	subs count, count, #64
				309	b.ge 1b
				310	stp A_l, A_h, [dst, #16]
				311	stp B_l, B_h, [dst, #32]
				312	stp C_l, C_h, [dst, #48]
				313	stp D_l, D_h, [dst, #64]
				314	add src, src, #16
				315	add dst, dst, #64 + 16
				316	tst count, #0x3f
				317	b.ne .Ltail63down
				318	ret
				319	END(memmove)