Blame - libc/arch-arm64/generic/bionic/memcpy.S - platform_bionic

blob: e1b1a727cda03b42609f1144a212c451f3e7b18a [file] [log] [blame]

Bernhard Rosenkraenzer	7e4fa56	2014-03-05 11:40:57 +0100	[diff] [blame]	1	/* Copyright (c) 2012, Linaro Limited
				2	All rights reserved.
				3
				4	Redistribution and use in source and binary forms, with or without
				5	modification, are permitted provided that the following conditions are met:
				6	* Redistributions of source code must retain the above copyright
				7	notice, this list of conditions and the following disclaimer.
				8	* Redistributions in binary form must reproduce the above copyright
				9	notice, this list of conditions and the following disclaimer in the
				10	documentation and/or other materials provided with the distribution.
				11	* Neither the name of the Linaro nor the
				12	names of its contributors may be used to endorse or promote products
				13	derived from this software without specific prior written permission.
				14
				15	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				16	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				17	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				18	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				19	HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				20	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				21	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				22	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				23	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				24	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				25	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				26	*/
				27
				28	/* Assumptions:
				29	*
				30	* ARMv8-a, AArch64
				31	* Unaligned accesses
				32	*
				33	*/
				34
				35	#include <private/bionic_asm.h>
				36
				37	#define dstin x0
				38	#define src x1
				39	#define count x2
				40	#define tmp1 x3
				41	#define tmp1w w3
				42	#define tmp2 x4
				43	#define tmp2w w4
				44	#define tmp3 x5
				45	#define tmp3w w5
				46	#define dst x6
				47
				48	#define A_l x7
				49	#define A_h x8
				50	#define B_l x9
				51	#define B_h x10
				52	#define C_l x11
				53	#define C_h x12
				54	#define D_l x13
				55	#define D_h x14
				56
				57	ENTRY(memcpy)
				58
				59	mov dst, dstin
				60	cmp count, #64
				61	b.ge .Lcpy_not_short
				62	cmp count, #15
				63	b.le .Ltail15tiny
				64
				65	/* Deal with small copies quickly by dropping straight into the
				66	* exit block. */
				67	.Ltail63:
				68	/* Copy up to 48 bytes of data. At this point we only need the
				69	* bottom 6 bits of count to be accurate. */
				70	ands tmp1, count, #0x30
				71	b.eq .Ltail15
				72	add dst, dst, tmp1
				73	add src, src, tmp1
				74	cmp tmp1w, #0x20
				75	b.eq 1f
				76	b.lt 2f
				77	ldp A_l, A_h, [src, #-48]
				78	stp A_l, A_h, [dst, #-48]
				79	1:
				80	ldp A_l, A_h, [src, #-32]
				81	stp A_l, A_h, [dst, #-32]
				82	2:
				83	ldp A_l, A_h, [src, #-16]
				84	stp A_l, A_h, [dst, #-16]
				85
				86	.Ltail15:
				87	ands count, count, #15
				88	beq 1f
				89	add src, src, count
				90	ldp A_l, A_h, [src, #-16]
				91	add dst, dst, count
				92	stp A_l, A_h, [dst, #-16]
				93	1:
				94	ret
				95
				96	.Ltail15tiny:
				97	/* Copy up to 15 bytes of data. Does not assume additional data
				98	being copied. */
				99	tbz count, #3, 1f
				100	ldr tmp1, [src], #8
				101	str tmp1, [dst], #8
				102	1:
				103	tbz count, #2, 1f
				104	ldr tmp1w, [src], #4
				105	str tmp1w, [dst], #4
				106	1:
				107	tbz count, #1, 1f
				108	ldrh tmp1w, [src], #2
				109	strh tmp1w, [dst], #2
				110	1:
				111	tbz count, #0, 1f
				112	ldrb tmp1w, [src]
				113	strb tmp1w, [dst]
				114	1:
				115	ret
				116
				117	.Lcpy_not_short:
				118	/* We don't much care about the alignment of DST, but we want SRC
				119	* to be 128-bit (16 byte) aligned so that we don't cross cache line
				120	* boundaries on both loads and stores. */
				121	neg tmp2, src
				122	ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
				123	b.eq 2f
				124	sub count, count, tmp2
				125	/* Copy more data than needed; it's faster than jumping
				126	* around copying sub-Quadword quantities. We know that
				127	* it can't overrun. */
				128	ldp A_l, A_h, [src]
				129	add src, src, tmp2
				130	stp A_l, A_h, [dst]
				131	add dst, dst, tmp2
				132	/* There may be less than 63 bytes to go now. */
				133	cmp count, #63
				134	b.le .Ltail63
				135	2:
				136	subs count, count, #128
				137	b.ge .Lcpy_body_large
				138	/* Less than 128 bytes to copy, so handle 64 here and then jump
				139	* to the tail. */
				140	ldp A_l, A_h, [src]
				141	ldp B_l, B_h, [src, #16]
				142	ldp C_l, C_h, [src, #32]
				143	ldp D_l, D_h, [src, #48]
				144	stp A_l, A_h, [dst]
				145	stp B_l, B_h, [dst, #16]
				146	stp C_l, C_h, [dst, #32]
				147	stp D_l, D_h, [dst, #48]
				148	tst count, #0x3f
				149	add src, src, #64
				150	add dst, dst, #64
				151	b.ne .Ltail63
				152	ret
				153
				154	/* Critical loop. Start at a new cache line boundary. Assuming
				155	* 64 bytes per line this ensures the entire loop is in one line. */
				156	.p2align 6
				157	.Lcpy_body_large:
				158	/* There are at least 128 bytes to copy. */
				159	ldp A_l, A_h, [src, #0]
				160	sub dst, dst, #16 /* Pre-bias. */
				161	ldp B_l, B_h, [src, #16]
				162	ldp C_l, C_h, [src, #32]
				163	ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
				164	1:
				165	stp A_l, A_h, [dst, #16]
				166	ldp A_l, A_h, [src, #16]
				167	stp B_l, B_h, [dst, #32]
				168	ldp B_l, B_h, [src, #32]
				169	stp C_l, C_h, [dst, #48]
				170	ldp C_l, C_h, [src, #48]
				171	stp D_l, D_h, [dst, #64]!
				172	ldp D_l, D_h, [src, #64]!
				173	subs count, count, #64
				174	b.ge 1b
				175	stp A_l, A_h, [dst, #16]
				176	stp B_l, B_h, [dst, #32]
				177	stp C_l, C_h, [dst, #48]
				178	stp D_l, D_h, [dst, #64]
				179	add src, src, #16
				180	add dst, dst, #64 + 16
				181	tst count, #0x3f
				182	b.ne .Ltail63
				183	ret
				184	END(memcpy)