Blame - libc/arch-arm/bionic/memcpy.a9.S - platform_bionic

blob: 550989aa8ecd45efc8a3486083aef67edd20e242 [file] [log] [blame]

Will Newton	b61103d	2013-07-01 11:15:27 +0100	[diff] [blame]	1	/* Copyright (c) 2013, Linaro Limited
				2	All rights reserved.
				3
				4	Redistribution and use in source and binary forms, with or without
				5	modification, are permitted provided that the following conditions
				6	are met:
				7
				8	* Redistributions of source code must retain the above copyright
				9	notice, this list of conditions and the following disclaimer.
				10
				11	* Redistributions in binary form must reproduce the above copyright
				12	notice, this list of conditions and the following disclaimer in the
				13	documentation and/or other materials provided with the distribution.
				14
				15	* Neither the name of Linaro Limited nor the names of its
				16	contributors may be used to endorse or promote products derived
				17	from this software without specific prior written permission.
				18
				19	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				20	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				21	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				22	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				23	HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				24	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				25	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				26	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				27	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				28	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				29	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				30
				31	This memcpy routine is optimised for Cortex-A15 cores and takes advantage
				32	of VFP or NEON when built with the appropriate flags.
				33
				34	Assumptions:
				35
				36	ARMv6 (ARMv7-a if using Neon)
				37	ARM state
				38	Unaligned accesses
				39	LDRD/STRD support unaligned word accesses
				40
				41	*/
				42
				43	#include <machine/cpu-features.h>
				44	#include <machine/asm.h>
				45
				46	.syntax unified
				47	/* This implementation requires ARM state. */
				48	.arm
				49
				50	#ifdef __ARM_NEON__
				51
				52	.fpu neon
				53	.arch armv7-a
				54	# define FRAME_SIZE 4
				55	# define USE_VFP
				56	# define USE_NEON
				57
				58	#elif !defined (__SOFTFP__)
				59
				60	.arch armv6
				61	.fpu vfpv2
				62	# define FRAME_SIZE 32
				63	# define USE_VFP
				64
				65	#else
				66	.arch armv6
				67	# define FRAME_SIZE 32
				68
				69	#endif
				70
				71	/* Old versions of GAS incorrectly implement the NEON align semantics. */
				72	#ifdef BROKEN_ASM_NEON_ALIGN
				73	#define ALIGN(addr, align) addr,:align
				74	#else
				75	#define ALIGN(addr, align) addr:align
				76	#endif
				77
				78	#define PC_OFFSET 8 /* PC pipeline compensation. */
				79	#define INSN_SIZE 4
				80
				81	/* Call parameters. */
				82	#define dstin r0
				83	#define src r1
				84	#define count r2
				85
				86	/* Locals. */
				87	#define tmp1 r3
				88	#define dst ip
				89	#define tmp2 r10
				90
				91	#ifndef USE_NEON
				92	/* For bulk copies using GP registers. */
				93	#define A_l r2 /* Call-clobbered. */
				94	#define A_h r3 /* Call-clobbered. */
				95	#define B_l r4
				96	#define B_h r5
				97	#define C_l r6
				98	#define C_h r7
				99	#define D_l r8
				100	#define D_h r9
				101	#endif
				102
				103	/* Number of lines ahead to pre-fetch data. If you change this the code
				104	below will need adjustment to compensate. */
				105
				106	#define prefetch_lines 5
				107
				108	#ifdef USE_VFP
				109	.macro cpy_line_vfp vreg, base
				110	vstr \vreg, [dst, #\base]
				111	vldr \vreg, [src, #\base]
				112	vstr d0, [dst, #\base + 8]
				113	vldr d0, [src, #\base + 8]
				114	vstr d1, [dst, #\base + 16]
				115	vldr d1, [src, #\base + 16]
				116	vstr d2, [dst, #\base + 24]
				117	vldr d2, [src, #\base + 24]
				118	vstr \vreg, [dst, #\base + 32]
				119	vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
				120	vstr d0, [dst, #\base + 40]
				121	vldr d0, [src, #\base + 40]
				122	vstr d1, [dst, #\base + 48]
				123	vldr d1, [src, #\base + 48]
				124	vstr d2, [dst, #\base + 56]
				125	vldr d2, [src, #\base + 56]
				126	.endm
				127
				128	.macro cpy_tail_vfp vreg, base
				129	vstr \vreg, [dst, #\base]
				130	vldr \vreg, [src, #\base]
				131	vstr d0, [dst, #\base + 8]
				132	vldr d0, [src, #\base + 8]
				133	vstr d1, [dst, #\base + 16]
				134	vldr d1, [src, #\base + 16]
				135	vstr d2, [dst, #\base + 24]
				136	vldr d2, [src, #\base + 24]
				137	vstr \vreg, [dst, #\base + 32]
				138	vstr d0, [dst, #\base + 40]
				139	vldr d0, [src, #\base + 40]
				140	vstr d1, [dst, #\base + 48]
				141	vldr d1, [src, #\base + 48]
				142	vstr d2, [dst, #\base + 56]
				143	vldr d2, [src, #\base + 56]
				144	.endm
				145	#endif
				146
				147	.p2align 6
				148	ENTRY(memcpy)
				149
				150	mov dst, dstin /* Preserve dstin, we need to return it. */
				151	cmp count, #64
				152	bge .Lcpy_not_short
				153	/* Deal with small copies quickly by dropping straight into the
				154	exit block. */
				155
				156	.Ltail63unaligned:
				157	#ifdef USE_NEON
				158	and tmp1, count, #0x38
				159	rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
				160	add pc, pc, tmp1
				161	vld1.8 {d0}, [src]! /* 14 words to go. */
				162	vst1.8 {d0}, [dst]!
				163	vld1.8 {d0}, [src]! /* 12 words to go. */
				164	vst1.8 {d0}, [dst]!
				165	vld1.8 {d0}, [src]! /* 10 words to go. */
				166	vst1.8 {d0}, [dst]!
				167	vld1.8 {d0}, [src]! /* 8 words to go. */
				168	vst1.8 {d0}, [dst]!
				169	vld1.8 {d0}, [src]! /* 6 words to go. */
				170	vst1.8 {d0}, [dst]!
				171	vld1.8 {d0}, [src]! /* 4 words to go. */
				172	vst1.8 {d0}, [dst]!
				173	vld1.8 {d0}, [src]! /* 2 words to go. */
				174	vst1.8 {d0}, [dst]!
				175
				176	tst count, #4
				177	ldrne tmp1, [src], #4
				178	strne tmp1, [dst], #4
				179	#else
				180	/* Copy up to 15 full words of data. May not be aligned. */
				181	/* Cannot use VFP for unaligned data. */
				182	and tmp1, count, #0x3c
				183	add dst, dst, tmp1
				184	add src, src, tmp1
				185	rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
				186	/* Jump directly into the sequence below at the correct offset. */
				187	add pc, pc, tmp1, lsl #1
				188
				189	ldr tmp1, [src, #-60] /* 15 words to go. */
				190	str tmp1, [dst, #-60]
				191
				192	ldr tmp1, [src, #-56] /* 14 words to go. */
				193	str tmp1, [dst, #-56]
				194	ldr tmp1, [src, #-52]
				195	str tmp1, [dst, #-52]
				196
				197	ldr tmp1, [src, #-48] /* 12 words to go. */
				198	str tmp1, [dst, #-48]
				199	ldr tmp1, [src, #-44]
				200	str tmp1, [dst, #-44]
				201
				202	ldr tmp1, [src, #-40] /* 10 words to go. */
				203	str tmp1, [dst, #-40]
				204	ldr tmp1, [src, #-36]
				205	str tmp1, [dst, #-36]
				206
				207	ldr tmp1, [src, #-32] /* 8 words to go. */
				208	str tmp1, [dst, #-32]
				209	ldr tmp1, [src, #-28]
				210	str tmp1, [dst, #-28]
				211
				212	ldr tmp1, [src, #-24] /* 6 words to go. */
				213	str tmp1, [dst, #-24]
				214	ldr tmp1, [src, #-20]
				215	str tmp1, [dst, #-20]
				216
				217	ldr tmp1, [src, #-16] /* 4 words to go. */
				218	str tmp1, [dst, #-16]
				219	ldr tmp1, [src, #-12]
				220	str tmp1, [dst, #-12]
				221
				222	ldr tmp1, [src, #-8] /* 2 words to go. */
				223	str tmp1, [dst, #-8]
				224	ldr tmp1, [src, #-4]
				225	str tmp1, [dst, #-4]
				226	#endif
				227
				228	lsls count, count, #31
				229	ldrhcs tmp1, [src], #2
				230	ldrbne src, [src] /* Src is dead, use as a scratch. */
				231	strhcs tmp1, [dst], #2
				232	strbne src, [dst]
				233	bx lr
				234
				235	.Lcpy_not_short:
				236	/* At least 64 bytes to copy, but don't know the alignment yet. */
				237	str tmp2, [sp, #-FRAME_SIZE]!
				238	and tmp2, src, #7
				239	and tmp1, dst, #7
				240	cmp tmp1, tmp2
				241	bne .Lcpy_notaligned
				242
				243	#ifdef USE_VFP
				244	/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
				245	that the FP pipeline is much better at streaming loads and
				246	stores. This is outside the critical loop. */
				247	vmov.f32 s0, s0
				248	#endif
				249
				250	/* SRC and DST have the same mutual 32-bit alignment, but we may
				251	still need to pre-copy some bytes to get to natural alignment.
				252	We bring DST into full 64-bit alignment. */
				253	lsls tmp2, dst, #29
				254	beq 1f
				255	rsbs tmp2, tmp2, #0
				256	sub count, count, tmp2, lsr #29
				257	ldrmi tmp1, [src], #4
				258	strmi tmp1, [dst], #4
				259	lsls tmp2, tmp2, #2
				260	ldrhcs tmp1, [src], #2
				261	ldrbne tmp2, [src], #1
				262	strhcs tmp1, [dst], #2
				263	strbne tmp2, [dst], #1
				264
				265	1:
				266	subs tmp2, count, #64 /* Use tmp2 for count. */
				267	blt .Ltail63aligned
				268
				269	cmp tmp2, #512
				270	bge .Lcpy_body_long
				271
				272	.Lcpy_body_medium: /* Count in tmp2. */
				273	#ifdef USE_VFP
				274	1:
				275	vldr d0, [src, #0]
				276	subs tmp2, tmp2, #64
				277	vldr d1, [src, #8]
				278	vstr d0, [dst, #0]
				279	vldr d0, [src, #16]
				280	vstr d1, [dst, #8]
				281	vldr d1, [src, #24]
				282	vstr d0, [dst, #16]
				283	vldr d0, [src, #32]
				284	vstr d1, [dst, #24]
				285	vldr d1, [src, #40]
				286	vstr d0, [dst, #32]
				287	vldr d0, [src, #48]
				288	vstr d1, [dst, #40]
				289	vldr d1, [src, #56]
				290	vstr d0, [dst, #48]
				291	add src, src, #64
				292	vstr d1, [dst, #56]
				293	add dst, dst, #64
				294	bge 1b
				295	tst tmp2, #0x3f
				296	beq .Ldone
				297
				298	.Ltail63aligned: /* Count in tmp2. */
				299	and tmp1, tmp2, #0x38
				300	add dst, dst, tmp1
				301	add src, src, tmp1
				302	rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
				303	add pc, pc, tmp1
				304
				305	vldr d0, [src, #-56] /* 14 words to go. */
				306	vstr d0, [dst, #-56]
				307	vldr d0, [src, #-48] /* 12 words to go. */
				308	vstr d0, [dst, #-48]
				309	vldr d0, [src, #-40] /* 10 words to go. */
				310	vstr d0, [dst, #-40]
				311	vldr d0, [src, #-32] /* 8 words to go. */
				312	vstr d0, [dst, #-32]
				313	vldr d0, [src, #-24] /* 6 words to go. */
				314	vstr d0, [dst, #-24]
				315	vldr d0, [src, #-16] /* 4 words to go. */
				316	vstr d0, [dst, #-16]
				317	vldr d0, [src, #-8] /* 2 words to go. */
				318	vstr d0, [dst, #-8]
				319	#else
				320	sub src, src, #8
				321	sub dst, dst, #8
				322	1:
				323	ldrd A_l, A_h, [src, #8]
				324	strd A_l, A_h, [dst, #8]
				325	ldrd A_l, A_h, [src, #16]
				326	strd A_l, A_h, [dst, #16]
				327	ldrd A_l, A_h, [src, #24]
				328	strd A_l, A_h, [dst, #24]
				329	ldrd A_l, A_h, [src, #32]
				330	strd A_l, A_h, [dst, #32]
				331	ldrd A_l, A_h, [src, #40]
				332	strd A_l, A_h, [dst, #40]
				333	ldrd A_l, A_h, [src, #48]
				334	strd A_l, A_h, [dst, #48]
				335	ldrd A_l, A_h, [src, #56]
				336	strd A_l, A_h, [dst, #56]
				337	ldrd A_l, A_h, [src, #64]!
				338	strd A_l, A_h, [dst, #64]!
				339	subs tmp2, tmp2, #64
				340	bge 1b
				341	tst tmp2, #0x3f
				342	bne 1f
				343	ldr tmp2,[sp], #FRAME_SIZE
				344	bx lr
				345	1:
				346	add src, src, #8
				347	add dst, dst, #8
				348
				349	.Ltail63aligned: /* Count in tmp2. */
				350	/* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
				351	we know that the src and dest are 32-bit aligned so we can use
				352	LDRD/STRD to improve efficiency. */
				353	/* TMP2 is now negative, but we don't care about that. The bottom
				354	six bits still tell us how many bytes are left to copy. */
				355
				356	and tmp1, tmp2, #0x38
				357	add dst, dst, tmp1
				358	add src, src, tmp1
				359	rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
				360	add pc, pc, tmp1
				361	ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
				362	strd A_l, A_h, [dst, #-56]
				363	ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
				364	strd A_l, A_h, [dst, #-48]
				365	ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
				366	strd A_l, A_h, [dst, #-40]
				367	ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
				368	strd A_l, A_h, [dst, #-32]
				369	ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
				370	strd A_l, A_h, [dst, #-24]
				371	ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
				372	strd A_l, A_h, [dst, #-16]
				373	ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
				374	strd A_l, A_h, [dst, #-8]
				375
				376	#endif
				377	tst tmp2, #4
				378	ldrne tmp1, [src], #4
				379	strne tmp1, [dst], #4
				380	lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
				381	ldrhcs tmp1, [src], #2
				382	ldrbne tmp2, [src]
				383	strhcs tmp1, [dst], #2
				384	strbne tmp2, [dst]
				385
				386	.Ldone:
				387	ldr tmp2, [sp], #FRAME_SIZE
				388	bx lr
				389
				390	.Lcpy_body_long: /* Count in tmp2. */
				391
				392	/* Long copy. We know that there's at least (prefetch_lines * 64)
				393	bytes to go. */
				394	#ifdef USE_VFP
				395	/* Don't use PLD. Instead, read some data in advance of the current
				396	copy position into a register. This should act like a PLD
				397	operation but we won't have to repeat the transfer. */
				398
				399	vldr d3, [src, #0]
				400	vldr d4, [src, #64]
				401	vldr d5, [src, #128]
				402	vldr d6, [src, #192]
				403	vldr d7, [src, #256]
				404
				405	vldr d0, [src, #8]
				406	vldr d1, [src, #16]
				407	vldr d2, [src, #24]
				408	add src, src, #32
				409
				410	subs tmp2, tmp2, #prefetch_lines * 64 * 2
				411	blt 2f
				412	1:
				413	cpy_line_vfp d3, 0
				414	cpy_line_vfp d4, 64
				415	cpy_line_vfp d5, 128
				416	add dst, dst, #3 * 64
				417	add src, src, #3 * 64
				418	cpy_line_vfp d6, 0
				419	cpy_line_vfp d7, 64
				420	add dst, dst, #2 * 64
				421	add src, src, #2 * 64
				422	subs tmp2, tmp2, #prefetch_lines * 64
				423	bge 1b
				424
				425	2:
				426	cpy_tail_vfp d3, 0
				427	cpy_tail_vfp d4, 64
				428	cpy_tail_vfp d5, 128
				429	add src, src, #3 * 64
				430	add dst, dst, #3 * 64
				431	cpy_tail_vfp d6, 0
				432	vstr d7, [dst, #64]
				433	vldr d7, [src, #64]
				434	vstr d0, [dst, #64 + 8]
				435	vldr d0, [src, #64 + 8]
				436	vstr d1, [dst, #64 + 16]
				437	vldr d1, [src, #64 + 16]
				438	vstr d2, [dst, #64 + 24]
				439	vldr d2, [src, #64 + 24]
				440	vstr d7, [dst, #64 + 32]
				441	add src, src, #96
				442	vstr d0, [dst, #64 + 40]
				443	vstr d1, [dst, #64 + 48]
				444	vstr d2, [dst, #64 + 56]
				445	add dst, dst, #128
				446	add tmp2, tmp2, #prefetch_lines * 64
				447	b .Lcpy_body_medium
				448	#else
				449	/* Long copy. Use an SMS style loop to maximize the I/O
				450	bandwidth of the core. We don't have enough spare registers
				451	to synthesise prefetching, so use PLD operations. */
				452	/* Pre-bias src and dst. */
				453	sub src, src, #8
				454	sub dst, dst, #8
				455	pld [src, #8]
				456	pld [src, #72]
				457	subs tmp2, tmp2, #64
				458	pld [src, #136]
				459	ldrd A_l, A_h, [src, #8]
				460	strd B_l, B_h, [sp, #8]
				461	ldrd B_l, B_h, [src, #16]
				462	strd C_l, C_h, [sp, #16]
				463	ldrd C_l, C_h, [src, #24]
				464	strd D_l, D_h, [sp, #24]
				465	pld [src, #200]
				466	ldrd D_l, D_h, [src, #32]!
				467	b 1f
				468	.p2align 6
				469	2:
				470	pld [src, #232]
				471	strd A_l, A_h, [dst, #40]
				472	ldrd A_l, A_h, [src, #40]
				473	strd B_l, B_h, [dst, #48]
				474	ldrd B_l, B_h, [src, #48]
				475	strd C_l, C_h, [dst, #56]
				476	ldrd C_l, C_h, [src, #56]
				477	strd D_l, D_h, [dst, #64]!
				478	ldrd D_l, D_h, [src, #64]!
				479	subs tmp2, tmp2, #64
				480	1:
				481	strd A_l, A_h, [dst, #8]
				482	ldrd A_l, A_h, [src, #8]
				483	strd B_l, B_h, [dst, #16]
				484	ldrd B_l, B_h, [src, #16]
				485	strd C_l, C_h, [dst, #24]
				486	ldrd C_l, C_h, [src, #24]
				487	strd D_l, D_h, [dst, #32]
				488	ldrd D_l, D_h, [src, #32]
				489	bcs 2b
				490	/* Save the remaining bytes and restore the callee-saved regs. */
				491	strd A_l, A_h, [dst, #40]
				492	add src, src, #40
				493	strd B_l, B_h, [dst, #48]
				494	ldrd B_l, B_h, [sp, #8]
				495	strd C_l, C_h, [dst, #56]
				496	ldrd C_l, C_h, [sp, #16]
				497	strd D_l, D_h, [dst, #64]
				498	ldrd D_l, D_h, [sp, #24]
				499	add dst, dst, #72
				500	tst tmp2, #0x3f
				501	bne .Ltail63aligned
				502	ldr tmp2, [sp], #FRAME_SIZE
				503	bx lr
				504	#endif
				505
				506	.Lcpy_notaligned:
				507	pld [src]
				508	pld [src, #64]
				509	/* There's at least 64 bytes to copy, but there is no mutual
				510	alignment. */
				511	/* Bring DST to 64-bit alignment. */
				512	lsls tmp2, dst, #29
				513	pld [src, #(2 * 64)]
				514	beq 1f
				515	rsbs tmp2, tmp2, #0
				516	sub count, count, tmp2, lsr #29
				517	ldrmi tmp1, [src], #4
				518	strmi tmp1, [dst], #4
				519	lsls tmp2, tmp2, #2
				520	ldrbne tmp1, [src], #1
				521	ldrhcs tmp2, [src], #2
				522	strbne tmp1, [dst], #1
				523	strhcs tmp2, [dst], #2
				524	1:
				525	pld [src, #(3 * 64)]
				526	subs count, count, #64
				527	ldrmi tmp2, [sp], #FRAME_SIZE
				528	bmi .Ltail63unaligned
				529	pld [src, #(4 * 64)]
				530
				531	#ifdef USE_NEON
				532	vld1.8 {d0-d3}, [src]!
				533	vld1.8 {d4-d7}, [src]!
				534	subs count, count, #64
				535	bmi 2f
				536	1:
				537	pld [src, #(4 * 64)]
				538	vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
				539	vld1.8 {d0-d3}, [src]!
				540	vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
				541	vld1.8 {d4-d7}, [src]!
				542	subs count, count, #64
				543	bpl 1b
				544	2:
				545	vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
				546	vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
				547	ands count, count, #0x3f
				548	#else
				549	/* Use an SMS style loop to maximize the I/O bandwidth. */
				550	sub src, src, #4
				551	sub dst, dst, #8
				552	subs tmp2, count, #64 /* Use tmp2 for count. */
				553	ldr A_l, [src, #4]
				554	ldr A_h, [src, #8]
				555	strd B_l, B_h, [sp, #8]
				556	ldr B_l, [src, #12]
				557	ldr B_h, [src, #16]
				558	strd C_l, C_h, [sp, #16]
				559	ldr C_l, [src, #20]
				560	ldr C_h, [src, #24]
				561	strd D_l, D_h, [sp, #24]
				562	ldr D_l, [src, #28]
				563	ldr D_h, [src, #32]!
				564	b 1f
				565	.p2align 6
				566	2:
				567	pld [src, #(5 * 64) - (32 - 4)]
				568	strd A_l, A_h, [dst, #40]
				569	ldr A_l, [src, #36]
				570	ldr A_h, [src, #40]
				571	strd B_l, B_h, [dst, #48]
				572	ldr B_l, [src, #44]
				573	ldr B_h, [src, #48]
				574	strd C_l, C_h, [dst, #56]
				575	ldr C_l, [src, #52]
				576	ldr C_h, [src, #56]
				577	strd D_l, D_h, [dst, #64]!
				578	ldr D_l, [src, #60]
				579	ldr D_h, [src, #64]!
				580	subs tmp2, tmp2, #64
				581	1:
				582	strd A_l, A_h, [dst, #8]
				583	ldr A_l, [src, #4]
				584	ldr A_h, [src, #8]
				585	strd B_l, B_h, [dst, #16]
				586	ldr B_l, [src, #12]
				587	ldr B_h, [src, #16]
				588	strd C_l, C_h, [dst, #24]
				589	ldr C_l, [src, #20]
				590	ldr C_h, [src, #24]
				591	strd D_l, D_h, [dst, #32]
				592	ldr D_l, [src, #28]
				593	ldr D_h, [src, #32]
				594	bcs 2b
				595
				596	/* Save the remaining bytes and restore the callee-saved regs. */
				597	strd A_l, A_h, [dst, #40]
				598	add src, src, #36
				599	strd B_l, B_h, [dst, #48]
				600	ldrd B_l, B_h, [sp, #8]
				601	strd C_l, C_h, [dst, #56]
				602	ldrd C_l, C_h, [sp, #16]
				603	strd D_l, D_h, [dst, #64]
				604	ldrd D_l, D_h, [sp, #24]
				605	add dst, dst, #72
				606	ands count, tmp2, #0x3f
				607	#endif
				608	ldr tmp2, [sp], #FRAME_SIZE
				609	bne .Ltail63unaligned
				610	bx lr
				611	END(memcpy)