Blame - libc/arch-arm/bionic/memcpy.a9.S - platform_bionic

blob: 2ba1ff51d705584a7e9eeaa51f78fd6015443c37 [file] [log] [blame]

Will Newton	b61103d	2013-07-01 11:15:27 +0100	[diff] [blame]	1	/* Copyright (c) 2013, Linaro Limited
				2	All rights reserved.
				3
				4	Redistribution and use in source and binary forms, with or without
				5	modification, are permitted provided that the following conditions
				6	are met:
				7
				8	* Redistributions of source code must retain the above copyright
				9	notice, this list of conditions and the following disclaimer.
				10
				11	* Redistributions in binary form must reproduce the above copyright
				12	notice, this list of conditions and the following disclaimer in the
				13	documentation and/or other materials provided with the distribution.
				14
				15	* Neither the name of Linaro Limited nor the names of its
				16	contributors may be used to endorse or promote products derived
				17	from this software without specific prior written permission.
				18
				19	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				20	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				21	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				22	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
				23	HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				24	SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
				25	LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
				26	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
				27	THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				28	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
				29	OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				30
Will Newton	2753e12	2013-07-03 09:44:30 +0100	[diff] [blame]	31	*/
				32
				33	/*
Will Newton	b61103d	2013-07-01 11:15:27 +0100	[diff] [blame]	34	This memcpy routine is optimised for Cortex-A15 cores and takes advantage
				35	of VFP or NEON when built with the appropriate flags.
				36
				37	Assumptions:
				38
				39	ARMv6 (ARMv7-a if using Neon)
				40	ARM state
				41	Unaligned accesses
				42	LDRD/STRD support unaligned word accesses
				43
				44	*/
				45
				46	#include <machine/cpu-features.h>
				47	#include <machine/asm.h>
				48
				49	.syntax unified
				50	/* This implementation requires ARM state. */
				51	.arm
				52
				53	#ifdef __ARM_NEON__
				54
				55	.fpu neon
				56	.arch armv7-a
				57	# define FRAME_SIZE 4
				58	# define USE_VFP
				59	# define USE_NEON
				60
				61	#elif !defined (__SOFTFP__)
				62
				63	.arch armv6
				64	.fpu vfpv2
				65	# define FRAME_SIZE 32
				66	# define USE_VFP
				67
				68	#else
				69	.arch armv6
				70	# define FRAME_SIZE 32
				71
				72	#endif
				73
				74	/* Old versions of GAS incorrectly implement the NEON align semantics. */
				75	#ifdef BROKEN_ASM_NEON_ALIGN
				76	#define ALIGN(addr, align) addr,:align
				77	#else
				78	#define ALIGN(addr, align) addr:align
				79	#endif
				80
				81	#define PC_OFFSET 8 /* PC pipeline compensation. */
				82	#define INSN_SIZE 4
				83
				84	/* Call parameters. */
				85	#define dstin r0
				86	#define src r1
				87	#define count r2
				88
				89	/* Locals. */
				90	#define tmp1 r3
				91	#define dst ip
				92	#define tmp2 r10
				93
				94	#ifndef USE_NEON
				95	/* For bulk copies using GP registers. */
				96	#define A_l r2 /* Call-clobbered. */
				97	#define A_h r3 /* Call-clobbered. */
				98	#define B_l r4
				99	#define B_h r5
				100	#define C_l r6
				101	#define C_h r7
				102	#define D_l r8
				103	#define D_h r9
				104	#endif
				105
				106	/* Number of lines ahead to pre-fetch data. If you change this the code
				107	below will need adjustment to compensate. */
				108
				109	#define prefetch_lines 5
				110
				111	#ifdef USE_VFP
				112	.macro cpy_line_vfp vreg, base
				113	vstr \vreg, [dst, #\base]
				114	vldr \vreg, [src, #\base]
				115	vstr d0, [dst, #\base + 8]
				116	vldr d0, [src, #\base + 8]
				117	vstr d1, [dst, #\base + 16]
				118	vldr d1, [src, #\base + 16]
				119	vstr d2, [dst, #\base + 24]
				120	vldr d2, [src, #\base + 24]
				121	vstr \vreg, [dst, #\base + 32]
				122	vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
				123	vstr d0, [dst, #\base + 40]
				124	vldr d0, [src, #\base + 40]
				125	vstr d1, [dst, #\base + 48]
				126	vldr d1, [src, #\base + 48]
				127	vstr d2, [dst, #\base + 56]
				128	vldr d2, [src, #\base + 56]
				129	.endm
				130
				131	.macro cpy_tail_vfp vreg, base
				132	vstr \vreg, [dst, #\base]
				133	vldr \vreg, [src, #\base]
				134	vstr d0, [dst, #\base + 8]
				135	vldr d0, [src, #\base + 8]
				136	vstr d1, [dst, #\base + 16]
				137	vldr d1, [src, #\base + 16]
				138	vstr d2, [dst, #\base + 24]
				139	vldr d2, [src, #\base + 24]
				140	vstr \vreg, [dst, #\base + 32]
				141	vstr d0, [dst, #\base + 40]
				142	vldr d0, [src, #\base + 40]
				143	vstr d1, [dst, #\base + 48]
				144	vldr d1, [src, #\base + 48]
				145	vstr d2, [dst, #\base + 56]
				146	vldr d2, [src, #\base + 56]
				147	.endm
				148	#endif
				149
				150	.p2align 6
				151	ENTRY(memcpy)
				152
				153	mov dst, dstin /* Preserve dstin, we need to return it. */
				154	cmp count, #64
				155	bge .Lcpy_not_short
				156	/* Deal with small copies quickly by dropping straight into the
				157	exit block. */
				158
				159	.Ltail63unaligned:
				160	#ifdef USE_NEON
				161	and tmp1, count, #0x38
				162	rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
				163	add pc, pc, tmp1
				164	vld1.8 {d0}, [src]! /* 14 words to go. */
				165	vst1.8 {d0}, [dst]!
				166	vld1.8 {d0}, [src]! /* 12 words to go. */
				167	vst1.8 {d0}, [dst]!
				168	vld1.8 {d0}, [src]! /* 10 words to go. */
				169	vst1.8 {d0}, [dst]!
				170	vld1.8 {d0}, [src]! /* 8 words to go. */
				171	vst1.8 {d0}, [dst]!
				172	vld1.8 {d0}, [src]! /* 6 words to go. */
				173	vst1.8 {d0}, [dst]!
				174	vld1.8 {d0}, [src]! /* 4 words to go. */
				175	vst1.8 {d0}, [dst]!
				176	vld1.8 {d0}, [src]! /* 2 words to go. */
				177	vst1.8 {d0}, [dst]!
				178
				179	tst count, #4
				180	ldrne tmp1, [src], #4
				181	strne tmp1, [dst], #4
				182	#else
				183	/* Copy up to 15 full words of data. May not be aligned. */
				184	/* Cannot use VFP for unaligned data. */
				185	and tmp1, count, #0x3c
				186	add dst, dst, tmp1
				187	add src, src, tmp1
				188	rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
				189	/* Jump directly into the sequence below at the correct offset. */
				190	add pc, pc, tmp1, lsl #1
				191
				192	ldr tmp1, [src, #-60] /* 15 words to go. */
				193	str tmp1, [dst, #-60]
				194
				195	ldr tmp1, [src, #-56] /* 14 words to go. */
				196	str tmp1, [dst, #-56]
				197	ldr tmp1, [src, #-52]
				198	str tmp1, [dst, #-52]
				199
				200	ldr tmp1, [src, #-48] /* 12 words to go. */
				201	str tmp1, [dst, #-48]
				202	ldr tmp1, [src, #-44]
				203	str tmp1, [dst, #-44]
				204
				205	ldr tmp1, [src, #-40] /* 10 words to go. */
				206	str tmp1, [dst, #-40]
				207	ldr tmp1, [src, #-36]
				208	str tmp1, [dst, #-36]
				209
				210	ldr tmp1, [src, #-32] /* 8 words to go. */
				211	str tmp1, [dst, #-32]
				212	ldr tmp1, [src, #-28]
				213	str tmp1, [dst, #-28]
				214
				215	ldr tmp1, [src, #-24] /* 6 words to go. */
				216	str tmp1, [dst, #-24]
				217	ldr tmp1, [src, #-20]
				218	str tmp1, [dst, #-20]
				219
				220	ldr tmp1, [src, #-16] /* 4 words to go. */
				221	str tmp1, [dst, #-16]
				222	ldr tmp1, [src, #-12]
				223	str tmp1, [dst, #-12]
				224
				225	ldr tmp1, [src, #-8] /* 2 words to go. */
				226	str tmp1, [dst, #-8]
				227	ldr tmp1, [src, #-4]
				228	str tmp1, [dst, #-4]
				229	#endif
				230
				231	lsls count, count, #31
				232	ldrhcs tmp1, [src], #2
				233	ldrbne src, [src] /* Src is dead, use as a scratch. */
				234	strhcs tmp1, [dst], #2
				235	strbne src, [dst]
				236	bx lr
				237
				238	.Lcpy_not_short:
				239	/* At least 64 bytes to copy, but don't know the alignment yet. */
				240	str tmp2, [sp, #-FRAME_SIZE]!
				241	and tmp2, src, #7
				242	and tmp1, dst, #7
				243	cmp tmp1, tmp2
				244	bne .Lcpy_notaligned
				245
				246	#ifdef USE_VFP
				247	/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
				248	that the FP pipeline is much better at streaming loads and
				249	stores. This is outside the critical loop. */
				250	vmov.f32 s0, s0
				251	#endif
				252
				253	/* SRC and DST have the same mutual 32-bit alignment, but we may
				254	still need to pre-copy some bytes to get to natural alignment.
				255	We bring DST into full 64-bit alignment. */
				256	lsls tmp2, dst, #29
				257	beq 1f
				258	rsbs tmp2, tmp2, #0
				259	sub count, count, tmp2, lsr #29
				260	ldrmi tmp1, [src], #4
				261	strmi tmp1, [dst], #4
				262	lsls tmp2, tmp2, #2
				263	ldrhcs tmp1, [src], #2
				264	ldrbne tmp2, [src], #1
				265	strhcs tmp1, [dst], #2
				266	strbne tmp2, [dst], #1
				267
				268	1:
				269	subs tmp2, count, #64 /* Use tmp2 for count. */
				270	blt .Ltail63aligned
				271
				272	cmp tmp2, #512
				273	bge .Lcpy_body_long
				274
				275	.Lcpy_body_medium: /* Count in tmp2. */
				276	#ifdef USE_VFP
				277	1:
				278	vldr d0, [src, #0]
				279	subs tmp2, tmp2, #64
				280	vldr d1, [src, #8]
				281	vstr d0, [dst, #0]
				282	vldr d0, [src, #16]
				283	vstr d1, [dst, #8]
				284	vldr d1, [src, #24]
				285	vstr d0, [dst, #16]
				286	vldr d0, [src, #32]
				287	vstr d1, [dst, #24]
				288	vldr d1, [src, #40]
				289	vstr d0, [dst, #32]
				290	vldr d0, [src, #48]
				291	vstr d1, [dst, #40]
				292	vldr d1, [src, #56]
				293	vstr d0, [dst, #48]
				294	add src, src, #64
				295	vstr d1, [dst, #56]
				296	add dst, dst, #64
				297	bge 1b
				298	tst tmp2, #0x3f
				299	beq .Ldone
				300
				301	.Ltail63aligned: /* Count in tmp2. */
				302	and tmp1, tmp2, #0x38
				303	add dst, dst, tmp1
				304	add src, src, tmp1
				305	rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
				306	add pc, pc, tmp1
				307
				308	vldr d0, [src, #-56] /* 14 words to go. */
				309	vstr d0, [dst, #-56]
				310	vldr d0, [src, #-48] /* 12 words to go. */
				311	vstr d0, [dst, #-48]
				312	vldr d0, [src, #-40] /* 10 words to go. */
				313	vstr d0, [dst, #-40]
				314	vldr d0, [src, #-32] /* 8 words to go. */
				315	vstr d0, [dst, #-32]
				316	vldr d0, [src, #-24] /* 6 words to go. */
				317	vstr d0, [dst, #-24]
				318	vldr d0, [src, #-16] /* 4 words to go. */
				319	vstr d0, [dst, #-16]
				320	vldr d0, [src, #-8] /* 2 words to go. */
				321	vstr d0, [dst, #-8]
				322	#else
				323	sub src, src, #8
				324	sub dst, dst, #8
				325	1:
				326	ldrd A_l, A_h, [src, #8]
				327	strd A_l, A_h, [dst, #8]
				328	ldrd A_l, A_h, [src, #16]
				329	strd A_l, A_h, [dst, #16]
				330	ldrd A_l, A_h, [src, #24]
				331	strd A_l, A_h, [dst, #24]
				332	ldrd A_l, A_h, [src, #32]
				333	strd A_l, A_h, [dst, #32]
				334	ldrd A_l, A_h, [src, #40]
				335	strd A_l, A_h, [dst, #40]
				336	ldrd A_l, A_h, [src, #48]
				337	strd A_l, A_h, [dst, #48]
				338	ldrd A_l, A_h, [src, #56]
				339	strd A_l, A_h, [dst, #56]
				340	ldrd A_l, A_h, [src, #64]!
				341	strd A_l, A_h, [dst, #64]!
				342	subs tmp2, tmp2, #64
				343	bge 1b
				344	tst tmp2, #0x3f
				345	bne 1f
				346	ldr tmp2,[sp], #FRAME_SIZE
				347	bx lr
				348	1:
				349	add src, src, #8
				350	add dst, dst, #8
				351
				352	.Ltail63aligned: /* Count in tmp2. */
				353	/* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
				354	we know that the src and dest are 32-bit aligned so we can use
				355	LDRD/STRD to improve efficiency. */
				356	/* TMP2 is now negative, but we don't care about that. The bottom
				357	six bits still tell us how many bytes are left to copy. */
				358
				359	and tmp1, tmp2, #0x38
				360	add dst, dst, tmp1
				361	add src, src, tmp1
				362	rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
				363	add pc, pc, tmp1
				364	ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
				365	strd A_l, A_h, [dst, #-56]
				366	ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
				367	strd A_l, A_h, [dst, #-48]
				368	ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
				369	strd A_l, A_h, [dst, #-40]
				370	ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
				371	strd A_l, A_h, [dst, #-32]
				372	ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
				373	strd A_l, A_h, [dst, #-24]
				374	ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
				375	strd A_l, A_h, [dst, #-16]
				376	ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
				377	strd A_l, A_h, [dst, #-8]
				378
				379	#endif
				380	tst tmp2, #4
				381	ldrne tmp1, [src], #4
				382	strne tmp1, [dst], #4
				383	lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
				384	ldrhcs tmp1, [src], #2
				385	ldrbne tmp2, [src]
				386	strhcs tmp1, [dst], #2
				387	strbne tmp2, [dst]
				388
				389	.Ldone:
				390	ldr tmp2, [sp], #FRAME_SIZE
				391	bx lr
				392
				393	.Lcpy_body_long: /* Count in tmp2. */
				394
				395	/* Long copy. We know that there's at least (prefetch_lines * 64)
				396	bytes to go. */
				397	#ifdef USE_VFP
				398	/* Don't use PLD. Instead, read some data in advance of the current
				399	copy position into a register. This should act like a PLD
				400	operation but we won't have to repeat the transfer. */
				401
				402	vldr d3, [src, #0]
				403	vldr d4, [src, #64]
				404	vldr d5, [src, #128]
				405	vldr d6, [src, #192]
				406	vldr d7, [src, #256]
				407
				408	vldr d0, [src, #8]
				409	vldr d1, [src, #16]
				410	vldr d2, [src, #24]
				411	add src, src, #32
				412
				413	subs tmp2, tmp2, #prefetch_lines * 64 * 2
				414	blt 2f
				415	1:
				416	cpy_line_vfp d3, 0
				417	cpy_line_vfp d4, 64
				418	cpy_line_vfp d5, 128
				419	add dst, dst, #3 * 64
				420	add src, src, #3 * 64
				421	cpy_line_vfp d6, 0
				422	cpy_line_vfp d7, 64
				423	add dst, dst, #2 * 64
				424	add src, src, #2 * 64
				425	subs tmp2, tmp2, #prefetch_lines * 64
				426	bge 1b
				427
				428	2:
				429	cpy_tail_vfp d3, 0
				430	cpy_tail_vfp d4, 64
				431	cpy_tail_vfp d5, 128
				432	add src, src, #3 * 64
				433	add dst, dst, #3 * 64
				434	cpy_tail_vfp d6, 0
				435	vstr d7, [dst, #64]
				436	vldr d7, [src, #64]
				437	vstr d0, [dst, #64 + 8]
				438	vldr d0, [src, #64 + 8]
				439	vstr d1, [dst, #64 + 16]
				440	vldr d1, [src, #64 + 16]
				441	vstr d2, [dst, #64 + 24]
				442	vldr d2, [src, #64 + 24]
				443	vstr d7, [dst, #64 + 32]
				444	add src, src, #96
				445	vstr d0, [dst, #64 + 40]
				446	vstr d1, [dst, #64 + 48]
				447	vstr d2, [dst, #64 + 56]
				448	add dst, dst, #128
				449	add tmp2, tmp2, #prefetch_lines * 64
				450	b .Lcpy_body_medium
				451	#else
				452	/* Long copy. Use an SMS style loop to maximize the I/O
				453	bandwidth of the core. We don't have enough spare registers
				454	to synthesise prefetching, so use PLD operations. */
				455	/* Pre-bias src and dst. */
				456	sub src, src, #8
				457	sub dst, dst, #8
				458	pld [src, #8]
				459	pld [src, #72]
				460	subs tmp2, tmp2, #64
				461	pld [src, #136]
				462	ldrd A_l, A_h, [src, #8]
				463	strd B_l, B_h, [sp, #8]
				464	ldrd B_l, B_h, [src, #16]
				465	strd C_l, C_h, [sp, #16]
				466	ldrd C_l, C_h, [src, #24]
				467	strd D_l, D_h, [sp, #24]
				468	pld [src, #200]
				469	ldrd D_l, D_h, [src, #32]!
				470	b 1f
				471	.p2align 6
				472	2:
				473	pld [src, #232]
				474	strd A_l, A_h, [dst, #40]
				475	ldrd A_l, A_h, [src, #40]
				476	strd B_l, B_h, [dst, #48]
				477	ldrd B_l, B_h, [src, #48]
				478	strd C_l, C_h, [dst, #56]
				479	ldrd C_l, C_h, [src, #56]
				480	strd D_l, D_h, [dst, #64]!
				481	ldrd D_l, D_h, [src, #64]!
				482	subs tmp2, tmp2, #64
				483	1:
				484	strd A_l, A_h, [dst, #8]
				485	ldrd A_l, A_h, [src, #8]
				486	strd B_l, B_h, [dst, #16]
				487	ldrd B_l, B_h, [src, #16]
				488	strd C_l, C_h, [dst, #24]
				489	ldrd C_l, C_h, [src, #24]
				490	strd D_l, D_h, [dst, #32]
				491	ldrd D_l, D_h, [src, #32]
				492	bcs 2b
				493	/* Save the remaining bytes and restore the callee-saved regs. */
				494	strd A_l, A_h, [dst, #40]
				495	add src, src, #40
				496	strd B_l, B_h, [dst, #48]
				497	ldrd B_l, B_h, [sp, #8]
				498	strd C_l, C_h, [dst, #56]
				499	ldrd C_l, C_h, [sp, #16]
				500	strd D_l, D_h, [dst, #64]
				501	ldrd D_l, D_h, [sp, #24]
				502	add dst, dst, #72
				503	tst tmp2, #0x3f
				504	bne .Ltail63aligned
				505	ldr tmp2, [sp], #FRAME_SIZE
				506	bx lr
				507	#endif
				508
				509	.Lcpy_notaligned:
				510	pld [src]
				511	pld [src, #64]
				512	/* There's at least 64 bytes to copy, but there is no mutual
				513	alignment. */
				514	/* Bring DST to 64-bit alignment. */
				515	lsls tmp2, dst, #29
				516	pld [src, #(2 * 64)]
				517	beq 1f
				518	rsbs tmp2, tmp2, #0
				519	sub count, count, tmp2, lsr #29
				520	ldrmi tmp1, [src], #4
				521	strmi tmp1, [dst], #4
				522	lsls tmp2, tmp2, #2
				523	ldrbne tmp1, [src], #1
				524	ldrhcs tmp2, [src], #2
				525	strbne tmp1, [dst], #1
				526	strhcs tmp2, [dst], #2
				527	1:
				528	pld [src, #(3 * 64)]
				529	subs count, count, #64
				530	ldrmi tmp2, [sp], #FRAME_SIZE
				531	bmi .Ltail63unaligned
				532	pld [src, #(4 * 64)]
				533
				534	#ifdef USE_NEON
				535	vld1.8 {d0-d3}, [src]!
				536	vld1.8 {d4-d7}, [src]!
				537	subs count, count, #64
				538	bmi 2f
				539	1:
				540	pld [src, #(4 * 64)]
				541	vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
				542	vld1.8 {d0-d3}, [src]!
				543	vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
				544	vld1.8 {d4-d7}, [src]!
				545	subs count, count, #64
				546	bpl 1b
				547	2:
				548	vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
				549	vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
				550	ands count, count, #0x3f
				551	#else
				552	/* Use an SMS style loop to maximize the I/O bandwidth. */
				553	sub src, src, #4
				554	sub dst, dst, #8
				555	subs tmp2, count, #64 /* Use tmp2 for count. */
				556	ldr A_l, [src, #4]
				557	ldr A_h, [src, #8]
				558	strd B_l, B_h, [sp, #8]
				559	ldr B_l, [src, #12]
				560	ldr B_h, [src, #16]
				561	strd C_l, C_h, [sp, #16]
				562	ldr C_l, [src, #20]
				563	ldr C_h, [src, #24]
				564	strd D_l, D_h, [sp, #24]
				565	ldr D_l, [src, #28]
				566	ldr D_h, [src, #32]!
				567	b 1f
				568	.p2align 6
				569	2:
				570	pld [src, #(5 * 64) - (32 - 4)]
				571	strd A_l, A_h, [dst, #40]
				572	ldr A_l, [src, #36]
				573	ldr A_h, [src, #40]
				574	strd B_l, B_h, [dst, #48]
				575	ldr B_l, [src, #44]
				576	ldr B_h, [src, #48]
				577	strd C_l, C_h, [dst, #56]
				578	ldr C_l, [src, #52]
				579	ldr C_h, [src, #56]
				580	strd D_l, D_h, [dst, #64]!
				581	ldr D_l, [src, #60]
				582	ldr D_h, [src, #64]!
				583	subs tmp2, tmp2, #64
				584	1:
				585	strd A_l, A_h, [dst, #8]
				586	ldr A_l, [src, #4]
				587	ldr A_h, [src, #8]
				588	strd B_l, B_h, [dst, #16]
				589	ldr B_l, [src, #12]
				590	ldr B_h, [src, #16]
				591	strd C_l, C_h, [dst, #24]
				592	ldr C_l, [src, #20]
				593	ldr C_h, [src, #24]
				594	strd D_l, D_h, [dst, #32]
				595	ldr D_l, [src, #28]
				596	ldr D_h, [src, #32]
				597	bcs 2b
				598
				599	/* Save the remaining bytes and restore the callee-saved regs. */
				600	strd A_l, A_h, [dst, #40]
				601	add src, src, #36
				602	strd B_l, B_h, [dst, #48]
				603	ldrd B_l, B_h, [sp, #8]
				604	strd C_l, C_h, [dst, #56]
				605	ldrd C_l, C_h, [sp, #16]
				606	strd D_l, D_h, [dst, #64]
				607	ldrd D_l, D_h, [sp, #24]
				608	add dst, dst, #72
				609	ands count, tmp2, #0x3f
				610	#endif
				611	ldr tmp2, [sp], #FRAME_SIZE
				612	bne .Ltail63unaligned
				613	bx lr
				614	END(memcpy)