Blame - libc/arch-arm/bionic/memcpy.a15.S - platform_bionic

blob: 516e20c623c0d2281afcd5263e130cb3269094bd [file] [log] [blame]

Greta Yorsh	5b349fc	2011-10-04 16:02:25 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2013 ARM Ltd
				3	* All rights reserved.
				4	*
				5	* Redistribution and use in source and binary forms, with or without
				6	* modification, are permitted provided that the following conditions
				7	* are met:
				8	* 1. Redistributions of source code must retain the above copyright
				9	* notice, this list of conditions and the following disclaimer.
				10	* 2. Redistributions in binary form must reproduce the above copyright
				11	* notice, this list of conditions and the following disclaimer in the
				12	* documentation and/or other materials provided with the distribution.
				13	* 3. The name of the company may not be used to endorse or promote
				14	* products derived from this software without specific prior written
				15	* permission.
				16	*
				17	* THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
				18	* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
				19	* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
				20	* IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
				21	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
				22	* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
				23	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
				24	* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
				25	* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
				26	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				27	*/
				28
Greta Yorsh	5b349fc	2011-10-04 16:02:25 +0000	[diff] [blame]	29	/* Prototype: void memcpy (void dst, const void src, size_t count). /
				30
				31	/* Use the version of memcpy implemented using LDRD and STRD.
				32	This version is tuned for Cortex-A15.
				33	This might not be the best for other ARMv7-A CPUs,
				34	but there is no predefine to distinguish between
				35	different CPUs in the same architecture,
				36	and this version is better than the plain memcpy provided in newlib.
				37
				38	Therefore, we use this version for all ARMv7-A CPUS. */
				39
				40	/* To make the same code compile for both ARM and Thumb instruction
				41	sets, switch to unified syntax at the beginning of this function.
				42	However, by using the same code, we may be missing optimization
				43	opportunities. For instance, in LDRD/STRD instructions, the first
				44	destination register must be even and the second consecutive in
				45	ARM state, but not in Thumb state. */
				46
Ben Cheng	1428300	2013-03-01 12:38:09 -0800	[diff] [blame]	47	#include <machine/cpu-features.h>
				48	#include <machine/asm.h>
				49
Greta Yorsh	5b349fc	2011-10-04 16:02:25 +0000	[diff] [blame]	50	.syntax unified
				51
Ben Cheng	1428300	2013-03-01 12:38:09 -0800	[diff] [blame]	52	ENTRY(memcpy)
Greta Yorsh	5b349fc	2011-10-04 16:02:25 +0000	[diff] [blame]	53
				54	/* Assumes that n >= 0, and dst, src are valid pointers.
				55	If there is at least 8 bytes to copy, use LDRD/STRD.
				56	If src and dst are misaligned with different offsets,
				57	first copy byte by byte until dst is aligned,
				58	and then copy using LDRD/STRD and shift if needed.
				59	When less than 8 left, copy a word and then byte by byte. */
				60
				61	/* Save registers (r0 holds the return value):
Ben Cheng	1428300	2013-03-01 12:38:09 -0800	[diff] [blame]	62	optimized push {r0, r4, r5, r6, r7, lr}.
Greta Yorsh	5b349fc	2011-10-04 16:02:25 +0000	[diff] [blame]	63	To try and improve performance, stack layout changed,
				64	i.e., not keeping the stack looking like users expect
				65	(highest numbered register at highest address). */
Ben Cheng	1428300	2013-03-01 12:38:09 -0800	[diff] [blame]	66	.save {r0, lr}
				67	push {r0, lr}
				68	.save {r4, r5}
				69	strd r4, r5, [sp, #-8]!
				70	.save {r6, r7}
				71	strd r6, r7, [sp, #-8]!
Greta Yorsh	5b349fc	2011-10-04 16:02:25 +0000	[diff] [blame]	72
				73	/* TODO: Add debug frame directives.
				74	We don't need exception unwind directives, because the code below
				75	does not throw any exceptions and does not call any other functions.
				76	Generally, newlib functions like this lack debug information for
				77	assembler source. */
				78
				79	/* Get copying of tiny blocks out of the way first. */
				80	/* Is there at least 4 bytes to copy? */
				81	subs r2, r2, #4
				82	blt copy_less_than_4 /* If n < 4. */
				83
				84	/* Check word alignment. */
				85	ands ip, r0, #3 /* ip = last 2 bits of dst. */
				86	bne dst_not_word_aligned /* If dst is not word-aligned. */
				87
				88	/* Get here if dst is word-aligned. */
				89	ands ip, r1, #3 /* ip = last 2 bits of src. */
				90	bne src_not_word_aligned /* If src is not word-aligned. */
				91	word_aligned:
				92	/* Get here if source and dst both are word-aligned.
				93	The number of bytes remaining to copy is r2+4. */
				94
				95	/* Is there is at least 64 bytes to copy? */
				96	subs r2, r2, #60
				97	blt copy_less_than_64 /* If r2 + 4 < 64. */
				98
				99	/* First, align the destination buffer to 8-bytes,
				100	to make sure double loads and stores don't cross cache line boundary,
				101	as they are then more expensive even if the data is in the cache
				102	(require two load/store issue cycles instead of one).
				103	If only one of the buffers is not 8-bytes aligned,
				104	then it's more important to align dst than src,
				105	because there is more penalty for stores
				106	than loads that cross cacheline boundary.
				107	This check and realignment are only worth doing
				108	if there is a lot to copy. */
				109
				110	/* Get here if dst is word aligned,
				111	i.e., the 2 least significant bits are 0.
				112	If dst is not 2w aligned (i.e., the 3rd bit is not set in dst),
				113	then copy 1 word (4 bytes). */
				114	ands r3, r0, #4
				115	beq 11f /* If dst already two-word aligned. */
				116	ldr r3, [r1], #4
				117	str r3, [r0], #4
				118	subs r2, r2, #4
				119	blt copy_less_than_64
				120
				121	11:
				122	/* TODO: Align to cacheline (useful for PLD optimization). */
				123
				124	/* Every loop iteration copies 64 bytes. */
				125	1:
				126	.irp offset, #0, #8, #16, #24, #32, #40, #48, #56
				127	ldrd r4, r5, [r1, \offset]
				128	strd r4, r5, [r0, \offset]
				129	.endr
				130
				131	add r0, r0, #64
				132	add r1, r1, #64
				133	subs r2, r2, #64
				134	bge 1b /* If there is more to copy. */
				135
				136	copy_less_than_64:
				137
				138	/* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
				139	Restore the count if there is more than 7 bytes to copy. */
				140	adds r2, r2, #56
				141	blt copy_less_than_8
				142
				143	/* Copy 8 bytes at a time. */
				144	2:
				145	ldrd r4, r5, [r1], #8
				146	strd r4, r5, [r0], #8
				147	subs r2, r2, #8
				148	bge 2b /* If there is more to copy. */
				149
				150	copy_less_than_8:
				151
				152	/* Get here if less than 8 bytes to copy, -8 <= r2 < 0.
				153	Check if there is more to copy. */
				154	cmn r2, #8
				155	beq return /* If r2 + 8 == 0. */
				156
				157	/* Restore the count if there is more than 3 bytes to copy. */
				158	adds r2, r2, #4
				159	blt copy_less_than_4
				160
				161	/* Copy 4 bytes. */
				162	ldr r3, [r1], #4
				163	str r3, [r0], #4
				164
				165	copy_less_than_4:
				166	/* Get here if less than 4 bytes to copy, -4 <= r2 < 0. */
				167
				168	/* Restore the count, check if there is more to copy. */
				169	adds r2, r2, #4
				170	beq return /* If r2 == 0. */
				171
				172	/* Get here with r2 is in {1,2,3}={01,10,11}. */
				173	/* Logical shift left r2, insert 0s, update flags. */
				174	lsls r2, r2, #31
				175
				176	/* Copy byte by byte.
				177	Condition ne means the last bit of r2 is 0.
				178	Condition cs means the second to last bit of r2 is set,
				179	i.e., r2 is 1 or 3. */
				180	itt ne
				181	ldrbne r3, [r1], #1
				182	strbne r3, [r0], #1
				183
				184	itttt cs
				185	ldrbcs r4, [r1], #1
				186	ldrbcs r5, [r1]
				187	strbcs r4, [r0], #1
				188	strbcs r5, [r0]
				189
				190	return:
Ben Cheng	1428300	2013-03-01 12:38:09 -0800	[diff] [blame]	191	/* Restore registers: optimized pop {r0, r4, r5, r6, r7, pc} */
				192	/* This is the only return point of memcpy. */
				193	ldrd r6, r7, [sp], #8
Greta Yorsh	5b349fc	2011-10-04 16:02:25 +0000	[diff] [blame]	194	ldrd r4, r5, [sp], #8
Ben Cheng	1428300	2013-03-01 12:38:09 -0800	[diff] [blame]	195	pop {r0, pc}
Greta Yorsh	5b349fc	2011-10-04 16:02:25 +0000	[diff] [blame]	196
				197	#ifndef __ARM_FEATURE_UNALIGNED
				198
				199	/* The following assembly macro implements misaligned copy in software.
				200	Assumes that dst is word aligned, src is at offset "pull" bits from
				201	word, push = 32 - pull, and the number of bytes that remain to copy
				202	is r2 + 4, r2 >= 0. */
				203
				204	/* In the code below, r2 is the number of bytes that remain to be
				205	written. The number of bytes read is always larger, because we have
				206	partial words in the shift queue. */
				207
				208	.macro miscopy pull push shiftleft shiftright
				209
				210	/* Align src to the previous word boundary. */
				211	bic r1, r1, #3
				212
				213	/* Initialize the shift queue. */
				214	ldr r5, [r1], #4 /* Load a word from source. */
				215
				216	subs r2, r2, #4
				217	blt 6f /* Go to misaligned copy of less than 8 bytes. */
				218
				219	/* Get here if there is more than 8 bytes to copy.
				220	The number of bytes to copy is r2+8, r2 >= 0. */
				221
Greta Yorsh	5b349fc	2011-10-04 16:02:25 +0000	[diff] [blame]	222	subs r2, r2, #56
				223	blt 4f /* Go to misaligned copy of less than 64 bytes. */
				224
				225	3:
				226	/* Get here if there is more than 64 bytes to copy.
				227	The number of bytes to copy is r2+64, r2 >= 0. */
				228
				229	/* Copy 64 bytes in every iteration.
				230	Use a partial word from the shift queue. */
				231	.irp offset, #0, #8, #16, #24, #32, #40, #48, #56
				232	mov r6, r5, \shiftleft #\pull
				233	ldrd r4, r5, [r1, \offset]
				234	orr r6, r6, r4, \shiftright #\push
				235	mov r7, r4, \shiftleft #\pull
				236	orr r7, r7, r5, \shiftright #\push
				237	strd r6, r7, [r0, \offset]
				238	.endr
				239
				240	add r1, r1, #64
				241	add r0, r0, #64
				242	subs r2, r2, #64
				243	bge 3b
				244
				245	4:
				246	/* Get here if there is less than 64 bytes to copy (-64 <= r2 < 0)
				247	and they are misaligned. */
				248
				249	/* Restore the count if there is more than 7 bytes to copy. */
				250	adds r2, r2, #56
				251
Greta Yorsh	5b349fc	2011-10-04 16:02:25 +0000	[diff] [blame]	252	blt 6f /* Go to misaligned copy of less than 8 bytes. */
				253
				254	5:
				255	/* Copy 8 bytes at a time.
				256	Use a partial word from the shift queue. */
				257	mov r6, r5, \shiftleft #\pull
				258	ldrd r4, r5, [r1], #8
				259	orr r6, r6, r4, \shiftright #\push
				260	mov r7, r4, \shiftleft #\pull
				261	orr r7, r7, r5, \shiftright #\push
				262	strd r6, r7, [r0], #8
				263
				264	subs r2, r2, #8
				265	bge 5b /* If there is more to copy. */
				266
Greta Yorsh	5b349fc	2011-10-04 16:02:25 +0000	[diff] [blame]	267	6:
				268	/* Get here if there less than 8 bytes to copy (-8 <= r2 < 0)
				269	and they are misaligned. */
				270
				271	/* Check if there is more to copy. */
				272	cmn r2, #8
				273	beq return
				274
				275	/* Check if there is less than 4 bytes to copy. */
				276	cmn r2, #4
				277
				278	itt lt
				279	/* Restore src offset from word-align. */
				280	sublt r1, r1, #(\push / 8)
				281	blt copy_less_than_4
				282
				283	/* Use a partial word from the shift queue. */
				284	mov r3, r5, \shiftleft #\pull
				285	/* Load a word from src, but without writeback
				286	(this word is not fully written to dst). */
				287	ldr r5, [r1]
				288
				289	/* Restore src offset from word-align. */
				290	add r1, r1, #(\pull / 8)
				291
				292	/* Shift bytes to create one dst word and store it. */
				293	orr r3, r3, r5, \shiftright #\push
				294	str r3, [r0], #4
				295
				296	/* Use single byte copying of the remaining bytes. */
				297	b copy_less_than_4
				298
				299	.endm
				300
				301	#endif /* not __ARM_FEATURE_UNALIGNED */
				302
				303	dst_not_word_aligned:
				304
				305	/* Get here when dst is not aligned and ip has the last 2 bits of dst,
				306	i.e., ip is the offset of dst from word.
				307	The number of bytes that remains to copy is r2 + 4,
				308	i.e., there are at least 4 bytes to copy.
				309	Write a partial word (0 to 3 bytes), such that dst becomes
				310	word-aligned. */
				311
				312	/* If dst is at ip bytes offset from a word (with 0 < ip < 4),
				313	then there are (4 - ip) bytes to fill up to align dst to the next
				314	word. */
				315	rsb ip, ip, #4 /* ip = #4 - ip. */
				316	cmp ip, #2
				317
				318	/* Copy byte by byte with conditionals. */
				319	itt gt
				320	ldrbgt r3, [r1], #1
				321	strbgt r3, [r0], #1
				322
				323	itt ge
				324	ldrbge r4, [r1], #1
				325	strbge r4, [r0], #1
				326
				327	ldrb lr, [r1], #1
				328	strb lr, [r0], #1
				329
				330	/* Update the count.
				331	ip holds the number of bytes we have just copied. */
				332	subs r2, r2, ip /* r2 = r2 - ip. */
				333	blt copy_less_than_4 /* If r2 < ip. */
				334
				335	/* Get here if there are more than 4 bytes to copy.
				336	Check if src is aligned. If beforehand src and dst were not word
				337	aligned but congruent (same offset), then now they are both
				338	word-aligned, and we can copy the rest efficiently (without
				339	shifting). */
				340	ands ip, r1, #3 /* ip = last 2 bits of src. */
				341	beq word_aligned /* If r1 is word-aligned. */
				342
				343	src_not_word_aligned:
				344	/* Get here when src is not word-aligned, but dst is word-aligned.
				345	The number of bytes that remains to copy is r2+4. */
				346
				347	#ifdef __ARM_FEATURE_UNALIGNED
				348	/* Copy word by word using LDR when alignment can be done in hardware,
				349	i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */
				350	subs r2, r2, #60
				351	blt 8f
				352
				353	7:
				354	/* Copy 64 bytes in every loop iteration. */
				355	.irp offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60
				356	ldr r3, [r1, \offset]
				357	str r3, [r0, \offset]
				358	.endr
				359
				360	add r0, r0, #64
				361	add r1, r1, #64
				362	subs r2, r2, #64
				363	bge 7b
				364
				365	8:
				366	/* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
				367	Check if there is more than 3 bytes to copy. */
				368	adds r2, r2, #60
				369	blt copy_less_than_4
				370
				371	9:
				372	/* Get here if there is less than 64 but at least 4 bytes to copy,
				373	where the number of bytes to copy is r2+4. */
				374	ldr r3, [r1], #4
				375	str r3, [r0], #4
				376	subs r2, r2, #4
				377	bge 9b
				378
				379	b copy_less_than_4
				380
				381	#else /* not __ARM_FEATURE_UNALIGNED */
				382
				383	/* ip has last 2 bits of src,
				384	i.e., ip is the offset of src from word, and ip > 0.
				385	Compute shifts needed to copy from src to dst. */
				386	cmp ip, #2
				387	beq miscopy_16_16 /* If ip == 2. */
				388	bge miscopy_24_8 /* If ip == 3. */
				389
				390	/* Get here if ip == 1. */
				391
				392	/* Endian independent macros for shifting bytes within registers. */
				393
				394	#ifndef __ARMEB__
				395	miscopy_8_24: miscopy pull=8 push=24 shiftleft=lsr shiftright=lsl
				396	miscopy_16_16: miscopy pull=16 push=16 shiftleft=lsr shiftright=lsl
				397	miscopy_24_8: miscopy pull=24 push=8 shiftleft=lsr shiftright=lsl
				398	#else /* not __ARMEB__ */
				399	miscopy_8_24: miscopy pull=8 push=24 shiftleft=lsl shiftright=lsr
				400	miscopy_16_16: miscopy pull=16 push=16 shiftleft=lsl shiftright=lsr
				401	miscopy_24_8: miscopy pull=24 push=8 shiftleft=lsl shiftright=lsr
				402	#endif /* not __ARMEB__ */
				403
				404	#endif /* not __ARM_FEATURE_UNALIGNED */
				405
Ben Cheng	1428300	2013-03-01 12:38:09 -0800	[diff] [blame]	406	END(memcpy)