Blame - libc/arch-mips/string/memcpy.S - platform_bionic

blob: aabdfcfdce6e95d768636c22572856fcab1f9d8c [file] [log] [blame]

Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame^]	1	/*
				2	* Copyright (c) 2009
				3	* MIPS Technologies, Inc., California.
				4	*
				5	* Redistribution and use in source and binary forms, with or without
				6	* modification, are permitted provided that the following conditions
				7	* are met:
				8	* 1. Redistributions of source code must retain the above copyright
				9	* notice, this list of conditions and the following disclaimer.
				10	* 2. Redistributions in binary form must reproduce the above copyright
				11	* notice, this list of conditions and the following disclaimer in the
				12	* documentation and/or other materials provided with the distribution.
				13	* 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
				14	* contributors may be used to endorse or promote products derived from
				15	* this software without specific prior written permission.
				16	*
				17	* THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
				18	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				19	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				20	* ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
				21	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
				22	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
				23	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
				24	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
				25	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
				26	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
				27	* SUCH DAMAGE.
				28	*/
				29
				30	/************************************************************************
				31	*
				32	* memcpy.S
				33	* Version: "043009"
				34	*
				35	************************************************************************/
				36
				37
				38	/************************************************************************
				39	* Include files
				40	************************************************************************/
				41
				42	#include "machine/asm.h"
				43
				44
				45	/*
				46	* This routine could be optimized for MIPS64. The current code only
				47	* uses MIPS32 instructions.
				48	*/
				49	#if defined(__MIPSEB__)
				50	# define LWHI lwl /* high part is left in big-endian */
				51	# define SWHI swl /* high part is left in big-endian */
				52	# define LWLO lwr /* low part is right in big-endian */
				53	# define SWLO swr /* low part is right in big-endian */
				54	#endif
				55
				56	#if defined(__MIPSEL__)
				57	# define LWHI lwr /* high part is right in little-endian */
				58	# define SWHI swr /* high part is right in little-endian */
				59	# define LWLO lwl /* low part is left in big-endian */
				60	# define SWLO swl /* low part is left in big-endian */
				61	#endif
				62
				63	LEAF(memcpy,0)
				64
				65	.set noreorder
				66	.set noat
				67	/*
				68	* Below we handle the case where memcpy is called with overlapping src and dst.
				69	* Although memcpy is not required to handle this case, some parts of Android like Skia
				70	* rely on such usage. We call memmove to handle such cases.
				71	*/
				72	subu t0,a0,a1
				73	sra AT,t0,31
				74	xor t1,t0,AT
				75	subu t0,t1,AT
				76	sltu AT,t0,a2
				77	beq AT,zero,.Lmemcpy
				78	la t9,memmove
				79	jr t9
				80	nop
				81	.Lmemcpy:
				82	slti AT,a2,8
				83	bne AT,zero,.Llast8
				84	move v0,a0 # memcpy returns the dst pointer
				85
				86	# Test if the src and dst are word-aligned, or can be made word-aligned
				87	xor t8,a1,a0
				88	andi t8,t8,0x3 # t8 is a0/a1 word-displacement
				89
				90	bne t8,zero,.Lunaligned
				91	negu a3,a0
				92
				93	andi a3,a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned
				94	beq a3,zero,.Lchk16w # when a3=0 then the dst (a0) is word-aligned
				95	subu a2,a2,a3 # now a2 is the remining bytes count
				96
				97	LWHI t8,0(a1)
				98	addu a1,a1,a3
				99	SWHI t8,0(a0)
				100	addu a0,a0,a3
				101
				102	# Now the dst/src are mutually word-aligned with word-aligned addresses
				103	.Lchk16w:
				104	andi t8,a2,0x3f # any whole 64-byte chunks?
				105	# t8 is the byte count after 64-byte chunks
				106
				107	beq a2,t8,.Lchk8w # if a2==t8, no 64-byte chunks
				108	# There will be at most 1 32-byte chunk after it
				109	subu a3,a2,t8 # subtract from a2 the reminder
				110	# Here a3 counts bytes in 16w chunks
				111	addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks
				112
				113	addu t0,a0,a2 # t0 is the "past the end" address
				114
				115	# When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
				116	# the "t0-32" address
				117	# This means: for x=128 the last "safe" a0 address is "t0-160"
				118	# Alternatively, for x=64 the last "safe" a0 address is "t0-96"
				119	# In the current version we will use "pref 30,128(a0)", so "t0-160" is the limit
				120	subu t9,t0,160 # t9 is the "last safe pref 30,128(a0)" address
				121
				122	pref 0,0(a1) # bring the first line of src, addr 0
				123	pref 0,32(a1) # bring the second line of src, addr 32
				124	pref 0,64(a1) # bring the third line of src, addr 64
				125	pref 30,32(a0) # safe, as we have at least 64 bytes ahead
				126	# In case the a0 > t9 don't use "pref 30" at all
				127	sgtu v1,a0,t9
				128	bgtz v1,.Lloop16w # skip "pref 30,64(a0)" for too short arrays
				129	nop
				130	# otherwise, start with using pref30
				131	pref 30,64(a0)
				132	.Lloop16w:
				133	pref 0,96(a1)
				134	lw t0,0(a1)
				135	bgtz v1,.Lskip_pref30_96 # skip "pref 30,96(a0)"
				136	lw t1,4(a1)
				137	pref 30,96(a0) # continue setting up the dest, addr 96
				138	.Lskip_pref30_96:
				139	lw t2,8(a1)
				140	lw t3,12(a1)
				141	lw t4,16(a1)
				142	lw t5,20(a1)
				143	lw t6,24(a1)
				144	lw t7,28(a1)
				145	pref 0,128(a1) # bring the next lines of src, addr 128
				146
				147	sw t0,0(a0)
				148	sw t1,4(a0)
				149	sw t2,8(a0)
				150	sw t3,12(a0)
				151	sw t4,16(a0)
				152	sw t5,20(a0)
				153	sw t6,24(a0)
				154	sw t7,28(a0)
				155
				156	lw t0,32(a1)
				157	bgtz v1,.Lskip_pref30_128 # skip "pref 30,128(a0)"
				158	lw t1,36(a1)
				159	pref 30,128(a0) # continue setting up the dest, addr 128
				160	.Lskip_pref30_128:
				161	lw t2,40(a1)
				162	lw t3,44(a1)
				163	lw t4,48(a1)
				164	lw t5,52(a1)
				165	lw t6,56(a1)
				166	lw t7,60(a1)
				167	pref 0, 160(a1) # bring the next lines of src, addr 160
				168
				169	sw t0,32(a0)
				170	sw t1,36(a0)
				171	sw t2,40(a0)
				172	sw t3,44(a0)
				173	sw t4,48(a0)
				174	sw t5,52(a0)
				175	sw t6,56(a0)
				176	sw t7,60(a0)
				177
				178	addiu a0,a0,64 # adding 64 to dest
				179	sgtu v1,a0,t9
				180	bne a0,a3,.Lloop16w
				181	addiu a1,a1,64 # adding 64 to src
				182	move a2,t8
				183
				184	# Here we have src and dest word-aligned but less than 64-bytes to go
				185
				186	.Lchk8w:
				187	pref 0, 0x0(a1)
				188	andi t8,a2,0x1f # is there a 32-byte chunk?
				189	# the t8 is the reminder count past 32-bytes
				190	beq a2,t8,.Lchk1w # when a2=t8, no 32-byte chunk
				191	nop
				192
				193	lw t0,0(a1)
				194	lw t1,4(a1)
				195	lw t2,8(a1)
				196	lw t3,12(a1)
				197	lw t4,16(a1)
				198	lw t5,20(a1)
				199	lw t6,24(a1)
				200	lw t7,28(a1)
				201	addiu a1,a1,32
				202
				203	sw t0,0(a0)
				204	sw t1,4(a0)
				205	sw t2,8(a0)
				206	sw t3,12(a0)
				207	sw t4,16(a0)
				208	sw t5,20(a0)
				209	sw t6,24(a0)
				210	sw t7,28(a0)
				211	addiu a0,a0,32
				212
				213	.Lchk1w:
				214	andi a2,t8,0x3 # now a2 is the reminder past 1w chunks
				215	beq a2,t8,.Llast8
				216	subu a3,t8,a2 # a3 is count of bytes in 1w chunks
				217	addu a3,a0,a3 # now a3 is the dst address past the 1w chunks
				218
				219	# copying in words (4-byte chunks)
				220	.LwordCopy_loop:
				221	lw t3,0(a1) # the first t3 may be equal t0 ... optimize?
				222	addiu a1,a1,4
				223	addiu a0,a0,4
				224	bne a0,a3,.LwordCopy_loop
				225	sw t3,-4(a0)
				226
				227	# For the last (<8) bytes
				228	.Llast8:
				229	blez a2,.Lleave
				230	addu a3,a0,a2 # a3 is the last dst address
				231	.Llast8loop:
				232	lb v1,0(a1)
				233	addiu a1,a1,1
				234	addiu a0,a0,1
				235	bne a0,a3,.Llast8loop
				236	sb v1,-1(a0)
				237
				238	.Lleave:
				239	j ra
				240	nop
				241
				242	#
				243	# UNALIGNED case
				244	#
				245
				246	.Lunaligned:
				247	# got here with a3="negu a0"
				248	andi a3,a3,0x3 # test if the a0 is word aligned
				249	beqz a3,.Lua_chk16w
				250	subu a2,a2,a3 # bytes left after initial a3 bytes
				251
				252	LWHI v1,0(a1)
				253	LWLO v1,3(a1)
				254	addu a1,a1,a3 # a3 may be here 1, 2 or 3
				255	SWHI v1,0(a0)
				256	addu a0,a0,a3 # below the dst will be word aligned (NOTE1)
				257
				258	.Lua_chk16w:
				259	andi t8,a2,0x3f # any whole 64-byte chunks?
				260	# t8 is the byte count after 64-byte chunks
				261	beq a2,t8,.Lua_chk8w # if a2==t8, no 64-byte chunks
				262	# There will be at most 1 32-byte chunk after it
				263	subu a3,a2,t8 # subtract from a2 the reminder
				264	# Here a3 counts bytes in 16w chunks
				265	addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks
				266
				267	addu t0,a0,a2 # t0 is the "past the end" address
				268
				269	subu t9,t0,160 # t9 is the "last safe pref 30,128(a0)" address
				270
				271	pref 0,0(a1) # bring the first line of src, addr 0
				272	pref 0,32(a1) # bring the second line of src, addr 32
				273	pref 0,64(a1) # bring the third line of src, addr 64
				274	pref 30,32(a0) # safe, as we have at least 64 bytes ahead
				275	# In case the a0 > t9 don't use "pref 30" at all
				276	sgtu v1,a0,t9
				277	bgtz v1,.Lua_loop16w # skip "pref 30,64(a0)" for too short arrays
				278	nop
				279	# otherwise, start with using pref30
				280	pref 30,64(a0)
				281	.Lua_loop16w:
				282	pref 0,96(a1)
				283	LWHI t0,0(a1)
				284	LWLO t0,3(a1)
				285	LWHI t1,4(a1)
				286	bgtz v1,.Lua_skip_pref30_96
				287	LWLO t1,7(a1)
				288	pref 30,96(a0) # continue setting up the dest, addr 96
				289	.Lua_skip_pref30_96:
				290	LWHI t2,8(a1)
				291	LWLO t2,11(a1)
				292	LWHI t3,12(a1)
				293	LWLO t3,15(a1)
				294	LWHI t4,16(a1)
				295	LWLO t4,19(a1)
				296	LWHI t5,20(a1)
				297	LWLO t5,23(a1)
				298	LWHI t6,24(a1)
				299	LWLO t6,27(a1)
				300	LWHI t7,28(a1)
				301	LWLO t7,31(a1)
				302	pref 0,128(a1) # bring the next lines of src, addr 128
				303
				304	sw t0,0(a0)
				305	sw t1,4(a0)
				306	sw t2,8(a0)
				307	sw t3,12(a0)
				308	sw t4,16(a0)
				309	sw t5,20(a0)
				310	sw t6,24(a0)
				311	sw t7,28(a0)
				312
				313	LWHI t0,32(a1)
				314	LWLO t0,35(a1)
				315	LWHI t1,36(a1)
				316	bgtz v1,.Lua_skip_pref30_128
				317	LWLO t1,39(a1)
				318	pref 30,128(a0) # continue setting up the dest, addr 128
				319	.Lua_skip_pref30_128:
				320	LWHI t2,40(a1)
				321	LWLO t2,43(a1)
				322	LWHI t3,44(a1)
				323	LWLO t3,47(a1)
				324	LWHI t4,48(a1)
				325	LWLO t4,51(a1)
				326	LWHI t5,52(a1)
				327	LWLO t5,55(a1)
				328	LWHI t6,56(a1)
				329	LWLO t6,59(a1)
				330	LWHI t7,60(a1)
				331	LWLO t7,63(a1)
				332	pref 0, 160(a1) # bring the next lines of src, addr 160
				333
				334	sw t0,32(a0)
				335	sw t1,36(a0)
				336	sw t2,40(a0)
				337	sw t3,44(a0)
				338	sw t4,48(a0)
				339	sw t5,52(a0)
				340	sw t6,56(a0)
				341	sw t7,60(a0)
				342
				343	addiu a0,a0,64 # adding 64 to dest
				344	sgtu v1,a0,t9
				345	bne a0,a3,.Lua_loop16w
				346	addiu a1,a1,64 # adding 64 to src
				347	move a2,t8
				348
				349	# Here we have src and dest word-aligned but less than 64-bytes to go
				350
				351	.Lua_chk8w:
				352	pref 0, 0x0(a1)
				353	andi t8,a2,0x1f # is there a 32-byte chunk?
				354	# the t8 is the reminder count
				355	beq a2,t8,.Lua_chk1w # when a2=t8, no 32-byte chunk
				356	nop
				357
				358	LWHI t0,0(a1)
				359	LWLO t0,3(a1)
				360	LWHI t1,4(a1)
				361	LWLO t1,7(a1)
				362	LWHI t2,8(a1)
				363	LWLO t2,11(a1)
				364	LWHI t3,12(a1)
				365	LWLO t3,15(a1)
				366	LWHI t4,16(a1)
				367	LWLO t4,19(a1)
				368	LWHI t5,20(a1)
				369	LWLO t5,23(a1)
				370	LWHI t6,24(a1)
				371	LWLO t6,27(a1)
				372	LWHI t7,28(a1)
				373	LWLO t7,31(a1)
				374	addiu a1,a1,32
				375
				376	sw t0,0(a0)
				377	sw t1,4(a0)
				378	sw t2,8(a0)
				379	sw t3,12(a0)
				380	sw t4,16(a0)
				381	sw t5,20(a0)
				382	sw t6,24(a0)
				383	sw t7,28(a0)
				384	addiu a0,a0,32
				385
				386	.Lua_chk1w:
				387	andi a2,t8,0x3 # now a2 is the reminder past 1w chunks
				388	beq a2,t8,.Lua_smallCopy
				389	subu a3,t8,a2 # a3 is count of bytes in 1w chunks
				390	addu a3,a0,a3 # now a3 is the dst address past the 1w chunks
				391
				392	# copying in words (4-byte chunks)
				393	.Lua_wordCopy_loop:
				394	LWHI v1,0(a1)
				395	LWLO v1,3(a1)
				396	addiu a1,a1,4
				397	addiu a0,a0,4 # note: dst=a0 is word aligned here, see NOTE1
				398	bne a0,a3,.Lua_wordCopy_loop
				399	sw v1,-4(a0)
				400
				401	# Now less than 4 bytes (value in a2) left to copy
				402	.Lua_smallCopy:
				403	beqz a2,.Lleave
				404	addu a3,a0,a2 # a3 is the last dst address
				405	.Lua_smallCopy_loop:
				406	lb v1,0(a1)
				407	addiu a1,a1,1
				408	addiu a0,a0,1
				409	bne a0,a3,.Lua_smallCopy_loop
				410	sb v1,-1(a0)
				411
				412	j ra
				413	nop
				414
				415	.set at
				416	.set reorder
				417
				418	END(memcpy)
				419
				420
				421	/************************************************************************
				422	* Implementation : Static functions
				423	************************************************************************/