Blame - libc/arch-mips/string/memcpy.S - platform_bionic

blob: 0b711bd5bb30b9ce0ecae148b3dd734887eb2c2e [file] [log] [blame]

Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	1	/*
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	2	* Copyright (c) 2012-2015
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	3	* MIPS Technologies, Inc., California.
				4	*
				5	* Redistribution and use in source and binary forms, with or without
				6	* modification, are permitted provided that the following conditions
				7	* are met:
				8	* 1. Redistributions of source code must retain the above copyright
				9	* notice, this list of conditions and the following disclaimer.
				10	* 2. Redistributions in binary form must reproduce the above copyright
				11	* notice, this list of conditions and the following disclaimer in the
				12	* documentation and/or other materials provided with the distribution.
				13	* 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
				14	* contributors may be used to endorse or promote products derived from
				15	* this software without specific prior written permission.
				16	*
				17	* THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
				18	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
				19	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
				20	* ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
				21	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
				22	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
				23	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
				24	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
				25	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
				26	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
				27	* SUCH DAMAGE.
				28	*/
				29
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	30	#ifdef __ANDROID__
				31	# include <private/bionic_asm.h>
				32	# define USE_MEMMOVE_FOR_OVERLAP
				33	# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
				34	# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
				35	#elif _LIBC
				36	# include <sysdep.h>
				37	# include <regdef.h>
				38	# include <sys/asm.h>
				39	# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
				40	# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
				41	#elif _COMPILING_NEWLIB
				42	# include "machine/asm.h"
				43	# include "machine/regdef.h"
				44	# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
				45	# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
				46	#else
				47	# include <regdef.h>
				48	# include <sys/asm.h>
				49	#endif
				50
				51	/* Check to see if the MIPS architecture we are compiling for supports
				52	* prefetching.
				53	*/
				54
				55	#if (__mips == 4) \|\| (__mips == 5) \|\| (__mips == 32) \|\| (__mips == 64)
				56	# ifndef DISABLE_PREFETCH
				57	# define USE_PREFETCH
				58	# endif
				59	#endif
				60
				61	#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) \|\| (_MIPS_SIM == _ABIN32))
				62	# ifndef DISABLE_DOUBLE
				63	# define USE_DOUBLE
				64	# endif
				65	#endif
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	66
				67
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	68	#if __mips_isa_rev > 5
				69	# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
				70	# undef PREFETCH_STORE_HINT
				71	# define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED
				72	# endif
				73	# define R6_CODE
				74	#endif
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	75
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	76	/* Some asm.h files do not have the L macro definition. */
				77	#ifndef L
				78	# if _MIPS_SIM == _ABIO32
				79	# define L(label) $L ## label
				80	# else
				81	# define L(label) .L ## label
				82	# endif
				83	#endif
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	84
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	85	/* Some asm.h files do not have the PTR_ADDIU macro definition. */
				86	#ifndef PTR_ADDIU
				87	# if _MIPS_SIM == _ABIO32
				88	# define PTR_ADDIU addiu
				89	# else
				90	# define PTR_ADDIU daddiu
				91	# endif
				92	#endif
				93
				94	/* Some asm.h files do not have the PTR_SRA macro definition. */
				95	#ifndef PTR_SRA
				96	# if _MIPS_SIM == _ABIO32
				97	# define PTR_SRA sra
				98	# else
				99	# define PTR_SRA dsra
				100	# endif
				101	#endif
				102
				103	/* New R6 instructions that may not be in asm.h. */
				104	#ifndef PTR_LSA
				105	# if _MIPS_SIM == _ABIO32
				106	# define PTR_LSA lsa
				107	# else
				108	# define PTR_LSA dlsa
				109	# endif
				110	#endif
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	111
Elliott Hughes	851e68a	2014-02-19 16:53:20 -0800	[diff] [blame]	112	/*
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	113	* Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load
				114	* prefetches appears to offer a slight preformance advantage.
				115	*
				116	* Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE
				117	* or PREFETCH_STORE_STREAMED offers a large performance advantage
				118	* but PREPAREFORSTORE has some special restrictions to consider.
				119	*
				120	* Prefetch with the 'prepare for store' hint does not copy a memory
				121	* location into the cache, it just allocates a cache line and zeros
				122	* it out. This means that if you do not write to the entire cache
				123	* line before writing it out to memory some data will get zero'ed out
				124	* when the cache line is written back to memory and data will be lost.
				125	*
				126	* Also if you are using this memcpy to copy overlapping buffers it may
				127	* not behave correctly when using the 'prepare for store' hint. If you
				128	* use the 'prepare for store' prefetch on a memory area that is in the
				129	* memcpy source (as well as the memcpy destination), then you will get
				130	* some data zero'ed out before you have a chance to read it and data will
				131	* be lost.
				132	*
				133	* If you are going to use this memcpy routine with the 'prepare for store'
				134	* prefetch you may want to set USE_MEMMOVE_FOR_OVERLAP in order to avoid
				135	* the problem of running memcpy on overlapping buffers.
				136	*
				137	* There are ifdef'ed sections of this memcpy to make sure that it does not
				138	* do prefetches on cache lines that are not going to be completely written.
				139	* This code is only needed and only used when PREFETCH_STORE_HINT is set to
				140	* PREFETCH_HINT_PREPAREFORSTORE. This code assumes that cache lines are
				141	* 32 bytes and if the cache line is larger it will not work correctly.
Elliott Hughes	851e68a	2014-02-19 16:53:20 -0800	[diff] [blame]	142	*/
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	143
				144	#ifdef USE_PREFETCH
				145	# define PREFETCH_HINT_LOAD 0
				146	# define PREFETCH_HINT_STORE 1
				147	# define PREFETCH_HINT_LOAD_STREAMED 4
				148	# define PREFETCH_HINT_STORE_STREAMED 5
				149	# define PREFETCH_HINT_LOAD_RETAINED 6
				150	# define PREFETCH_HINT_STORE_RETAINED 7
				151	# define PREFETCH_HINT_WRITEBACK_INVAL 25
				152	# define PREFETCH_HINT_PREPAREFORSTORE 30
				153
				154	/*
				155	* If we have not picked out what hints to use at this point use the
				156	* standard load and store prefetch hints.
				157	*/
				158	# ifndef PREFETCH_STORE_HINT
				159	# define PREFETCH_STORE_HINT PREFETCH_HINT_STORE
				160	# endif
				161	# ifndef PREFETCH_LOAD_HINT
				162	# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD
				163	# endif
				164
				165	/*
				166	* We double everything when USE_DOUBLE is true so we do 2 prefetches to
				167	* get 64 bytes in that case. The assumption is that each individual
				168	* prefetch brings in 32 bytes.
				169	*/
				170
				171	# ifdef USE_DOUBLE
				172	# define PREFETCH_CHUNK 64
				173	# define PREFETCH_FOR_LOAD(chunk, reg) \
				174	pref PREFETCH_LOAD_HINT, (chunk)*64(reg); \
				175	pref PREFETCH_LOAD_HINT, ((chunk)*64)+32(reg)
				176	# define PREFETCH_FOR_STORE(chunk, reg) \
				177	pref PREFETCH_STORE_HINT, (chunk)*64(reg); \
				178	pref PREFETCH_STORE_HINT, ((chunk)*64)+32(reg)
				179	# else
				180	# define PREFETCH_CHUNK 32
				181	# define PREFETCH_FOR_LOAD(chunk, reg) \
				182	pref PREFETCH_LOAD_HINT, (chunk)*32(reg)
				183	# define PREFETCH_FOR_STORE(chunk, reg) \
				184	pref PREFETCH_STORE_HINT, (chunk)*32(reg)
				185	# endif
				186	/* MAX_PREFETCH_SIZE is the maximum size of a prefetch, it must not be less
				187	* than PREFETCH_CHUNK, the assumed size of each prefetch. If the real size
				188	* of a prefetch is greater than MAX_PREFETCH_SIZE and the PREPAREFORSTORE
				189	* hint is used, the code will not work correctly. If PREPAREFORSTORE is not
				190	* used then MAX_PREFETCH_SIZE does not matter. */
				191	# define MAX_PREFETCH_SIZE 128
				192	/* PREFETCH_LIMIT is set based on the fact that we never use an offset greater
				193	* than 5 on a STORE prefetch and that a single prefetch can never be larger
				194	* than MAX_PREFETCH_SIZE. We add the extra 32 when USE_DOUBLE is set because
				195	* we actually do two prefetches in that case, one 32 bytes after the other. */
				196	# ifdef USE_DOUBLE
				197	# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + 32 + MAX_PREFETCH_SIZE
				198	# else
				199	# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + MAX_PREFETCH_SIZE
				200	# endif
				201	# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) \
				202	&& ((PREFETCH_CHUNK * 4) < MAX_PREFETCH_SIZE)
				203	/* We cannot handle this because the initial prefetches may fetch bytes that
				204	* are before the buffer being copied. We start copies with an offset
				205	* of 4 so avoid this situation when using PREPAREFORSTORE. */
				206	#error "PREFETCH_CHUNK is too large and/or MAX_PREFETCH_SIZE is too small."
				207	# endif
				208	#else /* USE_PREFETCH not defined */
				209	# define PREFETCH_FOR_LOAD(offset, reg)
				210	# define PREFETCH_FOR_STORE(offset, reg)
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	211	#endif
				212
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	213	/* Allow the routine to be named something else if desired. */
				214	#ifndef MEMCPY_NAME
				215	# define MEMCPY_NAME memcpy
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	216	#endif
				217
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	218	/* We use these 32/64 bit registers as temporaries to do the copying. */
				219	#define REG0 t0
				220	#define REG1 t1
				221	#define REG2 t2
				222	#define REG3 t3
				223	#if defined(_MIPS_SIM) && (_MIPS_SIM == _ABIO32 \|\| _MIPS_SIM == _ABIO64)
				224	# define REG4 t4
				225	# define REG5 t5
				226	# define REG6 t6
				227	# define REG7 t7
				228	#else
				229	# define REG4 ta0
				230	# define REG5 ta1
				231	# define REG6 ta2
				232	# define REG7 ta3
				233	#endif
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	234
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	235	/* We load/store 64 bits at a time when USE_DOUBLE is true.
				236	* The C_ prefix stands for CHUNK and is used to avoid macro name
				237	* conflicts with system header files. */
				238
				239	#ifdef USE_DOUBLE
				240	# define C_ST sd
				241	# define C_LD ld
				242	# if __MIPSEB
				243	# define C_LDHI ldl /* high part is left in big-endian */
				244	# define C_STHI sdl /* high part is left in big-endian */
				245	# define C_LDLO ldr /* low part is right in big-endian */
				246	# define C_STLO sdr /* low part is right in big-endian */
				247	# else
				248	# define C_LDHI ldr /* high part is right in little-endian */
				249	# define C_STHI sdr /* high part is right in little-endian */
				250	# define C_LDLO ldl /* low part is left in little-endian */
				251	# define C_STLO sdl /* low part is left in little-endian */
				252	# endif
				253	# define C_ALIGN dalign /* r6 align instruction */
				254	#else
				255	# define C_ST sw
				256	# define C_LD lw
				257	# if __MIPSEB
				258	# define C_LDHI lwl /* high part is left in big-endian */
				259	# define C_STHI swl /* high part is left in big-endian */
				260	# define C_LDLO lwr /* low part is right in big-endian */
				261	# define C_STLO swr /* low part is right in big-endian */
				262	# else
				263	# define C_LDHI lwr /* high part is right in little-endian */
				264	# define C_STHI swr /* high part is right in little-endian */
				265	# define C_LDLO lwl /* low part is left in little-endian */
				266	# define C_STLO swl /* low part is left in little-endian */
				267	# endif
				268	# define C_ALIGN align /* r6 align instruction */
				269	#endif
				270
				271	/* Bookkeeping values for 32 vs. 64 bit mode. */
				272	#ifdef USE_DOUBLE
				273	# define NSIZE 8
				274	# define NSIZEMASK 0x3f
				275	# define NSIZEDMASK 0x7f
				276	#else
				277	# define NSIZE 4
				278	# define NSIZEMASK 0x1f
				279	# define NSIZEDMASK 0x3f
				280	#endif
				281	#define UNIT(unit) ((unit)*NSIZE)
				282	#define UNITM1(unit) (((unit)*NSIZE)-1)
				283
				284	#ifdef __ANDROID__
				285	LEAF(MEMCPY_NAME, 0)
				286	#else
				287	LEAF(MEMCPY_NAME)
				288	#endif
				289	.set nomips16
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	290	.set noreorder
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	291	/*
				292	* Below we handle the case where memcpy is called with overlapping src and dst.
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	293	* Although memcpy is not required to handle this case, some parts of Android
				294	* like Skia rely on such usage. We call memmove to handle such cases.
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	295	*/
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	296	#ifdef USE_MEMMOVE_FOR_OVERLAP
				297	PTR_SUBU t0,a0,a1
				298	PTR_SRA t2,t0,31
				299	xor t1,t0,t2
				300	PTR_SUBU t0,t1,t2
				301	sltu t2,t0,a2
				302	beq t2,zero,L(memcpy)
				303	nop
				304	#if defined(__LP64__)
				305	daddiu sp,sp,-8
				306	SETUP_GP64(0,MEMCPY_NAME)
				307	LA t9,memmove
				308	RESTORE_GP64
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	309	jr t9
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	310	daddiu sp,sp,8
				311	#else
				312	LA t9,memmove
				313	jr t9
				314	nop
				315	#endif
				316	L(memcpy):
				317	#endif
				318	/*
				319	* If the size is less than 2*NSIZE (8 or 16), go to L(lastb). Regardless of
				320	* size, copy dst pointer to v0 for the return value.
				321	*/
				322	slti t2,a2,(2 * NSIZE)
				323	bne t2,zero,L(lastb)
				324	#if defined(RETURN_FIRST_PREFETCH) \|\| defined(RETURN_LAST_PREFETCH)
				325	move v0,zero
				326	#else
				327	move v0,a0
				328	#endif
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	329
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	330	#ifndef R6_CODE
				331
				332	/*
				333	* If src and dst have different alignments, go to L(unaligned), if they
				334	* have the same alignment (but are not actually aligned) do a partial
				335	* load/store to make them aligned. If they are both already aligned
				336	* we can start copying at L(aligned).
				337	*/
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	338	xor t8,a1,a0
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	339	andi t8,t8,(NSIZE-1) /* t8 is a0/a1 word-displacement */
				340	bne t8,zero,L(unaligned)
				341	PTR_SUBU a3, zero, a0
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	342
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	343	andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */
				344	beq a3,zero,L(aligned) /* if a3=0, it is already aligned */
				345	PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	346
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	347	C_LDHI t8,0(a1)
				348	PTR_ADDU a1,a1,a3
				349	C_STHI t8,0(a0)
				350	PTR_ADDU a0,a0,a3
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	351
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	352	#else /* R6_CODE */
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	353
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	354	/*
				355	* Align the destination and hope that the source gets aligned too. If it
				356	* doesn't we jump to L(r6_unaligned*) to do unaligned copies using the r6
				357	* align instruction.
				358	*/
				359	andi t8,a0,7
				360	lapc t9,L(atable)
				361	PTR_LSA t9,t8,t9,2
				362	jrc t9
				363	L(atable):
				364	bc L(lb0)
				365	bc L(lb7)
				366	bc L(lb6)
				367	bc L(lb5)
				368	bc L(lb4)
				369	bc L(lb3)
				370	bc L(lb2)
				371	bc L(lb1)
				372	L(lb7):
				373	lb a3, 6(a1)
				374	sb a3, 6(a0)
				375	L(lb6):
				376	lb a3, 5(a1)
				377	sb a3, 5(a0)
				378	L(lb5):
				379	lb a3, 4(a1)
				380	sb a3, 4(a0)
				381	L(lb4):
				382	lb a3, 3(a1)
				383	sb a3, 3(a0)
				384	L(lb3):
				385	lb a3, 2(a1)
				386	sb a3, 2(a0)
				387	L(lb2):
				388	lb a3, 1(a1)
				389	sb a3, 1(a0)
				390	L(lb1):
				391	lb a3, 0(a1)
				392	sb a3, 0(a0)
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	393
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	394	li t9,8
				395	subu t8,t9,t8
				396	PTR_SUBU a2,a2,t8
				397	PTR_ADDU a0,a0,t8
				398	PTR_ADDU a1,a1,t8
				399	L(lb0):
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	400
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	401	andi t8,a1,(NSIZE-1)
				402	lapc t9,L(jtable)
				403	PTR_LSA t9,t8,t9,2
				404	jrc t9
				405	L(jtable):
				406	bc L(aligned)
				407	bc L(r6_unaligned1)
				408	bc L(r6_unaligned2)
				409	bc L(r6_unaligned3)
				410	# ifdef USE_DOUBLE
				411	bc L(r6_unaligned4)
				412	bc L(r6_unaligned5)
				413	bc L(r6_unaligned6)
				414	bc L(r6_unaligned7)
				415	# endif
				416	#endif /* R6_CODE */
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	417
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	418	L(aligned):
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	419
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	420	/*
				421	* Now dst/src are both aligned to (word or double word) aligned addresses
				422	* Set a2 to count how many bytes we have to copy after all the 64/128 byte
				423	* chunks are copied and a3 to the dst pointer after all the 64/128 byte
				424	* chunks have been copied. We will loop, incrementing a0 and a1 until a0
				425	* equals a3.
				426	*/
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	427
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	428	andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
				429	beq a2,t8,L(chkw) /* if a2==t8, no 64-byte/128-byte chunks */
				430	PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */
				431	PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	432
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	433	/* When in the loop we may prefetch with the 'prepare to store' hint,
				434	* in this case the a0+x should not be past the "t0-32" address. This
				435	* means: for x=128 the last "safe" a0 address is "t0-160". Alternatively,
				436	* for x=64 the last "safe" a0 address is "t0-96" In the current version we
				437	* will use "prefetch hint,128(a0)", so "t0-160" is the limit.
				438	*/
				439	#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
				440	PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */
				441	PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */
				442	#endif
				443	PREFETCH_FOR_LOAD (0, a1)
				444	PREFETCH_FOR_LOAD (1, a1)
				445	PREFETCH_FOR_LOAD (2, a1)
				446	PREFETCH_FOR_LOAD (3, a1)
				447	#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
				448	PREFETCH_FOR_STORE (1, a0)
				449	PREFETCH_FOR_STORE (2, a0)
				450	PREFETCH_FOR_STORE (3, a0)
				451	#endif
				452	#if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH)
				453	# if PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE
				454	sltu v1,t9,a0
				455	bgtz v1,L(skip_set)
				456	nop
				457	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4)
				458	L(skip_set):
				459	# else
				460	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1)
				461	# endif
				462	#endif
				463	#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH) \
				464	&& (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
				465	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*3)
				466	# ifdef USE_DOUBLE
				467	PTR_ADDIU v0,v0,32
				468	# endif
				469	#endif
				470	L(loop16w):
				471	C_LD t0,UNIT(0)(a1)
				472	#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
				473	sltu v1,t9,a0 /* If a0 > t9 don't use next prefetch */
				474	bgtz v1,L(skip_pref)
				475	#endif
				476	C_LD t1,UNIT(1)(a1)
				477	#ifndef R6_CODE
				478	PREFETCH_FOR_STORE (4, a0)
				479	PREFETCH_FOR_STORE (5, a0)
				480	#else
				481	PREFETCH_FOR_STORE (2, a0)
				482	#endif
				483	#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH)
				484	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*5)
				485	# ifdef USE_DOUBLE
				486	PTR_ADDIU v0,v0,32
				487	# endif
				488	#endif
				489	L(skip_pref):
				490	C_LD REG2,UNIT(2)(a1)
				491	C_LD REG3,UNIT(3)(a1)
				492	C_LD REG4,UNIT(4)(a1)
				493	C_LD REG5,UNIT(5)(a1)
				494	C_LD REG6,UNIT(6)(a1)
				495	C_LD REG7,UNIT(7)(a1)
				496	#ifndef R6_CODE
				497	PREFETCH_FOR_LOAD (4, a1)
				498	#else
				499	PREFETCH_FOR_LOAD (3, a1)
				500	#endif
				501	C_ST t0,UNIT(0)(a0)
				502	C_ST t1,UNIT(1)(a0)
				503	C_ST REG2,UNIT(2)(a0)
				504	C_ST REG3,UNIT(3)(a0)
				505	C_ST REG4,UNIT(4)(a0)
				506	C_ST REG5,UNIT(5)(a0)
				507	C_ST REG6,UNIT(6)(a0)
				508	C_ST REG7,UNIT(7)(a0)
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	509
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	510	C_LD t0,UNIT(8)(a1)
				511	C_LD t1,UNIT(9)(a1)
				512	C_LD REG2,UNIT(10)(a1)
				513	C_LD REG3,UNIT(11)(a1)
				514	C_LD REG4,UNIT(12)(a1)
				515	C_LD REG5,UNIT(13)(a1)
				516	C_LD REG6,UNIT(14)(a1)
				517	C_LD REG7,UNIT(15)(a1)
				518	#ifndef R6_CODE
				519	PREFETCH_FOR_LOAD (5, a1)
				520	#endif
				521	C_ST t0,UNIT(8)(a0)
				522	C_ST t1,UNIT(9)(a0)
				523	C_ST REG2,UNIT(10)(a0)
				524	C_ST REG3,UNIT(11)(a0)
				525	C_ST REG4,UNIT(12)(a0)
				526	C_ST REG5,UNIT(13)(a0)
				527	C_ST REG6,UNIT(14)(a0)
				528	C_ST REG7,UNIT(15)(a0)
				529	PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */
				530	bne a0,a3,L(loop16w)
				531	PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	532	move a2,t8
				533
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	534	/* Here we have src and dest word-aligned but less than 64-bytes or
				535	* 128 bytes to go. Check for a 32(64) byte chunk and copy if if there
				536	* is one. Otherwise jump down to L(chk1w) to handle the tail end of
				537	* the copy.
				538	*/
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	539
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	540	L(chkw):
				541	PREFETCH_FOR_LOAD (0, a1)
				542	andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */
				543	/* The t8 is the reminder count past 32-bytes */
				544	beq a2,t8,L(chk1w) /* When a2=t8, no 32-byte chunk */
				545	nop
				546	C_LD t0,UNIT(0)(a1)
				547	C_LD t1,UNIT(1)(a1)
				548	C_LD REG2,UNIT(2)(a1)
				549	C_LD REG3,UNIT(3)(a1)
				550	C_LD REG4,UNIT(4)(a1)
				551	C_LD REG5,UNIT(5)(a1)
				552	C_LD REG6,UNIT(6)(a1)
				553	C_LD REG7,UNIT(7)(a1)
				554	PTR_ADDIU a1,a1,UNIT(8)
				555	C_ST t0,UNIT(0)(a0)
				556	C_ST t1,UNIT(1)(a0)
				557	C_ST REG2,UNIT(2)(a0)
				558	C_ST REG3,UNIT(3)(a0)
				559	C_ST REG4,UNIT(4)(a0)
				560	C_ST REG5,UNIT(5)(a0)
				561	C_ST REG6,UNIT(6)(a0)
				562	C_ST REG7,UNIT(7)(a0)
				563	PTR_ADDIU a0,a0,UNIT(8)
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	564
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	565	/*
				566	* Here we have less than 32(64) bytes to copy. Set up for a loop to
				567	* copy one word (or double word) at a time. Set a2 to count how many
				568	* bytes we have to copy after all the word (or double word) chunks are
				569	* copied and a3 to the dst pointer after all the (d)word chunks have
				570	* been copied. We will loop, incrementing a0 and a1 until a0 equals a3.
				571	*/
				572	L(chk1w):
				573	andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
				574	beq a2,t8,L(lastb)
				575	PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */
				576	PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	577
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	578	/* copying in words (4-byte or 8-byte chunks) */
				579	L(wordCopy_loop):
				580	C_LD REG3,UNIT(0)(a1)
				581	PTR_ADDIU a0,a0,UNIT(1)
				582	PTR_ADDIU a1,a1,UNIT(1)
				583	bne a0,a3,L(wordCopy_loop)
				584	C_ST REG3,UNIT(-1)(a0)
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	585
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	586	/* Copy the last 8 (or 16) bytes */
				587	L(lastb):
				588	blez a2,L(leave)
				589	PTR_ADDU a3,a0,a2 /* a3 is the last dst address */
				590	L(lastbloop):
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	591	lb v1,0(a1)
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	592	PTR_ADDIU a0,a0,1
				593	PTR_ADDIU a1,a1,1
				594	bne a0,a3,L(lastbloop)
				595	sb v1,-1(a0)
				596	L(leave):
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	597	j ra
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	598	nop
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	599
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	600	#ifndef R6_CODE
				601	/*
				602	* UNALIGNED case, got here with a3 = "negu a0"
				603	* This code is nearly identical to the aligned code above
				604	* but only the destination (not the source) gets aligned
				605	* so we need to do partial loads of the source followed
				606	* by normal stores to the destination (once we have aligned
				607	* the destination).
				608	*/
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	609
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	610	L(unaligned):
				611	andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */
				612	beqz a3,L(ua_chk16w) /* if a3=0, it is already aligned */
				613	PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	614
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	615	C_LDHI v1,UNIT(0)(a1)
				616	C_LDLO v1,UNITM1(1)(a1)
				617	PTR_ADDU a1,a1,a3
				618	C_STHI v1,UNIT(0)(a0)
				619	PTR_ADDU a0,a0,a3
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	620
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	621	/*
				622	* Now the destination (but not the source) is aligned
				623	* Set a2 to count how many bytes we have to copy after all the 64/128 byte
				624	* chunks are copied and a3 to the dst pointer after all the 64/128 byte
				625	* chunks have been copied. We will loop, incrementing a0 and a1 until a0
				626	* equals a3.
				627	*/
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	628
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	629	L(ua_chk16w):
				630	andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
				631	beq a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
				632	PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */
				633	PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	634
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	635	# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
				636	PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */
				637	PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */
				638	# endif
				639	PREFETCH_FOR_LOAD (0, a1)
				640	PREFETCH_FOR_LOAD (1, a1)
				641	PREFETCH_FOR_LOAD (2, a1)
				642	# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
				643	PREFETCH_FOR_STORE (1, a0)
				644	PREFETCH_FOR_STORE (2, a0)
				645	PREFETCH_FOR_STORE (3, a0)
				646	# endif
				647	# if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH)
				648	# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
				649	sltu v1,t9,a0
				650	bgtz v1,L(ua_skip_set)
				651	nop
				652	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4)
				653	L(ua_skip_set):
				654	# else
				655	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1)
				656	# endif
				657	# endif
				658	L(ua_loop16w):
				659	PREFETCH_FOR_LOAD (3, a1)
				660	C_LDHI t0,UNIT(0)(a1)
				661	C_LDHI t1,UNIT(1)(a1)
				662	C_LDHI REG2,UNIT(2)(a1)
				663	# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
				664	sltu v1,t9,a0
				665	bgtz v1,L(ua_skip_pref)
				666	# endif
				667	C_LDHI REG3,UNIT(3)(a1)
				668	PREFETCH_FOR_STORE (4, a0)
				669	PREFETCH_FOR_STORE (5, a0)
				670	L(ua_skip_pref):
				671	C_LDHI REG4,UNIT(4)(a1)
				672	C_LDHI REG5,UNIT(5)(a1)
				673	C_LDHI REG6,UNIT(6)(a1)
				674	C_LDHI REG7,UNIT(7)(a1)
				675	C_LDLO t0,UNITM1(1)(a1)
				676	C_LDLO t1,UNITM1(2)(a1)
				677	C_LDLO REG2,UNITM1(3)(a1)
				678	C_LDLO REG3,UNITM1(4)(a1)
				679	C_LDLO REG4,UNITM1(5)(a1)
				680	C_LDLO REG5,UNITM1(6)(a1)
				681	C_LDLO REG6,UNITM1(7)(a1)
				682	C_LDLO REG7,UNITM1(8)(a1)
				683	PREFETCH_FOR_LOAD (4, a1)
				684	C_ST t0,UNIT(0)(a0)
				685	C_ST t1,UNIT(1)(a0)
				686	C_ST REG2,UNIT(2)(a0)
				687	C_ST REG3,UNIT(3)(a0)
				688	C_ST REG4,UNIT(4)(a0)
				689	C_ST REG5,UNIT(5)(a0)
				690	C_ST REG6,UNIT(6)(a0)
				691	C_ST REG7,UNIT(7)(a0)
				692	C_LDHI t0,UNIT(8)(a1)
				693	C_LDHI t1,UNIT(9)(a1)
				694	C_LDHI REG2,UNIT(10)(a1)
				695	C_LDHI REG3,UNIT(11)(a1)
				696	C_LDHI REG4,UNIT(12)(a1)
				697	C_LDHI REG5,UNIT(13)(a1)
				698	C_LDHI REG6,UNIT(14)(a1)
				699	C_LDHI REG7,UNIT(15)(a1)
				700	C_LDLO t0,UNITM1(9)(a1)
				701	C_LDLO t1,UNITM1(10)(a1)
				702	C_LDLO REG2,UNITM1(11)(a1)
				703	C_LDLO REG3,UNITM1(12)(a1)
				704	C_LDLO REG4,UNITM1(13)(a1)
				705	C_LDLO REG5,UNITM1(14)(a1)
				706	C_LDLO REG6,UNITM1(15)(a1)
				707	C_LDLO REG7,UNITM1(16)(a1)
				708	PREFETCH_FOR_LOAD (5, a1)
				709	C_ST t0,UNIT(8)(a0)
				710	C_ST t1,UNIT(9)(a0)
				711	C_ST REG2,UNIT(10)(a0)
				712	C_ST REG3,UNIT(11)(a0)
				713	C_ST REG4,UNIT(12)(a0)
				714	C_ST REG5,UNIT(13)(a0)
				715	C_ST REG6,UNIT(14)(a0)
				716	C_ST REG7,UNIT(15)(a0)
				717	PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */
				718	bne a0,a3,L(ua_loop16w)
				719	PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	720	move a2,t8
				721
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	722	/* Here we have src and dest word-aligned but less than 64-bytes or
				723	* 128 bytes to go. Check for a 32(64) byte chunk and copy if if there
				724	* is one. Otherwise jump down to L(ua_chk1w) to handle the tail end of
				725	* the copy. */
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	726
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	727	L(ua_chkw):
				728	PREFETCH_FOR_LOAD (0, a1)
				729	andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */
				730	/* t8 is the reminder count past 32-bytes */
				731	beq a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
				732	nop
				733	C_LDHI t0,UNIT(0)(a1)
				734	C_LDHI t1,UNIT(1)(a1)
				735	C_LDHI REG2,UNIT(2)(a1)
				736	C_LDHI REG3,UNIT(3)(a1)
				737	C_LDHI REG4,UNIT(4)(a1)
				738	C_LDHI REG5,UNIT(5)(a1)
				739	C_LDHI REG6,UNIT(6)(a1)
				740	C_LDHI REG7,UNIT(7)(a1)
				741	C_LDLO t0,UNITM1(1)(a1)
				742	C_LDLO t1,UNITM1(2)(a1)
				743	C_LDLO REG2,UNITM1(3)(a1)
				744	C_LDLO REG3,UNITM1(4)(a1)
				745	C_LDLO REG4,UNITM1(5)(a1)
				746	C_LDLO REG5,UNITM1(6)(a1)
				747	C_LDLO REG6,UNITM1(7)(a1)
				748	C_LDLO REG7,UNITM1(8)(a1)
				749	PTR_ADDIU a1,a1,UNIT(8)
				750	C_ST t0,UNIT(0)(a0)
				751	C_ST t1,UNIT(1)(a0)
				752	C_ST REG2,UNIT(2)(a0)
				753	C_ST REG3,UNIT(3)(a0)
				754	C_ST REG4,UNIT(4)(a0)
				755	C_ST REG5,UNIT(5)(a0)
				756	C_ST REG6,UNIT(6)(a0)
				757	C_ST REG7,UNIT(7)(a0)
				758	PTR_ADDIU a0,a0,UNIT(8)
				759	/*
				760	* Here we have less than 32(64) bytes to copy. Set up for a loop to
				761	* copy one word (or double word) at a time.
				762	*/
				763	L(ua_chk1w):
				764	andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
				765	beq a2,t8,L(ua_smallCopy)
				766	PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */
				767	PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	768
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	769	/* copying in words (4-byte or 8-byte chunks) */
				770	L(ua_wordCopy_loop):
				771	C_LDHI v1,UNIT(0)(a1)
				772	C_LDLO v1,UNITM1(1)(a1)
				773	PTR_ADDIU a0,a0,UNIT(1)
				774	PTR_ADDIU a1,a1,UNIT(1)
				775	bne a0,a3,L(ua_wordCopy_loop)
				776	C_ST v1,UNIT(-1)(a0)
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	777
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	778	/* Copy the last 8 (or 16) bytes */
				779	L(ua_smallCopy):
				780	beqz a2,L(leave)
				781	PTR_ADDU a3,a0,a2 /* a3 is the last dst address */
				782	L(ua_smallCopy_loop):
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	783	lb v1,0(a1)
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	784	PTR_ADDIU a0,a0,1
				785	PTR_ADDIU a1,a1,1
				786	bne a0,a3,L(ua_smallCopy_loop)
				787	sb v1,-1(a0)
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	788
				789	j ra
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	790	nop
				791
				792	#else /* R6_CODE */
				793
				794	# if __MIPSEB
				795	# define SWAP_REGS(X,Y) X, Y
				796	# define ALIGN_OFFSET(N) (N)
				797	# else
				798	# define SWAP_REGS(X,Y) Y, X
				799	# define ALIGN_OFFSET(N) (NSIZE-N)
				800	# endif
				801	# define R6_UNALIGNED_WORD_COPY(BYTEOFFSET) \
				802	andi REG7, a2, (NSIZE-1);/* REG7 is # of bytes to by bytes. */ \
				803	beq REG7, a2, L(lastb); /* Check for bytes to copy by word */ \
				804	PTR_SUBU a3, a2, REG7; /* a3 is number of bytes to be copied in */ \
				805	/* (d)word chunks. */ \
				806	move a2, REG7; /* a2 is # of bytes to copy byte by byte */ \
				807	/* after word loop is finished. */ \
				808	PTR_ADDU REG6, a0, a3; /* REG6 is the dst address after loop. */ \
				809	PTR_SUBU REG2, a1, t8; /* REG2 is the aligned src address. */ \
				810	PTR_ADDU a1, a1, a3; /* a1 is addr of source after word loop. */ \
				811	C_LD t0, UNIT(0)(REG2); /* Load first part of source. */ \
				812	L(r6_ua_wordcopy##BYTEOFFSET): \
				813	C_LD t1, UNIT(1)(REG2); /* Load second part of source. */ \
				814	C_ALIGN REG3, SWAP_REGS(t1,t0), ALIGN_OFFSET(BYTEOFFSET); \
				815	PTR_ADDIU a0, a0, UNIT(1); /* Increment destination pointer. */ \
				816	PTR_ADDIU REG2, REG2, UNIT(1); /* Increment aligned source pointer.*/ \
				817	move t0, t1; /* Move second part of source to first. */ \
				818	bne a0, REG6,L(r6_ua_wordcopy##BYTEOFFSET); \
				819	C_ST REG3, UNIT(-1)(a0); \
				820	j L(lastb); \
				821	nop
				822
				823	/* We are generating R6 code, the destination is 4 byte aligned and
				824	the source is not 4 byte aligned. t8 is 1, 2, or 3 depending on the
				825	alignment of the source. */
				826
				827	L(r6_unaligned1):
				828	R6_UNALIGNED_WORD_COPY(1)
				829	L(r6_unaligned2):
				830	R6_UNALIGNED_WORD_COPY(2)
				831	L(r6_unaligned3):
				832	R6_UNALIGNED_WORD_COPY(3)
				833	# ifdef USE_DOUBLE
				834	L(r6_unaligned4):
				835	R6_UNALIGNED_WORD_COPY(4)
				836	L(r6_unaligned5):
				837	R6_UNALIGNED_WORD_COPY(5)
				838	L(r6_unaligned6):
				839	R6_UNALIGNED_WORD_COPY(6)
				840	L(r6_unaligned7):
				841	R6_UNALIGNED_WORD_COPY(7)
				842	# endif
				843	#endif /* R6_CODE */
Raghu Gandham	405b802	2012-07-25 18:16:42 -0700	[diff] [blame]	844
				845	.set at
				846	.set reorder
Nikola Veljkovic	38f2eaa	2015-05-26 12:06:09 +0200	[diff] [blame]	847	END(MEMCPY_NAME)
				848	#ifndef __ANDROID__
				849	# ifdef _LIBC
				850	libc_hidden_builtin_def (MEMCPY_NAME)
				851	# endif
				852	#endif