blob: 0b711bd5bb30b9ce0ecae148b3dd734887eb2c2e [file] [log] [blame]
Raghu Gandham405b8022012-07-25 18:16:42 -07001/*
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +02002 * Copyright (c) 2012-2015
Raghu Gandham405b8022012-07-25 18:16:42 -07003 * MIPS Technologies, Inc., California.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14 * contributors may be used to endorse or promote products derived from
15 * this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +020030#ifdef __ANDROID__
31# include <private/bionic_asm.h>
32# define USE_MEMMOVE_FOR_OVERLAP
33# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
34# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
35#elif _LIBC
36# include <sysdep.h>
37# include <regdef.h>
38# include <sys/asm.h>
39# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
40# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
41#elif _COMPILING_NEWLIB
42# include "machine/asm.h"
43# include "machine/regdef.h"
44# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
45# define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
46#else
47# include <regdef.h>
48# include <sys/asm.h>
49#endif
50
51/* Check to see if the MIPS architecture we are compiling for supports
52 * prefetching.
53 */
54
55#if (__mips == 4) || (__mips == 5) || (__mips == 32) || (__mips == 64)
56# ifndef DISABLE_PREFETCH
57# define USE_PREFETCH
58# endif
59#endif
60
61#if defined(_MIPS_SIM) && ((_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32))
62# ifndef DISABLE_DOUBLE
63# define USE_DOUBLE
64# endif
65#endif
Raghu Gandham405b8022012-07-25 18:16:42 -070066
67
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +020068#if __mips_isa_rev > 5
69# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
70# undef PREFETCH_STORE_HINT
71# define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED
72# endif
73# define R6_CODE
74#endif
Raghu Gandham405b8022012-07-25 18:16:42 -070075
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +020076/* Some asm.h files do not have the L macro definition. */
77#ifndef L
78# if _MIPS_SIM == _ABIO32
79# define L(label) $L ## label
80# else
81# define L(label) .L ## label
82# endif
83#endif
Raghu Gandham405b8022012-07-25 18:16:42 -070084
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +020085/* Some asm.h files do not have the PTR_ADDIU macro definition. */
86#ifndef PTR_ADDIU
87# if _MIPS_SIM == _ABIO32
88# define PTR_ADDIU addiu
89# else
90# define PTR_ADDIU daddiu
91# endif
92#endif
93
94/* Some asm.h files do not have the PTR_SRA macro definition. */
95#ifndef PTR_SRA
96# if _MIPS_SIM == _ABIO32
97# define PTR_SRA sra
98# else
99# define PTR_SRA dsra
100# endif
101#endif
102
103/* New R6 instructions that may not be in asm.h. */
104#ifndef PTR_LSA
105# if _MIPS_SIM == _ABIO32
106# define PTR_LSA lsa
107# else
108# define PTR_LSA dlsa
109# endif
110#endif
Raghu Gandham405b8022012-07-25 18:16:42 -0700111
Elliott Hughes851e68a2014-02-19 16:53:20 -0800112/*
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200113 * Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load
114 * prefetches appears to offer a slight preformance advantage.
115 *
116 * Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE
117 * or PREFETCH_STORE_STREAMED offers a large performance advantage
118 * but PREPAREFORSTORE has some special restrictions to consider.
119 *
120 * Prefetch with the 'prepare for store' hint does not copy a memory
121 * location into the cache, it just allocates a cache line and zeros
122 * it out. This means that if you do not write to the entire cache
123 * line before writing it out to memory some data will get zero'ed out
124 * when the cache line is written back to memory and data will be lost.
125 *
126 * Also if you are using this memcpy to copy overlapping buffers it may
127 * not behave correctly when using the 'prepare for store' hint. If you
128 * use the 'prepare for store' prefetch on a memory area that is in the
129 * memcpy source (as well as the memcpy destination), then you will get
130 * some data zero'ed out before you have a chance to read it and data will
131 * be lost.
132 *
133 * If you are going to use this memcpy routine with the 'prepare for store'
134 * prefetch you may want to set USE_MEMMOVE_FOR_OVERLAP in order to avoid
135 * the problem of running memcpy on overlapping buffers.
136 *
137 * There are ifdef'ed sections of this memcpy to make sure that it does not
138 * do prefetches on cache lines that are not going to be completely written.
139 * This code is only needed and only used when PREFETCH_STORE_HINT is set to
140 * PREFETCH_HINT_PREPAREFORSTORE. This code assumes that cache lines are
141 * 32 bytes and if the cache line is larger it will not work correctly.
Elliott Hughes851e68a2014-02-19 16:53:20 -0800142 */
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200143
144#ifdef USE_PREFETCH
145# define PREFETCH_HINT_LOAD 0
146# define PREFETCH_HINT_STORE 1
147# define PREFETCH_HINT_LOAD_STREAMED 4
148# define PREFETCH_HINT_STORE_STREAMED 5
149# define PREFETCH_HINT_LOAD_RETAINED 6
150# define PREFETCH_HINT_STORE_RETAINED 7
151# define PREFETCH_HINT_WRITEBACK_INVAL 25
152# define PREFETCH_HINT_PREPAREFORSTORE 30
153
154/*
155 * If we have not picked out what hints to use at this point use the
156 * standard load and store prefetch hints.
157 */
158# ifndef PREFETCH_STORE_HINT
159# define PREFETCH_STORE_HINT PREFETCH_HINT_STORE
160# endif
161# ifndef PREFETCH_LOAD_HINT
162# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD
163# endif
164
165/*
166 * We double everything when USE_DOUBLE is true so we do 2 prefetches to
167 * get 64 bytes in that case. The assumption is that each individual
168 * prefetch brings in 32 bytes.
169 */
170
171# ifdef USE_DOUBLE
172# define PREFETCH_CHUNK 64
173# define PREFETCH_FOR_LOAD(chunk, reg) \
174 pref PREFETCH_LOAD_HINT, (chunk)*64(reg); \
175 pref PREFETCH_LOAD_HINT, ((chunk)*64)+32(reg)
176# define PREFETCH_FOR_STORE(chunk, reg) \
177 pref PREFETCH_STORE_HINT, (chunk)*64(reg); \
178 pref PREFETCH_STORE_HINT, ((chunk)*64)+32(reg)
179# else
180# define PREFETCH_CHUNK 32
181# define PREFETCH_FOR_LOAD(chunk, reg) \
182 pref PREFETCH_LOAD_HINT, (chunk)*32(reg)
183# define PREFETCH_FOR_STORE(chunk, reg) \
184 pref PREFETCH_STORE_HINT, (chunk)*32(reg)
185# endif
186/* MAX_PREFETCH_SIZE is the maximum size of a prefetch, it must not be less
187 * than PREFETCH_CHUNK, the assumed size of each prefetch. If the real size
188 * of a prefetch is greater than MAX_PREFETCH_SIZE and the PREPAREFORSTORE
189 * hint is used, the code will not work correctly. If PREPAREFORSTORE is not
190 * used then MAX_PREFETCH_SIZE does not matter. */
191# define MAX_PREFETCH_SIZE 128
192/* PREFETCH_LIMIT is set based on the fact that we never use an offset greater
193 * than 5 on a STORE prefetch and that a single prefetch can never be larger
194 * than MAX_PREFETCH_SIZE. We add the extra 32 when USE_DOUBLE is set because
195 * we actually do two prefetches in that case, one 32 bytes after the other. */
196# ifdef USE_DOUBLE
197# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + 32 + MAX_PREFETCH_SIZE
198# else
199# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + MAX_PREFETCH_SIZE
200# endif
201# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) \
202 && ((PREFETCH_CHUNK * 4) < MAX_PREFETCH_SIZE)
203/* We cannot handle this because the initial prefetches may fetch bytes that
204 * are before the buffer being copied. We start copies with an offset
205 * of 4 so avoid this situation when using PREPAREFORSTORE. */
206#error "PREFETCH_CHUNK is too large and/or MAX_PREFETCH_SIZE is too small."
207# endif
208#else /* USE_PREFETCH not defined */
209# define PREFETCH_FOR_LOAD(offset, reg)
210# define PREFETCH_FOR_STORE(offset, reg)
Raghu Gandham405b8022012-07-25 18:16:42 -0700211#endif
212
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200213/* Allow the routine to be named something else if desired. */
214#ifndef MEMCPY_NAME
215# define MEMCPY_NAME memcpy
Raghu Gandham405b8022012-07-25 18:16:42 -0700216#endif
217
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200218/* We use these 32/64 bit registers as temporaries to do the copying. */
219#define REG0 t0
220#define REG1 t1
221#define REG2 t2
222#define REG3 t3
223#if defined(_MIPS_SIM) && (_MIPS_SIM == _ABIO32 || _MIPS_SIM == _ABIO64)
224# define REG4 t4
225# define REG5 t5
226# define REG6 t6
227# define REG7 t7
228#else
229# define REG4 ta0
230# define REG5 ta1
231# define REG6 ta2
232# define REG7 ta3
233#endif
Raghu Gandham405b8022012-07-25 18:16:42 -0700234
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200235/* We load/store 64 bits at a time when USE_DOUBLE is true.
236 * The C_ prefix stands for CHUNK and is used to avoid macro name
237 * conflicts with system header files. */
238
239#ifdef USE_DOUBLE
240# define C_ST sd
241# define C_LD ld
242# if __MIPSEB
243# define C_LDHI ldl /* high part is left in big-endian */
244# define C_STHI sdl /* high part is left in big-endian */
245# define C_LDLO ldr /* low part is right in big-endian */
246# define C_STLO sdr /* low part is right in big-endian */
247# else
248# define C_LDHI ldr /* high part is right in little-endian */
249# define C_STHI sdr /* high part is right in little-endian */
250# define C_LDLO ldl /* low part is left in little-endian */
251# define C_STLO sdl /* low part is left in little-endian */
252# endif
253# define C_ALIGN dalign /* r6 align instruction */
254#else
255# define C_ST sw
256# define C_LD lw
257# if __MIPSEB
258# define C_LDHI lwl /* high part is left in big-endian */
259# define C_STHI swl /* high part is left in big-endian */
260# define C_LDLO lwr /* low part is right in big-endian */
261# define C_STLO swr /* low part is right in big-endian */
262# else
263# define C_LDHI lwr /* high part is right in little-endian */
264# define C_STHI swr /* high part is right in little-endian */
265# define C_LDLO lwl /* low part is left in little-endian */
266# define C_STLO swl /* low part is left in little-endian */
267# endif
268# define C_ALIGN align /* r6 align instruction */
269#endif
270
271/* Bookkeeping values for 32 vs. 64 bit mode. */
272#ifdef USE_DOUBLE
273# define NSIZE 8
274# define NSIZEMASK 0x3f
275# define NSIZEDMASK 0x7f
276#else
277# define NSIZE 4
278# define NSIZEMASK 0x1f
279# define NSIZEDMASK 0x3f
280#endif
281#define UNIT(unit) ((unit)*NSIZE)
282#define UNITM1(unit) (((unit)*NSIZE)-1)
283
284#ifdef __ANDROID__
285LEAF(MEMCPY_NAME, 0)
286#else
287LEAF(MEMCPY_NAME)
288#endif
289 .set nomips16
Raghu Gandham405b8022012-07-25 18:16:42 -0700290 .set noreorder
Raghu Gandham405b8022012-07-25 18:16:42 -0700291/*
292 * Below we handle the case where memcpy is called with overlapping src and dst.
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200293 * Although memcpy is not required to handle this case, some parts of Android
294 * like Skia rely on such usage. We call memmove to handle such cases.
Raghu Gandham405b8022012-07-25 18:16:42 -0700295 */
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200296#ifdef USE_MEMMOVE_FOR_OVERLAP
297 PTR_SUBU t0,a0,a1
298 PTR_SRA t2,t0,31
299 xor t1,t0,t2
300 PTR_SUBU t0,t1,t2
301 sltu t2,t0,a2
302 beq t2,zero,L(memcpy)
303 nop
304#if defined(__LP64__)
305 daddiu sp,sp,-8
306 SETUP_GP64(0,MEMCPY_NAME)
307 LA t9,memmove
308 RESTORE_GP64
Raghu Gandham405b8022012-07-25 18:16:42 -0700309 jr t9
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200310 daddiu sp,sp,8
311#else
312 LA t9,memmove
313 jr t9
314 nop
315#endif
316L(memcpy):
317#endif
318/*
319 * If the size is less than 2*NSIZE (8 or 16), go to L(lastb). Regardless of
320 * size, copy dst pointer to v0 for the return value.
321 */
322 slti t2,a2,(2 * NSIZE)
323 bne t2,zero,L(lastb)
324#if defined(RETURN_FIRST_PREFETCH) || defined(RETURN_LAST_PREFETCH)
325 move v0,zero
326#else
327 move v0,a0
328#endif
Raghu Gandham405b8022012-07-25 18:16:42 -0700329
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200330#ifndef R6_CODE
331
332/*
333 * If src and dst have different alignments, go to L(unaligned), if they
334 * have the same alignment (but are not actually aligned) do a partial
335 * load/store to make them aligned. If they are both already aligned
336 * we can start copying at L(aligned).
337 */
Raghu Gandham405b8022012-07-25 18:16:42 -0700338 xor t8,a1,a0
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200339 andi t8,t8,(NSIZE-1) /* t8 is a0/a1 word-displacement */
340 bne t8,zero,L(unaligned)
341 PTR_SUBU a3, zero, a0
Raghu Gandham405b8022012-07-25 18:16:42 -0700342
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200343 andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */
344 beq a3,zero,L(aligned) /* if a3=0, it is already aligned */
345 PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */
Raghu Gandham405b8022012-07-25 18:16:42 -0700346
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200347 C_LDHI t8,0(a1)
348 PTR_ADDU a1,a1,a3
349 C_STHI t8,0(a0)
350 PTR_ADDU a0,a0,a3
Raghu Gandham405b8022012-07-25 18:16:42 -0700351
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200352#else /* R6_CODE */
Raghu Gandham405b8022012-07-25 18:16:42 -0700353
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200354/*
355 * Align the destination and hope that the source gets aligned too. If it
356 * doesn't we jump to L(r6_unaligned*) to do unaligned copies using the r6
357 * align instruction.
358 */
359 andi t8,a0,7
360 lapc t9,L(atable)
361 PTR_LSA t9,t8,t9,2
362 jrc t9
363L(atable):
364 bc L(lb0)
365 bc L(lb7)
366 bc L(lb6)
367 bc L(lb5)
368 bc L(lb4)
369 bc L(lb3)
370 bc L(lb2)
371 bc L(lb1)
372L(lb7):
373 lb a3, 6(a1)
374 sb a3, 6(a0)
375L(lb6):
376 lb a3, 5(a1)
377 sb a3, 5(a0)
378L(lb5):
379 lb a3, 4(a1)
380 sb a3, 4(a0)
381L(lb4):
382 lb a3, 3(a1)
383 sb a3, 3(a0)
384L(lb3):
385 lb a3, 2(a1)
386 sb a3, 2(a0)
387L(lb2):
388 lb a3, 1(a1)
389 sb a3, 1(a0)
390L(lb1):
391 lb a3, 0(a1)
392 sb a3, 0(a0)
Raghu Gandham405b8022012-07-25 18:16:42 -0700393
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200394 li t9,8
395 subu t8,t9,t8
396 PTR_SUBU a2,a2,t8
397 PTR_ADDU a0,a0,t8
398 PTR_ADDU a1,a1,t8
399L(lb0):
Raghu Gandham405b8022012-07-25 18:16:42 -0700400
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200401 andi t8,a1,(NSIZE-1)
402 lapc t9,L(jtable)
403 PTR_LSA t9,t8,t9,2
404 jrc t9
405L(jtable):
406 bc L(aligned)
407 bc L(r6_unaligned1)
408 bc L(r6_unaligned2)
409 bc L(r6_unaligned3)
410# ifdef USE_DOUBLE
411 bc L(r6_unaligned4)
412 bc L(r6_unaligned5)
413 bc L(r6_unaligned6)
414 bc L(r6_unaligned7)
415# endif
416#endif /* R6_CODE */
Raghu Gandham405b8022012-07-25 18:16:42 -0700417
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200418L(aligned):
Raghu Gandham405b8022012-07-25 18:16:42 -0700419
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200420/*
421 * Now dst/src are both aligned to (word or double word) aligned addresses
422 * Set a2 to count how many bytes we have to copy after all the 64/128 byte
423 * chunks are copied and a3 to the dst pointer after all the 64/128 byte
424 * chunks have been copied. We will loop, incrementing a0 and a1 until a0
425 * equals a3.
426 */
Raghu Gandham405b8022012-07-25 18:16:42 -0700427
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200428 andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
429 beq a2,t8,L(chkw) /* if a2==t8, no 64-byte/128-byte chunks */
430 PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */
431 PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */
Raghu Gandham405b8022012-07-25 18:16:42 -0700432
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200433/* When in the loop we may prefetch with the 'prepare to store' hint,
434 * in this case the a0+x should not be past the "t0-32" address. This
435 * means: for x=128 the last "safe" a0 address is "t0-160". Alternatively,
436 * for x=64 the last "safe" a0 address is "t0-96" In the current version we
437 * will use "prefetch hint,128(a0)", so "t0-160" is the limit.
438 */
439#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
440 PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */
441 PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */
442#endif
443 PREFETCH_FOR_LOAD (0, a1)
444 PREFETCH_FOR_LOAD (1, a1)
445 PREFETCH_FOR_LOAD (2, a1)
446 PREFETCH_FOR_LOAD (3, a1)
447#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
448 PREFETCH_FOR_STORE (1, a0)
449 PREFETCH_FOR_STORE (2, a0)
450 PREFETCH_FOR_STORE (3, a0)
451#endif
452#if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH)
453# if PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE
454 sltu v1,t9,a0
455 bgtz v1,L(skip_set)
456 nop
457 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4)
458L(skip_set):
459# else
460 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1)
461# endif
462#endif
463#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH) \
464 && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
465 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*3)
466# ifdef USE_DOUBLE
467 PTR_ADDIU v0,v0,32
468# endif
469#endif
470L(loop16w):
471 C_LD t0,UNIT(0)(a1)
472#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
473 sltu v1,t9,a0 /* If a0 > t9 don't use next prefetch */
474 bgtz v1,L(skip_pref)
475#endif
476 C_LD t1,UNIT(1)(a1)
477#ifndef R6_CODE
478 PREFETCH_FOR_STORE (4, a0)
479 PREFETCH_FOR_STORE (5, a0)
480#else
481 PREFETCH_FOR_STORE (2, a0)
482#endif
483#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH)
484 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*5)
485# ifdef USE_DOUBLE
486 PTR_ADDIU v0,v0,32
487# endif
488#endif
489L(skip_pref):
490 C_LD REG2,UNIT(2)(a1)
491 C_LD REG3,UNIT(3)(a1)
492 C_LD REG4,UNIT(4)(a1)
493 C_LD REG5,UNIT(5)(a1)
494 C_LD REG6,UNIT(6)(a1)
495 C_LD REG7,UNIT(7)(a1)
496#ifndef R6_CODE
497 PREFETCH_FOR_LOAD (4, a1)
498#else
499 PREFETCH_FOR_LOAD (3, a1)
500#endif
501 C_ST t0,UNIT(0)(a0)
502 C_ST t1,UNIT(1)(a0)
503 C_ST REG2,UNIT(2)(a0)
504 C_ST REG3,UNIT(3)(a0)
505 C_ST REG4,UNIT(4)(a0)
506 C_ST REG5,UNIT(5)(a0)
507 C_ST REG6,UNIT(6)(a0)
508 C_ST REG7,UNIT(7)(a0)
Raghu Gandham405b8022012-07-25 18:16:42 -0700509
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200510 C_LD t0,UNIT(8)(a1)
511 C_LD t1,UNIT(9)(a1)
512 C_LD REG2,UNIT(10)(a1)
513 C_LD REG3,UNIT(11)(a1)
514 C_LD REG4,UNIT(12)(a1)
515 C_LD REG5,UNIT(13)(a1)
516 C_LD REG6,UNIT(14)(a1)
517 C_LD REG7,UNIT(15)(a1)
518#ifndef R6_CODE
519 PREFETCH_FOR_LOAD (5, a1)
520#endif
521 C_ST t0,UNIT(8)(a0)
522 C_ST t1,UNIT(9)(a0)
523 C_ST REG2,UNIT(10)(a0)
524 C_ST REG3,UNIT(11)(a0)
525 C_ST REG4,UNIT(12)(a0)
526 C_ST REG5,UNIT(13)(a0)
527 C_ST REG6,UNIT(14)(a0)
528 C_ST REG7,UNIT(15)(a0)
529 PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */
530 bne a0,a3,L(loop16w)
531 PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */
Raghu Gandham405b8022012-07-25 18:16:42 -0700532 move a2,t8
533
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200534/* Here we have src and dest word-aligned but less than 64-bytes or
535 * 128 bytes to go. Check for a 32(64) byte chunk and copy if if there
536 * is one. Otherwise jump down to L(chk1w) to handle the tail end of
537 * the copy.
538 */
Raghu Gandham405b8022012-07-25 18:16:42 -0700539
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200540L(chkw):
541 PREFETCH_FOR_LOAD (0, a1)
542 andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */
543 /* The t8 is the reminder count past 32-bytes */
544 beq a2,t8,L(chk1w) /* When a2=t8, no 32-byte chunk */
545 nop
546 C_LD t0,UNIT(0)(a1)
547 C_LD t1,UNIT(1)(a1)
548 C_LD REG2,UNIT(2)(a1)
549 C_LD REG3,UNIT(3)(a1)
550 C_LD REG4,UNIT(4)(a1)
551 C_LD REG5,UNIT(5)(a1)
552 C_LD REG6,UNIT(6)(a1)
553 C_LD REG7,UNIT(7)(a1)
554 PTR_ADDIU a1,a1,UNIT(8)
555 C_ST t0,UNIT(0)(a0)
556 C_ST t1,UNIT(1)(a0)
557 C_ST REG2,UNIT(2)(a0)
558 C_ST REG3,UNIT(3)(a0)
559 C_ST REG4,UNIT(4)(a0)
560 C_ST REG5,UNIT(5)(a0)
561 C_ST REG6,UNIT(6)(a0)
562 C_ST REG7,UNIT(7)(a0)
563 PTR_ADDIU a0,a0,UNIT(8)
Raghu Gandham405b8022012-07-25 18:16:42 -0700564
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200565/*
566 * Here we have less than 32(64) bytes to copy. Set up for a loop to
567 * copy one word (or double word) at a time. Set a2 to count how many
568 * bytes we have to copy after all the word (or double word) chunks are
569 * copied and a3 to the dst pointer after all the (d)word chunks have
570 * been copied. We will loop, incrementing a0 and a1 until a0 equals a3.
571 */
572L(chk1w):
573 andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
574 beq a2,t8,L(lastb)
575 PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */
576 PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */
Raghu Gandham405b8022012-07-25 18:16:42 -0700577
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200578/* copying in words (4-byte or 8-byte chunks) */
579L(wordCopy_loop):
580 C_LD REG3,UNIT(0)(a1)
581 PTR_ADDIU a0,a0,UNIT(1)
582 PTR_ADDIU a1,a1,UNIT(1)
583 bne a0,a3,L(wordCopy_loop)
584 C_ST REG3,UNIT(-1)(a0)
Raghu Gandham405b8022012-07-25 18:16:42 -0700585
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200586/* Copy the last 8 (or 16) bytes */
587L(lastb):
588 blez a2,L(leave)
589 PTR_ADDU a3,a0,a2 /* a3 is the last dst address */
590L(lastbloop):
Raghu Gandham405b8022012-07-25 18:16:42 -0700591 lb v1,0(a1)
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200592 PTR_ADDIU a0,a0,1
593 PTR_ADDIU a1,a1,1
594 bne a0,a3,L(lastbloop)
595 sb v1,-1(a0)
596L(leave):
Raghu Gandham405b8022012-07-25 18:16:42 -0700597 j ra
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200598 nop
Raghu Gandham405b8022012-07-25 18:16:42 -0700599
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200600#ifndef R6_CODE
601/*
602 * UNALIGNED case, got here with a3 = "negu a0"
603 * This code is nearly identical to the aligned code above
604 * but only the destination (not the source) gets aligned
605 * so we need to do partial loads of the source followed
606 * by normal stores to the destination (once we have aligned
607 * the destination).
608 */
Raghu Gandham405b8022012-07-25 18:16:42 -0700609
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200610L(unaligned):
611 andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */
612 beqz a3,L(ua_chk16w) /* if a3=0, it is already aligned */
613 PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */
Raghu Gandham405b8022012-07-25 18:16:42 -0700614
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200615 C_LDHI v1,UNIT(0)(a1)
616 C_LDLO v1,UNITM1(1)(a1)
617 PTR_ADDU a1,a1,a3
618 C_STHI v1,UNIT(0)(a0)
619 PTR_ADDU a0,a0,a3
Raghu Gandham405b8022012-07-25 18:16:42 -0700620
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200621/*
622 * Now the destination (but not the source) is aligned
623 * Set a2 to count how many bytes we have to copy after all the 64/128 byte
624 * chunks are copied and a3 to the dst pointer after all the 64/128 byte
625 * chunks have been copied. We will loop, incrementing a0 and a1 until a0
626 * equals a3.
627 */
Raghu Gandham405b8022012-07-25 18:16:42 -0700628
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200629L(ua_chk16w):
630 andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
631 beq a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
632 PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */
633 PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */
Raghu Gandham405b8022012-07-25 18:16:42 -0700634
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200635# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
636 PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */
637 PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */
638# endif
639 PREFETCH_FOR_LOAD (0, a1)
640 PREFETCH_FOR_LOAD (1, a1)
641 PREFETCH_FOR_LOAD (2, a1)
642# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE)
643 PREFETCH_FOR_STORE (1, a0)
644 PREFETCH_FOR_STORE (2, a0)
645 PREFETCH_FOR_STORE (3, a0)
646# endif
647# if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH)
648# if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
649 sltu v1,t9,a0
650 bgtz v1,L(ua_skip_set)
651 nop
652 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4)
653L(ua_skip_set):
654# else
655 PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1)
656# endif
657# endif
658L(ua_loop16w):
659 PREFETCH_FOR_LOAD (3, a1)
660 C_LDHI t0,UNIT(0)(a1)
661 C_LDHI t1,UNIT(1)(a1)
662 C_LDHI REG2,UNIT(2)(a1)
663# if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
664 sltu v1,t9,a0
665 bgtz v1,L(ua_skip_pref)
666# endif
667 C_LDHI REG3,UNIT(3)(a1)
668 PREFETCH_FOR_STORE (4, a0)
669 PREFETCH_FOR_STORE (5, a0)
670L(ua_skip_pref):
671 C_LDHI REG4,UNIT(4)(a1)
672 C_LDHI REG5,UNIT(5)(a1)
673 C_LDHI REG6,UNIT(6)(a1)
674 C_LDHI REG7,UNIT(7)(a1)
675 C_LDLO t0,UNITM1(1)(a1)
676 C_LDLO t1,UNITM1(2)(a1)
677 C_LDLO REG2,UNITM1(3)(a1)
678 C_LDLO REG3,UNITM1(4)(a1)
679 C_LDLO REG4,UNITM1(5)(a1)
680 C_LDLO REG5,UNITM1(6)(a1)
681 C_LDLO REG6,UNITM1(7)(a1)
682 C_LDLO REG7,UNITM1(8)(a1)
683 PREFETCH_FOR_LOAD (4, a1)
684 C_ST t0,UNIT(0)(a0)
685 C_ST t1,UNIT(1)(a0)
686 C_ST REG2,UNIT(2)(a0)
687 C_ST REG3,UNIT(3)(a0)
688 C_ST REG4,UNIT(4)(a0)
689 C_ST REG5,UNIT(5)(a0)
690 C_ST REG6,UNIT(6)(a0)
691 C_ST REG7,UNIT(7)(a0)
692 C_LDHI t0,UNIT(8)(a1)
693 C_LDHI t1,UNIT(9)(a1)
694 C_LDHI REG2,UNIT(10)(a1)
695 C_LDHI REG3,UNIT(11)(a1)
696 C_LDHI REG4,UNIT(12)(a1)
697 C_LDHI REG5,UNIT(13)(a1)
698 C_LDHI REG6,UNIT(14)(a1)
699 C_LDHI REG7,UNIT(15)(a1)
700 C_LDLO t0,UNITM1(9)(a1)
701 C_LDLO t1,UNITM1(10)(a1)
702 C_LDLO REG2,UNITM1(11)(a1)
703 C_LDLO REG3,UNITM1(12)(a1)
704 C_LDLO REG4,UNITM1(13)(a1)
705 C_LDLO REG5,UNITM1(14)(a1)
706 C_LDLO REG6,UNITM1(15)(a1)
707 C_LDLO REG7,UNITM1(16)(a1)
708 PREFETCH_FOR_LOAD (5, a1)
709 C_ST t0,UNIT(8)(a0)
710 C_ST t1,UNIT(9)(a0)
711 C_ST REG2,UNIT(10)(a0)
712 C_ST REG3,UNIT(11)(a0)
713 C_ST REG4,UNIT(12)(a0)
714 C_ST REG5,UNIT(13)(a0)
715 C_ST REG6,UNIT(14)(a0)
716 C_ST REG7,UNIT(15)(a0)
717 PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */
718 bne a0,a3,L(ua_loop16w)
719 PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */
Raghu Gandham405b8022012-07-25 18:16:42 -0700720 move a2,t8
721
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200722/* Here we have src and dest word-aligned but less than 64-bytes or
723 * 128 bytes to go. Check for a 32(64) byte chunk and copy if if there
724 * is one. Otherwise jump down to L(ua_chk1w) to handle the tail end of
725 * the copy. */
Raghu Gandham405b8022012-07-25 18:16:42 -0700726
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200727L(ua_chkw):
728 PREFETCH_FOR_LOAD (0, a1)
729 andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */
730 /* t8 is the reminder count past 32-bytes */
731 beq a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
732 nop
733 C_LDHI t0,UNIT(0)(a1)
734 C_LDHI t1,UNIT(1)(a1)
735 C_LDHI REG2,UNIT(2)(a1)
736 C_LDHI REG3,UNIT(3)(a1)
737 C_LDHI REG4,UNIT(4)(a1)
738 C_LDHI REG5,UNIT(5)(a1)
739 C_LDHI REG6,UNIT(6)(a1)
740 C_LDHI REG7,UNIT(7)(a1)
741 C_LDLO t0,UNITM1(1)(a1)
742 C_LDLO t1,UNITM1(2)(a1)
743 C_LDLO REG2,UNITM1(3)(a1)
744 C_LDLO REG3,UNITM1(4)(a1)
745 C_LDLO REG4,UNITM1(5)(a1)
746 C_LDLO REG5,UNITM1(6)(a1)
747 C_LDLO REG6,UNITM1(7)(a1)
748 C_LDLO REG7,UNITM1(8)(a1)
749 PTR_ADDIU a1,a1,UNIT(8)
750 C_ST t0,UNIT(0)(a0)
751 C_ST t1,UNIT(1)(a0)
752 C_ST REG2,UNIT(2)(a0)
753 C_ST REG3,UNIT(3)(a0)
754 C_ST REG4,UNIT(4)(a0)
755 C_ST REG5,UNIT(5)(a0)
756 C_ST REG6,UNIT(6)(a0)
757 C_ST REG7,UNIT(7)(a0)
758 PTR_ADDIU a0,a0,UNIT(8)
759/*
760 * Here we have less than 32(64) bytes to copy. Set up for a loop to
761 * copy one word (or double word) at a time.
762 */
763L(ua_chk1w):
764 andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
765 beq a2,t8,L(ua_smallCopy)
766 PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */
767 PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */
Raghu Gandham405b8022012-07-25 18:16:42 -0700768
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200769/* copying in words (4-byte or 8-byte chunks) */
770L(ua_wordCopy_loop):
771 C_LDHI v1,UNIT(0)(a1)
772 C_LDLO v1,UNITM1(1)(a1)
773 PTR_ADDIU a0,a0,UNIT(1)
774 PTR_ADDIU a1,a1,UNIT(1)
775 bne a0,a3,L(ua_wordCopy_loop)
776 C_ST v1,UNIT(-1)(a0)
Raghu Gandham405b8022012-07-25 18:16:42 -0700777
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200778/* Copy the last 8 (or 16) bytes */
779L(ua_smallCopy):
780 beqz a2,L(leave)
781 PTR_ADDU a3,a0,a2 /* a3 is the last dst address */
782L(ua_smallCopy_loop):
Raghu Gandham405b8022012-07-25 18:16:42 -0700783 lb v1,0(a1)
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200784 PTR_ADDIU a0,a0,1
785 PTR_ADDIU a1,a1,1
786 bne a0,a3,L(ua_smallCopy_loop)
787 sb v1,-1(a0)
Raghu Gandham405b8022012-07-25 18:16:42 -0700788
789 j ra
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200790 nop
791
792#else /* R6_CODE */
793
794# if __MIPSEB
795# define SWAP_REGS(X,Y) X, Y
796# define ALIGN_OFFSET(N) (N)
797# else
798# define SWAP_REGS(X,Y) Y, X
799# define ALIGN_OFFSET(N) (NSIZE-N)
800# endif
801# define R6_UNALIGNED_WORD_COPY(BYTEOFFSET) \
802 andi REG7, a2, (NSIZE-1);/* REG7 is # of bytes to by bytes. */ \
803 beq REG7, a2, L(lastb); /* Check for bytes to copy by word */ \
804 PTR_SUBU a3, a2, REG7; /* a3 is number of bytes to be copied in */ \
805 /* (d)word chunks. */ \
806 move a2, REG7; /* a2 is # of bytes to copy byte by byte */ \
807 /* after word loop is finished. */ \
808 PTR_ADDU REG6, a0, a3; /* REG6 is the dst address after loop. */ \
809 PTR_SUBU REG2, a1, t8; /* REG2 is the aligned src address. */ \
810 PTR_ADDU a1, a1, a3; /* a1 is addr of source after word loop. */ \
811 C_LD t0, UNIT(0)(REG2); /* Load first part of source. */ \
812L(r6_ua_wordcopy##BYTEOFFSET): \
813 C_LD t1, UNIT(1)(REG2); /* Load second part of source. */ \
814 C_ALIGN REG3, SWAP_REGS(t1,t0), ALIGN_OFFSET(BYTEOFFSET); \
815 PTR_ADDIU a0, a0, UNIT(1); /* Increment destination pointer. */ \
816 PTR_ADDIU REG2, REG2, UNIT(1); /* Increment aligned source pointer.*/ \
817 move t0, t1; /* Move second part of source to first. */ \
818 bne a0, REG6,L(r6_ua_wordcopy##BYTEOFFSET); \
819 C_ST REG3, UNIT(-1)(a0); \
820 j L(lastb); \
821 nop
822
823 /* We are generating R6 code, the destination is 4 byte aligned and
824 the source is not 4 byte aligned. t8 is 1, 2, or 3 depending on the
825 alignment of the source. */
826
827L(r6_unaligned1):
828 R6_UNALIGNED_WORD_COPY(1)
829L(r6_unaligned2):
830 R6_UNALIGNED_WORD_COPY(2)
831L(r6_unaligned3):
832 R6_UNALIGNED_WORD_COPY(3)
833# ifdef USE_DOUBLE
834L(r6_unaligned4):
835 R6_UNALIGNED_WORD_COPY(4)
836L(r6_unaligned5):
837 R6_UNALIGNED_WORD_COPY(5)
838L(r6_unaligned6):
839 R6_UNALIGNED_WORD_COPY(6)
840L(r6_unaligned7):
841 R6_UNALIGNED_WORD_COPY(7)
842# endif
843#endif /* R6_CODE */
Raghu Gandham405b8022012-07-25 18:16:42 -0700844
845 .set at
846 .set reorder
Nikola Veljkovic38f2eaa2015-05-26 12:06:09 +0200847END(MEMCPY_NAME)
848#ifndef __ANDROID__
849# ifdef _LIBC
850libc_hidden_builtin_def (MEMCPY_NAME)
851# endif
852#endif