blob: aabdfcfdce6e95d768636c22572856fcab1f9d8c [file] [log] [blame]
Raghu Gandham405b8022012-07-25 18:16:42 -07001/*
2 * Copyright (c) 2009
3 * MIPS Technologies, Inc., California.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14 * contributors may be used to endorse or promote products derived from
15 * this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30/************************************************************************
31 *
32 * memcpy.S
33 * Version: "043009"
34 *
35 ************************************************************************/
36
37
38/************************************************************************
39 * Include files
40 ************************************************************************/
41
42#include "machine/asm.h"
43
44
45/*
46 * This routine could be optimized for MIPS64. The current code only
47 * uses MIPS32 instructions.
48 */
49#if defined(__MIPSEB__)
50# define LWHI lwl /* high part is left in big-endian */
51# define SWHI swl /* high part is left in big-endian */
52# define LWLO lwr /* low part is right in big-endian */
53# define SWLO swr /* low part is right in big-endian */
54#endif
55
56#if defined(__MIPSEL__)
57# define LWHI lwr /* high part is right in little-endian */
58# define SWHI swr /* high part is right in little-endian */
59# define LWLO lwl /* low part is left in big-endian */
60# define SWLO swl /* low part is left in big-endian */
61#endif
62
63LEAF(memcpy,0)
64
65 .set noreorder
66 .set noat
67/*
68 * Below we handle the case where memcpy is called with overlapping src and dst.
69 * Although memcpy is not required to handle this case, some parts of Android like Skia
70 * rely on such usage. We call memmove to handle such cases.
71 */
72 subu t0,a0,a1
73 sra AT,t0,31
74 xor t1,t0,AT
75 subu t0,t1,AT
76 sltu AT,t0,a2
77 beq AT,zero,.Lmemcpy
78 la t9,memmove
79 jr t9
80 nop
81.Lmemcpy:
82 slti AT,a2,8
83 bne AT,zero,.Llast8
84 move v0,a0 # memcpy returns the dst pointer
85
86# Test if the src and dst are word-aligned, or can be made word-aligned
87 xor t8,a1,a0
88 andi t8,t8,0x3 # t8 is a0/a1 word-displacement
89
90 bne t8,zero,.Lunaligned
91 negu a3,a0
92
93 andi a3,a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned
94 beq a3,zero,.Lchk16w # when a3=0 then the dst (a0) is word-aligned
95 subu a2,a2,a3 # now a2 is the remining bytes count
96
97 LWHI t8,0(a1)
98 addu a1,a1,a3
99 SWHI t8,0(a0)
100 addu a0,a0,a3
101
102# Now the dst/src are mutually word-aligned with word-aligned addresses
103.Lchk16w:
104 andi t8,a2,0x3f # any whole 64-byte chunks?
105 # t8 is the byte count after 64-byte chunks
106
107 beq a2,t8,.Lchk8w # if a2==t8, no 64-byte chunks
108 # There will be at most 1 32-byte chunk after it
109 subu a3,a2,t8 # subtract from a2 the reminder
110 # Here a3 counts bytes in 16w chunks
111 addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks
112
113 addu t0,a0,a2 # t0 is the "past the end" address
114
115# When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
116# the "t0-32" address
117# This means: for x=128 the last "safe" a0 address is "t0-160"
118# Alternatively, for x=64 the last "safe" a0 address is "t0-96"
119# In the current version we will use "pref 30,128(a0)", so "t0-160" is the limit
120 subu t9,t0,160 # t9 is the "last safe pref 30,128(a0)" address
121
122 pref 0,0(a1) # bring the first line of src, addr 0
123 pref 0,32(a1) # bring the second line of src, addr 32
124 pref 0,64(a1) # bring the third line of src, addr 64
125 pref 30,32(a0) # safe, as we have at least 64 bytes ahead
126# In case the a0 > t9 don't use "pref 30" at all
127 sgtu v1,a0,t9
128 bgtz v1,.Lloop16w # skip "pref 30,64(a0)" for too short arrays
129 nop
130# otherwise, start with using pref30
131 pref 30,64(a0)
132.Lloop16w:
133 pref 0,96(a1)
134 lw t0,0(a1)
135 bgtz v1,.Lskip_pref30_96 # skip "pref 30,96(a0)"
136 lw t1,4(a1)
137 pref 30,96(a0) # continue setting up the dest, addr 96
138.Lskip_pref30_96:
139 lw t2,8(a1)
140 lw t3,12(a1)
141 lw t4,16(a1)
142 lw t5,20(a1)
143 lw t6,24(a1)
144 lw t7,28(a1)
145 pref 0,128(a1) # bring the next lines of src, addr 128
146
147 sw t0,0(a0)
148 sw t1,4(a0)
149 sw t2,8(a0)
150 sw t3,12(a0)
151 sw t4,16(a0)
152 sw t5,20(a0)
153 sw t6,24(a0)
154 sw t7,28(a0)
155
156 lw t0,32(a1)
157 bgtz v1,.Lskip_pref30_128 # skip "pref 30,128(a0)"
158 lw t1,36(a1)
159 pref 30,128(a0) # continue setting up the dest, addr 128
160.Lskip_pref30_128:
161 lw t2,40(a1)
162 lw t3,44(a1)
163 lw t4,48(a1)
164 lw t5,52(a1)
165 lw t6,56(a1)
166 lw t7,60(a1)
167 pref 0, 160(a1) # bring the next lines of src, addr 160
168
169 sw t0,32(a0)
170 sw t1,36(a0)
171 sw t2,40(a0)
172 sw t3,44(a0)
173 sw t4,48(a0)
174 sw t5,52(a0)
175 sw t6,56(a0)
176 sw t7,60(a0)
177
178 addiu a0,a0,64 # adding 64 to dest
179 sgtu v1,a0,t9
180 bne a0,a3,.Lloop16w
181 addiu a1,a1,64 # adding 64 to src
182 move a2,t8
183
184# Here we have src and dest word-aligned but less than 64-bytes to go
185
186.Lchk8w:
187 pref 0, 0x0(a1)
188 andi t8,a2,0x1f # is there a 32-byte chunk?
189 # the t8 is the reminder count past 32-bytes
190 beq a2,t8,.Lchk1w # when a2=t8, no 32-byte chunk
191 nop
192
193 lw t0,0(a1)
194 lw t1,4(a1)
195 lw t2,8(a1)
196 lw t3,12(a1)
197 lw t4,16(a1)
198 lw t5,20(a1)
199 lw t6,24(a1)
200 lw t7,28(a1)
201 addiu a1,a1,32
202
203 sw t0,0(a0)
204 sw t1,4(a0)
205 sw t2,8(a0)
206 sw t3,12(a0)
207 sw t4,16(a0)
208 sw t5,20(a0)
209 sw t6,24(a0)
210 sw t7,28(a0)
211 addiu a0,a0,32
212
213.Lchk1w:
214 andi a2,t8,0x3 # now a2 is the reminder past 1w chunks
215 beq a2,t8,.Llast8
216 subu a3,t8,a2 # a3 is count of bytes in 1w chunks
217 addu a3,a0,a3 # now a3 is the dst address past the 1w chunks
218
219# copying in words (4-byte chunks)
220.LwordCopy_loop:
221 lw t3,0(a1) # the first t3 may be equal t0 ... optimize?
222 addiu a1,a1,4
223 addiu a0,a0,4
224 bne a0,a3,.LwordCopy_loop
225 sw t3,-4(a0)
226
227# For the last (<8) bytes
228.Llast8:
229 blez a2,.Lleave
230 addu a3,a0,a2 # a3 is the last dst address
231.Llast8loop:
232 lb v1,0(a1)
233 addiu a1,a1,1
234 addiu a0,a0,1
235 bne a0,a3,.Llast8loop
236 sb v1,-1(a0)
237
238.Lleave:
239 j ra
240 nop
241
242#
243# UNALIGNED case
244#
245
246.Lunaligned:
247 # got here with a3="negu a0"
248 andi a3,a3,0x3 # test if the a0 is word aligned
249 beqz a3,.Lua_chk16w
250 subu a2,a2,a3 # bytes left after initial a3 bytes
251
252 LWHI v1,0(a1)
253 LWLO v1,3(a1)
254 addu a1,a1,a3 # a3 may be here 1, 2 or 3
255 SWHI v1,0(a0)
256 addu a0,a0,a3 # below the dst will be word aligned (NOTE1)
257
258.Lua_chk16w:
259 andi t8,a2,0x3f # any whole 64-byte chunks?
260 # t8 is the byte count after 64-byte chunks
261 beq a2,t8,.Lua_chk8w # if a2==t8, no 64-byte chunks
262 # There will be at most 1 32-byte chunk after it
263 subu a3,a2,t8 # subtract from a2 the reminder
264 # Here a3 counts bytes in 16w chunks
265 addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks
266
267 addu t0,a0,a2 # t0 is the "past the end" address
268
269 subu t9,t0,160 # t9 is the "last safe pref 30,128(a0)" address
270
271 pref 0,0(a1) # bring the first line of src, addr 0
272 pref 0,32(a1) # bring the second line of src, addr 32
273 pref 0,64(a1) # bring the third line of src, addr 64
274 pref 30,32(a0) # safe, as we have at least 64 bytes ahead
275# In case the a0 > t9 don't use "pref 30" at all
276 sgtu v1,a0,t9
277 bgtz v1,.Lua_loop16w # skip "pref 30,64(a0)" for too short arrays
278 nop
279# otherwise, start with using pref30
280 pref 30,64(a0)
281.Lua_loop16w:
282 pref 0,96(a1)
283 LWHI t0,0(a1)
284 LWLO t0,3(a1)
285 LWHI t1,4(a1)
286 bgtz v1,.Lua_skip_pref30_96
287 LWLO t1,7(a1)
288 pref 30,96(a0) # continue setting up the dest, addr 96
289.Lua_skip_pref30_96:
290 LWHI t2,8(a1)
291 LWLO t2,11(a1)
292 LWHI t3,12(a1)
293 LWLO t3,15(a1)
294 LWHI t4,16(a1)
295 LWLO t4,19(a1)
296 LWHI t5,20(a1)
297 LWLO t5,23(a1)
298 LWHI t6,24(a1)
299 LWLO t6,27(a1)
300 LWHI t7,28(a1)
301 LWLO t7,31(a1)
302 pref 0,128(a1) # bring the next lines of src, addr 128
303
304 sw t0,0(a0)
305 sw t1,4(a0)
306 sw t2,8(a0)
307 sw t3,12(a0)
308 sw t4,16(a0)
309 sw t5,20(a0)
310 sw t6,24(a0)
311 sw t7,28(a0)
312
313 LWHI t0,32(a1)
314 LWLO t0,35(a1)
315 LWHI t1,36(a1)
316 bgtz v1,.Lua_skip_pref30_128
317 LWLO t1,39(a1)
318 pref 30,128(a0) # continue setting up the dest, addr 128
319.Lua_skip_pref30_128:
320 LWHI t2,40(a1)
321 LWLO t2,43(a1)
322 LWHI t3,44(a1)
323 LWLO t3,47(a1)
324 LWHI t4,48(a1)
325 LWLO t4,51(a1)
326 LWHI t5,52(a1)
327 LWLO t5,55(a1)
328 LWHI t6,56(a1)
329 LWLO t6,59(a1)
330 LWHI t7,60(a1)
331 LWLO t7,63(a1)
332 pref 0, 160(a1) # bring the next lines of src, addr 160
333
334 sw t0,32(a0)
335 sw t1,36(a0)
336 sw t2,40(a0)
337 sw t3,44(a0)
338 sw t4,48(a0)
339 sw t5,52(a0)
340 sw t6,56(a0)
341 sw t7,60(a0)
342
343 addiu a0,a0,64 # adding 64 to dest
344 sgtu v1,a0,t9
345 bne a0,a3,.Lua_loop16w
346 addiu a1,a1,64 # adding 64 to src
347 move a2,t8
348
349# Here we have src and dest word-aligned but less than 64-bytes to go
350
351.Lua_chk8w:
352 pref 0, 0x0(a1)
353 andi t8,a2,0x1f # is there a 32-byte chunk?
354 # the t8 is the reminder count
355 beq a2,t8,.Lua_chk1w # when a2=t8, no 32-byte chunk
356 nop
357
358 LWHI t0,0(a1)
359 LWLO t0,3(a1)
360 LWHI t1,4(a1)
361 LWLO t1,7(a1)
362 LWHI t2,8(a1)
363 LWLO t2,11(a1)
364 LWHI t3,12(a1)
365 LWLO t3,15(a1)
366 LWHI t4,16(a1)
367 LWLO t4,19(a1)
368 LWHI t5,20(a1)
369 LWLO t5,23(a1)
370 LWHI t6,24(a1)
371 LWLO t6,27(a1)
372 LWHI t7,28(a1)
373 LWLO t7,31(a1)
374 addiu a1,a1,32
375
376 sw t0,0(a0)
377 sw t1,4(a0)
378 sw t2,8(a0)
379 sw t3,12(a0)
380 sw t4,16(a0)
381 sw t5,20(a0)
382 sw t6,24(a0)
383 sw t7,28(a0)
384 addiu a0,a0,32
385
386.Lua_chk1w:
387 andi a2,t8,0x3 # now a2 is the reminder past 1w chunks
388 beq a2,t8,.Lua_smallCopy
389 subu a3,t8,a2 # a3 is count of bytes in 1w chunks
390 addu a3,a0,a3 # now a3 is the dst address past the 1w chunks
391
392# copying in words (4-byte chunks)
393.Lua_wordCopy_loop:
394 LWHI v1,0(a1)
395 LWLO v1,3(a1)
396 addiu a1,a1,4
397 addiu a0,a0,4 # note: dst=a0 is word aligned here, see NOTE1
398 bne a0,a3,.Lua_wordCopy_loop
399 sw v1,-4(a0)
400
401# Now less than 4 bytes (value in a2) left to copy
402.Lua_smallCopy:
403 beqz a2,.Lleave
404 addu a3,a0,a2 # a3 is the last dst address
405.Lua_smallCopy_loop:
406 lb v1,0(a1)
407 addiu a1,a1,1
408 addiu a0,a0,1
409 bne a0,a3,.Lua_smallCopy_loop
410 sb v1,-1(a0)
411
412 j ra
413 nop
414
415 .set at
416 .set reorder
417
418END(memcpy)
419
420
421/************************************************************************
422 * Implementation : Static functions
423 ************************************************************************/