Raghu Gandham | 405b802 | 2012-07-25 18:16:42 -0700 | [diff] [blame^] | 1 | /* |
| 2 | * Copyright (c) 2009 |
| 3 | * MIPS Technologies, Inc., California. |
| 4 | * |
| 5 | * Redistribution and use in source and binary forms, with or without |
| 6 | * modification, are permitted provided that the following conditions |
| 7 | * are met: |
| 8 | * 1. Redistributions of source code must retain the above copyright |
| 9 | * notice, this list of conditions and the following disclaimer. |
| 10 | * 2. Redistributions in binary form must reproduce the above copyright |
| 11 | * notice, this list of conditions and the following disclaimer in the |
| 12 | * documentation and/or other materials provided with the distribution. |
| 13 | * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its |
| 14 | * contributors may be used to endorse or promote products derived from |
| 15 | * this software without specific prior written permission. |
| 16 | * |
| 17 | * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND |
| 18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 19 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 20 | * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE |
| 21 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 22 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 23 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 24 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 25 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 26 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 27 | * SUCH DAMAGE. |
| 28 | */ |
| 29 | |
| 30 | /************************************************************************ |
| 31 | * |
| 32 | * memcpy.S |
| 33 | * Version: "043009" |
| 34 | * |
| 35 | ************************************************************************/ |
| 36 | |
| 37 | |
| 38 | /************************************************************************ |
| 39 | * Include files |
| 40 | ************************************************************************/ |
| 41 | |
| 42 | #include "machine/asm.h" |
| 43 | |
| 44 | |
| 45 | /* |
| 46 | * This routine could be optimized for MIPS64. The current code only |
| 47 | * uses MIPS32 instructions. |
| 48 | */ |
| 49 | #if defined(__MIPSEB__) |
| 50 | # define LWHI lwl /* high part is left in big-endian */ |
| 51 | # define SWHI swl /* high part is left in big-endian */ |
| 52 | # define LWLO lwr /* low part is right in big-endian */ |
| 53 | # define SWLO swr /* low part is right in big-endian */ |
| 54 | #endif |
| 55 | |
| 56 | #if defined(__MIPSEL__) |
| 57 | # define LWHI lwr /* high part is right in little-endian */ |
| 58 | # define SWHI swr /* high part is right in little-endian */ |
| 59 | # define LWLO lwl /* low part is left in big-endian */ |
| 60 | # define SWLO swl /* low part is left in big-endian */ |
| 61 | #endif |
| 62 | |
| 63 | LEAF(memcpy,0) |
| 64 | |
| 65 | .set noreorder |
| 66 | .set noat |
| 67 | /* |
| 68 | * Below we handle the case where memcpy is called with overlapping src and dst. |
| 69 | * Although memcpy is not required to handle this case, some parts of Android like Skia |
| 70 | * rely on such usage. We call memmove to handle such cases. |
| 71 | */ |
| 72 | subu t0,a0,a1 |
| 73 | sra AT,t0,31 |
| 74 | xor t1,t0,AT |
| 75 | subu t0,t1,AT |
| 76 | sltu AT,t0,a2 |
| 77 | beq AT,zero,.Lmemcpy |
| 78 | la t9,memmove |
| 79 | jr t9 |
| 80 | nop |
| 81 | .Lmemcpy: |
| 82 | slti AT,a2,8 |
| 83 | bne AT,zero,.Llast8 |
| 84 | move v0,a0 # memcpy returns the dst pointer |
| 85 | |
| 86 | # Test if the src and dst are word-aligned, or can be made word-aligned |
| 87 | xor t8,a1,a0 |
| 88 | andi t8,t8,0x3 # t8 is a0/a1 word-displacement |
| 89 | |
| 90 | bne t8,zero,.Lunaligned |
| 91 | negu a3,a0 |
| 92 | |
| 93 | andi a3,a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned |
| 94 | beq a3,zero,.Lchk16w # when a3=0 then the dst (a0) is word-aligned |
| 95 | subu a2,a2,a3 # now a2 is the remining bytes count |
| 96 | |
| 97 | LWHI t8,0(a1) |
| 98 | addu a1,a1,a3 |
| 99 | SWHI t8,0(a0) |
| 100 | addu a0,a0,a3 |
| 101 | |
| 102 | # Now the dst/src are mutually word-aligned with word-aligned addresses |
| 103 | .Lchk16w: |
| 104 | andi t8,a2,0x3f # any whole 64-byte chunks? |
| 105 | # t8 is the byte count after 64-byte chunks |
| 106 | |
| 107 | beq a2,t8,.Lchk8w # if a2==t8, no 64-byte chunks |
| 108 | # There will be at most 1 32-byte chunk after it |
| 109 | subu a3,a2,t8 # subtract from a2 the reminder |
| 110 | # Here a3 counts bytes in 16w chunks |
| 111 | addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks |
| 112 | |
| 113 | addu t0,a0,a2 # t0 is the "past the end" address |
| 114 | |
| 115 | # When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past |
| 116 | # the "t0-32" address |
| 117 | # This means: for x=128 the last "safe" a0 address is "t0-160" |
| 118 | # Alternatively, for x=64 the last "safe" a0 address is "t0-96" |
| 119 | # In the current version we will use "pref 30,128(a0)", so "t0-160" is the limit |
| 120 | subu t9,t0,160 # t9 is the "last safe pref 30,128(a0)" address |
| 121 | |
| 122 | pref 0,0(a1) # bring the first line of src, addr 0 |
| 123 | pref 0,32(a1) # bring the second line of src, addr 32 |
| 124 | pref 0,64(a1) # bring the third line of src, addr 64 |
| 125 | pref 30,32(a0) # safe, as we have at least 64 bytes ahead |
| 126 | # In case the a0 > t9 don't use "pref 30" at all |
| 127 | sgtu v1,a0,t9 |
| 128 | bgtz v1,.Lloop16w # skip "pref 30,64(a0)" for too short arrays |
| 129 | nop |
| 130 | # otherwise, start with using pref30 |
| 131 | pref 30,64(a0) |
| 132 | .Lloop16w: |
| 133 | pref 0,96(a1) |
| 134 | lw t0,0(a1) |
| 135 | bgtz v1,.Lskip_pref30_96 # skip "pref 30,96(a0)" |
| 136 | lw t1,4(a1) |
| 137 | pref 30,96(a0) # continue setting up the dest, addr 96 |
| 138 | .Lskip_pref30_96: |
| 139 | lw t2,8(a1) |
| 140 | lw t3,12(a1) |
| 141 | lw t4,16(a1) |
| 142 | lw t5,20(a1) |
| 143 | lw t6,24(a1) |
| 144 | lw t7,28(a1) |
| 145 | pref 0,128(a1) # bring the next lines of src, addr 128 |
| 146 | |
| 147 | sw t0,0(a0) |
| 148 | sw t1,4(a0) |
| 149 | sw t2,8(a0) |
| 150 | sw t3,12(a0) |
| 151 | sw t4,16(a0) |
| 152 | sw t5,20(a0) |
| 153 | sw t6,24(a0) |
| 154 | sw t7,28(a0) |
| 155 | |
| 156 | lw t0,32(a1) |
| 157 | bgtz v1,.Lskip_pref30_128 # skip "pref 30,128(a0)" |
| 158 | lw t1,36(a1) |
| 159 | pref 30,128(a0) # continue setting up the dest, addr 128 |
| 160 | .Lskip_pref30_128: |
| 161 | lw t2,40(a1) |
| 162 | lw t3,44(a1) |
| 163 | lw t4,48(a1) |
| 164 | lw t5,52(a1) |
| 165 | lw t6,56(a1) |
| 166 | lw t7,60(a1) |
| 167 | pref 0, 160(a1) # bring the next lines of src, addr 160 |
| 168 | |
| 169 | sw t0,32(a0) |
| 170 | sw t1,36(a0) |
| 171 | sw t2,40(a0) |
| 172 | sw t3,44(a0) |
| 173 | sw t4,48(a0) |
| 174 | sw t5,52(a0) |
| 175 | sw t6,56(a0) |
| 176 | sw t7,60(a0) |
| 177 | |
| 178 | addiu a0,a0,64 # adding 64 to dest |
| 179 | sgtu v1,a0,t9 |
| 180 | bne a0,a3,.Lloop16w |
| 181 | addiu a1,a1,64 # adding 64 to src |
| 182 | move a2,t8 |
| 183 | |
| 184 | # Here we have src and dest word-aligned but less than 64-bytes to go |
| 185 | |
| 186 | .Lchk8w: |
| 187 | pref 0, 0x0(a1) |
| 188 | andi t8,a2,0x1f # is there a 32-byte chunk? |
| 189 | # the t8 is the reminder count past 32-bytes |
| 190 | beq a2,t8,.Lchk1w # when a2=t8, no 32-byte chunk |
| 191 | nop |
| 192 | |
| 193 | lw t0,0(a1) |
| 194 | lw t1,4(a1) |
| 195 | lw t2,8(a1) |
| 196 | lw t3,12(a1) |
| 197 | lw t4,16(a1) |
| 198 | lw t5,20(a1) |
| 199 | lw t6,24(a1) |
| 200 | lw t7,28(a1) |
| 201 | addiu a1,a1,32 |
| 202 | |
| 203 | sw t0,0(a0) |
| 204 | sw t1,4(a0) |
| 205 | sw t2,8(a0) |
| 206 | sw t3,12(a0) |
| 207 | sw t4,16(a0) |
| 208 | sw t5,20(a0) |
| 209 | sw t6,24(a0) |
| 210 | sw t7,28(a0) |
| 211 | addiu a0,a0,32 |
| 212 | |
| 213 | .Lchk1w: |
| 214 | andi a2,t8,0x3 # now a2 is the reminder past 1w chunks |
| 215 | beq a2,t8,.Llast8 |
| 216 | subu a3,t8,a2 # a3 is count of bytes in 1w chunks |
| 217 | addu a3,a0,a3 # now a3 is the dst address past the 1w chunks |
| 218 | |
| 219 | # copying in words (4-byte chunks) |
| 220 | .LwordCopy_loop: |
| 221 | lw t3,0(a1) # the first t3 may be equal t0 ... optimize? |
| 222 | addiu a1,a1,4 |
| 223 | addiu a0,a0,4 |
| 224 | bne a0,a3,.LwordCopy_loop |
| 225 | sw t3,-4(a0) |
| 226 | |
| 227 | # For the last (<8) bytes |
| 228 | .Llast8: |
| 229 | blez a2,.Lleave |
| 230 | addu a3,a0,a2 # a3 is the last dst address |
| 231 | .Llast8loop: |
| 232 | lb v1,0(a1) |
| 233 | addiu a1,a1,1 |
| 234 | addiu a0,a0,1 |
| 235 | bne a0,a3,.Llast8loop |
| 236 | sb v1,-1(a0) |
| 237 | |
| 238 | .Lleave: |
| 239 | j ra |
| 240 | nop |
| 241 | |
| 242 | # |
| 243 | # UNALIGNED case |
| 244 | # |
| 245 | |
| 246 | .Lunaligned: |
| 247 | # got here with a3="negu a0" |
| 248 | andi a3,a3,0x3 # test if the a0 is word aligned |
| 249 | beqz a3,.Lua_chk16w |
| 250 | subu a2,a2,a3 # bytes left after initial a3 bytes |
| 251 | |
| 252 | LWHI v1,0(a1) |
| 253 | LWLO v1,3(a1) |
| 254 | addu a1,a1,a3 # a3 may be here 1, 2 or 3 |
| 255 | SWHI v1,0(a0) |
| 256 | addu a0,a0,a3 # below the dst will be word aligned (NOTE1) |
| 257 | |
| 258 | .Lua_chk16w: |
| 259 | andi t8,a2,0x3f # any whole 64-byte chunks? |
| 260 | # t8 is the byte count after 64-byte chunks |
| 261 | beq a2,t8,.Lua_chk8w # if a2==t8, no 64-byte chunks |
| 262 | # There will be at most 1 32-byte chunk after it |
| 263 | subu a3,a2,t8 # subtract from a2 the reminder |
| 264 | # Here a3 counts bytes in 16w chunks |
| 265 | addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks |
| 266 | |
| 267 | addu t0,a0,a2 # t0 is the "past the end" address |
| 268 | |
| 269 | subu t9,t0,160 # t9 is the "last safe pref 30,128(a0)" address |
| 270 | |
| 271 | pref 0,0(a1) # bring the first line of src, addr 0 |
| 272 | pref 0,32(a1) # bring the second line of src, addr 32 |
| 273 | pref 0,64(a1) # bring the third line of src, addr 64 |
| 274 | pref 30,32(a0) # safe, as we have at least 64 bytes ahead |
| 275 | # In case the a0 > t9 don't use "pref 30" at all |
| 276 | sgtu v1,a0,t9 |
| 277 | bgtz v1,.Lua_loop16w # skip "pref 30,64(a0)" for too short arrays |
| 278 | nop |
| 279 | # otherwise, start with using pref30 |
| 280 | pref 30,64(a0) |
| 281 | .Lua_loop16w: |
| 282 | pref 0,96(a1) |
| 283 | LWHI t0,0(a1) |
| 284 | LWLO t0,3(a1) |
| 285 | LWHI t1,4(a1) |
| 286 | bgtz v1,.Lua_skip_pref30_96 |
| 287 | LWLO t1,7(a1) |
| 288 | pref 30,96(a0) # continue setting up the dest, addr 96 |
| 289 | .Lua_skip_pref30_96: |
| 290 | LWHI t2,8(a1) |
| 291 | LWLO t2,11(a1) |
| 292 | LWHI t3,12(a1) |
| 293 | LWLO t3,15(a1) |
| 294 | LWHI t4,16(a1) |
| 295 | LWLO t4,19(a1) |
| 296 | LWHI t5,20(a1) |
| 297 | LWLO t5,23(a1) |
| 298 | LWHI t6,24(a1) |
| 299 | LWLO t6,27(a1) |
| 300 | LWHI t7,28(a1) |
| 301 | LWLO t7,31(a1) |
| 302 | pref 0,128(a1) # bring the next lines of src, addr 128 |
| 303 | |
| 304 | sw t0,0(a0) |
| 305 | sw t1,4(a0) |
| 306 | sw t2,8(a0) |
| 307 | sw t3,12(a0) |
| 308 | sw t4,16(a0) |
| 309 | sw t5,20(a0) |
| 310 | sw t6,24(a0) |
| 311 | sw t7,28(a0) |
| 312 | |
| 313 | LWHI t0,32(a1) |
| 314 | LWLO t0,35(a1) |
| 315 | LWHI t1,36(a1) |
| 316 | bgtz v1,.Lua_skip_pref30_128 |
| 317 | LWLO t1,39(a1) |
| 318 | pref 30,128(a0) # continue setting up the dest, addr 128 |
| 319 | .Lua_skip_pref30_128: |
| 320 | LWHI t2,40(a1) |
| 321 | LWLO t2,43(a1) |
| 322 | LWHI t3,44(a1) |
| 323 | LWLO t3,47(a1) |
| 324 | LWHI t4,48(a1) |
| 325 | LWLO t4,51(a1) |
| 326 | LWHI t5,52(a1) |
| 327 | LWLO t5,55(a1) |
| 328 | LWHI t6,56(a1) |
| 329 | LWLO t6,59(a1) |
| 330 | LWHI t7,60(a1) |
| 331 | LWLO t7,63(a1) |
| 332 | pref 0, 160(a1) # bring the next lines of src, addr 160 |
| 333 | |
| 334 | sw t0,32(a0) |
| 335 | sw t1,36(a0) |
| 336 | sw t2,40(a0) |
| 337 | sw t3,44(a0) |
| 338 | sw t4,48(a0) |
| 339 | sw t5,52(a0) |
| 340 | sw t6,56(a0) |
| 341 | sw t7,60(a0) |
| 342 | |
| 343 | addiu a0,a0,64 # adding 64 to dest |
| 344 | sgtu v1,a0,t9 |
| 345 | bne a0,a3,.Lua_loop16w |
| 346 | addiu a1,a1,64 # adding 64 to src |
| 347 | move a2,t8 |
| 348 | |
| 349 | # Here we have src and dest word-aligned but less than 64-bytes to go |
| 350 | |
| 351 | .Lua_chk8w: |
| 352 | pref 0, 0x0(a1) |
| 353 | andi t8,a2,0x1f # is there a 32-byte chunk? |
| 354 | # the t8 is the reminder count |
| 355 | beq a2,t8,.Lua_chk1w # when a2=t8, no 32-byte chunk |
| 356 | nop |
| 357 | |
| 358 | LWHI t0,0(a1) |
| 359 | LWLO t0,3(a1) |
| 360 | LWHI t1,4(a1) |
| 361 | LWLO t1,7(a1) |
| 362 | LWHI t2,8(a1) |
| 363 | LWLO t2,11(a1) |
| 364 | LWHI t3,12(a1) |
| 365 | LWLO t3,15(a1) |
| 366 | LWHI t4,16(a1) |
| 367 | LWLO t4,19(a1) |
| 368 | LWHI t5,20(a1) |
| 369 | LWLO t5,23(a1) |
| 370 | LWHI t6,24(a1) |
| 371 | LWLO t6,27(a1) |
| 372 | LWHI t7,28(a1) |
| 373 | LWLO t7,31(a1) |
| 374 | addiu a1,a1,32 |
| 375 | |
| 376 | sw t0,0(a0) |
| 377 | sw t1,4(a0) |
| 378 | sw t2,8(a0) |
| 379 | sw t3,12(a0) |
| 380 | sw t4,16(a0) |
| 381 | sw t5,20(a0) |
| 382 | sw t6,24(a0) |
| 383 | sw t7,28(a0) |
| 384 | addiu a0,a0,32 |
| 385 | |
| 386 | .Lua_chk1w: |
| 387 | andi a2,t8,0x3 # now a2 is the reminder past 1w chunks |
| 388 | beq a2,t8,.Lua_smallCopy |
| 389 | subu a3,t8,a2 # a3 is count of bytes in 1w chunks |
| 390 | addu a3,a0,a3 # now a3 is the dst address past the 1w chunks |
| 391 | |
| 392 | # copying in words (4-byte chunks) |
| 393 | .Lua_wordCopy_loop: |
| 394 | LWHI v1,0(a1) |
| 395 | LWLO v1,3(a1) |
| 396 | addiu a1,a1,4 |
| 397 | addiu a0,a0,4 # note: dst=a0 is word aligned here, see NOTE1 |
| 398 | bne a0,a3,.Lua_wordCopy_loop |
| 399 | sw v1,-4(a0) |
| 400 | |
| 401 | # Now less than 4 bytes (value in a2) left to copy |
| 402 | .Lua_smallCopy: |
| 403 | beqz a2,.Lleave |
| 404 | addu a3,a0,a2 # a3 is the last dst address |
| 405 | .Lua_smallCopy_loop: |
| 406 | lb v1,0(a1) |
| 407 | addiu a1,a1,1 |
| 408 | addiu a0,a0,1 |
| 409 | bne a0,a3,.Lua_smallCopy_loop |
| 410 | sb v1,-1(a0) |
| 411 | |
| 412 | j ra |
| 413 | nop |
| 414 | |
| 415 | .set at |
| 416 | .set reorder |
| 417 | |
| 418 | END(memcpy) |
| 419 | |
| 420 | |
| 421 | /************************************************************************ |
| 422 | * Implementation : Static functions |
| 423 | ************************************************************************/ |