blob: b4773dfb1679f828a8543191acbad75041c327f7 [file] [log] [blame]
Bruce Beare8ff1a272010-03-04 11:03:37 -08001/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef MEMCPY
32# define MEMCPY ssse3_memcpy5
33#endif
34
35#ifndef L
36# define L(label) .L##label
37#endif
38
39#ifndef ALIGN
40# define ALIGN(n) .p2align n
41#endif
42
43#ifndef cfi_startproc
44# define cfi_startproc .cfi_startproc
45#endif
46
47#ifndef cfi_endproc
48# define cfi_endproc .cfi_endproc
49#endif
50
51#ifndef cfi_rel_offset
52# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
53#endif
54
55#ifndef cfi_restore
Bruce Beare124a5422010-10-11 12:24:41 -070056# define cfi_restore(reg) .cfi_restore reg
Bruce Beare8ff1a272010-03-04 11:03:37 -080057#endif
58
59#ifndef cfi_adjust_cfa_offset
60# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
61#endif
62
Bruce Beare124a5422010-10-11 12:24:41 -070063#ifndef cfi_remember_state
64# define cfi_remember_state .cfi_remember_state
65#endif
66
67#ifndef cfi_restore_state
68# define cfi_restore_state .cfi_restore_state
69#endif
70
Bruce Beare8ff1a272010-03-04 11:03:37 -080071#ifndef ENTRY
72# define ENTRY(name) \
73 .type name, @function; \
74 .globl name; \
75 .p2align 4; \
76name: \
77 cfi_startproc
78#endif
79
80#ifndef END
81# define END(name) \
82 cfi_endproc; \
83 .size name, .-name
84#endif
85
86#ifdef USE_AS_BCOPY
87# define SRC PARMS
88# define DEST SRC+4
89# define LEN DEST+4
90#else
91# define DEST PARMS
92# define SRC DEST+4
93# define LEN SRC+4
94#endif
95
96#define CFI_PUSH(REG) \
97 cfi_adjust_cfa_offset (4); \
98 cfi_rel_offset (REG, 0)
99
100#define CFI_POP(REG) \
101 cfi_adjust_cfa_offset (-4); \
102 cfi_restore (REG)
103
104#define PUSH(REG) pushl REG; CFI_PUSH (REG)
105#define POP(REG) popl REG; CFI_POP (REG)
106
107#ifdef SHARED
108# define PARMS 8 /* Preserve EBX. */
109# define ENTRANCE PUSH (%ebx);
110# define RETURN_END POP (%ebx); ret
111# define RETURN RETURN_END; CFI_PUSH (%ebx)
112# define JMPTBL(I, B) I - B
113
114/* Load an entry in a jump table into EBX and branch to it. TABLE is a
115 jump table with relative offsets. INDEX is a register contains the
116 index into the jump table. SCALE is the scale of INDEX. */
117# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
118 /* We first load PC into EBX. */ \
119 call __i686.get_pc_thunk.bx; \
120 /* Get the address of the jump table. */ \
121 addl $(TABLE - .), %ebx; \
122 /* Get the entry and convert the relative offset to the \
123 absolute address. */ \
124 addl (%ebx,INDEX,SCALE), %ebx; \
125 /* We loaded the jump table. Go. */ \
126 jmp *%ebx
127
128# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \
Bruce Beare124a5422010-10-11 12:24:41 -0700129 addl $(TABLE - .), %ebx
130
Bruce Beare8ff1a272010-03-04 11:03:37 -0800131# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
132 addl (%ebx,INDEX,SCALE), %ebx; \
133 /* We loaded the jump table. Go. */ \
134 jmp *%ebx
135
136 .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
137 .globl __i686.get_pc_thunk.bx
138 .hidden __i686.get_pc_thunk.bx
139 ALIGN (4)
140 .type __i686.get_pc_thunk.bx,@function
141__i686.get_pc_thunk.bx:
142 movl (%esp), %ebx
143 ret
144#else
145# define PARMS 4
146# define ENTRANCE
147# define RETURN_END ret
148# define RETURN RETURN_END
149# define JMPTBL(I, B) I
150
151/* Branch to an entry in a jump table. TABLE is a jump table with
152 absolute offsets. INDEX is a register contains the index into the
153 jump table. SCALE is the scale of INDEX. */
154# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
155 jmp *TABLE(,INDEX,SCALE)
156
Bruce Beare124a5422010-10-11 12:24:41 -0700157# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800158
159# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
160 jmp *TABLE(,INDEX,SCALE)
161#endif
162
163 .section .text.ssse3,"ax",@progbits
164ENTRY (MEMCPY)
165 ENTRANCE
166 movl LEN(%esp), %ecx
167 movl SRC(%esp), %eax
168 movl DEST(%esp), %edx
169
170#ifdef USE_AS_MEMMOVE
171 cmp %eax, %edx
172 jb L(copy_forward)
173 je L(fwd_write_0bytes)
174 cmp $32, %ecx
175 jae L(memmove_bwd)
176 jmp L(bk_write_less32bytes_2)
177L(memmove_bwd):
178 add %ecx, %eax
179 cmp %eax, %edx
180 movl SRC(%esp), %eax
181 jb L(copy_backward)
182
183L(copy_forward):
184#endif
185 cmp $48, %ecx
186 jae L(48bytesormore)
187
188L(fwd_write_less32bytes):
189#ifndef USE_AS_MEMMOVE
190 cmp %dl, %al
191 jb L(bk_write)
192#endif
193 add %ecx, %edx
194 add %ecx, %eax
195 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
196#ifndef USE_AS_MEMMOVE
197L(bk_write):
198 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
199#endif
200
201 ALIGN (4)
202/* ECX > 32 and EDX is 4 byte aligned. */
203L(48bytesormore):
204 movdqu (%eax), %xmm0
205 PUSH (%edi)
206 movl %edx, %edi
207 and $-16, %edx
208 PUSH (%esi)
Bruce Beare124a5422010-10-11 12:24:41 -0700209 cfi_remember_state
Bruce Beare8ff1a272010-03-04 11:03:37 -0800210 add $16, %edx
211 movl %edi, %esi
212 sub %edx, %edi
213 add %edi, %ecx
214 sub %edi, %eax
215
216#ifdef SHARED_CACHE_SIZE_HALF
217 cmp $SHARED_CACHE_SIZE_HALF, %ecx
218#else
219# ifdef SHARED
220 call __i686.get_pc_thunk.bx
221 add $_GLOBAL_OFFSET_TABLE_, %ebx
222 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
223# else
224 cmp __x86_shared_cache_size_half, %ecx
225# endif
226#endif
227
228 mov %eax, %edi
229 jae L(large_page)
230 and $0xf, %edi
231 jz L(shl_0)
232
233 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
234
Bruce Beare124a5422010-10-11 12:24:41 -0700235 cfi_restore_state
236 cfi_remember_state
Bruce Beare8ff1a272010-03-04 11:03:37 -0800237 ALIGN (4)
238L(shl_0):
239 movdqu %xmm0, (%esi)
240 xor %edi, %edi
241 POP (%esi)
242 cmp $127, %ecx
243 ja L(shl_0_gobble)
244 lea -32(%ecx), %ecx
245L(shl_0_loop):
246 movdqa (%eax, %edi), %xmm0
247 movdqa 16(%eax, %edi), %xmm1
248 sub $32, %ecx
249 movdqa %xmm0, (%edx, %edi)
250 movdqa %xmm1, 16(%edx, %edi)
251 lea 32(%edi), %edi
252 jb L(shl_0_end)
253
254 movdqa (%eax, %edi), %xmm0
255 movdqa 16(%eax, %edi), %xmm1
256 sub $32, %ecx
257 movdqa %xmm0, (%edx, %edi)
258 movdqa %xmm1, 16(%edx, %edi)
259 lea 32(%edi), %edi
260 jb L(shl_0_end)
261
262 movdqa (%eax, %edi), %xmm0
263 movdqa 16(%eax, %edi), %xmm1
264 sub $32, %ecx
265 movdqa %xmm0, (%edx, %edi)
266 movdqa %xmm1, 16(%edx, %edi)
267 lea 32(%edi), %edi
268 jb L(shl_0_end)
269
270 movdqa (%eax, %edi), %xmm0
271 movdqa 16(%eax, %edi), %xmm1
272 sub $32, %ecx
273 movdqa %xmm0, (%edx, %edi)
274 movdqa %xmm1, 16(%edx, %edi)
275 lea 32(%edi), %edi
276L(shl_0_end):
277 lea 32(%ecx), %ecx
278 add %ecx, %edi
279 add %edi, %edx
280 add %edi, %eax
281 POP (%edi)
282 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
283
Bruce Beare124a5422010-10-11 12:24:41 -0700284 CFI_PUSH (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800285L(shl_0_gobble):
286
287#ifdef DATA_CACHE_SIZE_HALF
288 cmp $DATA_CACHE_SIZE_HALF, %ecx
289#else
290# ifdef SHARED
291 call __i686.get_pc_thunk.bx
292 add $_GLOBAL_OFFSET_TABLE_, %ebx
293 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
294# else
295 cmp __x86_data_cache_size_half, %ecx
296# endif
297#endif
298
299 POP (%edi)
300 lea -128(%ecx), %ecx
301 jae L(shl_0_gobble_mem_loop)
302L(shl_0_gobble_cache_loop):
303 movdqa (%eax), %xmm0
304 movdqa 0x10(%eax), %xmm1
305 movdqa 0x20(%eax), %xmm2
306 movdqa 0x30(%eax), %xmm3
307 movdqa 0x40(%eax), %xmm4
308 movdqa 0x50(%eax), %xmm5
309 movdqa 0x60(%eax), %xmm6
310 movdqa 0x70(%eax), %xmm7
311 lea 0x80(%eax), %eax
312 sub $128, %ecx
313 movdqa %xmm0, (%edx)
314 movdqa %xmm1, 0x10(%edx)
315 movdqa %xmm2, 0x20(%edx)
316 movdqa %xmm3, 0x30(%edx)
317 movdqa %xmm4, 0x40(%edx)
318 movdqa %xmm5, 0x50(%edx)
319 movdqa %xmm6, 0x60(%edx)
320 movdqa %xmm7, 0x70(%edx)
321 lea 0x80(%edx), %edx
322
323 jae L(shl_0_gobble_cache_loop)
324 cmp $-0x40, %ecx
325 lea 0x80(%ecx), %ecx
326 jl L(shl_0_cache_less_64bytes)
327
328 movdqa (%eax), %xmm0
329 sub $0x40, %ecx
330 movdqa 0x10(%eax), %xmm1
331
332 movdqa %xmm0, (%edx)
333 movdqa %xmm1, 0x10(%edx)
334
335 movdqa 0x20(%eax), %xmm0
336 movdqa 0x30(%eax), %xmm1
337 add $0x40, %eax
338
339 movdqa %xmm0, 0x20(%edx)
340 movdqa %xmm1, 0x30(%edx)
341 add $0x40, %edx
342L(shl_0_cache_less_64bytes):
343 cmp $0x20, %ecx
344 jb L(shl_0_cache_less_32bytes)
345 movdqa (%eax), %xmm0
346 sub $0x20, %ecx
347 movdqa 0x10(%eax), %xmm1
348 add $0x20, %eax
349 movdqa %xmm0, (%edx)
350 movdqa %xmm1, 0x10(%edx)
351 add $0x20, %edx
352L(shl_0_cache_less_32bytes):
353 cmp $0x10, %ecx
354 jb L(shl_0_cache_less_16bytes)
355 sub $0x10, %ecx
356 movdqa (%eax), %xmm0
357 add $0x10, %eax
358 movdqa %xmm0, (%edx)
359 add $0x10, %edx
360L(shl_0_cache_less_16bytes):
361 add %ecx, %edx
362 add %ecx, %eax
363 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
364
365
366 ALIGN (4)
367L(shl_0_gobble_mem_loop):
368 prefetcht0 0x1c0(%eax)
369 prefetcht0 0x280(%eax)
370 prefetcht0 0x1c0(%edx)
371
372 movdqa (%eax), %xmm0
373 movdqa 0x10(%eax), %xmm1
374 movdqa 0x20(%eax), %xmm2
375 movdqa 0x30(%eax), %xmm3
376 movdqa 0x40(%eax), %xmm4
377 movdqa 0x50(%eax), %xmm5
378 movdqa 0x60(%eax), %xmm6
379 movdqa 0x70(%eax), %xmm7
380 lea 0x80(%eax), %eax
381 sub $0x80, %ecx
382 movdqa %xmm0, (%edx)
383 movdqa %xmm1, 0x10(%edx)
384 movdqa %xmm2, 0x20(%edx)
385 movdqa %xmm3, 0x30(%edx)
386 movdqa %xmm4, 0x40(%edx)
387 movdqa %xmm5, 0x50(%edx)
388 movdqa %xmm6, 0x60(%edx)
389 movdqa %xmm7, 0x70(%edx)
390 lea 0x80(%edx), %edx
391
392 jae L(shl_0_gobble_mem_loop)
393 cmp $-0x40, %ecx
394 lea 0x80(%ecx), %ecx
395 jl L(shl_0_mem_less_64bytes)
396
397 movdqa (%eax), %xmm0
398 sub $0x40, %ecx
399 movdqa 0x10(%eax), %xmm1
400
401 movdqa %xmm0, (%edx)
402 movdqa %xmm1, 0x10(%edx)
403
404 movdqa 0x20(%eax), %xmm0
405 movdqa 0x30(%eax), %xmm1
406 add $0x40, %eax
407
408 movdqa %xmm0, 0x20(%edx)
409 movdqa %xmm1, 0x30(%edx)
410 add $0x40, %edx
411L(shl_0_mem_less_64bytes):
412 cmp $0x20, %ecx
413 jb L(shl_0_mem_less_32bytes)
414 movdqa (%eax), %xmm0
415 sub $0x20, %ecx
416 movdqa 0x10(%eax), %xmm1
417 add $0x20, %eax
418 movdqa %xmm0, (%edx)
419 movdqa %xmm1, 0x10(%edx)
420 add $0x20, %edx
421L(shl_0_mem_less_32bytes):
422 cmp $0x10, %ecx
423 jb L(shl_0_mem_less_16bytes)
424 sub $0x10, %ecx
425 movdqa (%eax), %xmm0
426 add $0x10, %eax
427 movdqa %xmm0, (%edx)
428 add $0x10, %edx
429L(shl_0_mem_less_16bytes):
430 add %ecx, %edx
431 add %ecx, %eax
432 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
433
Bruce Beare124a5422010-10-11 12:24:41 -0700434 cfi_restore_state
435 cfi_remember_state
Bruce Beare8ff1a272010-03-04 11:03:37 -0800436 ALIGN (4)
437L(shl_1):
438 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
439 lea -1(%eax), %eax
440 movaps (%eax), %xmm1
441 xor %edi, %edi
442 lea -32(%ecx), %ecx
443 movdqu %xmm0, (%esi)
444 POP (%esi)
445L(shl_1_loop):
446
447 movdqa 16(%eax, %edi), %xmm2
448 sub $32, %ecx
449 movdqa 32(%eax, %edi), %xmm3
450 movdqa %xmm3, %xmm4
451 palignr $1, %xmm2, %xmm3
452 palignr $1, %xmm1, %xmm2
453 lea 32(%edi), %edi
454 movdqa %xmm2, -32(%edx, %edi)
455 movdqa %xmm3, -16(%edx, %edi)
456
457 jb L(shl_1_end)
458
459 movdqa 16(%eax, %edi), %xmm2
460 sub $32, %ecx
461 movdqa 32(%eax, %edi), %xmm3
462 movdqa %xmm3, %xmm1
463 palignr $1, %xmm2, %xmm3
464 palignr $1, %xmm4, %xmm2
465 lea 32(%edi), %edi
466 movdqa %xmm2, -32(%edx, %edi)
467 movdqa %xmm3, -16(%edx, %edi)
468
469 jae L(shl_1_loop)
470
471L(shl_1_end):
472 lea 32(%ecx), %ecx
473 add %ecx, %edi
474 add %edi, %edx
475 lea 1(%edi, %eax), %eax
476 POP (%edi)
477 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
478
Bruce Beare124a5422010-10-11 12:24:41 -0700479 cfi_restore_state
480 cfi_remember_state
Bruce Beare8ff1a272010-03-04 11:03:37 -0800481 ALIGN (4)
482L(shl_2):
483 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
484 lea -2(%eax), %eax
485 movaps (%eax), %xmm1
486 xor %edi, %edi
487 lea -32(%ecx), %ecx
488 movdqu %xmm0, (%esi)
489 POP (%esi)
490L(shl_2_loop):
491
492 movdqa 16(%eax, %edi), %xmm2
493 sub $32, %ecx
494 movdqa 32(%eax, %edi), %xmm3
495 movdqa %xmm3, %xmm4
496 palignr $2, %xmm2, %xmm3
497 palignr $2, %xmm1, %xmm2
498 lea 32(%edi), %edi
499 movdqa %xmm2, -32(%edx, %edi)
500 movdqa %xmm3, -16(%edx, %edi)
501
502 jb L(shl_2_end)
503
504 movdqa 16(%eax, %edi), %xmm2
505 sub $32, %ecx
506 movdqa 32(%eax, %edi), %xmm3
507 movdqa %xmm3, %xmm1
508 palignr $2, %xmm2, %xmm3
509 palignr $2, %xmm4, %xmm2
510 lea 32(%edi), %edi
511 movdqa %xmm2, -32(%edx, %edi)
512 movdqa %xmm3, -16(%edx, %edi)
513
514 jae L(shl_2_loop)
515
516L(shl_2_end):
517 lea 32(%ecx), %ecx
518 add %ecx, %edi
519 add %edi, %edx
520 lea 2(%edi, %eax), %eax
521 POP (%edi)
522 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
523
Bruce Beare124a5422010-10-11 12:24:41 -0700524 cfi_restore_state
525 cfi_remember_state
Bruce Beare8ff1a272010-03-04 11:03:37 -0800526 ALIGN (4)
527L(shl_3):
528 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
529 lea -3(%eax), %eax
530 movaps (%eax), %xmm1
531 xor %edi, %edi
532 lea -32(%ecx), %ecx
533 movdqu %xmm0, (%esi)
534 POP (%esi)
535L(shl_3_loop):
536
537 movdqa 16(%eax, %edi), %xmm2
538 sub $32, %ecx
539 movdqa 32(%eax, %edi), %xmm3
540 movdqa %xmm3, %xmm4
541 palignr $3, %xmm2, %xmm3
542 palignr $3, %xmm1, %xmm2
543 lea 32(%edi), %edi
544 movdqa %xmm2, -32(%edx, %edi)
545 movdqa %xmm3, -16(%edx, %edi)
546
547 jb L(shl_3_end)
548
549 movdqa 16(%eax, %edi), %xmm2
550 sub $32, %ecx
551 movdqa 32(%eax, %edi), %xmm3
552 movdqa %xmm3, %xmm1
553 palignr $3, %xmm2, %xmm3
554 palignr $3, %xmm4, %xmm2
555 lea 32(%edi), %edi
556 movdqa %xmm2, -32(%edx, %edi)
557 movdqa %xmm3, -16(%edx, %edi)
558
559 jae L(shl_3_loop)
560
561L(shl_3_end):
562 lea 32(%ecx), %ecx
563 add %ecx, %edi
564 add %edi, %edx
565 lea 3(%edi, %eax), %eax
566 POP (%edi)
567 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
568
Bruce Beare124a5422010-10-11 12:24:41 -0700569 cfi_restore_state
570 cfi_remember_state
Bruce Beare8ff1a272010-03-04 11:03:37 -0800571 ALIGN (4)
572L(shl_4):
573 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
574 lea -4(%eax), %eax
575 movaps (%eax), %xmm1
576 xor %edi, %edi
577 lea -32(%ecx), %ecx
578 movdqu %xmm0, (%esi)
579 POP (%esi)
580L(shl_4_loop):
581
582 movdqa 16(%eax, %edi), %xmm2
583 sub $32, %ecx
584 movdqa 32(%eax, %edi), %xmm3
585 movdqa %xmm3, %xmm4
586 palignr $4, %xmm2, %xmm3
587 palignr $4, %xmm1, %xmm2
588 lea 32(%edi), %edi
589 movdqa %xmm2, -32(%edx, %edi)
590 movdqa %xmm3, -16(%edx, %edi)
591
592 jb L(shl_4_end)
593
594 movdqa 16(%eax, %edi), %xmm2
595 sub $32, %ecx
596 movdqa 32(%eax, %edi), %xmm3
597 movdqa %xmm3, %xmm1
598 palignr $4, %xmm2, %xmm3
599 palignr $4, %xmm4, %xmm2
600 lea 32(%edi), %edi
601 movdqa %xmm2, -32(%edx, %edi)
602 movdqa %xmm3, -16(%edx, %edi)
603
604 jae L(shl_4_loop)
605
606L(shl_4_end):
607 lea 32(%ecx), %ecx
608 add %ecx, %edi
609 add %edi, %edx
610 lea 4(%edi, %eax), %eax
611 POP (%edi)
612 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
613
Bruce Beare124a5422010-10-11 12:24:41 -0700614 cfi_restore_state
615 cfi_remember_state
Bruce Beare8ff1a272010-03-04 11:03:37 -0800616 ALIGN (4)
617L(shl_5):
618 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
619 lea -5(%eax), %eax
620 movaps (%eax), %xmm1
621 xor %edi, %edi
622 lea -32(%ecx), %ecx
623 movdqu %xmm0, (%esi)
624 POP (%esi)
625L(shl_5_loop):
626
627 movdqa 16(%eax, %edi), %xmm2
628 sub $32, %ecx
629 movdqa 32(%eax, %edi), %xmm3
630 movdqa %xmm3, %xmm4
631 palignr $5, %xmm2, %xmm3
632 palignr $5, %xmm1, %xmm2
633 lea 32(%edi), %edi
634 movdqa %xmm2, -32(%edx, %edi)
635 movdqa %xmm3, -16(%edx, %edi)
636
637 jb L(shl_5_end)
638
639 movdqa 16(%eax, %edi), %xmm2
640 sub $32, %ecx
641 movdqa 32(%eax, %edi), %xmm3
642 movdqa %xmm3, %xmm1
643 palignr $5, %xmm2, %xmm3
644 palignr $5, %xmm4, %xmm2
645 lea 32(%edi), %edi
646 movdqa %xmm2, -32(%edx, %edi)
647 movdqa %xmm3, -16(%edx, %edi)
648
649 jae L(shl_5_loop)
650
651L(shl_5_end):
652 lea 32(%ecx), %ecx
653 add %ecx, %edi
654 add %edi, %edx
655 lea 5(%edi, %eax), %eax
656 POP (%edi)
657 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
658
Bruce Beare124a5422010-10-11 12:24:41 -0700659 cfi_restore_state
660 cfi_remember_state
Bruce Beare8ff1a272010-03-04 11:03:37 -0800661 ALIGN (4)
662L(shl_6):
663 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
664 lea -6(%eax), %eax
665 movaps (%eax), %xmm1
666 xor %edi, %edi
667 lea -32(%ecx), %ecx
668 movdqu %xmm0, (%esi)
669 POP (%esi)
670L(shl_6_loop):
671
672 movdqa 16(%eax, %edi), %xmm2
673 sub $32, %ecx
674 movdqa 32(%eax, %edi), %xmm3
675 movdqa %xmm3, %xmm4
676 palignr $6, %xmm2, %xmm3
677 palignr $6, %xmm1, %xmm2
678 lea 32(%edi), %edi
679 movdqa %xmm2, -32(%edx, %edi)
680 movdqa %xmm3, -16(%edx, %edi)
681
682 jb L(shl_6_end)
683
684 movdqa 16(%eax, %edi), %xmm2
685 sub $32, %ecx
686 movdqa 32(%eax, %edi), %xmm3
687 movdqa %xmm3, %xmm1
688 palignr $6, %xmm2, %xmm3
689 palignr $6, %xmm4, %xmm2
690 lea 32(%edi), %edi
691 movdqa %xmm2, -32(%edx, %edi)
692 movdqa %xmm3, -16(%edx, %edi)
693
694 jae L(shl_6_loop)
695
696L(shl_6_end):
697 lea 32(%ecx), %ecx
698 add %ecx, %edi
699 add %edi, %edx
700 lea 6(%edi, %eax), %eax
701 POP (%edi)
702 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
703
Bruce Beare124a5422010-10-11 12:24:41 -0700704 cfi_restore_state
705 cfi_remember_state
Bruce Beare8ff1a272010-03-04 11:03:37 -0800706 ALIGN (4)
707L(shl_7):
708 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
709 lea -7(%eax), %eax
710 movaps (%eax), %xmm1
711 xor %edi, %edi
712 lea -32(%ecx), %ecx
713 movdqu %xmm0, (%esi)
714 POP (%esi)
715L(shl_7_loop):
716
717 movdqa 16(%eax, %edi), %xmm2
718 sub $32, %ecx
719 movdqa 32(%eax, %edi), %xmm3
720 movdqa %xmm3, %xmm4
721 palignr $7, %xmm2, %xmm3
722 palignr $7, %xmm1, %xmm2
723 lea 32(%edi), %edi
724 movdqa %xmm2, -32(%edx, %edi)
725 movdqa %xmm3, -16(%edx, %edi)
726
727 jb L(shl_7_end)
728
729 movdqa 16(%eax, %edi), %xmm2
730 sub $32, %ecx
731 movdqa 32(%eax, %edi), %xmm3
732 movdqa %xmm3, %xmm1
733 palignr $7, %xmm2, %xmm3
734 palignr $7, %xmm4, %xmm2
735 lea 32(%edi), %edi
736 movdqa %xmm2, -32(%edx, %edi)
737 movdqa %xmm3, -16(%edx, %edi)
738
739 jae L(shl_7_loop)
740
741L(shl_7_end):
742 lea 32(%ecx), %ecx
743 add %ecx, %edi
744 add %edi, %edx
745 lea 7(%edi, %eax), %eax
746 POP (%edi)
747 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
748
Bruce Beare124a5422010-10-11 12:24:41 -0700749 cfi_restore_state
750 cfi_remember_state
Bruce Beare8ff1a272010-03-04 11:03:37 -0800751 ALIGN (4)
752L(shl_8):
753 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
754 lea -8(%eax), %eax
755 movaps (%eax), %xmm1
756 xor %edi, %edi
757 lea -32(%ecx), %ecx
758 movdqu %xmm0, (%esi)
759 POP (%esi)
760L(shl_8_loop):
761
762 movdqa 16(%eax, %edi), %xmm2
763 sub $32, %ecx
764 movdqa 32(%eax, %edi), %xmm3
765 movdqa %xmm3, %xmm4
766 palignr $8, %xmm2, %xmm3
767 palignr $8, %xmm1, %xmm2
768 lea 32(%edi), %edi
769 movdqa %xmm2, -32(%edx, %edi)
770 movdqa %xmm3, -16(%edx, %edi)
771
772 jb L(shl_8_end)
773
774 movdqa 16(%eax, %edi), %xmm2
775 sub $32, %ecx
776 movdqa 32(%eax, %edi), %xmm3
777 movdqa %xmm3, %xmm1
778 palignr $8, %xmm2, %xmm3
779 palignr $8, %xmm4, %xmm2
780 lea 32(%edi), %edi
781 movdqa %xmm2, -32(%edx, %edi)
782 movdqa %xmm3, -16(%edx, %edi)
783
784 jae L(shl_8_loop)
785
786L(shl_8_end):
787 lea 32(%ecx), %ecx
788 add %ecx, %edi
789 add %edi, %edx
790 lea 8(%edi, %eax), %eax
791 POP (%edi)
792 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
793
Bruce Beare124a5422010-10-11 12:24:41 -0700794 cfi_restore_state
795 cfi_remember_state
Bruce Beare8ff1a272010-03-04 11:03:37 -0800796 ALIGN (4)
797L(shl_9):
798 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
799 lea -9(%eax), %eax
800 movaps (%eax), %xmm1
801 xor %edi, %edi
802 lea -32(%ecx), %ecx
803 movdqu %xmm0, (%esi)
804 POP (%esi)
805L(shl_9_loop):
806
807 movdqa 16(%eax, %edi), %xmm2
808 sub $32, %ecx
809 movdqa 32(%eax, %edi), %xmm3
810 movdqa %xmm3, %xmm4
811 palignr $9, %xmm2, %xmm3
812 palignr $9, %xmm1, %xmm2
813 lea 32(%edi), %edi
814 movdqa %xmm2, -32(%edx, %edi)
815 movdqa %xmm3, -16(%edx, %edi)
816
817 jb L(shl_9_end)
818
819 movdqa 16(%eax, %edi), %xmm2
820 sub $32, %ecx
821 movdqa 32(%eax, %edi), %xmm3
822 movdqa %xmm3, %xmm1
823 palignr $9, %xmm2, %xmm3
824 palignr $9, %xmm4, %xmm2
825 lea 32(%edi), %edi
826 movdqa %xmm2, -32(%edx, %edi)
827 movdqa %xmm3, -16(%edx, %edi)
828
829 jae L(shl_9_loop)
830
831L(shl_9_end):
832 lea 32(%ecx), %ecx
833 add %ecx, %edi
834 add %edi, %edx
835 lea 9(%edi, %eax), %eax
836 POP (%edi)
837 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
838
Bruce Beare124a5422010-10-11 12:24:41 -0700839 cfi_restore_state
840 cfi_remember_state
Bruce Beare8ff1a272010-03-04 11:03:37 -0800841 ALIGN (4)
842L(shl_10):
843 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
844 lea -10(%eax), %eax
845 movaps (%eax), %xmm1
846 xor %edi, %edi
847 lea -32(%ecx), %ecx
848 movdqu %xmm0, (%esi)
849 POP (%esi)
850L(shl_10_loop):
851
852 movdqa 16(%eax, %edi), %xmm2
853 sub $32, %ecx
854 movdqa 32(%eax, %edi), %xmm3
855 movdqa %xmm3, %xmm4
856 palignr $10, %xmm2, %xmm3
857 palignr $10, %xmm1, %xmm2
858 lea 32(%edi), %edi
859 movdqa %xmm2, -32(%edx, %edi)
860 movdqa %xmm3, -16(%edx, %edi)
861
862 jb L(shl_10_end)
863
864 movdqa 16(%eax, %edi), %xmm2
865 sub $32, %ecx
866 movdqa 32(%eax, %edi), %xmm3
867 movdqa %xmm3, %xmm1
868 palignr $10, %xmm2, %xmm3
869 palignr $10, %xmm4, %xmm2
870 lea 32(%edi), %edi
871 movdqa %xmm2, -32(%edx, %edi)
872 movdqa %xmm3, -16(%edx, %edi)
873
874 jae L(shl_10_loop)
875
876L(shl_10_end):
877 lea 32(%ecx), %ecx
878 add %ecx, %edi
879 add %edi, %edx
880 lea 10(%edi, %eax), %eax
881 POP (%edi)
882 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
883
Bruce Beare124a5422010-10-11 12:24:41 -0700884 cfi_restore_state
885 cfi_remember_state
Bruce Beare8ff1a272010-03-04 11:03:37 -0800886 ALIGN (4)
887L(shl_11):
888 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
889 lea -11(%eax), %eax
890 movaps (%eax), %xmm1
891 xor %edi, %edi
892 lea -32(%ecx), %ecx
893 movdqu %xmm0, (%esi)
894 POP (%esi)
895L(shl_11_loop):
896
897 movdqa 16(%eax, %edi), %xmm2
898 sub $32, %ecx
899 movdqa 32(%eax, %edi), %xmm3
900 movdqa %xmm3, %xmm4
901 palignr $11, %xmm2, %xmm3
902 palignr $11, %xmm1, %xmm2
903 lea 32(%edi), %edi
904 movdqa %xmm2, -32(%edx, %edi)
905 movdqa %xmm3, -16(%edx, %edi)
906
907 jb L(shl_11_end)
908
909 movdqa 16(%eax, %edi), %xmm2
910 sub $32, %ecx
911 movdqa 32(%eax, %edi), %xmm3
912 movdqa %xmm3, %xmm1
913 palignr $11, %xmm2, %xmm3
914 palignr $11, %xmm4, %xmm2
915 lea 32(%edi), %edi
916 movdqa %xmm2, -32(%edx, %edi)
917 movdqa %xmm3, -16(%edx, %edi)
918
919 jae L(shl_11_loop)
920
921L(shl_11_end):
922 lea 32(%ecx), %ecx
923 add %ecx, %edi
924 add %edi, %edx
925 lea 11(%edi, %eax), %eax
926 POP (%edi)
927 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
928
Bruce Beare124a5422010-10-11 12:24:41 -0700929 cfi_restore_state
930 cfi_remember_state
Bruce Beare8ff1a272010-03-04 11:03:37 -0800931 ALIGN (4)
932L(shl_12):
933 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
934 lea -12(%eax), %eax
935 movaps (%eax), %xmm1
936 xor %edi, %edi
937 lea -32(%ecx), %ecx
938 movdqu %xmm0, (%esi)
939 POP (%esi)
940L(shl_12_loop):
941
942 movdqa 16(%eax, %edi), %xmm2
943 sub $32, %ecx
944 movdqa 32(%eax, %edi), %xmm3
945 movdqa %xmm3, %xmm4
946 palignr $12, %xmm2, %xmm3
947 palignr $12, %xmm1, %xmm2
948 lea 32(%edi), %edi
949 movdqa %xmm2, -32(%edx, %edi)
950 movdqa %xmm3, -16(%edx, %edi)
951
952 jb L(shl_12_end)
953
954 movdqa 16(%eax, %edi), %xmm2
955 sub $32, %ecx
956 movdqa 32(%eax, %edi), %xmm3
957 movdqa %xmm3, %xmm1
958 palignr $12, %xmm2, %xmm3
959 palignr $12, %xmm4, %xmm2
960 lea 32(%edi), %edi
961 movdqa %xmm2, -32(%edx, %edi)
962 movdqa %xmm3, -16(%edx, %edi)
963
964 jae L(shl_12_loop)
965
966L(shl_12_end):
967 lea 32(%ecx), %ecx
968 add %ecx, %edi
969 add %edi, %edx
970 lea 12(%edi, %eax), %eax
971 POP (%edi)
972 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
973
Bruce Beare124a5422010-10-11 12:24:41 -0700974 cfi_restore_state
975 cfi_remember_state
Bruce Beare8ff1a272010-03-04 11:03:37 -0800976 ALIGN (4)
977L(shl_13):
978 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
979 lea -13(%eax), %eax
980 movaps (%eax), %xmm1
981 xor %edi, %edi
982 lea -32(%ecx), %ecx
983 movdqu %xmm0, (%esi)
984 POP (%esi)
985L(shl_13_loop):
986
987 movdqa 16(%eax, %edi), %xmm2
988 sub $32, %ecx
989 movdqa 32(%eax, %edi), %xmm3
990 movdqa %xmm3, %xmm4
991 palignr $13, %xmm2, %xmm3
992 palignr $13, %xmm1, %xmm2
993 lea 32(%edi), %edi
994 movdqa %xmm2, -32(%edx, %edi)
995 movdqa %xmm3, -16(%edx, %edi)
996
997 jb L(shl_13_end)
998
999 movdqa 16(%eax, %edi), %xmm2
1000 sub $32, %ecx
1001 movdqa 32(%eax, %edi), %xmm3
1002 movdqa %xmm3, %xmm1
1003 palignr $13, %xmm2, %xmm3
1004 palignr $13, %xmm4, %xmm2
1005 lea 32(%edi), %edi
1006 movdqa %xmm2, -32(%edx, %edi)
1007 movdqa %xmm3, -16(%edx, %edi)
1008
1009 jae L(shl_13_loop)
1010
1011L(shl_13_end):
1012 lea 32(%ecx), %ecx
1013 add %ecx, %edi
1014 add %edi, %edx
1015 lea 13(%edi, %eax), %eax
1016 POP (%edi)
1017 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1018
Bruce Beare124a5422010-10-11 12:24:41 -07001019 cfi_restore_state
1020 cfi_remember_state
Bruce Beare8ff1a272010-03-04 11:03:37 -08001021 ALIGN (4)
1022L(shl_14):
1023 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
1024 lea -14(%eax), %eax
1025 movaps (%eax), %xmm1
1026 xor %edi, %edi
1027 lea -32(%ecx), %ecx
1028 movdqu %xmm0, (%esi)
1029 POP (%esi)
1030L(shl_14_loop):
1031
1032 movdqa 16(%eax, %edi), %xmm2
1033 sub $32, %ecx
1034 movdqa 32(%eax, %edi), %xmm3
1035 movdqa %xmm3, %xmm4
1036 palignr $14, %xmm2, %xmm3
1037 palignr $14, %xmm1, %xmm2
1038 lea 32(%edi), %edi
1039 movdqa %xmm2, -32(%edx, %edi)
1040 movdqa %xmm3, -16(%edx, %edi)
1041
1042 jb L(shl_14_end)
1043
1044 movdqa 16(%eax, %edi), %xmm2
1045 sub $32, %ecx
1046 movdqa 32(%eax, %edi), %xmm3
1047 movdqa %xmm3, %xmm1
1048 palignr $14, %xmm2, %xmm3
1049 palignr $14, %xmm4, %xmm2
1050 lea 32(%edi), %edi
1051 movdqa %xmm2, -32(%edx, %edi)
1052 movdqa %xmm3, -16(%edx, %edi)
1053
1054 jae L(shl_14_loop)
1055
1056L(shl_14_end):
1057 lea 32(%ecx), %ecx
1058 add %ecx, %edi
1059 add %edi, %edx
1060 lea 14(%edi, %eax), %eax
1061 POP (%edi)
1062 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1063
Bruce Beare124a5422010-10-11 12:24:41 -07001064 cfi_restore_state
1065 cfi_remember_state
Bruce Beare8ff1a272010-03-04 11:03:37 -08001066 ALIGN (4)
1067L(shl_15):
1068 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
1069 lea -15(%eax), %eax
1070 movaps (%eax), %xmm1
1071 xor %edi, %edi
1072 lea -32(%ecx), %ecx
1073 movdqu %xmm0, (%esi)
1074 POP (%esi)
1075L(shl_15_loop):
1076
1077 movdqa 16(%eax, %edi), %xmm2
1078 sub $32, %ecx
1079 movdqa 32(%eax, %edi), %xmm3
1080 movdqa %xmm3, %xmm4
1081 palignr $15, %xmm2, %xmm3
1082 palignr $15, %xmm1, %xmm2
1083 lea 32(%edi), %edi
1084 movdqa %xmm2, -32(%edx, %edi)
1085 movdqa %xmm3, -16(%edx, %edi)
1086
1087 jb L(shl_15_end)
1088
1089 movdqa 16(%eax, %edi), %xmm2
1090 sub $32, %ecx
1091 movdqa 32(%eax, %edi), %xmm3
1092 movdqa %xmm3, %xmm1
1093 palignr $15, %xmm2, %xmm3
1094 palignr $15, %xmm4, %xmm2
1095 lea 32(%edi), %edi
1096 movdqa %xmm2, -32(%edx, %edi)
1097 movdqa %xmm3, -16(%edx, %edi)
1098
1099 jae L(shl_15_loop)
1100
1101L(shl_15_end):
1102 lea 32(%ecx), %ecx
1103 add %ecx, %edi
1104 add %edi, %edx
1105 lea 15(%edi, %eax), %eax
1106 POP (%edi)
1107 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1108
1109
1110 ALIGN (4)
1111L(fwd_write_44bytes):
1112 movl -44(%eax), %ecx
1113 movl %ecx, -44(%edx)
1114L(fwd_write_40bytes):
1115 movl -40(%eax), %ecx
1116 movl %ecx, -40(%edx)
1117L(fwd_write_36bytes):
1118 movl -36(%eax), %ecx
1119 movl %ecx, -36(%edx)
1120L(fwd_write_32bytes):
1121 movl -32(%eax), %ecx
1122 movl %ecx, -32(%edx)
1123L(fwd_write_28bytes):
1124 movl -28(%eax), %ecx
1125 movl %ecx, -28(%edx)
1126L(fwd_write_24bytes):
1127 movl -24(%eax), %ecx
1128 movl %ecx, -24(%edx)
1129L(fwd_write_20bytes):
1130 movl -20(%eax), %ecx
1131 movl %ecx, -20(%edx)
1132L(fwd_write_16bytes):
1133 movl -16(%eax), %ecx
1134 movl %ecx, -16(%edx)
1135L(fwd_write_12bytes):
1136 movl -12(%eax), %ecx
1137 movl %ecx, -12(%edx)
1138L(fwd_write_8bytes):
1139 movl -8(%eax), %ecx
1140 movl %ecx, -8(%edx)
1141L(fwd_write_4bytes):
1142 movl -4(%eax), %ecx
1143 movl %ecx, -4(%edx)
1144L(fwd_write_0bytes):
1145#ifndef USE_AS_BCOPY
1146# ifdef USE_AS_MEMPCPY
1147 movl %edx, %eax
1148# else
1149 movl DEST(%esp), %eax
1150# endif
1151#endif
1152 RETURN
1153
1154 ALIGN (4)
1155L(fwd_write_5bytes):
1156 movl -5(%eax), %ecx
1157 movl -4(%eax), %eax
1158 movl %ecx, -5(%edx)
1159 movl %eax, -4(%edx)
1160#ifndef USE_AS_BCOPY
1161# ifdef USE_AS_MEMPCPY
1162 movl %edx, %eax
1163# else
1164 movl DEST(%esp), %eax
1165# endif
1166#endif
1167 RETURN
1168
1169 ALIGN (4)
1170L(fwd_write_45bytes):
1171 movl -45(%eax), %ecx
1172 movl %ecx, -45(%edx)
1173L(fwd_write_41bytes):
1174 movl -41(%eax), %ecx
1175 movl %ecx, -41(%edx)
1176L(fwd_write_37bytes):
1177 movl -37(%eax), %ecx
1178 movl %ecx, -37(%edx)
1179L(fwd_write_33bytes):
1180 movl -33(%eax), %ecx
1181 movl %ecx, -33(%edx)
1182L(fwd_write_29bytes):
1183 movl -29(%eax), %ecx
1184 movl %ecx, -29(%edx)
1185L(fwd_write_25bytes):
1186 movl -25(%eax), %ecx
1187 movl %ecx, -25(%edx)
1188L(fwd_write_21bytes):
1189 movl -21(%eax), %ecx
1190 movl %ecx, -21(%edx)
1191L(fwd_write_17bytes):
1192 movl -17(%eax), %ecx
1193 movl %ecx, -17(%edx)
1194L(fwd_write_13bytes):
1195 movl -13(%eax), %ecx
1196 movl %ecx, -13(%edx)
1197L(fwd_write_9bytes):
1198 movl -9(%eax), %ecx
1199 movl %ecx, -9(%edx)
1200 movl -5(%eax), %ecx
1201 movl %ecx, -5(%edx)
1202L(fwd_write_1bytes):
1203 movzbl -1(%eax), %ecx
1204 movb %cl, -1(%edx)
1205#ifndef USE_AS_BCOPY
1206# ifdef USE_AS_MEMPCPY
1207 movl %edx, %eax
1208# else
1209 movl DEST(%esp), %eax
1210# endif
1211#endif
1212 RETURN
1213
1214 ALIGN (4)
1215L(fwd_write_46bytes):
1216 movl -46(%eax), %ecx
1217 movl %ecx, -46(%edx)
1218L(fwd_write_42bytes):
1219 movl -42(%eax), %ecx
1220 movl %ecx, -42(%edx)
1221L(fwd_write_38bytes):
1222 movl -38(%eax), %ecx
1223 movl %ecx, -38(%edx)
1224L(fwd_write_34bytes):
1225 movl -34(%eax), %ecx
1226 movl %ecx, -34(%edx)
1227L(fwd_write_30bytes):
1228 movl -30(%eax), %ecx
1229 movl %ecx, -30(%edx)
1230L(fwd_write_26bytes):
1231 movl -26(%eax), %ecx
1232 movl %ecx, -26(%edx)
1233L(fwd_write_22bytes):
1234 movl -22(%eax), %ecx
1235 movl %ecx, -22(%edx)
1236L(fwd_write_18bytes):
1237 movl -18(%eax), %ecx
1238 movl %ecx, -18(%edx)
1239L(fwd_write_14bytes):
1240 movl -14(%eax), %ecx
1241 movl %ecx, -14(%edx)
1242L(fwd_write_10bytes):
1243 movl -10(%eax), %ecx
1244 movl %ecx, -10(%edx)
1245L(fwd_write_6bytes):
1246 movl -6(%eax), %ecx
1247 movl %ecx, -6(%edx)
1248L(fwd_write_2bytes):
1249 movzwl -2(%eax), %ecx
1250 movw %cx, -2(%edx)
1251#ifndef USE_AS_BCOPY
1252# ifdef USE_AS_MEMPCPY
1253 movl %edx, %eax
1254# else
1255 movl DEST(%esp), %eax
1256# endif
1257#endif
1258 RETURN
1259
1260 ALIGN (4)
1261L(fwd_write_47bytes):
1262 movl -47(%eax), %ecx
1263 movl %ecx, -47(%edx)
1264L(fwd_write_43bytes):
1265 movl -43(%eax), %ecx
1266 movl %ecx, -43(%edx)
1267L(fwd_write_39bytes):
1268 movl -39(%eax), %ecx
1269 movl %ecx, -39(%edx)
1270L(fwd_write_35bytes):
1271 movl -35(%eax), %ecx
1272 movl %ecx, -35(%edx)
1273L(fwd_write_31bytes):
1274 movl -31(%eax), %ecx
1275 movl %ecx, -31(%edx)
1276L(fwd_write_27bytes):
1277 movl -27(%eax), %ecx
1278 movl %ecx, -27(%edx)
1279L(fwd_write_23bytes):
1280 movl -23(%eax), %ecx
1281 movl %ecx, -23(%edx)
1282L(fwd_write_19bytes):
1283 movl -19(%eax), %ecx
1284 movl %ecx, -19(%edx)
1285L(fwd_write_15bytes):
1286 movl -15(%eax), %ecx
1287 movl %ecx, -15(%edx)
1288L(fwd_write_11bytes):
1289 movl -11(%eax), %ecx
1290 movl %ecx, -11(%edx)
1291L(fwd_write_7bytes):
1292 movl -7(%eax), %ecx
1293 movl %ecx, -7(%edx)
1294L(fwd_write_3bytes):
1295 movzwl -3(%eax), %ecx
1296 movzbl -1(%eax), %eax
1297 movw %cx, -3(%edx)
1298 movb %al, -1(%edx)
1299#ifndef USE_AS_BCOPY
1300# ifdef USE_AS_MEMPCPY
1301 movl %edx, %eax
1302# else
1303 movl DEST(%esp), %eax
1304# endif
1305#endif
Bruce Beare124a5422010-10-11 12:24:41 -07001306 RETURN_END
Bruce Beare8ff1a272010-03-04 11:03:37 -08001307
Bruce Beare124a5422010-10-11 12:24:41 -07001308 cfi_restore_state
1309 cfi_remember_state
Bruce Beare8ff1a272010-03-04 11:03:37 -08001310 ALIGN (4)
1311L(large_page):
1312 movdqu (%eax), %xmm1
1313 lea 16(%eax), %eax
1314 movdqu %xmm0, (%esi)
1315 movntdq %xmm1, (%edx)
1316 lea 16(%edx), %edx
1317 POP (%esi)
1318 lea -0x90(%ecx), %ecx
1319 POP (%edi)
1320L(large_page_loop):
1321 movdqu (%eax), %xmm0
1322 movdqu 0x10(%eax), %xmm1
1323 movdqu 0x20(%eax), %xmm2
1324 movdqu 0x30(%eax), %xmm3
1325 movdqu 0x40(%eax), %xmm4
1326 movdqu 0x50(%eax), %xmm5
1327 movdqu 0x60(%eax), %xmm6
1328 movdqu 0x70(%eax), %xmm7
1329 lea 0x80(%eax), %eax
1330
1331 sub $0x80, %ecx
1332 movntdq %xmm0, (%edx)
1333 movntdq %xmm1, 0x10(%edx)
1334 movntdq %xmm2, 0x20(%edx)
1335 movntdq %xmm3, 0x30(%edx)
1336 movntdq %xmm4, 0x40(%edx)
1337 movntdq %xmm5, 0x50(%edx)
1338 movntdq %xmm6, 0x60(%edx)
1339 movntdq %xmm7, 0x70(%edx)
1340 lea 0x80(%edx), %edx
1341 jae L(large_page_loop)
1342 cmp $-0x40, %ecx
1343 lea 0x80(%ecx), %ecx
1344 jl L(large_page_less_64bytes)
1345
1346 movdqu (%eax), %xmm0
1347 movdqu 0x10(%eax), %xmm1
1348 movdqu 0x20(%eax), %xmm2
1349 movdqu 0x30(%eax), %xmm3
1350 lea 0x40(%eax), %eax
1351
1352 movntdq %xmm0, (%edx)
1353 movntdq %xmm1, 0x10(%edx)
1354 movntdq %xmm2, 0x20(%edx)
1355 movntdq %xmm3, 0x30(%edx)
1356 lea 0x40(%edx), %edx
1357 sub $0x40, %ecx
1358L(large_page_less_64bytes):
1359 cmp $32, %ecx
1360 jb L(large_page_less_32bytes)
1361 movdqu (%eax), %xmm0
1362 movdqu 0x10(%eax), %xmm1
1363 lea 0x20(%eax), %eax
1364 movntdq %xmm0, (%edx)
1365 movntdq %xmm1, 0x10(%edx)
1366 lea 0x20(%edx), %edx
1367 sub $0x20, %ecx
1368L(large_page_less_32bytes):
1369 add %ecx, %edx
1370 add %ecx, %eax
1371 sfence
1372 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
1373
1374
1375 ALIGN (4)
1376L(bk_write_44bytes):
1377 movl 40(%eax), %ecx
1378 movl %ecx, 40(%edx)
1379L(bk_write_40bytes):
1380 movl 36(%eax), %ecx
1381 movl %ecx, 36(%edx)
1382L(bk_write_36bytes):
1383 movl 32(%eax), %ecx
1384 movl %ecx, 32(%edx)
1385L(bk_write_32bytes):
1386 movl 28(%eax), %ecx
1387 movl %ecx, 28(%edx)
1388L(bk_write_28bytes):
1389 movl 24(%eax), %ecx
1390 movl %ecx, 24(%edx)
1391L(bk_write_24bytes):
1392 movl 20(%eax), %ecx
1393 movl %ecx, 20(%edx)
1394L(bk_write_20bytes):
1395 movl 16(%eax), %ecx
1396 movl %ecx, 16(%edx)
1397L(bk_write_16bytes):
1398 movl 12(%eax), %ecx
1399 movl %ecx, 12(%edx)
1400L(bk_write_12bytes):
1401 movl 8(%eax), %ecx
1402 movl %ecx, 8(%edx)
1403L(bk_write_8bytes):
1404 movl 4(%eax), %ecx
1405 movl %ecx, 4(%edx)
1406L(bk_write_4bytes):
1407 movl (%eax), %ecx
1408 movl %ecx, (%edx)
1409L(bk_write_0bytes):
1410#ifndef USE_AS_BCOPY
1411 movl DEST(%esp), %eax
1412# ifdef USE_AS_MEMPCPY
1413 movl LEN(%esp), %ecx
1414 add %ecx, %eax
1415# endif
1416#endif
1417 RETURN
1418
1419 ALIGN (4)
1420L(bk_write_45bytes):
1421 movl 41(%eax), %ecx
1422 movl %ecx, 41(%edx)
1423L(bk_write_41bytes):
1424 movl 37(%eax), %ecx
1425 movl %ecx, 37(%edx)
1426L(bk_write_37bytes):
1427 movl 33(%eax), %ecx
1428 movl %ecx, 33(%edx)
1429L(bk_write_33bytes):
1430 movl 29(%eax), %ecx
1431 movl %ecx, 29(%edx)
1432L(bk_write_29bytes):
1433 movl 25(%eax), %ecx
1434 movl %ecx, 25(%edx)
1435L(bk_write_25bytes):
1436 movl 21(%eax), %ecx
1437 movl %ecx, 21(%edx)
1438L(bk_write_21bytes):
1439 movl 17(%eax), %ecx
1440 movl %ecx, 17(%edx)
1441L(bk_write_17bytes):
1442 movl 13(%eax), %ecx
1443 movl %ecx, 13(%edx)
1444L(bk_write_13bytes):
1445 movl 9(%eax), %ecx
1446 movl %ecx, 9(%edx)
1447L(bk_write_9bytes):
1448 movl 5(%eax), %ecx
1449 movl %ecx, 5(%edx)
1450L(bk_write_5bytes):
1451 movl 1(%eax), %ecx
1452 movl %ecx, 1(%edx)
1453L(bk_write_1bytes):
1454 movzbl (%eax), %ecx
1455 movb %cl, (%edx)
1456#ifndef USE_AS_BCOPY
1457 movl DEST(%esp), %eax
1458# ifdef USE_AS_MEMPCPY
1459 movl LEN(%esp), %ecx
1460 add %ecx, %eax
1461# endif
1462#endif
1463 RETURN
1464
1465 ALIGN (4)
1466L(bk_write_46bytes):
1467 movl 42(%eax), %ecx
1468 movl %ecx, 42(%edx)
1469L(bk_write_42bytes):
1470 movl 38(%eax), %ecx
1471 movl %ecx, 38(%edx)
1472L(bk_write_38bytes):
1473 movl 34(%eax), %ecx
1474 movl %ecx, 34(%edx)
1475L(bk_write_34bytes):
1476 movl 30(%eax), %ecx
1477 movl %ecx, 30(%edx)
1478L(bk_write_30bytes):
1479 movl 26(%eax), %ecx
1480 movl %ecx, 26(%edx)
1481L(bk_write_26bytes):
1482 movl 22(%eax), %ecx
1483 movl %ecx, 22(%edx)
1484L(bk_write_22bytes):
1485 movl 18(%eax), %ecx
1486 movl %ecx, 18(%edx)
1487L(bk_write_18bytes):
1488 movl 14(%eax), %ecx
1489 movl %ecx, 14(%edx)
1490L(bk_write_14bytes):
1491 movl 10(%eax), %ecx
1492 movl %ecx, 10(%edx)
1493L(bk_write_10bytes):
1494 movl 6(%eax), %ecx
1495 movl %ecx, 6(%edx)
1496L(bk_write_6bytes):
1497 movl 2(%eax), %ecx
1498 movl %ecx, 2(%edx)
1499L(bk_write_2bytes):
1500 movzwl (%eax), %ecx
1501 movw %cx, (%edx)
1502#ifndef USE_AS_BCOPY
1503 movl DEST(%esp), %eax
1504# ifdef USE_AS_MEMPCPY
1505 movl LEN(%esp), %ecx
1506 add %ecx, %eax
1507# endif
1508#endif
1509 RETURN
1510
1511 ALIGN (4)
1512L(bk_write_47bytes):
1513 movl 43(%eax), %ecx
1514 movl %ecx, 43(%edx)
1515L(bk_write_43bytes):
1516 movl 39(%eax), %ecx
1517 movl %ecx, 39(%edx)
1518L(bk_write_39bytes):
1519 movl 35(%eax), %ecx
1520 movl %ecx, 35(%edx)
1521L(bk_write_35bytes):
1522 movl 31(%eax), %ecx
1523 movl %ecx, 31(%edx)
1524L(bk_write_31bytes):
1525 movl 27(%eax), %ecx
1526 movl %ecx, 27(%edx)
1527L(bk_write_27bytes):
1528 movl 23(%eax), %ecx
1529 movl %ecx, 23(%edx)
1530L(bk_write_23bytes):
1531 movl 19(%eax), %ecx
1532 movl %ecx, 19(%edx)
1533L(bk_write_19bytes):
1534 movl 15(%eax), %ecx
1535 movl %ecx, 15(%edx)
1536L(bk_write_15bytes):
1537 movl 11(%eax), %ecx
1538 movl %ecx, 11(%edx)
1539L(bk_write_11bytes):
1540 movl 7(%eax), %ecx
1541 movl %ecx, 7(%edx)
1542L(bk_write_7bytes):
1543 movl 3(%eax), %ecx
1544 movl %ecx, 3(%edx)
1545L(bk_write_3bytes):
1546 movzwl 1(%eax), %ecx
1547 movw %cx, 1(%edx)
1548 movzbl (%eax), %eax
1549 movb %al, (%edx)
1550#ifndef USE_AS_BCOPY
1551 movl DEST(%esp), %eax
1552# ifdef USE_AS_MEMPCPY
1553 movl LEN(%esp), %ecx
1554 add %ecx, %eax
1555# endif
1556#endif
1557 RETURN_END
1558
1559
1560 .pushsection .rodata.ssse3,"a",@progbits
1561 ALIGN (2)
1562L(table_48bytes_fwd):
1563 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
1564 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
1565 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
1566 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
1567 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
1568 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
1569 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
1570 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
1571 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
1572 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
1573 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
1574 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
1575 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
1576 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
1577 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
1578 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
1579 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
1580 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
1581 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
1582 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
1583 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
1584 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
1585 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
1586 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
1587 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
1588 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
1589 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
1590 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
1591 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
1592 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
1593 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
1594 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
1595 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
1596 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
1597 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
1598 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
1599 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
1600 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
1601 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
1602 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
1603 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
1604 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
1605 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
1606 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
1607 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
1608 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
1609 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
1610 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
1611
1612 ALIGN (2)
1613L(shl_table):
1614 .int JMPTBL (L(shl_0), L(shl_table))
1615 .int JMPTBL (L(shl_1), L(shl_table))
1616 .int JMPTBL (L(shl_2), L(shl_table))
1617 .int JMPTBL (L(shl_3), L(shl_table))
1618 .int JMPTBL (L(shl_4), L(shl_table))
1619 .int JMPTBL (L(shl_5), L(shl_table))
1620 .int JMPTBL (L(shl_6), L(shl_table))
1621 .int JMPTBL (L(shl_7), L(shl_table))
1622 .int JMPTBL (L(shl_8), L(shl_table))
1623 .int JMPTBL (L(shl_9), L(shl_table))
1624 .int JMPTBL (L(shl_10), L(shl_table))
1625 .int JMPTBL (L(shl_11), L(shl_table))
1626 .int JMPTBL (L(shl_12), L(shl_table))
1627 .int JMPTBL (L(shl_13), L(shl_table))
1628 .int JMPTBL (L(shl_14), L(shl_table))
1629 .int JMPTBL (L(shl_15), L(shl_table))
1630
1631 ALIGN (2)
1632L(table_48_bytes_bwd):
1633 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
1634 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
1635 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
1636 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
1637 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
1638 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
1639 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
1640 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
1641 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
1642 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
1643 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
1644 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
1645 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
1646 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
1647 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
1648 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
1649 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
1650 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
1651 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
1652 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
1653 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
1654 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
1655 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
1656 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
1657 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
1658 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
1659 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
1660 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
1661 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
1662 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
1663 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
1664 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
1665 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
1666 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
1667 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
1668 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
1669 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
1670 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
1671 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
1672 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
1673 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
1674 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
1675 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
1676 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
1677 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
1678 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
1679 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
1680 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
1681
1682 .popsection
1683
1684#ifdef USE_AS_MEMMOVE
1685 ALIGN (4)
1686L(copy_backward):
1687 PUSH (%esi)
1688 movl %eax, %esi
1689 lea (%ecx,%edx,1),%edx
1690 lea (%ecx,%esi,1),%esi
1691 testl $0x3, %edx
1692 jnz L(bk_align)
1693
1694L(bk_aligned_4):
1695 cmp $64, %ecx
1696 jae L(bk_write_more64bytes)
1697
1698L(bk_write_64bytesless):
1699 cmp $32, %ecx
1700 jb L(bk_write_less32bytes)
1701
1702L(bk_write_more32bytes):
1703 /* Copy 32 bytes at a time. */
1704 sub $32, %ecx
1705 movl -4(%esi), %eax
1706 movl %eax, -4(%edx)
1707 movl -8(%esi), %eax
1708 movl %eax, -8(%edx)
1709 movl -12(%esi), %eax
1710 movl %eax, -12(%edx)
1711 movl -16(%esi), %eax
1712 movl %eax, -16(%edx)
1713 movl -20(%esi), %eax
1714 movl %eax, -20(%edx)
1715 movl -24(%esi), %eax
1716 movl %eax, -24(%edx)
1717 movl -28(%esi), %eax
1718 movl %eax, -28(%edx)
1719 movl -32(%esi), %eax
1720 movl %eax, -32(%edx)
1721 sub $32, %edx
1722 sub $32, %esi
1723
1724L(bk_write_less32bytes):
1725 movl %esi, %eax
1726 sub %ecx, %edx
1727 sub %ecx, %eax
1728 POP (%esi)
1729L(bk_write_less32bytes_2):
1730 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
1731
Bruce Beare124a5422010-10-11 12:24:41 -07001732 CFI_PUSH (%esi)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001733 ALIGN (4)
1734L(bk_align):
1735 cmp $8, %ecx
1736 jbe L(bk_write_less32bytes)
1737 testl $1, %edx
1738 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
1739 then (EDX & 2) must be != 0. */
1740 jz L(bk_got2)
1741 sub $1, %esi
1742 sub $1, %ecx
1743 sub $1, %edx
1744 movzbl (%esi), %eax
1745 movb %al, (%edx)
1746
1747 testl $2, %edx
1748 jz L(bk_aligned_4)
1749
1750L(bk_got2):
1751 sub $2, %esi
1752 sub $2, %ecx
1753 sub $2, %edx
1754 movzwl (%esi), %eax
1755 movw %ax, (%edx)
1756 jmp L(bk_aligned_4)
1757
1758 ALIGN (4)
1759L(bk_write_more64bytes):
1760 /* Check alignment of last byte. */
1761 testl $15, %edx
1762 jz L(bk_ssse3_cpy_pre)
1763
1764/* EDX is aligned 4 bytes, but not 16 bytes. */
1765L(bk_ssse3_align):
1766 sub $4, %esi
1767 sub $4, %ecx
1768 sub $4, %edx
1769 movl (%esi), %eax
1770 movl %eax, (%edx)
1771
1772 testl $15, %edx
1773 jz L(bk_ssse3_cpy_pre)
1774
1775 sub $4, %esi
1776 sub $4, %ecx
1777 sub $4, %edx
1778 movl (%esi), %eax
1779 movl %eax, (%edx)
1780
1781 testl $15, %edx
1782 jz L(bk_ssse3_cpy_pre)
1783
1784 sub $4, %esi
1785 sub $4, %ecx
1786 sub $4, %edx
1787 movl (%esi), %eax
1788 movl %eax, (%edx)
1789
1790L(bk_ssse3_cpy_pre):
1791 cmp $64, %ecx
1792 jb L(bk_write_more32bytes)
1793
1794L(bk_ssse3_cpy):
1795 sub $64, %esi
1796 sub $64, %ecx
1797 sub $64, %edx
1798 movdqu 0x30(%esi), %xmm3
1799 movdqa %xmm3, 0x30(%edx)
1800 movdqu 0x20(%esi), %xmm2
1801 movdqa %xmm2, 0x20(%edx)
1802 movdqu 0x10(%esi), %xmm1
1803 movdqa %xmm1, 0x10(%edx)
1804 movdqu (%esi), %xmm0
1805 movdqa %xmm0, (%edx)
1806 cmp $64, %ecx
1807 jae L(bk_ssse3_cpy)
1808 jmp L(bk_write_64bytesless)
1809
1810#endif
1811
1812END (MEMCPY)