blob: 6b9040266234a4bf92fb4b1ede045c2928e117ee [file] [log] [blame]
Bruce Beare8ff1a272010-03-04 11:03:37 -08001/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef MEMCPY
32# define MEMCPY ssse3_memcpy5
33#endif
34
35#ifndef L
36# define L(label) .L##label
37#endif
38
39#ifndef ALIGN
40# define ALIGN(n) .p2align n
41#endif
42
43#ifndef cfi_startproc
44# define cfi_startproc .cfi_startproc
45#endif
46
47#ifndef cfi_endproc
48# define cfi_endproc .cfi_endproc
49#endif
50
51#ifndef cfi_rel_offset
52# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
53#endif
54
55#ifndef cfi_restore
56# define cfi_restore(reg) .cfi_restore (reg)
57#endif
58
59#ifndef cfi_adjust_cfa_offset
60# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
61#endif
62
63#ifndef ENTRY
64# define ENTRY(name) \
65 .type name, @function; \
66 .globl name; \
67 .p2align 4; \
68name: \
69 cfi_startproc
70#endif
71
72#ifndef END
73# define END(name) \
74 cfi_endproc; \
75 .size name, .-name
76#endif
77
78#ifdef USE_AS_BCOPY
79# define SRC PARMS
80# define DEST SRC+4
81# define LEN DEST+4
82#else
83# define DEST PARMS
84# define SRC DEST+4
85# define LEN SRC+4
86#endif
87
88#define CFI_PUSH(REG) \
89 cfi_adjust_cfa_offset (4); \
90 cfi_rel_offset (REG, 0)
91
92#define CFI_POP(REG) \
93 cfi_adjust_cfa_offset (-4); \
94 cfi_restore (REG)
95
96#define PUSH(REG) pushl REG; CFI_PUSH (REG)
97#define POP(REG) popl REG; CFI_POP (REG)
98
99#ifdef SHARED
100# define PARMS 8 /* Preserve EBX. */
101# define ENTRANCE PUSH (%ebx);
102# define RETURN_END POP (%ebx); ret
103# define RETURN RETURN_END; CFI_PUSH (%ebx)
104# define JMPTBL(I, B) I - B
105
106/* Load an entry in a jump table into EBX and branch to it. TABLE is a
107 jump table with relative offsets. INDEX is a register contains the
108 index into the jump table. SCALE is the scale of INDEX. */
109# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
110 /* We first load PC into EBX. */ \
111 call __i686.get_pc_thunk.bx; \
112 /* Get the address of the jump table. */ \
113 addl $(TABLE - .), %ebx; \
114 /* Get the entry and convert the relative offset to the \
115 absolute address. */ \
116 addl (%ebx,INDEX,SCALE), %ebx; \
117 /* We loaded the jump table. Go. */ \
118 jmp *%ebx
119
120# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \
121 addl $(TABLE - .), %ebx
122
123# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
124 addl (%ebx,INDEX,SCALE), %ebx; \
125 /* We loaded the jump table. Go. */ \
126 jmp *%ebx
127
128 .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
129 .globl __i686.get_pc_thunk.bx
130 .hidden __i686.get_pc_thunk.bx
131 ALIGN (4)
132 .type __i686.get_pc_thunk.bx,@function
133__i686.get_pc_thunk.bx:
134 movl (%esp), %ebx
135 ret
136#else
137# define PARMS 4
138# define ENTRANCE
139# define RETURN_END ret
140# define RETURN RETURN_END
141# define JMPTBL(I, B) I
142
143/* Branch to an entry in a jump table. TABLE is a jump table with
144 absolute offsets. INDEX is a register contains the index into the
145 jump table. SCALE is the scale of INDEX. */
146# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
147 jmp *TABLE(,INDEX,SCALE)
148
149# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
150
151# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
152 jmp *TABLE(,INDEX,SCALE)
153#endif
154
155 .section .text.ssse3,"ax",@progbits
156ENTRY (MEMCPY)
157 ENTRANCE
158 movl LEN(%esp), %ecx
159 movl SRC(%esp), %eax
160 movl DEST(%esp), %edx
161
162#ifdef USE_AS_MEMMOVE
163 cmp %eax, %edx
164 jb L(copy_forward)
165 je L(fwd_write_0bytes)
166 cmp $32, %ecx
167 jae L(memmove_bwd)
168 jmp L(bk_write_less32bytes_2)
169L(memmove_bwd):
170 add %ecx, %eax
171 cmp %eax, %edx
172 movl SRC(%esp), %eax
173 jb L(copy_backward)
174
175L(copy_forward):
176#endif
177 cmp $48, %ecx
178 jae L(48bytesormore)
179
180L(fwd_write_less32bytes):
181#ifndef USE_AS_MEMMOVE
182 cmp %dl, %al
183 jb L(bk_write)
184#endif
185 add %ecx, %edx
186 add %ecx, %eax
187 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
188#ifndef USE_AS_MEMMOVE
189L(bk_write):
190 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
191#endif
192
193 ALIGN (4)
194/* ECX > 32 and EDX is 4 byte aligned. */
195L(48bytesormore):
196 movdqu (%eax), %xmm0
197 PUSH (%edi)
198 movl %edx, %edi
199 and $-16, %edx
200 PUSH (%esi)
201 add $16, %edx
202 movl %edi, %esi
203 sub %edx, %edi
204 add %edi, %ecx
205 sub %edi, %eax
206
207#ifdef SHARED_CACHE_SIZE_HALF
208 cmp $SHARED_CACHE_SIZE_HALF, %ecx
209#else
210# ifdef SHARED
211 call __i686.get_pc_thunk.bx
212 add $_GLOBAL_OFFSET_TABLE_, %ebx
213 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
214# else
215 cmp __x86_shared_cache_size_half, %ecx
216# endif
217#endif
218
219 mov %eax, %edi
220 jae L(large_page)
221 and $0xf, %edi
222 jz L(shl_0)
223
224 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
225
226 ALIGN (4)
227L(shl_0):
228 movdqu %xmm0, (%esi)
229 xor %edi, %edi
230 POP (%esi)
231 cmp $127, %ecx
232 ja L(shl_0_gobble)
233 lea -32(%ecx), %ecx
234L(shl_0_loop):
235 movdqa (%eax, %edi), %xmm0
236 movdqa 16(%eax, %edi), %xmm1
237 sub $32, %ecx
238 movdqa %xmm0, (%edx, %edi)
239 movdqa %xmm1, 16(%edx, %edi)
240 lea 32(%edi), %edi
241 jb L(shl_0_end)
242
243 movdqa (%eax, %edi), %xmm0
244 movdqa 16(%eax, %edi), %xmm1
245 sub $32, %ecx
246 movdqa %xmm0, (%edx, %edi)
247 movdqa %xmm1, 16(%edx, %edi)
248 lea 32(%edi), %edi
249 jb L(shl_0_end)
250
251 movdqa (%eax, %edi), %xmm0
252 movdqa 16(%eax, %edi), %xmm1
253 sub $32, %ecx
254 movdqa %xmm0, (%edx, %edi)
255 movdqa %xmm1, 16(%edx, %edi)
256 lea 32(%edi), %edi
257 jb L(shl_0_end)
258
259 movdqa (%eax, %edi), %xmm0
260 movdqa 16(%eax, %edi), %xmm1
261 sub $32, %ecx
262 movdqa %xmm0, (%edx, %edi)
263 movdqa %xmm1, 16(%edx, %edi)
264 lea 32(%edi), %edi
265L(shl_0_end):
266 lea 32(%ecx), %ecx
267 add %ecx, %edi
268 add %edi, %edx
269 add %edi, %eax
270 POP (%edi)
271 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
272
273L(shl_0_gobble):
274
275#ifdef DATA_CACHE_SIZE_HALF
276 cmp $DATA_CACHE_SIZE_HALF, %ecx
277#else
278# ifdef SHARED
279 call __i686.get_pc_thunk.bx
280 add $_GLOBAL_OFFSET_TABLE_, %ebx
281 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
282# else
283 cmp __x86_data_cache_size_half, %ecx
284# endif
285#endif
286
287 POP (%edi)
288 lea -128(%ecx), %ecx
289 jae L(shl_0_gobble_mem_loop)
290L(shl_0_gobble_cache_loop):
291 movdqa (%eax), %xmm0
292 movdqa 0x10(%eax), %xmm1
293 movdqa 0x20(%eax), %xmm2
294 movdqa 0x30(%eax), %xmm3
295 movdqa 0x40(%eax), %xmm4
296 movdqa 0x50(%eax), %xmm5
297 movdqa 0x60(%eax), %xmm6
298 movdqa 0x70(%eax), %xmm7
299 lea 0x80(%eax), %eax
300 sub $128, %ecx
301 movdqa %xmm0, (%edx)
302 movdqa %xmm1, 0x10(%edx)
303 movdqa %xmm2, 0x20(%edx)
304 movdqa %xmm3, 0x30(%edx)
305 movdqa %xmm4, 0x40(%edx)
306 movdqa %xmm5, 0x50(%edx)
307 movdqa %xmm6, 0x60(%edx)
308 movdqa %xmm7, 0x70(%edx)
309 lea 0x80(%edx), %edx
310
311 jae L(shl_0_gobble_cache_loop)
312 cmp $-0x40, %ecx
313 lea 0x80(%ecx), %ecx
314 jl L(shl_0_cache_less_64bytes)
315
316 movdqa (%eax), %xmm0
317 sub $0x40, %ecx
318 movdqa 0x10(%eax), %xmm1
319
320 movdqa %xmm0, (%edx)
321 movdqa %xmm1, 0x10(%edx)
322
323 movdqa 0x20(%eax), %xmm0
324 movdqa 0x30(%eax), %xmm1
325 add $0x40, %eax
326
327 movdqa %xmm0, 0x20(%edx)
328 movdqa %xmm1, 0x30(%edx)
329 add $0x40, %edx
330L(shl_0_cache_less_64bytes):
331 cmp $0x20, %ecx
332 jb L(shl_0_cache_less_32bytes)
333 movdqa (%eax), %xmm0
334 sub $0x20, %ecx
335 movdqa 0x10(%eax), %xmm1
336 add $0x20, %eax
337 movdqa %xmm0, (%edx)
338 movdqa %xmm1, 0x10(%edx)
339 add $0x20, %edx
340L(shl_0_cache_less_32bytes):
341 cmp $0x10, %ecx
342 jb L(shl_0_cache_less_16bytes)
343 sub $0x10, %ecx
344 movdqa (%eax), %xmm0
345 add $0x10, %eax
346 movdqa %xmm0, (%edx)
347 add $0x10, %edx
348L(shl_0_cache_less_16bytes):
349 add %ecx, %edx
350 add %ecx, %eax
351 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
352
353
354 ALIGN (4)
355L(shl_0_gobble_mem_loop):
356 prefetcht0 0x1c0(%eax)
357 prefetcht0 0x280(%eax)
358 prefetcht0 0x1c0(%edx)
359
360 movdqa (%eax), %xmm0
361 movdqa 0x10(%eax), %xmm1
362 movdqa 0x20(%eax), %xmm2
363 movdqa 0x30(%eax), %xmm3
364 movdqa 0x40(%eax), %xmm4
365 movdqa 0x50(%eax), %xmm5
366 movdqa 0x60(%eax), %xmm6
367 movdqa 0x70(%eax), %xmm7
368 lea 0x80(%eax), %eax
369 sub $0x80, %ecx
370 movdqa %xmm0, (%edx)
371 movdqa %xmm1, 0x10(%edx)
372 movdqa %xmm2, 0x20(%edx)
373 movdqa %xmm3, 0x30(%edx)
374 movdqa %xmm4, 0x40(%edx)
375 movdqa %xmm5, 0x50(%edx)
376 movdqa %xmm6, 0x60(%edx)
377 movdqa %xmm7, 0x70(%edx)
378 lea 0x80(%edx), %edx
379
380 jae L(shl_0_gobble_mem_loop)
381 cmp $-0x40, %ecx
382 lea 0x80(%ecx), %ecx
383 jl L(shl_0_mem_less_64bytes)
384
385 movdqa (%eax), %xmm0
386 sub $0x40, %ecx
387 movdqa 0x10(%eax), %xmm1
388
389 movdqa %xmm0, (%edx)
390 movdqa %xmm1, 0x10(%edx)
391
392 movdqa 0x20(%eax), %xmm0
393 movdqa 0x30(%eax), %xmm1
394 add $0x40, %eax
395
396 movdqa %xmm0, 0x20(%edx)
397 movdqa %xmm1, 0x30(%edx)
398 add $0x40, %edx
399L(shl_0_mem_less_64bytes):
400 cmp $0x20, %ecx
401 jb L(shl_0_mem_less_32bytes)
402 movdqa (%eax), %xmm0
403 sub $0x20, %ecx
404 movdqa 0x10(%eax), %xmm1
405 add $0x20, %eax
406 movdqa %xmm0, (%edx)
407 movdqa %xmm1, 0x10(%edx)
408 add $0x20, %edx
409L(shl_0_mem_less_32bytes):
410 cmp $0x10, %ecx
411 jb L(shl_0_mem_less_16bytes)
412 sub $0x10, %ecx
413 movdqa (%eax), %xmm0
414 add $0x10, %eax
415 movdqa %xmm0, (%edx)
416 add $0x10, %edx
417L(shl_0_mem_less_16bytes):
418 add %ecx, %edx
419 add %ecx, %eax
420 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
421
422
423 ALIGN (4)
424L(shl_1):
425 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
426 lea -1(%eax), %eax
427 movaps (%eax), %xmm1
428 xor %edi, %edi
429 lea -32(%ecx), %ecx
430 movdqu %xmm0, (%esi)
431 POP (%esi)
432L(shl_1_loop):
433
434 movdqa 16(%eax, %edi), %xmm2
435 sub $32, %ecx
436 movdqa 32(%eax, %edi), %xmm3
437 movdqa %xmm3, %xmm4
438 palignr $1, %xmm2, %xmm3
439 palignr $1, %xmm1, %xmm2
440 lea 32(%edi), %edi
441 movdqa %xmm2, -32(%edx, %edi)
442 movdqa %xmm3, -16(%edx, %edi)
443
444 jb L(shl_1_end)
445
446 movdqa 16(%eax, %edi), %xmm2
447 sub $32, %ecx
448 movdqa 32(%eax, %edi), %xmm3
449 movdqa %xmm3, %xmm1
450 palignr $1, %xmm2, %xmm3
451 palignr $1, %xmm4, %xmm2
452 lea 32(%edi), %edi
453 movdqa %xmm2, -32(%edx, %edi)
454 movdqa %xmm3, -16(%edx, %edi)
455
456 jae L(shl_1_loop)
457
458L(shl_1_end):
459 lea 32(%ecx), %ecx
460 add %ecx, %edi
461 add %edi, %edx
462 lea 1(%edi, %eax), %eax
463 POP (%edi)
464 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
465
466 ALIGN (4)
467L(shl_2):
468 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
469 lea -2(%eax), %eax
470 movaps (%eax), %xmm1
471 xor %edi, %edi
472 lea -32(%ecx), %ecx
473 movdqu %xmm0, (%esi)
474 POP (%esi)
475L(shl_2_loop):
476
477 movdqa 16(%eax, %edi), %xmm2
478 sub $32, %ecx
479 movdqa 32(%eax, %edi), %xmm3
480 movdqa %xmm3, %xmm4
481 palignr $2, %xmm2, %xmm3
482 palignr $2, %xmm1, %xmm2
483 lea 32(%edi), %edi
484 movdqa %xmm2, -32(%edx, %edi)
485 movdqa %xmm3, -16(%edx, %edi)
486
487 jb L(shl_2_end)
488
489 movdqa 16(%eax, %edi), %xmm2
490 sub $32, %ecx
491 movdqa 32(%eax, %edi), %xmm3
492 movdqa %xmm3, %xmm1
493 palignr $2, %xmm2, %xmm3
494 palignr $2, %xmm4, %xmm2
495 lea 32(%edi), %edi
496 movdqa %xmm2, -32(%edx, %edi)
497 movdqa %xmm3, -16(%edx, %edi)
498
499 jae L(shl_2_loop)
500
501L(shl_2_end):
502 lea 32(%ecx), %ecx
503 add %ecx, %edi
504 add %edi, %edx
505 lea 2(%edi, %eax), %eax
506 POP (%edi)
507 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
508
509 ALIGN (4)
510L(shl_3):
511 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
512 lea -3(%eax), %eax
513 movaps (%eax), %xmm1
514 xor %edi, %edi
515 lea -32(%ecx), %ecx
516 movdqu %xmm0, (%esi)
517 POP (%esi)
518L(shl_3_loop):
519
520 movdqa 16(%eax, %edi), %xmm2
521 sub $32, %ecx
522 movdqa 32(%eax, %edi), %xmm3
523 movdqa %xmm3, %xmm4
524 palignr $3, %xmm2, %xmm3
525 palignr $3, %xmm1, %xmm2
526 lea 32(%edi), %edi
527 movdqa %xmm2, -32(%edx, %edi)
528 movdqa %xmm3, -16(%edx, %edi)
529
530 jb L(shl_3_end)
531
532 movdqa 16(%eax, %edi), %xmm2
533 sub $32, %ecx
534 movdqa 32(%eax, %edi), %xmm3
535 movdqa %xmm3, %xmm1
536 palignr $3, %xmm2, %xmm3
537 palignr $3, %xmm4, %xmm2
538 lea 32(%edi), %edi
539 movdqa %xmm2, -32(%edx, %edi)
540 movdqa %xmm3, -16(%edx, %edi)
541
542 jae L(shl_3_loop)
543
544L(shl_3_end):
545 lea 32(%ecx), %ecx
546 add %ecx, %edi
547 add %edi, %edx
548 lea 3(%edi, %eax), %eax
549 POP (%edi)
550 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
551
552 ALIGN (4)
553L(shl_4):
554 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
555 lea -4(%eax), %eax
556 movaps (%eax), %xmm1
557 xor %edi, %edi
558 lea -32(%ecx), %ecx
559 movdqu %xmm0, (%esi)
560 POP (%esi)
561L(shl_4_loop):
562
563 movdqa 16(%eax, %edi), %xmm2
564 sub $32, %ecx
565 movdqa 32(%eax, %edi), %xmm3
566 movdqa %xmm3, %xmm4
567 palignr $4, %xmm2, %xmm3
568 palignr $4, %xmm1, %xmm2
569 lea 32(%edi), %edi
570 movdqa %xmm2, -32(%edx, %edi)
571 movdqa %xmm3, -16(%edx, %edi)
572
573 jb L(shl_4_end)
574
575 movdqa 16(%eax, %edi), %xmm2
576 sub $32, %ecx
577 movdqa 32(%eax, %edi), %xmm3
578 movdqa %xmm3, %xmm1
579 palignr $4, %xmm2, %xmm3
580 palignr $4, %xmm4, %xmm2
581 lea 32(%edi), %edi
582 movdqa %xmm2, -32(%edx, %edi)
583 movdqa %xmm3, -16(%edx, %edi)
584
585 jae L(shl_4_loop)
586
587L(shl_4_end):
588 lea 32(%ecx), %ecx
589 add %ecx, %edi
590 add %edi, %edx
591 lea 4(%edi, %eax), %eax
592 POP (%edi)
593 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
594
595 ALIGN (4)
596L(shl_5):
597 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
598 lea -5(%eax), %eax
599 movaps (%eax), %xmm1
600 xor %edi, %edi
601 lea -32(%ecx), %ecx
602 movdqu %xmm0, (%esi)
603 POP (%esi)
604L(shl_5_loop):
605
606 movdqa 16(%eax, %edi), %xmm2
607 sub $32, %ecx
608 movdqa 32(%eax, %edi), %xmm3
609 movdqa %xmm3, %xmm4
610 palignr $5, %xmm2, %xmm3
611 palignr $5, %xmm1, %xmm2
612 lea 32(%edi), %edi
613 movdqa %xmm2, -32(%edx, %edi)
614 movdqa %xmm3, -16(%edx, %edi)
615
616 jb L(shl_5_end)
617
618 movdqa 16(%eax, %edi), %xmm2
619 sub $32, %ecx
620 movdqa 32(%eax, %edi), %xmm3
621 movdqa %xmm3, %xmm1
622 palignr $5, %xmm2, %xmm3
623 palignr $5, %xmm4, %xmm2
624 lea 32(%edi), %edi
625 movdqa %xmm2, -32(%edx, %edi)
626 movdqa %xmm3, -16(%edx, %edi)
627
628 jae L(shl_5_loop)
629
630L(shl_5_end):
631 lea 32(%ecx), %ecx
632 add %ecx, %edi
633 add %edi, %edx
634 lea 5(%edi, %eax), %eax
635 POP (%edi)
636 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
637
638
639 ALIGN (4)
640L(shl_6):
641 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
642 lea -6(%eax), %eax
643 movaps (%eax), %xmm1
644 xor %edi, %edi
645 lea -32(%ecx), %ecx
646 movdqu %xmm0, (%esi)
647 POP (%esi)
648L(shl_6_loop):
649
650 movdqa 16(%eax, %edi), %xmm2
651 sub $32, %ecx
652 movdqa 32(%eax, %edi), %xmm3
653 movdqa %xmm3, %xmm4
654 palignr $6, %xmm2, %xmm3
655 palignr $6, %xmm1, %xmm2
656 lea 32(%edi), %edi
657 movdqa %xmm2, -32(%edx, %edi)
658 movdqa %xmm3, -16(%edx, %edi)
659
660 jb L(shl_6_end)
661
662 movdqa 16(%eax, %edi), %xmm2
663 sub $32, %ecx
664 movdqa 32(%eax, %edi), %xmm3
665 movdqa %xmm3, %xmm1
666 palignr $6, %xmm2, %xmm3
667 palignr $6, %xmm4, %xmm2
668 lea 32(%edi), %edi
669 movdqa %xmm2, -32(%edx, %edi)
670 movdqa %xmm3, -16(%edx, %edi)
671
672 jae L(shl_6_loop)
673
674L(shl_6_end):
675 lea 32(%ecx), %ecx
676 add %ecx, %edi
677 add %edi, %edx
678 lea 6(%edi, %eax), %eax
679 POP (%edi)
680 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
681
682 ALIGN (4)
683L(shl_7):
684 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
685 lea -7(%eax), %eax
686 movaps (%eax), %xmm1
687 xor %edi, %edi
688 lea -32(%ecx), %ecx
689 movdqu %xmm0, (%esi)
690 POP (%esi)
691L(shl_7_loop):
692
693 movdqa 16(%eax, %edi), %xmm2
694 sub $32, %ecx
695 movdqa 32(%eax, %edi), %xmm3
696 movdqa %xmm3, %xmm4
697 palignr $7, %xmm2, %xmm3
698 palignr $7, %xmm1, %xmm2
699 lea 32(%edi), %edi
700 movdqa %xmm2, -32(%edx, %edi)
701 movdqa %xmm3, -16(%edx, %edi)
702
703 jb L(shl_7_end)
704
705 movdqa 16(%eax, %edi), %xmm2
706 sub $32, %ecx
707 movdqa 32(%eax, %edi), %xmm3
708 movdqa %xmm3, %xmm1
709 palignr $7, %xmm2, %xmm3
710 palignr $7, %xmm4, %xmm2
711 lea 32(%edi), %edi
712 movdqa %xmm2, -32(%edx, %edi)
713 movdqa %xmm3, -16(%edx, %edi)
714
715 jae L(shl_7_loop)
716
717L(shl_7_end):
718 lea 32(%ecx), %ecx
719 add %ecx, %edi
720 add %edi, %edx
721 lea 7(%edi, %eax), %eax
722 POP (%edi)
723 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
724
725 ALIGN (4)
726L(shl_8):
727 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
728 lea -8(%eax), %eax
729 movaps (%eax), %xmm1
730 xor %edi, %edi
731 lea -32(%ecx), %ecx
732 movdqu %xmm0, (%esi)
733 POP (%esi)
734L(shl_8_loop):
735
736 movdqa 16(%eax, %edi), %xmm2
737 sub $32, %ecx
738 movdqa 32(%eax, %edi), %xmm3
739 movdqa %xmm3, %xmm4
740 palignr $8, %xmm2, %xmm3
741 palignr $8, %xmm1, %xmm2
742 lea 32(%edi), %edi
743 movdqa %xmm2, -32(%edx, %edi)
744 movdqa %xmm3, -16(%edx, %edi)
745
746 jb L(shl_8_end)
747
748 movdqa 16(%eax, %edi), %xmm2
749 sub $32, %ecx
750 movdqa 32(%eax, %edi), %xmm3
751 movdqa %xmm3, %xmm1
752 palignr $8, %xmm2, %xmm3
753 palignr $8, %xmm4, %xmm2
754 lea 32(%edi), %edi
755 movdqa %xmm2, -32(%edx, %edi)
756 movdqa %xmm3, -16(%edx, %edi)
757
758 jae L(shl_8_loop)
759
760L(shl_8_end):
761 lea 32(%ecx), %ecx
762 add %ecx, %edi
763 add %edi, %edx
764 lea 8(%edi, %eax), %eax
765 POP (%edi)
766 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
767
768 ALIGN (4)
769L(shl_9):
770 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
771 lea -9(%eax), %eax
772 movaps (%eax), %xmm1
773 xor %edi, %edi
774 lea -32(%ecx), %ecx
775 movdqu %xmm0, (%esi)
776 POP (%esi)
777L(shl_9_loop):
778
779 movdqa 16(%eax, %edi), %xmm2
780 sub $32, %ecx
781 movdqa 32(%eax, %edi), %xmm3
782 movdqa %xmm3, %xmm4
783 palignr $9, %xmm2, %xmm3
784 palignr $9, %xmm1, %xmm2
785 lea 32(%edi), %edi
786 movdqa %xmm2, -32(%edx, %edi)
787 movdqa %xmm3, -16(%edx, %edi)
788
789 jb L(shl_9_end)
790
791 movdqa 16(%eax, %edi), %xmm2
792 sub $32, %ecx
793 movdqa 32(%eax, %edi), %xmm3
794 movdqa %xmm3, %xmm1
795 palignr $9, %xmm2, %xmm3
796 palignr $9, %xmm4, %xmm2
797 lea 32(%edi), %edi
798 movdqa %xmm2, -32(%edx, %edi)
799 movdqa %xmm3, -16(%edx, %edi)
800
801 jae L(shl_9_loop)
802
803L(shl_9_end):
804 lea 32(%ecx), %ecx
805 add %ecx, %edi
806 add %edi, %edx
807 lea 9(%edi, %eax), %eax
808 POP (%edi)
809 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
810
811 ALIGN (4)
812L(shl_10):
813 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
814 lea -10(%eax), %eax
815 movaps (%eax), %xmm1
816 xor %edi, %edi
817 lea -32(%ecx), %ecx
818 movdqu %xmm0, (%esi)
819 POP (%esi)
820L(shl_10_loop):
821
822 movdqa 16(%eax, %edi), %xmm2
823 sub $32, %ecx
824 movdqa 32(%eax, %edi), %xmm3
825 movdqa %xmm3, %xmm4
826 palignr $10, %xmm2, %xmm3
827 palignr $10, %xmm1, %xmm2
828 lea 32(%edi), %edi
829 movdqa %xmm2, -32(%edx, %edi)
830 movdqa %xmm3, -16(%edx, %edi)
831
832 jb L(shl_10_end)
833
834 movdqa 16(%eax, %edi), %xmm2
835 sub $32, %ecx
836 movdqa 32(%eax, %edi), %xmm3
837 movdqa %xmm3, %xmm1
838 palignr $10, %xmm2, %xmm3
839 palignr $10, %xmm4, %xmm2
840 lea 32(%edi), %edi
841 movdqa %xmm2, -32(%edx, %edi)
842 movdqa %xmm3, -16(%edx, %edi)
843
844 jae L(shl_10_loop)
845
846L(shl_10_end):
847 lea 32(%ecx), %ecx
848 add %ecx, %edi
849 add %edi, %edx
850 lea 10(%edi, %eax), %eax
851 POP (%edi)
852 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
853
854 ALIGN (4)
855L(shl_11):
856 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
857 lea -11(%eax), %eax
858 movaps (%eax), %xmm1
859 xor %edi, %edi
860 lea -32(%ecx), %ecx
861 movdqu %xmm0, (%esi)
862 POP (%esi)
863L(shl_11_loop):
864
865 movdqa 16(%eax, %edi), %xmm2
866 sub $32, %ecx
867 movdqa 32(%eax, %edi), %xmm3
868 movdqa %xmm3, %xmm4
869 palignr $11, %xmm2, %xmm3
870 palignr $11, %xmm1, %xmm2
871 lea 32(%edi), %edi
872 movdqa %xmm2, -32(%edx, %edi)
873 movdqa %xmm3, -16(%edx, %edi)
874
875 jb L(shl_11_end)
876
877 movdqa 16(%eax, %edi), %xmm2
878 sub $32, %ecx
879 movdqa 32(%eax, %edi), %xmm3
880 movdqa %xmm3, %xmm1
881 palignr $11, %xmm2, %xmm3
882 palignr $11, %xmm4, %xmm2
883 lea 32(%edi), %edi
884 movdqa %xmm2, -32(%edx, %edi)
885 movdqa %xmm3, -16(%edx, %edi)
886
887 jae L(shl_11_loop)
888
889L(shl_11_end):
890 lea 32(%ecx), %ecx
891 add %ecx, %edi
892 add %edi, %edx
893 lea 11(%edi, %eax), %eax
894 POP (%edi)
895 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
896
897 ALIGN (4)
898L(shl_12):
899 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
900 lea -12(%eax), %eax
901 movaps (%eax), %xmm1
902 xor %edi, %edi
903 lea -32(%ecx), %ecx
904 movdqu %xmm0, (%esi)
905 POP (%esi)
906L(shl_12_loop):
907
908 movdqa 16(%eax, %edi), %xmm2
909 sub $32, %ecx
910 movdqa 32(%eax, %edi), %xmm3
911 movdqa %xmm3, %xmm4
912 palignr $12, %xmm2, %xmm3
913 palignr $12, %xmm1, %xmm2
914 lea 32(%edi), %edi
915 movdqa %xmm2, -32(%edx, %edi)
916 movdqa %xmm3, -16(%edx, %edi)
917
918 jb L(shl_12_end)
919
920 movdqa 16(%eax, %edi), %xmm2
921 sub $32, %ecx
922 movdqa 32(%eax, %edi), %xmm3
923 movdqa %xmm3, %xmm1
924 palignr $12, %xmm2, %xmm3
925 palignr $12, %xmm4, %xmm2
926 lea 32(%edi), %edi
927 movdqa %xmm2, -32(%edx, %edi)
928 movdqa %xmm3, -16(%edx, %edi)
929
930 jae L(shl_12_loop)
931
932L(shl_12_end):
933 lea 32(%ecx), %ecx
934 add %ecx, %edi
935 add %edi, %edx
936 lea 12(%edi, %eax), %eax
937 POP (%edi)
938 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
939
940 ALIGN (4)
941L(shl_13):
942 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
943 lea -13(%eax), %eax
944 movaps (%eax), %xmm1
945 xor %edi, %edi
946 lea -32(%ecx), %ecx
947 movdqu %xmm0, (%esi)
948 POP (%esi)
949L(shl_13_loop):
950
951 movdqa 16(%eax, %edi), %xmm2
952 sub $32, %ecx
953 movdqa 32(%eax, %edi), %xmm3
954 movdqa %xmm3, %xmm4
955 palignr $13, %xmm2, %xmm3
956 palignr $13, %xmm1, %xmm2
957 lea 32(%edi), %edi
958 movdqa %xmm2, -32(%edx, %edi)
959 movdqa %xmm3, -16(%edx, %edi)
960
961 jb L(shl_13_end)
962
963 movdqa 16(%eax, %edi), %xmm2
964 sub $32, %ecx
965 movdqa 32(%eax, %edi), %xmm3
966 movdqa %xmm3, %xmm1
967 palignr $13, %xmm2, %xmm3
968 palignr $13, %xmm4, %xmm2
969 lea 32(%edi), %edi
970 movdqa %xmm2, -32(%edx, %edi)
971 movdqa %xmm3, -16(%edx, %edi)
972
973 jae L(shl_13_loop)
974
975L(shl_13_end):
976 lea 32(%ecx), %ecx
977 add %ecx, %edi
978 add %edi, %edx
979 lea 13(%edi, %eax), %eax
980 POP (%edi)
981 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
982
983 ALIGN (4)
984L(shl_14):
985 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
986 lea -14(%eax), %eax
987 movaps (%eax), %xmm1
988 xor %edi, %edi
989 lea -32(%ecx), %ecx
990 movdqu %xmm0, (%esi)
991 POP (%esi)
992L(shl_14_loop):
993
994 movdqa 16(%eax, %edi), %xmm2
995 sub $32, %ecx
996 movdqa 32(%eax, %edi), %xmm3
997 movdqa %xmm3, %xmm4
998 palignr $14, %xmm2, %xmm3
999 palignr $14, %xmm1, %xmm2
1000 lea 32(%edi), %edi
1001 movdqa %xmm2, -32(%edx, %edi)
1002 movdqa %xmm3, -16(%edx, %edi)
1003
1004 jb L(shl_14_end)
1005
1006 movdqa 16(%eax, %edi), %xmm2
1007 sub $32, %ecx
1008 movdqa 32(%eax, %edi), %xmm3
1009 movdqa %xmm3, %xmm1
1010 palignr $14, %xmm2, %xmm3
1011 palignr $14, %xmm4, %xmm2
1012 lea 32(%edi), %edi
1013 movdqa %xmm2, -32(%edx, %edi)
1014 movdqa %xmm3, -16(%edx, %edi)
1015
1016 jae L(shl_14_loop)
1017
1018L(shl_14_end):
1019 lea 32(%ecx), %ecx
1020 add %ecx, %edi
1021 add %edi, %edx
1022 lea 14(%edi, %eax), %eax
1023 POP (%edi)
1024 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1025
1026
1027 ALIGN (4)
1028L(shl_15):
1029 BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
1030 lea -15(%eax), %eax
1031 movaps (%eax), %xmm1
1032 xor %edi, %edi
1033 lea -32(%ecx), %ecx
1034 movdqu %xmm0, (%esi)
1035 POP (%esi)
1036L(shl_15_loop):
1037
1038 movdqa 16(%eax, %edi), %xmm2
1039 sub $32, %ecx
1040 movdqa 32(%eax, %edi), %xmm3
1041 movdqa %xmm3, %xmm4
1042 palignr $15, %xmm2, %xmm3
1043 palignr $15, %xmm1, %xmm2
1044 lea 32(%edi), %edi
1045 movdqa %xmm2, -32(%edx, %edi)
1046 movdqa %xmm3, -16(%edx, %edi)
1047
1048 jb L(shl_15_end)
1049
1050 movdqa 16(%eax, %edi), %xmm2
1051 sub $32, %ecx
1052 movdqa 32(%eax, %edi), %xmm3
1053 movdqa %xmm3, %xmm1
1054 palignr $15, %xmm2, %xmm3
1055 palignr $15, %xmm4, %xmm2
1056 lea 32(%edi), %edi
1057 movdqa %xmm2, -32(%edx, %edi)
1058 movdqa %xmm3, -16(%edx, %edi)
1059
1060 jae L(shl_15_loop)
1061
1062L(shl_15_end):
1063 lea 32(%ecx), %ecx
1064 add %ecx, %edi
1065 add %edi, %edx
1066 lea 15(%edi, %eax), %eax
1067 POP (%edi)
1068 BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
1069
1070
1071 ALIGN (4)
1072L(fwd_write_44bytes):
1073 movl -44(%eax), %ecx
1074 movl %ecx, -44(%edx)
1075L(fwd_write_40bytes):
1076 movl -40(%eax), %ecx
1077 movl %ecx, -40(%edx)
1078L(fwd_write_36bytes):
1079 movl -36(%eax), %ecx
1080 movl %ecx, -36(%edx)
1081L(fwd_write_32bytes):
1082 movl -32(%eax), %ecx
1083 movl %ecx, -32(%edx)
1084L(fwd_write_28bytes):
1085 movl -28(%eax), %ecx
1086 movl %ecx, -28(%edx)
1087L(fwd_write_24bytes):
1088 movl -24(%eax), %ecx
1089 movl %ecx, -24(%edx)
1090L(fwd_write_20bytes):
1091 movl -20(%eax), %ecx
1092 movl %ecx, -20(%edx)
1093L(fwd_write_16bytes):
1094 movl -16(%eax), %ecx
1095 movl %ecx, -16(%edx)
1096L(fwd_write_12bytes):
1097 movl -12(%eax), %ecx
1098 movl %ecx, -12(%edx)
1099L(fwd_write_8bytes):
1100 movl -8(%eax), %ecx
1101 movl %ecx, -8(%edx)
1102L(fwd_write_4bytes):
1103 movl -4(%eax), %ecx
1104 movl %ecx, -4(%edx)
1105L(fwd_write_0bytes):
1106#ifndef USE_AS_BCOPY
1107# ifdef USE_AS_MEMPCPY
1108 movl %edx, %eax
1109# else
1110 movl DEST(%esp), %eax
1111# endif
1112#endif
1113 RETURN
1114
1115 ALIGN (4)
1116L(fwd_write_5bytes):
1117 movl -5(%eax), %ecx
1118 movl -4(%eax), %eax
1119 movl %ecx, -5(%edx)
1120 movl %eax, -4(%edx)
1121#ifndef USE_AS_BCOPY
1122# ifdef USE_AS_MEMPCPY
1123 movl %edx, %eax
1124# else
1125 movl DEST(%esp), %eax
1126# endif
1127#endif
1128 RETURN
1129
1130 ALIGN (4)
1131L(fwd_write_45bytes):
1132 movl -45(%eax), %ecx
1133 movl %ecx, -45(%edx)
1134L(fwd_write_41bytes):
1135 movl -41(%eax), %ecx
1136 movl %ecx, -41(%edx)
1137L(fwd_write_37bytes):
1138 movl -37(%eax), %ecx
1139 movl %ecx, -37(%edx)
1140L(fwd_write_33bytes):
1141 movl -33(%eax), %ecx
1142 movl %ecx, -33(%edx)
1143L(fwd_write_29bytes):
1144 movl -29(%eax), %ecx
1145 movl %ecx, -29(%edx)
1146L(fwd_write_25bytes):
1147 movl -25(%eax), %ecx
1148 movl %ecx, -25(%edx)
1149L(fwd_write_21bytes):
1150 movl -21(%eax), %ecx
1151 movl %ecx, -21(%edx)
1152L(fwd_write_17bytes):
1153 movl -17(%eax), %ecx
1154 movl %ecx, -17(%edx)
1155L(fwd_write_13bytes):
1156 movl -13(%eax), %ecx
1157 movl %ecx, -13(%edx)
1158L(fwd_write_9bytes):
1159 movl -9(%eax), %ecx
1160 movl %ecx, -9(%edx)
1161 movl -5(%eax), %ecx
1162 movl %ecx, -5(%edx)
1163L(fwd_write_1bytes):
1164 movzbl -1(%eax), %ecx
1165 movb %cl, -1(%edx)
1166#ifndef USE_AS_BCOPY
1167# ifdef USE_AS_MEMPCPY
1168 movl %edx, %eax
1169# else
1170 movl DEST(%esp), %eax
1171# endif
1172#endif
1173 RETURN
1174
1175 ALIGN (4)
1176L(fwd_write_46bytes):
1177 movl -46(%eax), %ecx
1178 movl %ecx, -46(%edx)
1179L(fwd_write_42bytes):
1180 movl -42(%eax), %ecx
1181 movl %ecx, -42(%edx)
1182L(fwd_write_38bytes):
1183 movl -38(%eax), %ecx
1184 movl %ecx, -38(%edx)
1185L(fwd_write_34bytes):
1186 movl -34(%eax), %ecx
1187 movl %ecx, -34(%edx)
1188L(fwd_write_30bytes):
1189 movl -30(%eax), %ecx
1190 movl %ecx, -30(%edx)
1191L(fwd_write_26bytes):
1192 movl -26(%eax), %ecx
1193 movl %ecx, -26(%edx)
1194L(fwd_write_22bytes):
1195 movl -22(%eax), %ecx
1196 movl %ecx, -22(%edx)
1197L(fwd_write_18bytes):
1198 movl -18(%eax), %ecx
1199 movl %ecx, -18(%edx)
1200L(fwd_write_14bytes):
1201 movl -14(%eax), %ecx
1202 movl %ecx, -14(%edx)
1203L(fwd_write_10bytes):
1204 movl -10(%eax), %ecx
1205 movl %ecx, -10(%edx)
1206L(fwd_write_6bytes):
1207 movl -6(%eax), %ecx
1208 movl %ecx, -6(%edx)
1209L(fwd_write_2bytes):
1210 movzwl -2(%eax), %ecx
1211 movw %cx, -2(%edx)
1212#ifndef USE_AS_BCOPY
1213# ifdef USE_AS_MEMPCPY
1214 movl %edx, %eax
1215# else
1216 movl DEST(%esp), %eax
1217# endif
1218#endif
1219 RETURN
1220
1221 ALIGN (4)
1222L(fwd_write_47bytes):
1223 movl -47(%eax), %ecx
1224 movl %ecx, -47(%edx)
1225L(fwd_write_43bytes):
1226 movl -43(%eax), %ecx
1227 movl %ecx, -43(%edx)
1228L(fwd_write_39bytes):
1229 movl -39(%eax), %ecx
1230 movl %ecx, -39(%edx)
1231L(fwd_write_35bytes):
1232 movl -35(%eax), %ecx
1233 movl %ecx, -35(%edx)
1234L(fwd_write_31bytes):
1235 movl -31(%eax), %ecx
1236 movl %ecx, -31(%edx)
1237L(fwd_write_27bytes):
1238 movl -27(%eax), %ecx
1239 movl %ecx, -27(%edx)
1240L(fwd_write_23bytes):
1241 movl -23(%eax), %ecx
1242 movl %ecx, -23(%edx)
1243L(fwd_write_19bytes):
1244 movl -19(%eax), %ecx
1245 movl %ecx, -19(%edx)
1246L(fwd_write_15bytes):
1247 movl -15(%eax), %ecx
1248 movl %ecx, -15(%edx)
1249L(fwd_write_11bytes):
1250 movl -11(%eax), %ecx
1251 movl %ecx, -11(%edx)
1252L(fwd_write_7bytes):
1253 movl -7(%eax), %ecx
1254 movl %ecx, -7(%edx)
1255L(fwd_write_3bytes):
1256 movzwl -3(%eax), %ecx
1257 movzbl -1(%eax), %eax
1258 movw %cx, -3(%edx)
1259 movb %al, -1(%edx)
1260#ifndef USE_AS_BCOPY
1261# ifdef USE_AS_MEMPCPY
1262 movl %edx, %eax
1263# else
1264 movl DEST(%esp), %eax
1265# endif
1266#endif
1267 RETURN
1268
1269 ALIGN (4)
1270L(large_page):
1271 movdqu (%eax), %xmm1
1272 lea 16(%eax), %eax
1273 movdqu %xmm0, (%esi)
1274 movntdq %xmm1, (%edx)
1275 lea 16(%edx), %edx
1276 POP (%esi)
1277 lea -0x90(%ecx), %ecx
1278 POP (%edi)
1279L(large_page_loop):
1280 movdqu (%eax), %xmm0
1281 movdqu 0x10(%eax), %xmm1
1282 movdqu 0x20(%eax), %xmm2
1283 movdqu 0x30(%eax), %xmm3
1284 movdqu 0x40(%eax), %xmm4
1285 movdqu 0x50(%eax), %xmm5
1286 movdqu 0x60(%eax), %xmm6
1287 movdqu 0x70(%eax), %xmm7
1288 lea 0x80(%eax), %eax
1289
1290 sub $0x80, %ecx
1291 movntdq %xmm0, (%edx)
1292 movntdq %xmm1, 0x10(%edx)
1293 movntdq %xmm2, 0x20(%edx)
1294 movntdq %xmm3, 0x30(%edx)
1295 movntdq %xmm4, 0x40(%edx)
1296 movntdq %xmm5, 0x50(%edx)
1297 movntdq %xmm6, 0x60(%edx)
1298 movntdq %xmm7, 0x70(%edx)
1299 lea 0x80(%edx), %edx
1300 jae L(large_page_loop)
1301 cmp $-0x40, %ecx
1302 lea 0x80(%ecx), %ecx
1303 jl L(large_page_less_64bytes)
1304
1305 movdqu (%eax), %xmm0
1306 movdqu 0x10(%eax), %xmm1
1307 movdqu 0x20(%eax), %xmm2
1308 movdqu 0x30(%eax), %xmm3
1309 lea 0x40(%eax), %eax
1310
1311 movntdq %xmm0, (%edx)
1312 movntdq %xmm1, 0x10(%edx)
1313 movntdq %xmm2, 0x20(%edx)
1314 movntdq %xmm3, 0x30(%edx)
1315 lea 0x40(%edx), %edx
1316 sub $0x40, %ecx
1317L(large_page_less_64bytes):
1318 cmp $32, %ecx
1319 jb L(large_page_less_32bytes)
1320 movdqu (%eax), %xmm0
1321 movdqu 0x10(%eax), %xmm1
1322 lea 0x20(%eax), %eax
1323 movntdq %xmm0, (%edx)
1324 movntdq %xmm1, 0x10(%edx)
1325 lea 0x20(%edx), %edx
1326 sub $0x20, %ecx
1327L(large_page_less_32bytes):
1328 add %ecx, %edx
1329 add %ecx, %eax
1330 sfence
1331 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
1332
1333
1334 ALIGN (4)
1335L(bk_write_44bytes):
1336 movl 40(%eax), %ecx
1337 movl %ecx, 40(%edx)
1338L(bk_write_40bytes):
1339 movl 36(%eax), %ecx
1340 movl %ecx, 36(%edx)
1341L(bk_write_36bytes):
1342 movl 32(%eax), %ecx
1343 movl %ecx, 32(%edx)
1344L(bk_write_32bytes):
1345 movl 28(%eax), %ecx
1346 movl %ecx, 28(%edx)
1347L(bk_write_28bytes):
1348 movl 24(%eax), %ecx
1349 movl %ecx, 24(%edx)
1350L(bk_write_24bytes):
1351 movl 20(%eax), %ecx
1352 movl %ecx, 20(%edx)
1353L(bk_write_20bytes):
1354 movl 16(%eax), %ecx
1355 movl %ecx, 16(%edx)
1356L(bk_write_16bytes):
1357 movl 12(%eax), %ecx
1358 movl %ecx, 12(%edx)
1359L(bk_write_12bytes):
1360 movl 8(%eax), %ecx
1361 movl %ecx, 8(%edx)
1362L(bk_write_8bytes):
1363 movl 4(%eax), %ecx
1364 movl %ecx, 4(%edx)
1365L(bk_write_4bytes):
1366 movl (%eax), %ecx
1367 movl %ecx, (%edx)
1368L(bk_write_0bytes):
1369#ifndef USE_AS_BCOPY
1370 movl DEST(%esp), %eax
1371# ifdef USE_AS_MEMPCPY
1372 movl LEN(%esp), %ecx
1373 add %ecx, %eax
1374# endif
1375#endif
1376 RETURN
1377
1378 ALIGN (4)
1379L(bk_write_45bytes):
1380 movl 41(%eax), %ecx
1381 movl %ecx, 41(%edx)
1382L(bk_write_41bytes):
1383 movl 37(%eax), %ecx
1384 movl %ecx, 37(%edx)
1385L(bk_write_37bytes):
1386 movl 33(%eax), %ecx
1387 movl %ecx, 33(%edx)
1388L(bk_write_33bytes):
1389 movl 29(%eax), %ecx
1390 movl %ecx, 29(%edx)
1391L(bk_write_29bytes):
1392 movl 25(%eax), %ecx
1393 movl %ecx, 25(%edx)
1394L(bk_write_25bytes):
1395 movl 21(%eax), %ecx
1396 movl %ecx, 21(%edx)
1397L(bk_write_21bytes):
1398 movl 17(%eax), %ecx
1399 movl %ecx, 17(%edx)
1400L(bk_write_17bytes):
1401 movl 13(%eax), %ecx
1402 movl %ecx, 13(%edx)
1403L(bk_write_13bytes):
1404 movl 9(%eax), %ecx
1405 movl %ecx, 9(%edx)
1406L(bk_write_9bytes):
1407 movl 5(%eax), %ecx
1408 movl %ecx, 5(%edx)
1409L(bk_write_5bytes):
1410 movl 1(%eax), %ecx
1411 movl %ecx, 1(%edx)
1412L(bk_write_1bytes):
1413 movzbl (%eax), %ecx
1414 movb %cl, (%edx)
1415#ifndef USE_AS_BCOPY
1416 movl DEST(%esp), %eax
1417# ifdef USE_AS_MEMPCPY
1418 movl LEN(%esp), %ecx
1419 add %ecx, %eax
1420# endif
1421#endif
1422 RETURN
1423
1424 ALIGN (4)
1425L(bk_write_46bytes):
1426 movl 42(%eax), %ecx
1427 movl %ecx, 42(%edx)
1428L(bk_write_42bytes):
1429 movl 38(%eax), %ecx
1430 movl %ecx, 38(%edx)
1431L(bk_write_38bytes):
1432 movl 34(%eax), %ecx
1433 movl %ecx, 34(%edx)
1434L(bk_write_34bytes):
1435 movl 30(%eax), %ecx
1436 movl %ecx, 30(%edx)
1437L(bk_write_30bytes):
1438 movl 26(%eax), %ecx
1439 movl %ecx, 26(%edx)
1440L(bk_write_26bytes):
1441 movl 22(%eax), %ecx
1442 movl %ecx, 22(%edx)
1443L(bk_write_22bytes):
1444 movl 18(%eax), %ecx
1445 movl %ecx, 18(%edx)
1446L(bk_write_18bytes):
1447 movl 14(%eax), %ecx
1448 movl %ecx, 14(%edx)
1449L(bk_write_14bytes):
1450 movl 10(%eax), %ecx
1451 movl %ecx, 10(%edx)
1452L(bk_write_10bytes):
1453 movl 6(%eax), %ecx
1454 movl %ecx, 6(%edx)
1455L(bk_write_6bytes):
1456 movl 2(%eax), %ecx
1457 movl %ecx, 2(%edx)
1458L(bk_write_2bytes):
1459 movzwl (%eax), %ecx
1460 movw %cx, (%edx)
1461#ifndef USE_AS_BCOPY
1462 movl DEST(%esp), %eax
1463# ifdef USE_AS_MEMPCPY
1464 movl LEN(%esp), %ecx
1465 add %ecx, %eax
1466# endif
1467#endif
1468 RETURN
1469
1470 ALIGN (4)
1471L(bk_write_47bytes):
1472 movl 43(%eax), %ecx
1473 movl %ecx, 43(%edx)
1474L(bk_write_43bytes):
1475 movl 39(%eax), %ecx
1476 movl %ecx, 39(%edx)
1477L(bk_write_39bytes):
1478 movl 35(%eax), %ecx
1479 movl %ecx, 35(%edx)
1480L(bk_write_35bytes):
1481 movl 31(%eax), %ecx
1482 movl %ecx, 31(%edx)
1483L(bk_write_31bytes):
1484 movl 27(%eax), %ecx
1485 movl %ecx, 27(%edx)
1486L(bk_write_27bytes):
1487 movl 23(%eax), %ecx
1488 movl %ecx, 23(%edx)
1489L(bk_write_23bytes):
1490 movl 19(%eax), %ecx
1491 movl %ecx, 19(%edx)
1492L(bk_write_19bytes):
1493 movl 15(%eax), %ecx
1494 movl %ecx, 15(%edx)
1495L(bk_write_15bytes):
1496 movl 11(%eax), %ecx
1497 movl %ecx, 11(%edx)
1498L(bk_write_11bytes):
1499 movl 7(%eax), %ecx
1500 movl %ecx, 7(%edx)
1501L(bk_write_7bytes):
1502 movl 3(%eax), %ecx
1503 movl %ecx, 3(%edx)
1504L(bk_write_3bytes):
1505 movzwl 1(%eax), %ecx
1506 movw %cx, 1(%edx)
1507 movzbl (%eax), %eax
1508 movb %al, (%edx)
1509#ifndef USE_AS_BCOPY
1510 movl DEST(%esp), %eax
1511# ifdef USE_AS_MEMPCPY
1512 movl LEN(%esp), %ecx
1513 add %ecx, %eax
1514# endif
1515#endif
1516 RETURN_END
1517
1518
1519 .pushsection .rodata.ssse3,"a",@progbits
1520 ALIGN (2)
1521L(table_48bytes_fwd):
1522 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
1523 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
1524 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
1525 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
1526 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
1527 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
1528 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
1529 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
1530 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
1531 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
1532 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
1533 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
1534 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
1535 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
1536 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
1537 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
1538 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
1539 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
1540 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
1541 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
1542 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
1543 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
1544 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
1545 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
1546 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
1547 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
1548 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
1549 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
1550 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
1551 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
1552 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
1553 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
1554 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
1555 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
1556 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
1557 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
1558 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
1559 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
1560 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
1561 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
1562 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
1563 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
1564 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
1565 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
1566 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
1567 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
1568 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
1569 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
1570
1571 ALIGN (2)
1572L(shl_table):
1573 .int JMPTBL (L(shl_0), L(shl_table))
1574 .int JMPTBL (L(shl_1), L(shl_table))
1575 .int JMPTBL (L(shl_2), L(shl_table))
1576 .int JMPTBL (L(shl_3), L(shl_table))
1577 .int JMPTBL (L(shl_4), L(shl_table))
1578 .int JMPTBL (L(shl_5), L(shl_table))
1579 .int JMPTBL (L(shl_6), L(shl_table))
1580 .int JMPTBL (L(shl_7), L(shl_table))
1581 .int JMPTBL (L(shl_8), L(shl_table))
1582 .int JMPTBL (L(shl_9), L(shl_table))
1583 .int JMPTBL (L(shl_10), L(shl_table))
1584 .int JMPTBL (L(shl_11), L(shl_table))
1585 .int JMPTBL (L(shl_12), L(shl_table))
1586 .int JMPTBL (L(shl_13), L(shl_table))
1587 .int JMPTBL (L(shl_14), L(shl_table))
1588 .int JMPTBL (L(shl_15), L(shl_table))
1589
1590 ALIGN (2)
1591L(table_48_bytes_bwd):
1592 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
1593 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
1594 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
1595 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
1596 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
1597 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
1598 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
1599 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
1600 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
1601 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
1602 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
1603 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
1604 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
1605 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
1606 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
1607 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
1608 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
1609 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
1610 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
1611 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
1612 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
1613 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
1614 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
1615 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
1616 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
1617 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
1618 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
1619 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
1620 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
1621 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
1622 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
1623 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
1624 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
1625 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
1626 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
1627 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
1628 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
1629 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
1630 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
1631 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
1632 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
1633 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
1634 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
1635 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
1636 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
1637 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
1638 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
1639 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
1640
1641 .popsection
1642
1643#ifdef USE_AS_MEMMOVE
1644 ALIGN (4)
1645L(copy_backward):
1646 PUSH (%esi)
1647 movl %eax, %esi
1648 lea (%ecx,%edx,1),%edx
1649 lea (%ecx,%esi,1),%esi
1650 testl $0x3, %edx
1651 jnz L(bk_align)
1652
1653L(bk_aligned_4):
1654 cmp $64, %ecx
1655 jae L(bk_write_more64bytes)
1656
1657L(bk_write_64bytesless):
1658 cmp $32, %ecx
1659 jb L(bk_write_less32bytes)
1660
1661L(bk_write_more32bytes):
1662 /* Copy 32 bytes at a time. */
1663 sub $32, %ecx
1664 movl -4(%esi), %eax
1665 movl %eax, -4(%edx)
1666 movl -8(%esi), %eax
1667 movl %eax, -8(%edx)
1668 movl -12(%esi), %eax
1669 movl %eax, -12(%edx)
1670 movl -16(%esi), %eax
1671 movl %eax, -16(%edx)
1672 movl -20(%esi), %eax
1673 movl %eax, -20(%edx)
1674 movl -24(%esi), %eax
1675 movl %eax, -24(%edx)
1676 movl -28(%esi), %eax
1677 movl %eax, -28(%edx)
1678 movl -32(%esi), %eax
1679 movl %eax, -32(%edx)
1680 sub $32, %edx
1681 sub $32, %esi
1682
1683L(bk_write_less32bytes):
1684 movl %esi, %eax
1685 sub %ecx, %edx
1686 sub %ecx, %eax
1687 POP (%esi)
1688L(bk_write_less32bytes_2):
1689 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
1690
1691 ALIGN (4)
1692L(bk_align):
1693 cmp $8, %ecx
1694 jbe L(bk_write_less32bytes)
1695 testl $1, %edx
1696 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
1697 then (EDX & 2) must be != 0. */
1698 jz L(bk_got2)
1699 sub $1, %esi
1700 sub $1, %ecx
1701 sub $1, %edx
1702 movzbl (%esi), %eax
1703 movb %al, (%edx)
1704
1705 testl $2, %edx
1706 jz L(bk_aligned_4)
1707
1708L(bk_got2):
1709 sub $2, %esi
1710 sub $2, %ecx
1711 sub $2, %edx
1712 movzwl (%esi), %eax
1713 movw %ax, (%edx)
1714 jmp L(bk_aligned_4)
1715
1716 ALIGN (4)
1717L(bk_write_more64bytes):
1718 /* Check alignment of last byte. */
1719 testl $15, %edx
1720 jz L(bk_ssse3_cpy_pre)
1721
1722/* EDX is aligned 4 bytes, but not 16 bytes. */
1723L(bk_ssse3_align):
1724 sub $4, %esi
1725 sub $4, %ecx
1726 sub $4, %edx
1727 movl (%esi), %eax
1728 movl %eax, (%edx)
1729
1730 testl $15, %edx
1731 jz L(bk_ssse3_cpy_pre)
1732
1733 sub $4, %esi
1734 sub $4, %ecx
1735 sub $4, %edx
1736 movl (%esi), %eax
1737 movl %eax, (%edx)
1738
1739 testl $15, %edx
1740 jz L(bk_ssse3_cpy_pre)
1741
1742 sub $4, %esi
1743 sub $4, %ecx
1744 sub $4, %edx
1745 movl (%esi), %eax
1746 movl %eax, (%edx)
1747
1748L(bk_ssse3_cpy_pre):
1749 cmp $64, %ecx
1750 jb L(bk_write_more32bytes)
1751
1752L(bk_ssse3_cpy):
1753 sub $64, %esi
1754 sub $64, %ecx
1755 sub $64, %edx
1756 movdqu 0x30(%esi), %xmm3
1757 movdqa %xmm3, 0x30(%edx)
1758 movdqu 0x20(%esi), %xmm2
1759 movdqa %xmm2, 0x20(%edx)
1760 movdqu 0x10(%esi), %xmm1
1761 movdqa %xmm1, 0x10(%edx)
1762 movdqu (%esi), %xmm0
1763 movdqa %xmm0, (%edx)
1764 cmp $64, %ecx
1765 jae L(bk_ssse3_cpy)
1766 jmp L(bk_write_64bytesless)
1767
1768#endif
1769
1770END (MEMCPY)