blob: b0612a6040592ef3da8e43c7316466559dd3af45 [file] [log] [blame]
Bruce Beare8ff1a272010-03-04 11:03:37 -08001/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef MEMCPY
Jack Renc47703a2012-02-14 12:01:52 +040032# define MEMCPY ssse3_memcpy5
Bruce Beare8ff1a272010-03-04 11:03:37 -080033#endif
34
35#ifndef L
36# define L(label) .L##label
37#endif
38
Bruce Beare8ff1a272010-03-04 11:03:37 -080039#ifndef cfi_startproc
Jack Renc47703a2012-02-14 12:01:52 +040040# define cfi_startproc .cfi_startproc
Bruce Beare8ff1a272010-03-04 11:03:37 -080041#endif
42
43#ifndef cfi_endproc
Jack Renc47703a2012-02-14 12:01:52 +040044# define cfi_endproc .cfi_endproc
Bruce Beare8ff1a272010-03-04 11:03:37 -080045#endif
46
47#ifndef cfi_rel_offset
48# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
49#endif
50
51#ifndef cfi_restore
Jack Renc47703a2012-02-14 12:01:52 +040052# define cfi_restore(reg) .cfi_restore reg
Bruce Beare8ff1a272010-03-04 11:03:37 -080053#endif
54
55#ifndef cfi_adjust_cfa_offset
56# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
57#endif
58
59#ifndef ENTRY
Jack Renc47703a2012-02-14 12:01:52 +040060# define ENTRY(name) \
61 .type name, @function; \
62 .globl name; \
63 .p2align 4; \
64name: \
Bruce Beare8ff1a272010-03-04 11:03:37 -080065 cfi_startproc
66#endif
67
68#ifndef END
Jack Renc47703a2012-02-14 12:01:52 +040069# define END(name) \
70 cfi_endproc; \
Bruce Beare8ff1a272010-03-04 11:03:37 -080071 .size name, .-name
72#endif
73
74#ifdef USE_AS_BCOPY
75# define SRC PARMS
76# define DEST SRC+4
77# define LEN DEST+4
78#else
79# define DEST PARMS
80# define SRC DEST+4
81# define LEN SRC+4
82#endif
83
Jack Renc47703a2012-02-14 12:01:52 +040084#define CFI_PUSH(REG) \
85 cfi_adjust_cfa_offset (4); \
Bruce Beare8ff1a272010-03-04 11:03:37 -080086 cfi_rel_offset (REG, 0)
87
Jack Renc47703a2012-02-14 12:01:52 +040088#define CFI_POP(REG) \
89 cfi_adjust_cfa_offset (-4); \
Bruce Beare8ff1a272010-03-04 11:03:37 -080090 cfi_restore (REG)
91
92#define PUSH(REG) pushl REG; CFI_PUSH (REG)
93#define POP(REG) popl REG; CFI_POP (REG)
94
Nick Kralevich5982e332011-11-11 15:47:24 -080095#if (defined SHARED || defined __PIC__)
Bruce Beare8ff1a272010-03-04 11:03:37 -080096# define PARMS 8 /* Preserve EBX. */
97# define ENTRANCE PUSH (%ebx);
98# define RETURN_END POP (%ebx); ret
99# define RETURN RETURN_END; CFI_PUSH (%ebx)
100# define JMPTBL(I, B) I - B
Jack Renc47703a2012-02-14 12:01:52 +0400101# undef __i686
102
103# define SETUP_PIC_REG(x) call __i686.get_pc_thunk.x
Bruce Beare8ff1a272010-03-04 11:03:37 -0800104
105/* Load an entry in a jump table into EBX and branch to it. TABLE is a
Jack Renc47703a2012-02-14 12:01:52 +0400106 jump table with relative offsets. INDEX is a register contains the
107 index into the jump table. SCALE is the scale of INDEX. */
108
Bruce Beare8ff1a272010-03-04 11:03:37 -0800109# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
Jack Renc47703a2012-02-14 12:01:52 +0400110 /* We first load PC into EBX. */ \
111 SETUP_PIC_REG(bx); \
112 /* Get the address of the jump table. */ \
113 addl $(TABLE - .), %ebx; \
114 /* Get the entry and convert the relative offset to the \
115 absolute address. */ \
116 addl (%ebx, INDEX, SCALE), %ebx; \
117 /* We loaded the jump table. Go. */ \
118 jmp *%ebx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800119#else
Jack Renc47703a2012-02-14 12:01:52 +0400120
Bruce Beare8ff1a272010-03-04 11:03:37 -0800121# define PARMS 4
122# define ENTRANCE
123# define RETURN_END ret
124# define RETURN RETURN_END
125# define JMPTBL(I, B) I
126
127/* Branch to an entry in a jump table. TABLE is a jump table with
Jack Renc47703a2012-02-14 12:01:52 +0400128 absolute offsets. INDEX is a register contains the index into the
129 jump table. SCALE is the scale of INDEX. */
130
Bruce Beare8ff1a272010-03-04 11:03:37 -0800131# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
Jack Renc47703a2012-02-14 12:01:52 +0400132 jmp *TABLE(, INDEX, SCALE)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800133#endif
134
135 .section .text.ssse3,"ax",@progbits
136ENTRY (MEMCPY)
137 ENTRANCE
138 movl LEN(%esp), %ecx
139 movl SRC(%esp), %eax
140 movl DEST(%esp), %edx
141
142#ifdef USE_AS_MEMMOVE
143 cmp %eax, %edx
144 jb L(copy_forward)
145 je L(fwd_write_0bytes)
146 cmp $32, %ecx
147 jae L(memmove_bwd)
148 jmp L(bk_write_less32bytes_2)
Jack Renc47703a2012-02-14 12:01:52 +0400149
150 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800151L(memmove_bwd):
152 add %ecx, %eax
153 cmp %eax, %edx
154 movl SRC(%esp), %eax
155 jb L(copy_backward)
156
157L(copy_forward):
158#endif
159 cmp $48, %ecx
160 jae L(48bytesormore)
161
162L(fwd_write_less32bytes):
163#ifndef USE_AS_MEMMOVE
164 cmp %dl, %al
165 jb L(bk_write)
166#endif
167 add %ecx, %edx
168 add %ecx, %eax
169 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
170#ifndef USE_AS_MEMMOVE
Jack Renc47703a2012-02-14 12:01:52 +0400171 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800172L(bk_write):
173 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
174#endif
175
Jack Renc47703a2012-02-14 12:01:52 +0400176 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800177L(48bytesormore):
Jack Renc47703a2012-02-14 12:01:52 +0400178#ifndef USE_AS_MEMMOVE
179 movlpd (%eax), %xmm0
180 movlpd 8(%eax), %xmm1
181 movlpd %xmm0, (%edx)
182 movlpd %xmm1, 8(%edx)
183#else
Bruce Beare8ff1a272010-03-04 11:03:37 -0800184 movdqu (%eax), %xmm0
Jack Renc47703a2012-02-14 12:01:52 +0400185#endif
Bruce Beare8ff1a272010-03-04 11:03:37 -0800186 PUSH (%edi)
187 movl %edx, %edi
188 and $-16, %edx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800189 add $16, %edx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800190 sub %edx, %edi
191 add %edi, %ecx
192 sub %edi, %eax
193
194#ifdef SHARED_CACHE_SIZE_HALF
195 cmp $SHARED_CACHE_SIZE_HALF, %ecx
196#else
Nick Kralevich5982e332011-11-11 15:47:24 -0800197# if (defined SHARED || defined __PIC__)
Jack Renc47703a2012-02-14 12:01:52 +0400198 SETUP_PIC_REG(bx)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800199 add $_GLOBAL_OFFSET_TABLE_, %ebx
200 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
201# else
202 cmp __x86_shared_cache_size_half, %ecx
203# endif
204#endif
205
206 mov %eax, %edi
207 jae L(large_page)
208 and $0xf, %edi
209 jz L(shl_0)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800210 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
211
Jack Renc47703a2012-02-14 12:01:52 +0400212 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800213L(shl_0):
Jack Renc47703a2012-02-14 12:01:52 +0400214#ifdef USE_AS_MEMMOVE
215 movl DEST+4(%esp), %edi
216 movdqu %xmm0, (%edi)
217#endif
Bruce Beare8ff1a272010-03-04 11:03:37 -0800218 xor %edi, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -0800219 cmp $127, %ecx
220 ja L(shl_0_gobble)
221 lea -32(%ecx), %ecx
Jack Renc47703a2012-02-14 12:01:52 +0400222
223 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800224L(shl_0_loop):
225 movdqa (%eax, %edi), %xmm0
226 movdqa 16(%eax, %edi), %xmm1
227 sub $32, %ecx
228 movdqa %xmm0, (%edx, %edi)
229 movdqa %xmm1, 16(%edx, %edi)
230 lea 32(%edi), %edi
231 jb L(shl_0_end)
232
233 movdqa (%eax, %edi), %xmm0
234 movdqa 16(%eax, %edi), %xmm1
235 sub $32, %ecx
236 movdqa %xmm0, (%edx, %edi)
237 movdqa %xmm1, 16(%edx, %edi)
238 lea 32(%edi), %edi
239 jb L(shl_0_end)
240
241 movdqa (%eax, %edi), %xmm0
242 movdqa 16(%eax, %edi), %xmm1
243 sub $32, %ecx
244 movdqa %xmm0, (%edx, %edi)
245 movdqa %xmm1, 16(%edx, %edi)
246 lea 32(%edi), %edi
247 jb L(shl_0_end)
248
249 movdqa (%eax, %edi), %xmm0
250 movdqa 16(%eax, %edi), %xmm1
251 sub $32, %ecx
252 movdqa %xmm0, (%edx, %edi)
253 movdqa %xmm1, 16(%edx, %edi)
254 lea 32(%edi), %edi
Jack Renc47703a2012-02-14 12:01:52 +0400255
Bruce Beare8ff1a272010-03-04 11:03:37 -0800256L(shl_0_end):
257 lea 32(%ecx), %ecx
258 add %ecx, %edi
259 add %edi, %edx
260 add %edi, %eax
261 POP (%edi)
Jack Renc47703a2012-02-14 12:01:52 +0400262 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800263
Bruce Beare124a5422010-10-11 12:24:41 -0700264 CFI_PUSH (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800265
Jack Renc47703a2012-02-14 12:01:52 +0400266 .p2align 4
267L(shl_0_gobble):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800268#ifdef DATA_CACHE_SIZE_HALF
269 cmp $DATA_CACHE_SIZE_HALF, %ecx
270#else
Nick Kralevich5982e332011-11-11 15:47:24 -0800271# if (defined SHARED || defined __PIC__)
Jack Renc47703a2012-02-14 12:01:52 +0400272 SETUP_PIC_REG(bx)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800273 add $_GLOBAL_OFFSET_TABLE_, %ebx
274 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
275# else
276 cmp __x86_data_cache_size_half, %ecx
277# endif
278#endif
Jack Renc47703a2012-02-14 12:01:52 +0400279 POP (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800280 lea -128(%ecx), %ecx
281 jae L(shl_0_gobble_mem_loop)
Jack Renc47703a2012-02-14 12:01:52 +0400282
283 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800284L(shl_0_gobble_cache_loop):
285 movdqa (%eax), %xmm0
286 movdqa 0x10(%eax), %xmm1
287 movdqa 0x20(%eax), %xmm2
288 movdqa 0x30(%eax), %xmm3
289 movdqa 0x40(%eax), %xmm4
290 movdqa 0x50(%eax), %xmm5
291 movdqa 0x60(%eax), %xmm6
292 movdqa 0x70(%eax), %xmm7
293 lea 0x80(%eax), %eax
294 sub $128, %ecx
295 movdqa %xmm0, (%edx)
296 movdqa %xmm1, 0x10(%edx)
297 movdqa %xmm2, 0x20(%edx)
298 movdqa %xmm3, 0x30(%edx)
299 movdqa %xmm4, 0x40(%edx)
300 movdqa %xmm5, 0x50(%edx)
301 movdqa %xmm6, 0x60(%edx)
302 movdqa %xmm7, 0x70(%edx)
303 lea 0x80(%edx), %edx
304
305 jae L(shl_0_gobble_cache_loop)
306 cmp $-0x40, %ecx
307 lea 0x80(%ecx), %ecx
308 jl L(shl_0_cache_less_64bytes)
309
310 movdqa (%eax), %xmm0
311 sub $0x40, %ecx
312 movdqa 0x10(%eax), %xmm1
Bruce Beare8ff1a272010-03-04 11:03:37 -0800313 movdqa %xmm0, (%edx)
314 movdqa %xmm1, 0x10(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800315 movdqa 0x20(%eax), %xmm0
316 movdqa 0x30(%eax), %xmm1
317 add $0x40, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -0800318 movdqa %xmm0, 0x20(%edx)
319 movdqa %xmm1, 0x30(%edx)
320 add $0x40, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400321
Bruce Beare8ff1a272010-03-04 11:03:37 -0800322L(shl_0_cache_less_64bytes):
323 cmp $0x20, %ecx
324 jb L(shl_0_cache_less_32bytes)
325 movdqa (%eax), %xmm0
326 sub $0x20, %ecx
327 movdqa 0x10(%eax), %xmm1
328 add $0x20, %eax
329 movdqa %xmm0, (%edx)
330 movdqa %xmm1, 0x10(%edx)
331 add $0x20, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400332
Bruce Beare8ff1a272010-03-04 11:03:37 -0800333L(shl_0_cache_less_32bytes):
334 cmp $0x10, %ecx
335 jb L(shl_0_cache_less_16bytes)
336 sub $0x10, %ecx
337 movdqa (%eax), %xmm0
338 add $0x10, %eax
339 movdqa %xmm0, (%edx)
340 add $0x10, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400341
Bruce Beare8ff1a272010-03-04 11:03:37 -0800342L(shl_0_cache_less_16bytes):
343 add %ecx, %edx
344 add %ecx, %eax
345 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
346
Jack Renc47703a2012-02-14 12:01:52 +0400347 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800348L(shl_0_gobble_mem_loop):
349 prefetcht0 0x1c0(%eax)
350 prefetcht0 0x280(%eax)
351 prefetcht0 0x1c0(%edx)
352
353 movdqa (%eax), %xmm0
354 movdqa 0x10(%eax), %xmm1
355 movdqa 0x20(%eax), %xmm2
356 movdqa 0x30(%eax), %xmm3
357 movdqa 0x40(%eax), %xmm4
358 movdqa 0x50(%eax), %xmm5
359 movdqa 0x60(%eax), %xmm6
360 movdqa 0x70(%eax), %xmm7
361 lea 0x80(%eax), %eax
362 sub $0x80, %ecx
363 movdqa %xmm0, (%edx)
364 movdqa %xmm1, 0x10(%edx)
365 movdqa %xmm2, 0x20(%edx)
366 movdqa %xmm3, 0x30(%edx)
367 movdqa %xmm4, 0x40(%edx)
368 movdqa %xmm5, 0x50(%edx)
369 movdqa %xmm6, 0x60(%edx)
370 movdqa %xmm7, 0x70(%edx)
371 lea 0x80(%edx), %edx
372
373 jae L(shl_0_gobble_mem_loop)
374 cmp $-0x40, %ecx
375 lea 0x80(%ecx), %ecx
376 jl L(shl_0_mem_less_64bytes)
377
378 movdqa (%eax), %xmm0
379 sub $0x40, %ecx
380 movdqa 0x10(%eax), %xmm1
381
382 movdqa %xmm0, (%edx)
383 movdqa %xmm1, 0x10(%edx)
384
385 movdqa 0x20(%eax), %xmm0
386 movdqa 0x30(%eax), %xmm1
387 add $0x40, %eax
388
389 movdqa %xmm0, 0x20(%edx)
390 movdqa %xmm1, 0x30(%edx)
391 add $0x40, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400392
Bruce Beare8ff1a272010-03-04 11:03:37 -0800393L(shl_0_mem_less_64bytes):
394 cmp $0x20, %ecx
395 jb L(shl_0_mem_less_32bytes)
396 movdqa (%eax), %xmm0
397 sub $0x20, %ecx
398 movdqa 0x10(%eax), %xmm1
399 add $0x20, %eax
400 movdqa %xmm0, (%edx)
401 movdqa %xmm1, 0x10(%edx)
402 add $0x20, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400403
Bruce Beare8ff1a272010-03-04 11:03:37 -0800404L(shl_0_mem_less_32bytes):
405 cmp $0x10, %ecx
406 jb L(shl_0_mem_less_16bytes)
407 sub $0x10, %ecx
408 movdqa (%eax), %xmm0
409 add $0x10, %eax
410 movdqa %xmm0, (%edx)
411 add $0x10, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400412
Bruce Beare8ff1a272010-03-04 11:03:37 -0800413L(shl_0_mem_less_16bytes):
414 add %ecx, %edx
415 add %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +0400416 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800417
Jack Renc47703a2012-02-14 12:01:52 +0400418 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800419L(shl_1):
Jack Renc47703a2012-02-14 12:01:52 +0400420#ifndef USE_AS_MEMMOVE
421 movaps -1(%eax), %xmm1
422#else
423 movl DEST+4(%esp), %edi
424 movaps -1(%eax), %xmm1
425 movdqu %xmm0, (%edi)
426#endif
427#ifdef DATA_CACHE_SIZE_HALF
428 cmp $DATA_CACHE_SIZE_HALF, %ecx
429#else
430# if (defined SHARED || defined __PIC__)
431 SETUP_PIC_REG(bx)
432 add $_GLOBAL_OFFSET_TABLE_, %ebx
433 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
434# else
435 cmp __x86_data_cache_size_half, %ecx
436# endif
437#endif
438 jb L(sh_1_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800439
Jack Renc47703a2012-02-14 12:01:52 +0400440 lea -64(%ecx), %ecx
441
442 .p2align 4
443L(Shl1LoopStart):
444 prefetcht0 0x1c0(%eax)
445 prefetcht0 0x1c0(%edx)
446 movaps 15(%eax), %xmm2
447 movaps 31(%eax), %xmm3
448 movaps 47(%eax), %xmm4
449 movaps 63(%eax), %xmm5
450 movaps %xmm5, %xmm7
451 palignr $1, %xmm4, %xmm5
452 palignr $1, %xmm3, %xmm4
453 movaps %xmm5, 48(%edx)
454 palignr $1, %xmm2, %xmm3
455 lea 64(%eax), %eax
456 palignr $1, %xmm1, %xmm2
457 movaps %xmm4, 32(%edx)
458 movaps %xmm3, 16(%edx)
459 movaps %xmm7, %xmm1
460 movaps %xmm2, (%edx)
461 lea 64(%edx), %edx
462 sub $64, %ecx
463 ja L(Shl1LoopStart)
464
465L(Shl1LoopLeave):
466 add $32, %ecx
467 jle L(shl_end_0)
468
469 movaps 15(%eax), %xmm2
470 movaps 31(%eax), %xmm3
471 palignr $1, %xmm2, %xmm3
472 palignr $1, %xmm1, %xmm2
473 movaps %xmm2, (%edx)
474 movaps %xmm3, 16(%edx)
475 lea 32(%edx, %ecx), %edx
476 lea 32(%eax, %ecx), %eax
477 POP (%edi)
478 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
479
480 CFI_PUSH (%edi)
481
482 .p2align 4
483L(sh_1_no_prefetch):
484 lea -32(%ecx), %ecx
485 lea -1(%eax), %eax
486 xor %edi, %edi
487
488 .p2align 4
489L(sh_1_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800490 movdqa 16(%eax, %edi), %xmm2
491 sub $32, %ecx
492 movdqa 32(%eax, %edi), %xmm3
493 movdqa %xmm3, %xmm4
494 palignr $1, %xmm2, %xmm3
495 palignr $1, %xmm1, %xmm2
496 lea 32(%edi), %edi
497 movdqa %xmm2, -32(%edx, %edi)
498 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400499 jb L(sh_1_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800500
501 movdqa 16(%eax, %edi), %xmm2
502 sub $32, %ecx
503 movdqa 32(%eax, %edi), %xmm3
504 movdqa %xmm3, %xmm1
505 palignr $1, %xmm2, %xmm3
506 palignr $1, %xmm4, %xmm2
507 lea 32(%edi), %edi
508 movdqa %xmm2, -32(%edx, %edi)
509 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400510 jae L(sh_1_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800511
Jack Renc47703a2012-02-14 12:01:52 +0400512L(sh_1_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800513 lea 32(%ecx), %ecx
514 add %ecx, %edi
515 add %edi, %edx
516 lea 1(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400517 POP (%edi)
518 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800519
Jack Renc47703a2012-02-14 12:01:52 +0400520 CFI_PUSH (%edi)
521
522 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800523L(shl_2):
Jack Renc47703a2012-02-14 12:01:52 +0400524#ifndef USE_AS_MEMMOVE
525 movaps -2(%eax), %xmm1
526#else
527 movl DEST+4(%esp), %edi
528 movaps -2(%eax), %xmm1
529 movdqu %xmm0, (%edi)
530#endif
531#ifdef DATA_CACHE_SIZE_HALF
532 cmp $DATA_CACHE_SIZE_HALF, %ecx
533#else
534# if (defined SHARED || defined __PIC__)
535 SETUP_PIC_REG(bx)
536 add $_GLOBAL_OFFSET_TABLE_, %ebx
537 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
538# else
539 cmp __x86_data_cache_size_half, %ecx
540# endif
541#endif
542 jb L(sh_2_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800543
Jack Renc47703a2012-02-14 12:01:52 +0400544 lea -64(%ecx), %ecx
545
546 .p2align 4
547L(Shl2LoopStart):
548 prefetcht0 0x1c0(%eax)
549 prefetcht0 0x1c0(%edx)
550 movaps 14(%eax), %xmm2
551 movaps 30(%eax), %xmm3
552 movaps 46(%eax), %xmm4
553 movaps 62(%eax), %xmm5
554 movaps %xmm5, %xmm7
555 palignr $2, %xmm4, %xmm5
556 palignr $2, %xmm3, %xmm4
557 movaps %xmm5, 48(%edx)
558 palignr $2, %xmm2, %xmm3
559 lea 64(%eax), %eax
560 palignr $2, %xmm1, %xmm2
561 movaps %xmm4, 32(%edx)
562 movaps %xmm3, 16(%edx)
563 movaps %xmm7, %xmm1
564 movaps %xmm2, (%edx)
565 lea 64(%edx), %edx
566 sub $64, %ecx
567 ja L(Shl2LoopStart)
568
569L(Shl2LoopLeave):
570 add $32, %ecx
571 jle L(shl_end_0)
572
573 movaps 14(%eax), %xmm2
574 movaps 30(%eax), %xmm3
575 palignr $2, %xmm2, %xmm3
576 palignr $2, %xmm1, %xmm2
577 movaps %xmm2, (%edx)
578 movaps %xmm3, 16(%edx)
579 lea 32(%edx, %ecx), %edx
580 lea 32(%eax, %ecx), %eax
581 POP (%edi)
582 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
583
584 CFI_PUSH (%edi)
585
586 .p2align 4
587L(sh_2_no_prefetch):
588 lea -32(%ecx), %ecx
589 lea -2(%eax), %eax
590 xor %edi, %edi
591
592 .p2align 4
593L(sh_2_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800594 movdqa 16(%eax, %edi), %xmm2
595 sub $32, %ecx
596 movdqa 32(%eax, %edi), %xmm3
597 movdqa %xmm3, %xmm4
598 palignr $2, %xmm2, %xmm3
599 palignr $2, %xmm1, %xmm2
600 lea 32(%edi), %edi
601 movdqa %xmm2, -32(%edx, %edi)
602 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400603 jb L(sh_2_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800604
605 movdqa 16(%eax, %edi), %xmm2
606 sub $32, %ecx
607 movdqa 32(%eax, %edi), %xmm3
608 movdqa %xmm3, %xmm1
609 palignr $2, %xmm2, %xmm3
610 palignr $2, %xmm4, %xmm2
611 lea 32(%edi), %edi
612 movdqa %xmm2, -32(%edx, %edi)
613 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400614 jae L(sh_2_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800615
Jack Renc47703a2012-02-14 12:01:52 +0400616L(sh_2_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800617 lea 32(%ecx), %ecx
618 add %ecx, %edi
619 add %edi, %edx
620 lea 2(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400621 POP (%edi)
622 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800623
Jack Renc47703a2012-02-14 12:01:52 +0400624 CFI_PUSH (%edi)
625
626 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800627L(shl_3):
Jack Renc47703a2012-02-14 12:01:52 +0400628#ifndef USE_AS_MEMMOVE
629 movaps -3(%eax), %xmm1
630#else
631 movl DEST+4(%esp), %edi
632 movaps -3(%eax), %xmm1
633 movdqu %xmm0, (%edi)
634#endif
635#ifdef DATA_CACHE_SIZE_HALF
636 cmp $DATA_CACHE_SIZE_HALF, %ecx
637#else
638# if (defined SHARED || defined __PIC__)
639 SETUP_PIC_REG(bx)
640 add $_GLOBAL_OFFSET_TABLE_, %ebx
641 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
642# else
643 cmp __x86_data_cache_size_half, %ecx
644# endif
645#endif
646 jb L(sh_3_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800647
Jack Renc47703a2012-02-14 12:01:52 +0400648 lea -64(%ecx), %ecx
649
650 .p2align 4
651L(Shl3LoopStart):
652 prefetcht0 0x1c0(%eax)
653 prefetcht0 0x1c0(%edx)
654 movaps 13(%eax), %xmm2
655 movaps 29(%eax), %xmm3
656 movaps 45(%eax), %xmm4
657 movaps 61(%eax), %xmm5
658 movaps %xmm5, %xmm7
659 palignr $3, %xmm4, %xmm5
660 palignr $3, %xmm3, %xmm4
661 movaps %xmm5, 48(%edx)
662 palignr $3, %xmm2, %xmm3
663 lea 64(%eax), %eax
664 palignr $3, %xmm1, %xmm2
665 movaps %xmm4, 32(%edx)
666 movaps %xmm3, 16(%edx)
667 movaps %xmm7, %xmm1
668 movaps %xmm2, (%edx)
669 lea 64(%edx), %edx
670 sub $64, %ecx
671 ja L(Shl3LoopStart)
672
673L(Shl3LoopLeave):
674 add $32, %ecx
675 jle L(shl_end_0)
676
677 movaps 13(%eax), %xmm2
678 movaps 29(%eax), %xmm3
679 palignr $3, %xmm2, %xmm3
680 palignr $3, %xmm1, %xmm2
681 movaps %xmm2, (%edx)
682 movaps %xmm3, 16(%edx)
683 lea 32(%edx, %ecx), %edx
684 lea 32(%eax, %ecx), %eax
685 POP (%edi)
686 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
687
688 CFI_PUSH (%edi)
689
690 .p2align 4
691L(sh_3_no_prefetch):
692 lea -32(%ecx), %ecx
693 lea -3(%eax), %eax
694 xor %edi, %edi
695
696 .p2align 4
697L(sh_3_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800698 movdqa 16(%eax, %edi), %xmm2
699 sub $32, %ecx
700 movdqa 32(%eax, %edi), %xmm3
701 movdqa %xmm3, %xmm4
702 palignr $3, %xmm2, %xmm3
703 palignr $3, %xmm1, %xmm2
704 lea 32(%edi), %edi
705 movdqa %xmm2, -32(%edx, %edi)
706 movdqa %xmm3, -16(%edx, %edi)
707
Jack Renc47703a2012-02-14 12:01:52 +0400708 jb L(sh_3_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800709
710 movdqa 16(%eax, %edi), %xmm2
711 sub $32, %ecx
712 movdqa 32(%eax, %edi), %xmm3
713 movdqa %xmm3, %xmm1
714 palignr $3, %xmm2, %xmm3
715 palignr $3, %xmm4, %xmm2
716 lea 32(%edi), %edi
717 movdqa %xmm2, -32(%edx, %edi)
718 movdqa %xmm3, -16(%edx, %edi)
719
Jack Renc47703a2012-02-14 12:01:52 +0400720 jae L(sh_3_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800721
Jack Renc47703a2012-02-14 12:01:52 +0400722L(sh_3_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800723 lea 32(%ecx), %ecx
724 add %ecx, %edi
725 add %edi, %edx
726 lea 3(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400727 POP (%edi)
728 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800729
Jack Renc47703a2012-02-14 12:01:52 +0400730 CFI_PUSH (%edi)
731
732 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800733L(shl_4):
Jack Renc47703a2012-02-14 12:01:52 +0400734#ifndef USE_AS_MEMMOVE
735 movaps -4(%eax), %xmm1
736#else
737 movl DEST+4(%esp), %edi
738 movaps -4(%eax), %xmm1
739 movdqu %xmm0, (%edi)
740#endif
741#ifdef DATA_CACHE_SIZE_HALF
742 cmp $DATA_CACHE_SIZE_HALF, %ecx
743#else
744# if (defined SHARED || defined __PIC__)
745 SETUP_PIC_REG(bx)
746 add $_GLOBAL_OFFSET_TABLE_, %ebx
747 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
748# else
749 cmp __x86_data_cache_size_half, %ecx
750# endif
751#endif
752 jb L(sh_4_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800753
Jack Renc47703a2012-02-14 12:01:52 +0400754 lea -64(%ecx), %ecx
755
756 .p2align 4
757L(Shl4LoopStart):
758 prefetcht0 0x1c0(%eax)
759 prefetcht0 0x1c0(%edx)
760 movaps 12(%eax), %xmm2
761 movaps 28(%eax), %xmm3
762 movaps 44(%eax), %xmm4
763 movaps 60(%eax), %xmm5
764 movaps %xmm5, %xmm7
765 palignr $4, %xmm4, %xmm5
766 palignr $4, %xmm3, %xmm4
767 movaps %xmm5, 48(%edx)
768 palignr $4, %xmm2, %xmm3
769 lea 64(%eax), %eax
770 palignr $4, %xmm1, %xmm2
771 movaps %xmm4, 32(%edx)
772 movaps %xmm3, 16(%edx)
773 movaps %xmm7, %xmm1
774 movaps %xmm2, (%edx)
775 lea 64(%edx), %edx
776 sub $64, %ecx
777 ja L(Shl4LoopStart)
778
779L(Shl4LoopLeave):
780 add $32, %ecx
781 jle L(shl_end_0)
782
783 movaps 12(%eax), %xmm2
784 movaps 28(%eax), %xmm3
785 palignr $4, %xmm2, %xmm3
786 palignr $4, %xmm1, %xmm2
787 movaps %xmm2, (%edx)
788 movaps %xmm3, 16(%edx)
789 lea 32(%edx, %ecx), %edx
790 lea 32(%eax, %ecx), %eax
791 POP (%edi)
792 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
793
794 CFI_PUSH (%edi)
795
796 .p2align 4
797L(sh_4_no_prefetch):
798 lea -32(%ecx), %ecx
799 lea -4(%eax), %eax
800 xor %edi, %edi
801
802 .p2align 4
803L(sh_4_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800804 movdqa 16(%eax, %edi), %xmm2
805 sub $32, %ecx
806 movdqa 32(%eax, %edi), %xmm3
807 movdqa %xmm3, %xmm4
808 palignr $4, %xmm2, %xmm3
809 palignr $4, %xmm1, %xmm2
810 lea 32(%edi), %edi
811 movdqa %xmm2, -32(%edx, %edi)
812 movdqa %xmm3, -16(%edx, %edi)
813
Jack Renc47703a2012-02-14 12:01:52 +0400814 jb L(sh_4_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800815
816 movdqa 16(%eax, %edi), %xmm2
817 sub $32, %ecx
818 movdqa 32(%eax, %edi), %xmm3
819 movdqa %xmm3, %xmm1
820 palignr $4, %xmm2, %xmm3
821 palignr $4, %xmm4, %xmm2
822 lea 32(%edi), %edi
823 movdqa %xmm2, -32(%edx, %edi)
824 movdqa %xmm3, -16(%edx, %edi)
825
Jack Renc47703a2012-02-14 12:01:52 +0400826 jae L(sh_4_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800827
Jack Renc47703a2012-02-14 12:01:52 +0400828L(sh_4_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800829 lea 32(%ecx), %ecx
830 add %ecx, %edi
831 add %edi, %edx
832 lea 4(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400833 POP (%edi)
834 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800835
Jack Renc47703a2012-02-14 12:01:52 +0400836 CFI_PUSH (%edi)
837
838 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800839L(shl_5):
Jack Renc47703a2012-02-14 12:01:52 +0400840#ifndef USE_AS_MEMMOVE
841 movaps -5(%eax), %xmm1
842#else
843 movl DEST+4(%esp), %edi
844 movaps -5(%eax), %xmm1
845 movdqu %xmm0, (%edi)
846#endif
847#ifdef DATA_CACHE_SIZE_HALF
848 cmp $DATA_CACHE_SIZE_HALF, %ecx
849#else
850# if (defined SHARED || defined __PIC__)
851 SETUP_PIC_REG(bx)
852 add $_GLOBAL_OFFSET_TABLE_, %ebx
853 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
854# else
855 cmp __x86_data_cache_size_half, %ecx
856# endif
857#endif
858 jb L(sh_5_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800859
Jack Renc47703a2012-02-14 12:01:52 +0400860 lea -64(%ecx), %ecx
861
862 .p2align 4
863L(Shl5LoopStart):
864 prefetcht0 0x1c0(%eax)
865 prefetcht0 0x1c0(%edx)
866 movaps 11(%eax), %xmm2
867 movaps 27(%eax), %xmm3
868 movaps 43(%eax), %xmm4
869 movaps 59(%eax), %xmm5
870 movaps %xmm5, %xmm7
871 palignr $5, %xmm4, %xmm5
872 palignr $5, %xmm3, %xmm4
873 movaps %xmm5, 48(%edx)
874 palignr $5, %xmm2, %xmm3
875 lea 64(%eax), %eax
876 palignr $5, %xmm1, %xmm2
877 movaps %xmm4, 32(%edx)
878 movaps %xmm3, 16(%edx)
879 movaps %xmm7, %xmm1
880 movaps %xmm2, (%edx)
881 lea 64(%edx), %edx
882 sub $64, %ecx
883 ja L(Shl5LoopStart)
884
885L(Shl5LoopLeave):
886 add $32, %ecx
887 jle L(shl_end_0)
888
889 movaps 11(%eax), %xmm2
890 movaps 27(%eax), %xmm3
891 palignr $5, %xmm2, %xmm3
892 palignr $5, %xmm1, %xmm2
893 movaps %xmm2, (%edx)
894 movaps %xmm3, 16(%edx)
895 lea 32(%edx, %ecx), %edx
896 lea 32(%eax, %ecx), %eax
897 POP (%edi)
898 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
899
900 CFI_PUSH (%edi)
901
902 .p2align 4
903L(sh_5_no_prefetch):
904 lea -32(%ecx), %ecx
905 lea -5(%eax), %eax
906 xor %edi, %edi
907
908 .p2align 4
909L(sh_5_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800910 movdqa 16(%eax, %edi), %xmm2
911 sub $32, %ecx
912 movdqa 32(%eax, %edi), %xmm3
913 movdqa %xmm3, %xmm4
914 palignr $5, %xmm2, %xmm3
915 palignr $5, %xmm1, %xmm2
916 lea 32(%edi), %edi
917 movdqa %xmm2, -32(%edx, %edi)
918 movdqa %xmm3, -16(%edx, %edi)
919
Jack Renc47703a2012-02-14 12:01:52 +0400920 jb L(sh_5_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800921
922 movdqa 16(%eax, %edi), %xmm2
923 sub $32, %ecx
924 movdqa 32(%eax, %edi), %xmm3
925 movdqa %xmm3, %xmm1
926 palignr $5, %xmm2, %xmm3
927 palignr $5, %xmm4, %xmm2
928 lea 32(%edi), %edi
929 movdqa %xmm2, -32(%edx, %edi)
930 movdqa %xmm3, -16(%edx, %edi)
931
Jack Renc47703a2012-02-14 12:01:52 +0400932 jae L(sh_5_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800933
Jack Renc47703a2012-02-14 12:01:52 +0400934L(sh_5_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800935 lea 32(%ecx), %ecx
936 add %ecx, %edi
937 add %edi, %edx
938 lea 5(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400939 POP (%edi)
940 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800941
Jack Renc47703a2012-02-14 12:01:52 +0400942 CFI_PUSH (%edi)
943
944 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800945L(shl_6):
Jack Renc47703a2012-02-14 12:01:52 +0400946#ifndef USE_AS_MEMMOVE
947 movaps -6(%eax), %xmm1
948#else
949 movl DEST+4(%esp), %edi
950 movaps -6(%eax), %xmm1
951 movdqu %xmm0, (%edi)
952#endif
953#ifdef DATA_CACHE_SIZE_HALF
954 cmp $DATA_CACHE_SIZE_HALF, %ecx
955#else
956# if (defined SHARED || defined __PIC__)
957 SETUP_PIC_REG(bx)
958 add $_GLOBAL_OFFSET_TABLE_, %ebx
959 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
960# else
961 cmp __x86_data_cache_size_half, %ecx
962# endif
963#endif
964 jb L(sh_6_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800965
Jack Renc47703a2012-02-14 12:01:52 +0400966 lea -64(%ecx), %ecx
967
968 .p2align 4
969L(Shl6LoopStart):
970 prefetcht0 0x1c0(%eax)
971 prefetcht0 0x1c0(%edx)
972 movaps 10(%eax), %xmm2
973 movaps 26(%eax), %xmm3
974 movaps 42(%eax), %xmm4
975 movaps 58(%eax), %xmm5
976 movaps %xmm5, %xmm7
977 palignr $6, %xmm4, %xmm5
978 palignr $6, %xmm3, %xmm4
979 movaps %xmm5, 48(%edx)
980 palignr $6, %xmm2, %xmm3
981 lea 64(%eax), %eax
982 palignr $6, %xmm1, %xmm2
983 movaps %xmm4, 32(%edx)
984 movaps %xmm3, 16(%edx)
985 movaps %xmm7, %xmm1
986 movaps %xmm2, (%edx)
987 lea 64(%edx), %edx
988 sub $64, %ecx
989 ja L(Shl6LoopStart)
990
991L(Shl6LoopLeave):
992 add $32, %ecx
993 jle L(shl_end_0)
994
995 movaps 10(%eax), %xmm2
996 movaps 26(%eax), %xmm3
997 palignr $6, %xmm2, %xmm3
998 palignr $6, %xmm1, %xmm2
999 movaps %xmm2, (%edx)
1000 movaps %xmm3, 16(%edx)
1001 lea 32(%edx, %ecx), %edx
1002 lea 32(%eax, %ecx), %eax
1003 POP (%edi)
1004 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1005
1006 CFI_PUSH (%edi)
1007
1008 .p2align 4
1009L(sh_6_no_prefetch):
1010 lea -32(%ecx), %ecx
1011 lea -6(%eax), %eax
1012 xor %edi, %edi
1013
1014 .p2align 4
1015L(sh_6_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001016 movdqa 16(%eax, %edi), %xmm2
1017 sub $32, %ecx
1018 movdqa 32(%eax, %edi), %xmm3
1019 movdqa %xmm3, %xmm4
1020 palignr $6, %xmm2, %xmm3
1021 palignr $6, %xmm1, %xmm2
1022 lea 32(%edi), %edi
1023 movdqa %xmm2, -32(%edx, %edi)
1024 movdqa %xmm3, -16(%edx, %edi)
1025
Jack Renc47703a2012-02-14 12:01:52 +04001026 jb L(sh_6_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001027
1028 movdqa 16(%eax, %edi), %xmm2
1029 sub $32, %ecx
1030 movdqa 32(%eax, %edi), %xmm3
1031 movdqa %xmm3, %xmm1
1032 palignr $6, %xmm2, %xmm3
1033 palignr $6, %xmm4, %xmm2
1034 lea 32(%edi), %edi
1035 movdqa %xmm2, -32(%edx, %edi)
1036 movdqa %xmm3, -16(%edx, %edi)
1037
Jack Renc47703a2012-02-14 12:01:52 +04001038 jae L(sh_6_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001039
Jack Renc47703a2012-02-14 12:01:52 +04001040L(sh_6_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001041 lea 32(%ecx), %ecx
1042 add %ecx, %edi
1043 add %edi, %edx
1044 lea 6(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001045 POP (%edi)
1046 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001047
Jack Renc47703a2012-02-14 12:01:52 +04001048 CFI_PUSH (%edi)
1049
1050 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001051L(shl_7):
Jack Renc47703a2012-02-14 12:01:52 +04001052#ifndef USE_AS_MEMMOVE
1053 movaps -7(%eax), %xmm1
1054#else
1055 movl DEST+4(%esp), %edi
1056 movaps -7(%eax), %xmm1
1057 movdqu %xmm0, (%edi)
1058#endif
1059#ifdef DATA_CACHE_SIZE_HALF
1060 cmp $DATA_CACHE_SIZE_HALF, %ecx
1061#else
1062# if (defined SHARED || defined __PIC__)
1063 SETUP_PIC_REG(bx)
1064 add $_GLOBAL_OFFSET_TABLE_, %ebx
1065 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1066# else
1067 cmp __x86_data_cache_size_half, %ecx
1068# endif
1069#endif
1070 jb L(sh_7_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001071
Jack Renc47703a2012-02-14 12:01:52 +04001072 lea -64(%ecx), %ecx
1073
1074 .p2align 4
1075L(Shl7LoopStart):
1076 prefetcht0 0x1c0(%eax)
1077 prefetcht0 0x1c0(%edx)
1078 movaps 9(%eax), %xmm2
1079 movaps 25(%eax), %xmm3
1080 movaps 41(%eax), %xmm4
1081 movaps 57(%eax), %xmm5
1082 movaps %xmm5, %xmm7
1083 palignr $7, %xmm4, %xmm5
1084 palignr $7, %xmm3, %xmm4
1085 movaps %xmm5, 48(%edx)
1086 palignr $7, %xmm2, %xmm3
1087 lea 64(%eax), %eax
1088 palignr $7, %xmm1, %xmm2
1089 movaps %xmm4, 32(%edx)
1090 movaps %xmm3, 16(%edx)
1091 movaps %xmm7, %xmm1
1092 movaps %xmm2, (%edx)
1093 lea 64(%edx), %edx
1094 sub $64, %ecx
1095 ja L(Shl7LoopStart)
1096
1097L(Shl7LoopLeave):
1098 add $32, %ecx
1099 jle L(shl_end_0)
1100
1101 movaps 9(%eax), %xmm2
1102 movaps 25(%eax), %xmm3
1103 palignr $7, %xmm2, %xmm3
1104 palignr $7, %xmm1, %xmm2
1105 movaps %xmm2, (%edx)
1106 movaps %xmm3, 16(%edx)
1107 lea 32(%edx, %ecx), %edx
1108 lea 32(%eax, %ecx), %eax
1109 POP (%edi)
1110 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1111
1112 CFI_PUSH (%edi)
1113
1114 .p2align 4
1115L(sh_7_no_prefetch):
1116 lea -32(%ecx), %ecx
1117 lea -7(%eax), %eax
1118 xor %edi, %edi
1119
1120 .p2align 4
1121L(sh_7_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001122 movdqa 16(%eax, %edi), %xmm2
1123 sub $32, %ecx
1124 movdqa 32(%eax, %edi), %xmm3
1125 movdqa %xmm3, %xmm4
1126 palignr $7, %xmm2, %xmm3
1127 palignr $7, %xmm1, %xmm2
1128 lea 32(%edi), %edi
1129 movdqa %xmm2, -32(%edx, %edi)
1130 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001131 jb L(sh_7_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001132
1133 movdqa 16(%eax, %edi), %xmm2
1134 sub $32, %ecx
1135 movdqa 32(%eax, %edi), %xmm3
1136 movdqa %xmm3, %xmm1
1137 palignr $7, %xmm2, %xmm3
1138 palignr $7, %xmm4, %xmm2
1139 lea 32(%edi), %edi
1140 movdqa %xmm2, -32(%edx, %edi)
1141 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001142 jae L(sh_7_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001143
Jack Renc47703a2012-02-14 12:01:52 +04001144L(sh_7_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001145 lea 32(%ecx), %ecx
1146 add %ecx, %edi
1147 add %edi, %edx
1148 lea 7(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001149 POP (%edi)
1150 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001151
Jack Renc47703a2012-02-14 12:01:52 +04001152 CFI_PUSH (%edi)
1153
1154 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001155L(shl_8):
Jack Renc47703a2012-02-14 12:01:52 +04001156#ifndef USE_AS_MEMMOVE
1157 movaps -8(%eax), %xmm1
1158#else
1159 movl DEST+4(%esp), %edi
1160 movaps -8(%eax), %xmm1
1161 movdqu %xmm0, (%edi)
1162#endif
1163#ifdef DATA_CACHE_SIZE_HALF
1164 cmp $DATA_CACHE_SIZE_HALF, %ecx
1165#else
1166# if (defined SHARED || defined __PIC__)
1167 SETUP_PIC_REG(bx)
1168 add $_GLOBAL_OFFSET_TABLE_, %ebx
1169 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1170# else
1171 cmp __x86_data_cache_size_half, %ecx
1172# endif
1173#endif
1174 jb L(sh_8_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001175
Jack Renc47703a2012-02-14 12:01:52 +04001176 lea -64(%ecx), %ecx
1177
1178 .p2align 4
1179L(Shl8LoopStart):
1180 prefetcht0 0x1c0(%eax)
1181 prefetcht0 0x1c0(%edx)
1182 movaps 8(%eax), %xmm2
1183 movaps 24(%eax), %xmm3
1184 movaps 40(%eax), %xmm4
1185 movaps 56(%eax), %xmm5
1186 movaps %xmm5, %xmm7
1187 palignr $8, %xmm4, %xmm5
1188 palignr $8, %xmm3, %xmm4
1189 movaps %xmm5, 48(%edx)
1190 palignr $8, %xmm2, %xmm3
1191 lea 64(%eax), %eax
1192 palignr $8, %xmm1, %xmm2
1193 movaps %xmm4, 32(%edx)
1194 movaps %xmm3, 16(%edx)
1195 movaps %xmm7, %xmm1
1196 movaps %xmm2, (%edx)
1197 lea 64(%edx), %edx
1198 sub $64, %ecx
1199 ja L(Shl8LoopStart)
1200
1201L(LoopLeave8):
1202 add $32, %ecx
1203 jle L(shl_end_0)
1204
1205 movaps 8(%eax), %xmm2
1206 movaps 24(%eax), %xmm3
1207 palignr $8, %xmm2, %xmm3
1208 palignr $8, %xmm1, %xmm2
1209 movaps %xmm2, (%edx)
1210 movaps %xmm3, 16(%edx)
1211 lea 32(%edx, %ecx), %edx
1212 lea 32(%eax, %ecx), %eax
1213 POP (%edi)
1214 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1215
1216 CFI_PUSH (%edi)
1217
1218 .p2align 4
1219L(sh_8_no_prefetch):
1220 lea -32(%ecx), %ecx
1221 lea -8(%eax), %eax
1222 xor %edi, %edi
1223
1224 .p2align 4
1225L(sh_8_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001226 movdqa 16(%eax, %edi), %xmm2
1227 sub $32, %ecx
1228 movdqa 32(%eax, %edi), %xmm3
1229 movdqa %xmm3, %xmm4
1230 palignr $8, %xmm2, %xmm3
1231 palignr $8, %xmm1, %xmm2
1232 lea 32(%edi), %edi
1233 movdqa %xmm2, -32(%edx, %edi)
1234 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001235 jb L(sh_8_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001236
1237 movdqa 16(%eax, %edi), %xmm2
1238 sub $32, %ecx
1239 movdqa 32(%eax, %edi), %xmm3
1240 movdqa %xmm3, %xmm1
1241 palignr $8, %xmm2, %xmm3
1242 palignr $8, %xmm4, %xmm2
1243 lea 32(%edi), %edi
1244 movdqa %xmm2, -32(%edx, %edi)
1245 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001246 jae L(sh_8_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001247
Jack Renc47703a2012-02-14 12:01:52 +04001248L(sh_8_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001249 lea 32(%ecx), %ecx
1250 add %ecx, %edi
1251 add %edi, %edx
1252 lea 8(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001253 POP (%edi)
1254 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001255
Jack Renc47703a2012-02-14 12:01:52 +04001256 CFI_PUSH (%edi)
1257
1258 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001259L(shl_9):
Jack Renc47703a2012-02-14 12:01:52 +04001260#ifndef USE_AS_MEMMOVE
1261 movaps -9(%eax), %xmm1
1262#else
1263 movl DEST+4(%esp), %edi
1264 movaps -9(%eax), %xmm1
1265 movdqu %xmm0, (%edi)
1266#endif
1267#ifdef DATA_CACHE_SIZE_HALF
1268 cmp $DATA_CACHE_SIZE_HALF, %ecx
1269#else
1270# if (defined SHARED || defined __PIC__)
1271 SETUP_PIC_REG(bx)
1272 add $_GLOBAL_OFFSET_TABLE_, %ebx
1273 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1274# else
1275 cmp __x86_data_cache_size_half, %ecx
1276# endif
1277#endif
1278 jb L(sh_9_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001279
Jack Renc47703a2012-02-14 12:01:52 +04001280 lea -64(%ecx), %ecx
1281
1282 .p2align 4
1283L(Shl9LoopStart):
1284 prefetcht0 0x1c0(%eax)
1285 prefetcht0 0x1c0(%edx)
1286 movaps 7(%eax), %xmm2
1287 movaps 23(%eax), %xmm3
1288 movaps 39(%eax), %xmm4
1289 movaps 55(%eax), %xmm5
1290 movaps %xmm5, %xmm7
1291 palignr $9, %xmm4, %xmm5
1292 palignr $9, %xmm3, %xmm4
1293 movaps %xmm5, 48(%edx)
1294 palignr $9, %xmm2, %xmm3
1295 lea 64(%eax), %eax
1296 palignr $9, %xmm1, %xmm2
1297 movaps %xmm4, 32(%edx)
1298 movaps %xmm3, 16(%edx)
1299 movaps %xmm7, %xmm1
1300 movaps %xmm2, (%edx)
1301 lea 64(%edx), %edx
1302 sub $64, %ecx
1303 ja L(Shl9LoopStart)
1304
1305L(Shl9LoopLeave):
1306 add $32, %ecx
1307 jle L(shl_end_0)
1308
1309 movaps 7(%eax), %xmm2
1310 movaps 23(%eax), %xmm3
1311 palignr $9, %xmm2, %xmm3
1312 palignr $9, %xmm1, %xmm2
1313
1314 movaps %xmm2, (%edx)
1315 movaps %xmm3, 16(%edx)
1316 lea 32(%edx, %ecx), %edx
1317 lea 32(%eax, %ecx), %eax
1318 POP (%edi)
1319 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1320
1321 CFI_PUSH (%edi)
1322
1323 .p2align 4
1324L(sh_9_no_prefetch):
1325 lea -32(%ecx), %ecx
1326 lea -9(%eax), %eax
1327 xor %edi, %edi
1328
1329 .p2align 4
1330L(sh_9_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001331 movdqa 16(%eax, %edi), %xmm2
1332 sub $32, %ecx
1333 movdqa 32(%eax, %edi), %xmm3
1334 movdqa %xmm3, %xmm4
1335 palignr $9, %xmm2, %xmm3
1336 palignr $9, %xmm1, %xmm2
1337 lea 32(%edi), %edi
1338 movdqa %xmm2, -32(%edx, %edi)
1339 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001340 jb L(sh_9_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001341
1342 movdqa 16(%eax, %edi), %xmm2
1343 sub $32, %ecx
1344 movdqa 32(%eax, %edi), %xmm3
1345 movdqa %xmm3, %xmm1
1346 palignr $9, %xmm2, %xmm3
1347 palignr $9, %xmm4, %xmm2
1348 lea 32(%edi), %edi
1349 movdqa %xmm2, -32(%edx, %edi)
1350 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001351 jae L(sh_9_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001352
Jack Renc47703a2012-02-14 12:01:52 +04001353L(sh_9_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001354 lea 32(%ecx), %ecx
1355 add %ecx, %edi
1356 add %edi, %edx
1357 lea 9(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001358 POP (%edi)
1359 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001360
Jack Renc47703a2012-02-14 12:01:52 +04001361 CFI_PUSH (%edi)
1362
1363 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001364L(shl_10):
Jack Renc47703a2012-02-14 12:01:52 +04001365#ifndef USE_AS_MEMMOVE
1366 movaps -10(%eax), %xmm1
1367#else
1368 movl DEST+4(%esp), %edi
1369 movaps -10(%eax), %xmm1
1370 movdqu %xmm0, (%edi)
1371#endif
1372#ifdef DATA_CACHE_SIZE_HALF
1373 cmp $DATA_CACHE_SIZE_HALF, %ecx
1374#else
1375# if (defined SHARED || defined __PIC__)
1376 SETUP_PIC_REG(bx)
1377 add $_GLOBAL_OFFSET_TABLE_, %ebx
1378 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1379# else
1380 cmp __x86_data_cache_size_half, %ecx
1381# endif
1382#endif
1383 jb L(sh_10_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001384
Jack Renc47703a2012-02-14 12:01:52 +04001385 lea -64(%ecx), %ecx
1386
1387 .p2align 4
1388L(Shl10LoopStart):
1389 prefetcht0 0x1c0(%eax)
1390 prefetcht0 0x1c0(%edx)
1391 movaps 6(%eax), %xmm2
1392 movaps 22(%eax), %xmm3
1393 movaps 38(%eax), %xmm4
1394 movaps 54(%eax), %xmm5
1395 movaps %xmm5, %xmm7
1396 palignr $10, %xmm4, %xmm5
1397 palignr $10, %xmm3, %xmm4
1398 movaps %xmm5, 48(%edx)
1399 palignr $10, %xmm2, %xmm3
1400 lea 64(%eax), %eax
1401 palignr $10, %xmm1, %xmm2
1402 movaps %xmm4, 32(%edx)
1403 movaps %xmm3, 16(%edx)
1404 movaps %xmm7, %xmm1
1405 movaps %xmm2, (%edx)
1406 lea 64(%edx), %edx
1407 sub $64, %ecx
1408 ja L(Shl10LoopStart)
1409
1410L(Shl10LoopLeave):
1411 add $32, %ecx
1412 jle L(shl_end_0)
1413
1414 movaps 6(%eax), %xmm2
1415 movaps 22(%eax), %xmm3
1416 palignr $10, %xmm2, %xmm3
1417 palignr $10, %xmm1, %xmm2
1418
1419 movaps %xmm2, (%edx)
1420 movaps %xmm3, 16(%edx)
1421 lea 32(%edx, %ecx), %edx
1422 lea 32(%eax, %ecx), %eax
1423 POP (%edi)
1424 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1425
1426 CFI_PUSH (%edi)
1427
1428 .p2align 4
1429L(sh_10_no_prefetch):
1430 lea -32(%ecx), %ecx
1431 lea -10(%eax), %eax
1432 xor %edi, %edi
1433
1434 .p2align 4
1435L(sh_10_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001436 movdqa 16(%eax, %edi), %xmm2
1437 sub $32, %ecx
1438 movdqa 32(%eax, %edi), %xmm3
1439 movdqa %xmm3, %xmm4
1440 palignr $10, %xmm2, %xmm3
1441 palignr $10, %xmm1, %xmm2
1442 lea 32(%edi), %edi
1443 movdqa %xmm2, -32(%edx, %edi)
1444 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001445 jb L(sh_10_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001446
1447 movdqa 16(%eax, %edi), %xmm2
1448 sub $32, %ecx
1449 movdqa 32(%eax, %edi), %xmm3
1450 movdqa %xmm3, %xmm1
1451 palignr $10, %xmm2, %xmm3
1452 palignr $10, %xmm4, %xmm2
1453 lea 32(%edi), %edi
1454 movdqa %xmm2, -32(%edx, %edi)
1455 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001456 jae L(sh_10_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001457
Jack Renc47703a2012-02-14 12:01:52 +04001458L(sh_10_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001459 lea 32(%ecx), %ecx
1460 add %ecx, %edi
1461 add %edi, %edx
1462 lea 10(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001463 POP (%edi)
1464 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001465
Jack Renc47703a2012-02-14 12:01:52 +04001466 CFI_PUSH (%edi)
1467
1468 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001469L(shl_11):
Jack Renc47703a2012-02-14 12:01:52 +04001470#ifndef USE_AS_MEMMOVE
1471 movaps -11(%eax), %xmm1
1472#else
1473 movl DEST+4(%esp), %edi
1474 movaps -11(%eax), %xmm1
1475 movdqu %xmm0, (%edi)
1476#endif
1477#ifdef DATA_CACHE_SIZE_HALF
1478 cmp $DATA_CACHE_SIZE_HALF, %ecx
1479#else
1480# if (defined SHARED || defined __PIC__)
1481 SETUP_PIC_REG(bx)
1482 add $_GLOBAL_OFFSET_TABLE_, %ebx
1483 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1484# else
1485 cmp __x86_data_cache_size_half, %ecx
1486# endif
1487#endif
1488 jb L(sh_11_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001489
Jack Renc47703a2012-02-14 12:01:52 +04001490 lea -64(%ecx), %ecx
1491
1492 .p2align 4
1493L(Shl11LoopStart):
1494 prefetcht0 0x1c0(%eax)
1495 prefetcht0 0x1c0(%edx)
1496 movaps 5(%eax), %xmm2
1497 movaps 21(%eax), %xmm3
1498 movaps 37(%eax), %xmm4
1499 movaps 53(%eax), %xmm5
1500 movaps %xmm5, %xmm7
1501 palignr $11, %xmm4, %xmm5
1502 palignr $11, %xmm3, %xmm4
1503 movaps %xmm5, 48(%edx)
1504 palignr $11, %xmm2, %xmm3
1505 lea 64(%eax), %eax
1506 palignr $11, %xmm1, %xmm2
1507 movaps %xmm4, 32(%edx)
1508 movaps %xmm3, 16(%edx)
1509 movaps %xmm7, %xmm1
1510 movaps %xmm2, (%edx)
1511 lea 64(%edx), %edx
1512 sub $64, %ecx
1513 ja L(Shl11LoopStart)
1514
1515L(Shl11LoopLeave):
1516 add $32, %ecx
1517 jle L(shl_end_0)
1518
1519 movaps 5(%eax), %xmm2
1520 movaps 21(%eax), %xmm3
1521 palignr $11, %xmm2, %xmm3
1522 palignr $11, %xmm1, %xmm2
1523
1524 movaps %xmm2, (%edx)
1525 movaps %xmm3, 16(%edx)
1526 lea 32(%edx, %ecx), %edx
1527 lea 32(%eax, %ecx), %eax
1528 POP (%edi)
1529 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1530
1531 CFI_PUSH (%edi)
1532
1533 .p2align 4
1534L(sh_11_no_prefetch):
1535 lea -32(%ecx), %ecx
1536 lea -11(%eax), %eax
1537 xor %edi, %edi
1538
1539 .p2align 4
1540L(sh_11_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001541 movdqa 16(%eax, %edi), %xmm2
1542 sub $32, %ecx
1543 movdqa 32(%eax, %edi), %xmm3
1544 movdqa %xmm3, %xmm4
1545 palignr $11, %xmm2, %xmm3
1546 palignr $11, %xmm1, %xmm2
1547 lea 32(%edi), %edi
1548 movdqa %xmm2, -32(%edx, %edi)
1549 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001550 jb L(sh_11_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001551
1552 movdqa 16(%eax, %edi), %xmm2
1553 sub $32, %ecx
1554 movdqa 32(%eax, %edi), %xmm3
1555 movdqa %xmm3, %xmm1
1556 palignr $11, %xmm2, %xmm3
1557 palignr $11, %xmm4, %xmm2
1558 lea 32(%edi), %edi
1559 movdqa %xmm2, -32(%edx, %edi)
1560 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001561 jae L(sh_11_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001562
Jack Renc47703a2012-02-14 12:01:52 +04001563L(sh_11_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001564 lea 32(%ecx), %ecx
1565 add %ecx, %edi
1566 add %edi, %edx
1567 lea 11(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001568 POP (%edi)
1569 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001570
Jack Renc47703a2012-02-14 12:01:52 +04001571 CFI_PUSH (%edi)
1572
1573 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001574L(shl_12):
Jack Renc47703a2012-02-14 12:01:52 +04001575#ifndef USE_AS_MEMMOVE
1576 movaps -12(%eax), %xmm1
1577#else
1578 movl DEST+4(%esp), %edi
1579 movaps -12(%eax), %xmm1
1580 movdqu %xmm0, (%edi)
1581#endif
1582#ifdef DATA_CACHE_SIZE_HALF
1583 cmp $DATA_CACHE_SIZE_HALF, %ecx
1584#else
1585# if (defined SHARED || defined __PIC__)
1586 SETUP_PIC_REG(bx)
1587 add $_GLOBAL_OFFSET_TABLE_, %ebx
1588 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1589# else
1590 cmp __x86_data_cache_size_half, %ecx
1591# endif
1592#endif
1593 jb L(sh_12_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001594
Jack Renc47703a2012-02-14 12:01:52 +04001595 lea -64(%ecx), %ecx
1596
1597 .p2align 4
1598L(Shl12LoopStart):
1599 prefetcht0 0x1c0(%eax)
1600 prefetcht0 0x1c0(%edx)
1601 movaps 4(%eax), %xmm2
1602 movaps 20(%eax), %xmm3
1603 movaps 36(%eax), %xmm4
1604 movaps 52(%eax), %xmm5
1605 movaps %xmm5, %xmm7
1606 palignr $12, %xmm4, %xmm5
1607 palignr $12, %xmm3, %xmm4
1608 movaps %xmm5, 48(%edx)
1609 palignr $12, %xmm2, %xmm3
1610 lea 64(%eax), %eax
1611 palignr $12, %xmm1, %xmm2
1612 movaps %xmm4, 32(%edx)
1613 movaps %xmm3, 16(%edx)
1614 movaps %xmm7, %xmm1
1615 movaps %xmm2, (%edx)
1616 lea 64(%edx), %edx
1617 sub $64, %ecx
1618 ja L(Shl12LoopStart)
1619
1620L(Shl12LoopLeave):
1621 add $32, %ecx
1622 jle L(shl_end_0)
1623
1624 movaps 4(%eax), %xmm2
1625 movaps 20(%eax), %xmm3
1626 palignr $12, %xmm2, %xmm3
1627 palignr $12, %xmm1, %xmm2
1628
1629 movaps %xmm2, (%edx)
1630 movaps %xmm3, 16(%edx)
1631 lea 32(%edx, %ecx), %edx
1632 lea 32(%eax, %ecx), %eax
1633 POP (%edi)
1634 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1635
1636 CFI_PUSH (%edi)
1637
1638 .p2align 4
1639L(sh_12_no_prefetch):
1640 lea -32(%ecx), %ecx
1641 lea -12(%eax), %eax
1642 xor %edi, %edi
1643
1644 .p2align 4
1645L(sh_12_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001646 movdqa 16(%eax, %edi), %xmm2
1647 sub $32, %ecx
1648 movdqa 32(%eax, %edi), %xmm3
1649 movdqa %xmm3, %xmm4
1650 palignr $12, %xmm2, %xmm3
1651 palignr $12, %xmm1, %xmm2
1652 lea 32(%edi), %edi
1653 movdqa %xmm2, -32(%edx, %edi)
1654 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001655 jb L(sh_12_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001656
1657 movdqa 16(%eax, %edi), %xmm2
1658 sub $32, %ecx
1659 movdqa 32(%eax, %edi), %xmm3
1660 movdqa %xmm3, %xmm1
1661 palignr $12, %xmm2, %xmm3
1662 palignr $12, %xmm4, %xmm2
1663 lea 32(%edi), %edi
1664 movdqa %xmm2, -32(%edx, %edi)
1665 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001666 jae L(sh_12_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001667
Jack Renc47703a2012-02-14 12:01:52 +04001668L(sh_12_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001669 lea 32(%ecx), %ecx
1670 add %ecx, %edi
1671 add %edi, %edx
1672 lea 12(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001673 POP (%edi)
1674 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001675
Jack Renc47703a2012-02-14 12:01:52 +04001676 CFI_PUSH (%edi)
1677
1678 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001679L(shl_13):
Jack Renc47703a2012-02-14 12:01:52 +04001680#ifndef USE_AS_MEMMOVE
1681 movaps -13(%eax), %xmm1
1682#else
1683 movl DEST+4(%esp), %edi
1684 movaps -13(%eax), %xmm1
1685 movdqu %xmm0, (%edi)
1686#endif
1687#ifdef DATA_CACHE_SIZE_HALF
1688 cmp $DATA_CACHE_SIZE_HALF, %ecx
1689#else
1690# if (defined SHARED || defined __PIC__)
1691 SETUP_PIC_REG(bx)
1692 add $_GLOBAL_OFFSET_TABLE_, %ebx
1693 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1694# else
1695 cmp __x86_data_cache_size_half, %ecx
1696# endif
1697#endif
1698 jb L(sh_13_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001699
Jack Renc47703a2012-02-14 12:01:52 +04001700 lea -64(%ecx), %ecx
1701
1702 .p2align 4
1703L(Shl13LoopStart):
1704 prefetcht0 0x1c0(%eax)
1705 prefetcht0 0x1c0(%edx)
1706 movaps 3(%eax), %xmm2
1707 movaps 19(%eax), %xmm3
1708 movaps 35(%eax), %xmm4
1709 movaps 51(%eax), %xmm5
1710 movaps %xmm5, %xmm7
1711 palignr $13, %xmm4, %xmm5
1712 palignr $13, %xmm3, %xmm4
1713 movaps %xmm5, 48(%edx)
1714 palignr $13, %xmm2, %xmm3
1715 lea 64(%eax), %eax
1716 palignr $13, %xmm1, %xmm2
1717 movaps %xmm4, 32(%edx)
1718 movaps %xmm3, 16(%edx)
1719 movaps %xmm7, %xmm1
1720 movaps %xmm2, (%edx)
1721 lea 64(%edx), %edx
1722 sub $64, %ecx
1723 ja L(Shl13LoopStart)
1724
1725L(Shl13LoopLeave):
1726 add $32, %ecx
1727 jle L(shl_end_0)
1728
1729 movaps 3(%eax), %xmm2
1730 movaps 19(%eax), %xmm3
1731 palignr $13, %xmm2, %xmm3
1732 palignr $13, %xmm1, %xmm2
1733
1734 movaps %xmm2, (%edx)
1735 movaps %xmm3, 16(%edx)
1736 lea 32(%edx, %ecx), %edx
1737 lea 32(%eax, %ecx), %eax
1738 POP (%edi)
1739 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1740
1741 CFI_PUSH (%edi)
1742
1743 .p2align 4
1744L(sh_13_no_prefetch):
1745 lea -32(%ecx), %ecx
1746 lea -13(%eax), %eax
1747 xor %edi, %edi
1748
1749 .p2align 4
1750L(sh_13_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001751 movdqa 16(%eax, %edi), %xmm2
1752 sub $32, %ecx
1753 movdqa 32(%eax, %edi), %xmm3
1754 movdqa %xmm3, %xmm4
1755 palignr $13, %xmm2, %xmm3
1756 palignr $13, %xmm1, %xmm2
1757 lea 32(%edi), %edi
1758 movdqa %xmm2, -32(%edx, %edi)
1759 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001760 jb L(sh_13_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001761
1762 movdqa 16(%eax, %edi), %xmm2
1763 sub $32, %ecx
1764 movdqa 32(%eax, %edi), %xmm3
1765 movdqa %xmm3, %xmm1
1766 palignr $13, %xmm2, %xmm3
1767 palignr $13, %xmm4, %xmm2
1768 lea 32(%edi), %edi
1769 movdqa %xmm2, -32(%edx, %edi)
1770 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001771 jae L(sh_13_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001772
Jack Renc47703a2012-02-14 12:01:52 +04001773L(sh_13_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001774 lea 32(%ecx), %ecx
1775 add %ecx, %edi
1776 add %edi, %edx
1777 lea 13(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001778 POP (%edi)
1779 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001780
Jack Renc47703a2012-02-14 12:01:52 +04001781 CFI_PUSH (%edi)
1782
1783 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001784L(shl_14):
Jack Renc47703a2012-02-14 12:01:52 +04001785#ifndef USE_AS_MEMMOVE
1786 movaps -14(%eax), %xmm1
1787#else
1788 movl DEST+4(%esp), %edi
1789 movaps -14(%eax), %xmm1
1790 movdqu %xmm0, (%edi)
1791#endif
1792#ifdef DATA_CACHE_SIZE_HALF
1793 cmp $DATA_CACHE_SIZE_HALF, %ecx
1794#else
1795# if (defined SHARED || defined __PIC__)
1796 SETUP_PIC_REG(bx)
1797 add $_GLOBAL_OFFSET_TABLE_, %ebx
1798 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1799# else
1800 cmp __x86_data_cache_size_half, %ecx
1801# endif
1802#endif
1803 jb L(sh_14_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001804
Jack Renc47703a2012-02-14 12:01:52 +04001805 lea -64(%ecx), %ecx
1806
1807 .p2align 4
1808L(Shl14LoopStart):
1809 prefetcht0 0x1c0(%eax)
1810 prefetcht0 0x1c0(%edx)
1811 movaps 2(%eax), %xmm2
1812 movaps 18(%eax), %xmm3
1813 movaps 34(%eax), %xmm4
1814 movaps 50(%eax), %xmm5
1815 movaps %xmm5, %xmm7
1816 palignr $14, %xmm4, %xmm5
1817 palignr $14, %xmm3, %xmm4
1818 movaps %xmm5, 48(%edx)
1819 palignr $14, %xmm2, %xmm3
1820 lea 64(%eax), %eax
1821 palignr $14, %xmm1, %xmm2
1822 movaps %xmm4, 32(%edx)
1823 movaps %xmm3, 16(%edx)
1824 movaps %xmm7, %xmm1
1825 movaps %xmm2, (%edx)
1826 lea 64(%edx), %edx
1827 sub $64, %ecx
1828 ja L(Shl14LoopStart)
1829
1830L(Shl14LoopLeave):
1831 add $32, %ecx
1832 jle L(shl_end_0)
1833
1834 movaps 2(%eax), %xmm2
1835 movaps 18(%eax), %xmm3
1836 palignr $14, %xmm2, %xmm3
1837 palignr $14, %xmm1, %xmm2
1838
1839 movaps %xmm2, (%edx)
1840 movaps %xmm3, 16(%edx)
1841 lea 32(%edx, %ecx), %edx
1842 lea 32(%eax, %ecx), %eax
1843 POP (%edi)
1844 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1845
1846 CFI_PUSH (%edi)
1847
1848 .p2align 4
1849L(sh_14_no_prefetch):
1850 lea -32(%ecx), %ecx
1851 lea -14(%eax), %eax
1852 xor %edi, %edi
1853
1854 .p2align 4
1855L(sh_14_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001856 movdqa 16(%eax, %edi), %xmm2
1857 sub $32, %ecx
1858 movdqa 32(%eax, %edi), %xmm3
1859 movdqa %xmm3, %xmm4
1860 palignr $14, %xmm2, %xmm3
1861 palignr $14, %xmm1, %xmm2
1862 lea 32(%edi), %edi
1863 movdqa %xmm2, -32(%edx, %edi)
1864 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001865 jb L(sh_14_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001866
1867 movdqa 16(%eax, %edi), %xmm2
1868 sub $32, %ecx
1869 movdqa 32(%eax, %edi), %xmm3
1870 movdqa %xmm3, %xmm1
1871 palignr $14, %xmm2, %xmm3
1872 palignr $14, %xmm4, %xmm2
1873 lea 32(%edi), %edi
1874 movdqa %xmm2, -32(%edx, %edi)
1875 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001876 jae L(sh_14_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001877
Jack Renc47703a2012-02-14 12:01:52 +04001878L(sh_14_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001879 lea 32(%ecx), %ecx
1880 add %ecx, %edi
1881 add %edi, %edx
1882 lea 14(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001883 POP (%edi)
1884 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001885
Jack Renc47703a2012-02-14 12:01:52 +04001886 CFI_PUSH (%edi)
1887
1888 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001889L(shl_15):
Jack Renc47703a2012-02-14 12:01:52 +04001890#ifndef USE_AS_MEMMOVE
1891 movaps -15(%eax), %xmm1
1892#else
1893 movl DEST+4(%esp), %edi
1894 movaps -15(%eax), %xmm1
1895 movdqu %xmm0, (%edi)
1896#endif
1897#ifdef DATA_CACHE_SIZE_HALF
1898 cmp $DATA_CACHE_SIZE_HALF, %ecx
1899#else
1900# if (defined SHARED || defined __PIC__)
1901 SETUP_PIC_REG(bx)
1902 add $_GLOBAL_OFFSET_TABLE_, %ebx
1903 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1904# else
1905 cmp __x86_data_cache_size_half, %ecx
1906# endif
1907#endif
1908 jb L(sh_15_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001909
Jack Renc47703a2012-02-14 12:01:52 +04001910 lea -64(%ecx), %ecx
1911
1912 .p2align 4
1913L(Shl15LoopStart):
1914 prefetcht0 0x1c0(%eax)
1915 prefetcht0 0x1c0(%edx)
1916 movaps 1(%eax), %xmm2
1917 movaps 17(%eax), %xmm3
1918 movaps 33(%eax), %xmm4
1919 movaps 49(%eax), %xmm5
1920 movaps %xmm5, %xmm7
1921 palignr $15, %xmm4, %xmm5
1922 palignr $15, %xmm3, %xmm4
1923 movaps %xmm5, 48(%edx)
1924 palignr $15, %xmm2, %xmm3
1925 lea 64(%eax), %eax
1926 palignr $15, %xmm1, %xmm2
1927 movaps %xmm4, 32(%edx)
1928 movaps %xmm3, 16(%edx)
1929 movaps %xmm7, %xmm1
1930 movaps %xmm2, (%edx)
1931 lea 64(%edx), %edx
1932 sub $64, %ecx
1933 ja L(Shl15LoopStart)
1934
1935L(Shl15LoopLeave):
1936 add $32, %ecx
1937 jle L(shl_end_0)
1938
1939 movaps 1(%eax), %xmm2
1940 movaps 17(%eax), %xmm3
1941 palignr $15, %xmm2, %xmm3
1942 palignr $15, %xmm1, %xmm2
1943
1944 movaps %xmm2, (%edx)
1945 movaps %xmm3, 16(%edx)
1946 lea 32(%edx, %ecx), %edx
1947 lea 32(%eax, %ecx), %eax
1948 POP (%edi)
1949 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1950
1951 CFI_PUSH (%edi)
1952
1953 .p2align 4
1954L(sh_15_no_prefetch):
1955 lea -32(%ecx), %ecx
1956 lea -15(%eax), %eax
1957 xor %edi, %edi
1958
1959 .p2align 4
1960L(sh_15_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001961 movdqa 16(%eax, %edi), %xmm2
1962 sub $32, %ecx
1963 movdqa 32(%eax, %edi), %xmm3
1964 movdqa %xmm3, %xmm4
1965 palignr $15, %xmm2, %xmm3
1966 palignr $15, %xmm1, %xmm2
1967 lea 32(%edi), %edi
1968 movdqa %xmm2, -32(%edx, %edi)
1969 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001970 jb L(sh_15_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001971
1972 movdqa 16(%eax, %edi), %xmm2
1973 sub $32, %ecx
1974 movdqa 32(%eax, %edi), %xmm3
1975 movdqa %xmm3, %xmm1
1976 palignr $15, %xmm2, %xmm3
1977 palignr $15, %xmm4, %xmm2
1978 lea 32(%edi), %edi
1979 movdqa %xmm2, -32(%edx, %edi)
1980 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001981 jae L(sh_15_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001982
Jack Renc47703a2012-02-14 12:01:52 +04001983L(sh_15_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001984 lea 32(%ecx), %ecx
1985 add %ecx, %edi
1986 add %edi, %edx
1987 lea 15(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001988 POP (%edi)
1989 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001990
Jack Renc47703a2012-02-14 12:01:52 +04001991 CFI_PUSH (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001992
Jack Renc47703a2012-02-14 12:01:52 +04001993 .p2align 4
1994L(shl_end_0):
1995 lea 32(%ecx), %ecx
1996 lea (%edx, %ecx), %edx
1997 lea (%eax, %ecx), %eax
1998 POP (%edi)
1999 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
2000
2001 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002002L(fwd_write_44bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002003 movq -44(%eax), %xmm0
2004 movq %xmm0, -44(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002005L(fwd_write_36bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002006 movq -36(%eax), %xmm0
2007 movq %xmm0, -36(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002008L(fwd_write_28bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002009 movq -28(%eax), %xmm0
2010 movq %xmm0, -28(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002011L(fwd_write_20bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002012 movq -20(%eax), %xmm0
2013 movq %xmm0, -20(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002014L(fwd_write_12bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002015 movq -12(%eax), %xmm0
2016 movq %xmm0, -12(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002017L(fwd_write_4bytes):
2018 movl -4(%eax), %ecx
2019 movl %ecx, -4(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002020#ifndef USE_AS_BCOPY
2021# ifdef USE_AS_MEMPCPY
2022 movl %edx, %eax
2023# else
2024 movl DEST(%esp), %eax
2025# endif
2026#endif
2027 RETURN
2028
2029 .p2align 4
2030L(fwd_write_40bytes):
2031 movq -40(%eax), %xmm0
2032 movq %xmm0, -40(%edx)
2033L(fwd_write_32bytes):
2034 movq -32(%eax), %xmm0
2035 movq %xmm0, -32(%edx)
2036L(fwd_write_24bytes):
2037 movq -24(%eax), %xmm0
2038 movq %xmm0, -24(%edx)
2039L(fwd_write_16bytes):
2040 movq -16(%eax), %xmm0
2041 movq %xmm0, -16(%edx)
2042L(fwd_write_8bytes):
2043 movq -8(%eax), %xmm0
2044 movq %xmm0, -8(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002045L(fwd_write_0bytes):
2046#ifndef USE_AS_BCOPY
2047# ifdef USE_AS_MEMPCPY
2048 movl %edx, %eax
2049# else
2050 movl DEST(%esp), %eax
2051# endif
2052#endif
2053 RETURN
2054
Jack Renc47703a2012-02-14 12:01:52 +04002055 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002056L(fwd_write_5bytes):
2057 movl -5(%eax), %ecx
2058 movl -4(%eax), %eax
2059 movl %ecx, -5(%edx)
2060 movl %eax, -4(%edx)
2061#ifndef USE_AS_BCOPY
2062# ifdef USE_AS_MEMPCPY
2063 movl %edx, %eax
2064# else
2065 movl DEST(%esp), %eax
2066# endif
2067#endif
2068 RETURN
2069
Jack Renc47703a2012-02-14 12:01:52 +04002070 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002071L(fwd_write_45bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002072 movq -45(%eax), %xmm0
2073 movq %xmm0, -45(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002074L(fwd_write_37bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002075 movq -37(%eax), %xmm0
2076 movq %xmm0, -37(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002077L(fwd_write_29bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002078 movq -29(%eax), %xmm0
2079 movq %xmm0, -29(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002080L(fwd_write_21bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002081 movq -21(%eax), %xmm0
2082 movq %xmm0, -21(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002083L(fwd_write_13bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002084 movq -13(%eax), %xmm0
2085 movq %xmm0, -13(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002086 movl -5(%eax), %ecx
2087 movl %ecx, -5(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002088 movzbl -1(%eax), %ecx
2089 movb %cl, -1(%edx)
2090#ifndef USE_AS_BCOPY
2091# ifdef USE_AS_MEMPCPY
2092 movl %edx, %eax
2093# else
2094 movl DEST(%esp), %eax
2095# endif
2096#endif
2097 RETURN
2098
2099 .p2align 4
2100L(fwd_write_41bytes):
2101 movq -41(%eax), %xmm0
2102 movq %xmm0, -41(%edx)
2103L(fwd_write_33bytes):
2104 movq -33(%eax), %xmm0
2105 movq %xmm0, -33(%edx)
2106L(fwd_write_25bytes):
2107 movq -25(%eax), %xmm0
2108 movq %xmm0, -25(%edx)
2109L(fwd_write_17bytes):
2110 movq -17(%eax), %xmm0
2111 movq %xmm0, -17(%edx)
2112L(fwd_write_9bytes):
2113 movq -9(%eax), %xmm0
2114 movq %xmm0, -9(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002115L(fwd_write_1bytes):
2116 movzbl -1(%eax), %ecx
2117 movb %cl, -1(%edx)
2118#ifndef USE_AS_BCOPY
2119# ifdef USE_AS_MEMPCPY
2120 movl %edx, %eax
2121# else
2122 movl DEST(%esp), %eax
2123# endif
2124#endif
2125 RETURN
2126
Jack Renc47703a2012-02-14 12:01:52 +04002127 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002128L(fwd_write_46bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002129 movq -46(%eax), %xmm0
2130 movq %xmm0, -46(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002131L(fwd_write_38bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002132 movq -38(%eax), %xmm0
2133 movq %xmm0, -38(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002134L(fwd_write_30bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002135 movq -30(%eax), %xmm0
2136 movq %xmm0, -30(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002137L(fwd_write_22bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002138 movq -22(%eax), %xmm0
2139 movq %xmm0, -22(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002140L(fwd_write_14bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002141 movq -14(%eax), %xmm0
2142 movq %xmm0, -14(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002143L(fwd_write_6bytes):
2144 movl -6(%eax), %ecx
2145 movl %ecx, -6(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002146 movzwl -2(%eax), %ecx
2147 movw %cx, -2(%edx)
2148#ifndef USE_AS_BCOPY
2149# ifdef USE_AS_MEMPCPY
2150 movl %edx, %eax
2151# else
2152 movl DEST(%esp), %eax
2153# endif
2154#endif
2155 RETURN
2156
2157 .p2align 4
2158L(fwd_write_42bytes):
2159 movq -42(%eax), %xmm0
2160 movq %xmm0, -42(%edx)
2161L(fwd_write_34bytes):
2162 movq -34(%eax), %xmm0
2163 movq %xmm0, -34(%edx)
2164L(fwd_write_26bytes):
2165 movq -26(%eax), %xmm0
2166 movq %xmm0, -26(%edx)
2167L(fwd_write_18bytes):
2168 movq -18(%eax), %xmm0
2169 movq %xmm0, -18(%edx)
2170L(fwd_write_10bytes):
2171 movq -10(%eax), %xmm0
2172 movq %xmm0, -10(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002173L(fwd_write_2bytes):
2174 movzwl -2(%eax), %ecx
2175 movw %cx, -2(%edx)
2176#ifndef USE_AS_BCOPY
2177# ifdef USE_AS_MEMPCPY
2178 movl %edx, %eax
2179# else
2180 movl DEST(%esp), %eax
2181# endif
2182#endif
2183 RETURN
2184
Jack Renc47703a2012-02-14 12:01:52 +04002185 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002186L(fwd_write_47bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002187 movq -47(%eax), %xmm0
2188 movq %xmm0, -47(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002189L(fwd_write_39bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002190 movq -39(%eax), %xmm0
2191 movq %xmm0, -39(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002192L(fwd_write_31bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002193 movq -31(%eax), %xmm0
2194 movq %xmm0, -31(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002195L(fwd_write_23bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002196 movq -23(%eax), %xmm0
2197 movq %xmm0, -23(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002198L(fwd_write_15bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002199 movq -15(%eax), %xmm0
2200 movq %xmm0, -15(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002201L(fwd_write_7bytes):
2202 movl -7(%eax), %ecx
2203 movl %ecx, -7(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002204 movzwl -3(%eax), %ecx
2205 movzbl -1(%eax), %eax
2206 movw %cx, -3(%edx)
2207 movb %al, -1(%edx)
2208#ifndef USE_AS_BCOPY
2209# ifdef USE_AS_MEMPCPY
2210 movl %edx, %eax
2211# else
2212 movl DEST(%esp), %eax
2213# endif
2214#endif
2215 RETURN
2216
2217 .p2align 4
2218L(fwd_write_43bytes):
2219 movq -43(%eax), %xmm0
2220 movq %xmm0, -43(%edx)
2221L(fwd_write_35bytes):
2222 movq -35(%eax), %xmm0
2223 movq %xmm0, -35(%edx)
2224L(fwd_write_27bytes):
2225 movq -27(%eax), %xmm0
2226 movq %xmm0, -27(%edx)
2227L(fwd_write_19bytes):
2228 movq -19(%eax), %xmm0
2229 movq %xmm0, -19(%edx)
2230L(fwd_write_11bytes):
2231 movq -11(%eax), %xmm0
2232 movq %xmm0, -11(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002233L(fwd_write_3bytes):
2234 movzwl -3(%eax), %ecx
2235 movzbl -1(%eax), %eax
2236 movw %cx, -3(%edx)
2237 movb %al, -1(%edx)
2238#ifndef USE_AS_BCOPY
2239# ifdef USE_AS_MEMPCPY
2240 movl %edx, %eax
2241# else
2242 movl DEST(%esp), %eax
2243# endif
2244#endif
Jack Renc47703a2012-02-14 12:01:52 +04002245 RETURN
2246
2247 .p2align 4
2248L(fwd_write_40bytes_align):
2249 movdqa -40(%eax), %xmm0
2250 movdqa %xmm0, -40(%edx)
2251L(fwd_write_24bytes_align):
2252 movdqa -24(%eax), %xmm0
2253 movdqa %xmm0, -24(%edx)
2254L(fwd_write_8bytes_align):
2255 movq -8(%eax), %xmm0
2256 movq %xmm0, -8(%edx)
2257L(fwd_write_0bytes_align):
2258#ifndef USE_AS_BCOPY
2259# ifdef USE_AS_MEMPCPY
2260 movl %edx, %eax
2261# else
2262 movl DEST(%esp), %eax
2263# endif
2264#endif
2265 RETURN
2266
2267 .p2align 4
2268L(fwd_write_32bytes_align):
2269 movdqa -32(%eax), %xmm0
2270 movdqa %xmm0, -32(%edx)
2271L(fwd_write_16bytes_align):
2272 movdqa -16(%eax), %xmm0
2273 movdqa %xmm0, -16(%edx)
2274#ifndef USE_AS_BCOPY
2275# ifdef USE_AS_MEMPCPY
2276 movl %edx, %eax
2277# else
2278 movl DEST(%esp), %eax
2279# endif
2280#endif
2281 RETURN
2282
2283 .p2align 4
2284L(fwd_write_5bytes_align):
2285 movl -5(%eax), %ecx
2286 movl -4(%eax), %eax
2287 movl %ecx, -5(%edx)
2288 movl %eax, -4(%edx)
2289#ifndef USE_AS_BCOPY
2290# ifdef USE_AS_MEMPCPY
2291 movl %edx, %eax
2292# else
2293 movl DEST(%esp), %eax
2294# endif
2295#endif
2296 RETURN
2297
2298 .p2align 4
2299L(fwd_write_45bytes_align):
2300 movdqa -45(%eax), %xmm0
2301 movdqa %xmm0, -45(%edx)
2302L(fwd_write_29bytes_align):
2303 movdqa -29(%eax), %xmm0
2304 movdqa %xmm0, -29(%edx)
2305L(fwd_write_13bytes_align):
2306 movq -13(%eax), %xmm0
2307 movq %xmm0, -13(%edx)
2308 movl -5(%eax), %ecx
2309 movl %ecx, -5(%edx)
2310 movzbl -1(%eax), %ecx
2311 movb %cl, -1(%edx)
2312#ifndef USE_AS_BCOPY
2313# ifdef USE_AS_MEMPCPY
2314 movl %edx, %eax
2315# else
2316 movl DEST(%esp), %eax
2317# endif
2318#endif
2319 RETURN
2320
2321 .p2align 4
2322L(fwd_write_37bytes_align):
2323 movdqa -37(%eax), %xmm0
2324 movdqa %xmm0, -37(%edx)
2325L(fwd_write_21bytes_align):
2326 movdqa -21(%eax), %xmm0
2327 movdqa %xmm0, -21(%edx)
2328 movl -5(%eax), %ecx
2329 movl %ecx, -5(%edx)
2330 movzbl -1(%eax), %ecx
2331 movb %cl, -1(%edx)
2332#ifndef USE_AS_BCOPY
2333# ifdef USE_AS_MEMPCPY
2334 movl %edx, %eax
2335# else
2336 movl DEST(%esp), %eax
2337# endif
2338#endif
2339 RETURN
2340
2341 .p2align 4
2342L(fwd_write_41bytes_align):
2343 movdqa -41(%eax), %xmm0
2344 movdqa %xmm0, -41(%edx)
2345L(fwd_write_25bytes_align):
2346 movdqa -25(%eax), %xmm0
2347 movdqa %xmm0, -25(%edx)
2348L(fwd_write_9bytes_align):
2349 movq -9(%eax), %xmm0
2350 movq %xmm0, -9(%edx)
2351L(fwd_write_1bytes_align):
2352 movzbl -1(%eax), %ecx
2353 movb %cl, -1(%edx)
2354#ifndef USE_AS_BCOPY
2355# ifdef USE_AS_MEMPCPY
2356 movl %edx, %eax
2357# else
2358 movl DEST(%esp), %eax
2359# endif
2360#endif
2361 RETURN
2362
2363 .p2align 4
2364L(fwd_write_33bytes_align):
2365 movdqa -33(%eax), %xmm0
2366 movdqa %xmm0, -33(%edx)
2367L(fwd_write_17bytes_align):
2368 movdqa -17(%eax), %xmm0
2369 movdqa %xmm0, -17(%edx)
2370 movzbl -1(%eax), %ecx
2371 movb %cl, -1(%edx)
2372#ifndef USE_AS_BCOPY
2373# ifdef USE_AS_MEMPCPY
2374 movl %edx, %eax
2375# else
2376 movl DEST(%esp), %eax
2377# endif
2378#endif
2379 RETURN
2380
2381 .p2align 4
2382L(fwd_write_46bytes_align):
2383 movdqa -46(%eax), %xmm0
2384 movdqa %xmm0, -46(%edx)
2385L(fwd_write_30bytes_align):
2386 movdqa -30(%eax), %xmm0
2387 movdqa %xmm0, -30(%edx)
2388L(fwd_write_14bytes_align):
2389 movq -14(%eax), %xmm0
2390 movq %xmm0, -14(%edx)
2391L(fwd_write_6bytes_align):
2392 movl -6(%eax), %ecx
2393 movl %ecx, -6(%edx)
2394 movzwl -2(%eax), %ecx
2395 movw %cx, -2(%edx)
2396#ifndef USE_AS_BCOPY
2397# ifdef USE_AS_MEMPCPY
2398 movl %edx, %eax
2399# else
2400 movl DEST(%esp), %eax
2401# endif
2402#endif
2403 RETURN
2404
2405 .p2align 4
2406L(fwd_write_38bytes_align):
2407 movdqa -38(%eax), %xmm0
2408 movdqa %xmm0, -38(%edx)
2409L(fwd_write_22bytes_align):
2410 movdqa -22(%eax), %xmm0
2411 movdqa %xmm0, -22(%edx)
2412 movl -6(%eax), %ecx
2413 movl %ecx, -6(%edx)
2414 movzwl -2(%eax), %ecx
2415 movw %cx, -2(%edx)
2416#ifndef USE_AS_BCOPY
2417# ifdef USE_AS_MEMPCPY
2418 movl %edx, %eax
2419# else
2420 movl DEST(%esp), %eax
2421# endif
2422#endif
2423 RETURN
2424
2425 .p2align 4
2426L(fwd_write_42bytes_align):
2427 movdqa -42(%eax), %xmm0
2428 movdqa %xmm0, -42(%edx)
2429L(fwd_write_26bytes_align):
2430 movdqa -26(%eax), %xmm0
2431 movdqa %xmm0, -26(%edx)
2432L(fwd_write_10bytes_align):
2433 movq -10(%eax), %xmm0
2434 movq %xmm0, -10(%edx)
2435L(fwd_write_2bytes_align):
2436 movzwl -2(%eax), %ecx
2437 movw %cx, -2(%edx)
2438#ifndef USE_AS_BCOPY
2439# ifdef USE_AS_MEMPCPY
2440 movl %edx, %eax
2441# else
2442 movl DEST(%esp), %eax
2443# endif
2444#endif
2445 RETURN
2446
2447 .p2align 4
2448L(fwd_write_34bytes_align):
2449 movdqa -34(%eax), %xmm0
2450 movdqa %xmm0, -34(%edx)
2451L(fwd_write_18bytes_align):
2452 movdqa -18(%eax), %xmm0
2453 movdqa %xmm0, -18(%edx)
2454 movzwl -2(%eax), %ecx
2455 movw %cx, -2(%edx)
2456#ifndef USE_AS_BCOPY
2457# ifdef USE_AS_MEMPCPY
2458 movl %edx, %eax
2459# else
2460 movl DEST(%esp), %eax
2461# endif
2462#endif
2463 RETURN
2464
2465 .p2align 4
2466L(fwd_write_47bytes_align):
2467 movdqa -47(%eax), %xmm0
2468 movdqa %xmm0, -47(%edx)
2469L(fwd_write_31bytes_align):
2470 movdqa -31(%eax), %xmm0
2471 movdqa %xmm0, -31(%edx)
2472L(fwd_write_15bytes_align):
2473 movq -15(%eax), %xmm0
2474 movq %xmm0, -15(%edx)
2475L(fwd_write_7bytes_align):
2476 movl -7(%eax), %ecx
2477 movl %ecx, -7(%edx)
2478 movzwl -3(%eax), %ecx
2479 movzbl -1(%eax), %eax
2480 movw %cx, -3(%edx)
2481 movb %al, -1(%edx)
2482#ifndef USE_AS_BCOPY
2483# ifdef USE_AS_MEMPCPY
2484 movl %edx, %eax
2485# else
2486 movl DEST(%esp), %eax
2487# endif
2488#endif
2489 RETURN
2490
2491 .p2align 4
2492L(fwd_write_39bytes_align):
2493 movdqa -39(%eax), %xmm0
2494 movdqa %xmm0, -39(%edx)
2495L(fwd_write_23bytes_align):
2496 movdqa -23(%eax), %xmm0
2497 movdqa %xmm0, -23(%edx)
2498 movl -7(%eax), %ecx
2499 movl %ecx, -7(%edx)
2500 movzwl -3(%eax), %ecx
2501 movzbl -1(%eax), %eax
2502 movw %cx, -3(%edx)
2503 movb %al, -1(%edx)
2504#ifndef USE_AS_BCOPY
2505# ifdef USE_AS_MEMPCPY
2506 movl %edx, %eax
2507# else
2508 movl DEST(%esp), %eax
2509# endif
2510#endif
2511 RETURN
2512
2513 .p2align 4
2514L(fwd_write_43bytes_align):
2515 movdqa -43(%eax), %xmm0
2516 movdqa %xmm0, -43(%edx)
2517L(fwd_write_27bytes_align):
2518 movdqa -27(%eax), %xmm0
2519 movdqa %xmm0, -27(%edx)
2520L(fwd_write_11bytes_align):
2521 movq -11(%eax), %xmm0
2522 movq %xmm0, -11(%edx)
2523L(fwd_write_3bytes_align):
2524 movzwl -3(%eax), %ecx
2525 movzbl -1(%eax), %eax
2526 movw %cx, -3(%edx)
2527 movb %al, -1(%edx)
2528#ifndef USE_AS_BCOPY
2529# ifdef USE_AS_MEMPCPY
2530 movl %edx, %eax
2531# else
2532 movl DEST(%esp), %eax
2533# endif
2534#endif
2535 RETURN
2536
2537 .p2align 4
2538L(fwd_write_35bytes_align):
2539 movdqa -35(%eax), %xmm0
2540 movdqa %xmm0, -35(%edx)
2541L(fwd_write_19bytes_align):
2542 movdqa -19(%eax), %xmm0
2543 movdqa %xmm0, -19(%edx)
2544 movzwl -3(%eax), %ecx
2545 movzbl -1(%eax), %eax
2546 movw %cx, -3(%edx)
2547 movb %al, -1(%edx)
2548#ifndef USE_AS_BCOPY
2549# ifdef USE_AS_MEMPCPY
2550 movl %edx, %eax
2551# else
2552 movl DEST(%esp), %eax
2553# endif
2554#endif
2555 RETURN
2556
2557 .p2align 4
2558L(fwd_write_44bytes_align):
2559 movdqa -44(%eax), %xmm0
2560 movdqa %xmm0, -44(%edx)
2561L(fwd_write_28bytes_align):
2562 movdqa -28(%eax), %xmm0
2563 movdqa %xmm0, -28(%edx)
2564L(fwd_write_12bytes_align):
2565 movq -12(%eax), %xmm0
2566 movq %xmm0, -12(%edx)
2567L(fwd_write_4bytes_align):
2568 movl -4(%eax), %ecx
2569 movl %ecx, -4(%edx)
2570#ifndef USE_AS_BCOPY
2571# ifdef USE_AS_MEMPCPY
2572 movl %edx, %eax
2573# else
2574 movl DEST(%esp), %eax
2575# endif
2576#endif
2577 RETURN
2578
2579 .p2align 4
2580L(fwd_write_36bytes_align):
2581 movdqa -36(%eax), %xmm0
2582 movdqa %xmm0, -36(%edx)
2583L(fwd_write_20bytes_align):
2584 movdqa -20(%eax), %xmm0
2585 movdqa %xmm0, -20(%edx)
2586 movl -4(%eax), %ecx
2587 movl %ecx, -4(%edx)
2588#ifndef USE_AS_BCOPY
2589# ifdef USE_AS_MEMPCPY
2590 movl %edx, %eax
2591# else
2592 movl DEST(%esp), %eax
2593# endif
2594#endif
Bruce Beare124a5422010-10-11 12:24:41 -07002595 RETURN_END
Bruce Beare8ff1a272010-03-04 11:03:37 -08002596
Jack Renc47703a2012-02-14 12:01:52 +04002597 CFI_PUSH (%edi)
2598
2599 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002600L(large_page):
2601 movdqu (%eax), %xmm1
Jack Renc47703a2012-02-14 12:01:52 +04002602#ifdef USE_AS_MEMMOVE
2603 movl DEST+4(%esp), %edi
2604 movdqu %xmm0, (%edi)
2605#endif
Bruce Beare8ff1a272010-03-04 11:03:37 -08002606 lea 16(%eax), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002607 movntdq %xmm1, (%edx)
2608 lea 16(%edx), %edx
Bruce Beare8ff1a272010-03-04 11:03:37 -08002609 lea -0x90(%ecx), %ecx
2610 POP (%edi)
Jack Renc47703a2012-02-14 12:01:52 +04002611
2612 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002613L(large_page_loop):
2614 movdqu (%eax), %xmm0
2615 movdqu 0x10(%eax), %xmm1
2616 movdqu 0x20(%eax), %xmm2
2617 movdqu 0x30(%eax), %xmm3
2618 movdqu 0x40(%eax), %xmm4
2619 movdqu 0x50(%eax), %xmm5
2620 movdqu 0x60(%eax), %xmm6
2621 movdqu 0x70(%eax), %xmm7
2622 lea 0x80(%eax), %eax
2623
2624 sub $0x80, %ecx
2625 movntdq %xmm0, (%edx)
2626 movntdq %xmm1, 0x10(%edx)
2627 movntdq %xmm2, 0x20(%edx)
2628 movntdq %xmm3, 0x30(%edx)
2629 movntdq %xmm4, 0x40(%edx)
2630 movntdq %xmm5, 0x50(%edx)
2631 movntdq %xmm6, 0x60(%edx)
2632 movntdq %xmm7, 0x70(%edx)
2633 lea 0x80(%edx), %edx
2634 jae L(large_page_loop)
2635 cmp $-0x40, %ecx
2636 lea 0x80(%ecx), %ecx
2637 jl L(large_page_less_64bytes)
2638
2639 movdqu (%eax), %xmm0
2640 movdqu 0x10(%eax), %xmm1
2641 movdqu 0x20(%eax), %xmm2
2642 movdqu 0x30(%eax), %xmm3
2643 lea 0x40(%eax), %eax
2644
2645 movntdq %xmm0, (%edx)
2646 movntdq %xmm1, 0x10(%edx)
2647 movntdq %xmm2, 0x20(%edx)
2648 movntdq %xmm3, 0x30(%edx)
2649 lea 0x40(%edx), %edx
2650 sub $0x40, %ecx
2651L(large_page_less_64bytes):
2652 cmp $32, %ecx
2653 jb L(large_page_less_32bytes)
2654 movdqu (%eax), %xmm0
2655 movdqu 0x10(%eax), %xmm1
2656 lea 0x20(%eax), %eax
2657 movntdq %xmm0, (%edx)
2658 movntdq %xmm1, 0x10(%edx)
2659 lea 0x20(%edx), %edx
2660 sub $0x20, %ecx
2661L(large_page_less_32bytes):
2662 add %ecx, %edx
2663 add %ecx, %eax
2664 sfence
2665 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
2666
Jack Renc47703a2012-02-14 12:01:52 +04002667 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002668L(bk_write_44bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002669 movq 36(%eax), %xmm0
2670 movq %xmm0, 36(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002671L(bk_write_36bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002672 movq 28(%eax), %xmm0
2673 movq %xmm0, 28(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002674L(bk_write_28bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002675 movq 20(%eax), %xmm0
2676 movq %xmm0, 20(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002677L(bk_write_20bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002678 movq 12(%eax), %xmm0
2679 movq %xmm0, 12(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002680L(bk_write_12bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002681 movq 4(%eax), %xmm0
2682 movq %xmm0, 4(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002683L(bk_write_4bytes):
2684 movl (%eax), %ecx
2685 movl %ecx, (%edx)
2686L(bk_write_0bytes):
2687#ifndef USE_AS_BCOPY
2688 movl DEST(%esp), %eax
2689# ifdef USE_AS_MEMPCPY
2690 movl LEN(%esp), %ecx
2691 add %ecx, %eax
2692# endif
2693#endif
2694 RETURN
2695
Jack Renc47703a2012-02-14 12:01:52 +04002696 .p2align 4
2697L(bk_write_40bytes):
2698 movq 32(%eax), %xmm0
2699 movq %xmm0, 32(%edx)
2700L(bk_write_32bytes):
2701 movq 24(%eax), %xmm0
2702 movq %xmm0, 24(%edx)
2703L(bk_write_24bytes):
2704 movq 16(%eax), %xmm0
2705 movq %xmm0, 16(%edx)
2706L(bk_write_16bytes):
2707 movq 8(%eax), %xmm0
2708 movq %xmm0, 8(%edx)
2709L(bk_write_8bytes):
2710 movq (%eax), %xmm0
2711 movq %xmm0, (%edx)
2712#ifndef USE_AS_BCOPY
2713 movl DEST(%esp), %eax
2714# ifdef USE_AS_MEMPCPY
2715 movl LEN(%esp), %ecx
2716 add %ecx, %eax
2717# endif
2718#endif
2719 RETURN
2720
2721 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002722L(bk_write_45bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002723 movq 37(%eax), %xmm0
2724 movq %xmm0, 37(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002725L(bk_write_37bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002726 movq 29(%eax), %xmm0
2727 movq %xmm0, 29(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002728L(bk_write_29bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002729 movq 21(%eax), %xmm0
2730 movq %xmm0, 21(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002731L(bk_write_21bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002732 movq 13(%eax), %xmm0
2733 movq %xmm0, 13(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002734L(bk_write_13bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002735 movq 5(%eax), %xmm0
2736 movq %xmm0, 5(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002737L(bk_write_5bytes):
2738 movl 1(%eax), %ecx
2739 movl %ecx, 1(%edx)
2740L(bk_write_1bytes):
2741 movzbl (%eax), %ecx
2742 movb %cl, (%edx)
2743#ifndef USE_AS_BCOPY
2744 movl DEST(%esp), %eax
2745# ifdef USE_AS_MEMPCPY
2746 movl LEN(%esp), %ecx
2747 add %ecx, %eax
2748# endif
2749#endif
2750 RETURN
2751
Jack Renc47703a2012-02-14 12:01:52 +04002752 .p2align 4
2753L(bk_write_41bytes):
2754 movq 33(%eax), %xmm0
2755 movq %xmm0, 33(%edx)
2756L(bk_write_33bytes):
2757 movq 25(%eax), %xmm0
2758 movq %xmm0, 25(%edx)
2759L(bk_write_25bytes):
2760 movq 17(%eax), %xmm0
2761 movq %xmm0, 17(%edx)
2762L(bk_write_17bytes):
2763 movq 9(%eax), %xmm0
2764 movq %xmm0, 9(%edx)
2765L(bk_write_9bytes):
2766 movq 1(%eax), %xmm0
2767 movq %xmm0, 1(%edx)
2768 movzbl (%eax), %ecx
2769 movb %cl, (%edx)
2770#ifndef USE_AS_BCOPY
2771 movl DEST(%esp), %eax
2772# ifdef USE_AS_MEMPCPY
2773 movl LEN(%esp), %ecx
2774 add %ecx, %eax
2775# endif
2776#endif
2777 RETURN
2778
2779 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002780L(bk_write_46bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002781 movq 38(%eax), %xmm0
2782 movq %xmm0, 38(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002783L(bk_write_38bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002784 movq 30(%eax), %xmm0
2785 movq %xmm0, 30(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002786L(bk_write_30bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002787 movq 22(%eax), %xmm0
2788 movq %xmm0, 22(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002789L(bk_write_22bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002790 movq 14(%eax), %xmm0
2791 movq %xmm0, 14(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002792L(bk_write_14bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002793 movq 6(%eax), %xmm0
2794 movq %xmm0, 6(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002795L(bk_write_6bytes):
2796 movl 2(%eax), %ecx
2797 movl %ecx, 2(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002798 movzwl (%eax), %ecx
2799 movw %cx, (%edx)
2800#ifndef USE_AS_BCOPY
2801 movl DEST(%esp), %eax
2802# ifdef USE_AS_MEMPCPY
2803 movl LEN(%esp), %ecx
2804 add %ecx, %eax
2805# endif
2806#endif
2807 RETURN
2808
2809 .p2align 4
2810L(bk_write_42bytes):
2811 movq 34(%eax), %xmm0
2812 movq %xmm0, 34(%edx)
2813L(bk_write_34bytes):
2814 movq 26(%eax), %xmm0
2815 movq %xmm0, 26(%edx)
2816L(bk_write_26bytes):
2817 movq 18(%eax), %xmm0
2818 movq %xmm0, 18(%edx)
2819L(bk_write_18bytes):
2820 movq 10(%eax), %xmm0
2821 movq %xmm0, 10(%edx)
2822L(bk_write_10bytes):
2823 movq 2(%eax), %xmm0
2824 movq %xmm0, 2(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002825L(bk_write_2bytes):
2826 movzwl (%eax), %ecx
2827 movw %cx, (%edx)
2828#ifndef USE_AS_BCOPY
2829 movl DEST(%esp), %eax
2830# ifdef USE_AS_MEMPCPY
2831 movl LEN(%esp), %ecx
2832 add %ecx, %eax
2833# endif
2834#endif
2835 RETURN
2836
Jack Renc47703a2012-02-14 12:01:52 +04002837 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002838L(bk_write_47bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002839 movq 39(%eax), %xmm0
2840 movq %xmm0, 39(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002841L(bk_write_39bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002842 movq 31(%eax), %xmm0
2843 movq %xmm0, 31(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002844L(bk_write_31bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002845 movq 23(%eax), %xmm0
2846 movq %xmm0, 23(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002847L(bk_write_23bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002848 movq 15(%eax), %xmm0
2849 movq %xmm0, 15(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002850L(bk_write_15bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002851 movq 7(%eax), %xmm0
2852 movq %xmm0, 7(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002853L(bk_write_7bytes):
2854 movl 3(%eax), %ecx
2855 movl %ecx, 3(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002856 movzwl 1(%eax), %ecx
2857 movw %cx, 1(%edx)
2858 movzbl (%eax), %eax
2859 movb %al, (%edx)
2860#ifndef USE_AS_BCOPY
2861 movl DEST(%esp), %eax
2862# ifdef USE_AS_MEMPCPY
2863 movl LEN(%esp), %ecx
2864 add %ecx, %eax
2865# endif
2866#endif
2867 RETURN
2868
2869 .p2align 4
2870L(bk_write_43bytes):
2871 movq 35(%eax), %xmm0
2872 movq %xmm0, 35(%edx)
2873L(bk_write_35bytes):
2874 movq 27(%eax), %xmm0
2875 movq %xmm0, 27(%edx)
2876L(bk_write_27bytes):
2877 movq 19(%eax), %xmm0
2878 movq %xmm0, 19(%edx)
2879L(bk_write_19bytes):
2880 movq 11(%eax), %xmm0
2881 movq %xmm0, 11(%edx)
2882L(bk_write_11bytes):
2883 movq 3(%eax), %xmm0
2884 movq %xmm0, 3(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002885L(bk_write_3bytes):
2886 movzwl 1(%eax), %ecx
2887 movw %cx, 1(%edx)
2888 movzbl (%eax), %eax
2889 movb %al, (%edx)
2890#ifndef USE_AS_BCOPY
2891 movl DEST(%esp), %eax
2892# ifdef USE_AS_MEMPCPY
2893 movl LEN(%esp), %ecx
2894 add %ecx, %eax
2895# endif
2896#endif
2897 RETURN_END
2898
2899
2900 .pushsection .rodata.ssse3,"a",@progbits
Jack Renc47703a2012-02-14 12:01:52 +04002901 .p2align 2
Bruce Beare8ff1a272010-03-04 11:03:37 -08002902L(table_48bytes_fwd):
2903 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
2904 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
2905 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
2906 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
2907 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
2908 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
2909 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
2910 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
2911 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
2912 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
2913 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
2914 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
2915 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
2916 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
2917 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
2918 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
2919 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
2920 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
2921 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
2922 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
2923 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
2924 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
2925 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
2926 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
2927 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
2928 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
2929 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
2930 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
2931 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
2932 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
2933 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
2934 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
2935 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
2936 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
2937 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
2938 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
2939 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
2940 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
2941 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
2942 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
2943 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
2944 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
2945 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
2946 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
2947 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
2948 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
2949 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
2950 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
2951
Jack Renc47703a2012-02-14 12:01:52 +04002952 .p2align 2
2953L(table_48bytes_fwd_align):
2954 .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
2955 .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
2956 .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
2957 .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
2958 .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
2959 .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
2960 .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
2961 .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
2962 .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
2963 .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
2964 .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
2965 .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
2966 .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
2967 .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
2968 .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
2969 .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
2970 .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
2971 .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
2972 .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
2973 .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
2974 .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
2975 .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
2976 .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
2977 .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
2978 .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
2979 .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
2980 .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
2981 .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
2982 .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
2983 .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
2984 .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
2985 .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
2986 .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
2987 .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
2988 .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
2989 .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
2990 .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
2991 .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
2992 .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
2993 .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
2994 .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
2995 .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
2996 .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
2997 .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
2998 .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
2999 .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
3000 .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
3001 .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
3002
3003 .p2align 2
Bruce Beare8ff1a272010-03-04 11:03:37 -08003004L(shl_table):
3005 .int JMPTBL (L(shl_0), L(shl_table))
3006 .int JMPTBL (L(shl_1), L(shl_table))
3007 .int JMPTBL (L(shl_2), L(shl_table))
3008 .int JMPTBL (L(shl_3), L(shl_table))
3009 .int JMPTBL (L(shl_4), L(shl_table))
3010 .int JMPTBL (L(shl_5), L(shl_table))
3011 .int JMPTBL (L(shl_6), L(shl_table))
3012 .int JMPTBL (L(shl_7), L(shl_table))
3013 .int JMPTBL (L(shl_8), L(shl_table))
3014 .int JMPTBL (L(shl_9), L(shl_table))
3015 .int JMPTBL (L(shl_10), L(shl_table))
3016 .int JMPTBL (L(shl_11), L(shl_table))
3017 .int JMPTBL (L(shl_12), L(shl_table))
3018 .int JMPTBL (L(shl_13), L(shl_table))
3019 .int JMPTBL (L(shl_14), L(shl_table))
3020 .int JMPTBL (L(shl_15), L(shl_table))
3021
Jack Renc47703a2012-02-14 12:01:52 +04003022 .p2align 2
Bruce Beare8ff1a272010-03-04 11:03:37 -08003023L(table_48_bytes_bwd):
3024 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
3025 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
3026 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
3027 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
3028 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
3029 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
3030 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
3031 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
3032 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
3033 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
3034 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
3035 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
3036 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
3037 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
3038 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
3039 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
3040 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
3041 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
3042 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
3043 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
3044 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
3045 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
3046 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
3047 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
3048 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
3049 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
3050 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
3051 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
3052 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
3053 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
3054 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
3055 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
3056 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
3057 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
3058 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
3059 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
3060 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
3061 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
3062 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
3063 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
3064 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
3065 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
3066 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
3067 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
3068 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
3069 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
3070 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
3071 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
3072
3073 .popsection
3074
3075#ifdef USE_AS_MEMMOVE
Jack Renc47703a2012-02-14 12:01:52 +04003076 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003077L(copy_backward):
Jack Renc47703a2012-02-14 12:01:52 +04003078 PUSH (%edi)
3079 movl %eax, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003080 lea (%ecx,%edx,1),%edx
Jack Renc47703a2012-02-14 12:01:52 +04003081 lea (%ecx,%edi,1),%edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003082 testl $0x3, %edx
3083 jnz L(bk_align)
3084
3085L(bk_aligned_4):
3086 cmp $64, %ecx
3087 jae L(bk_write_more64bytes)
3088
3089L(bk_write_64bytesless):
3090 cmp $32, %ecx
3091 jb L(bk_write_less32bytes)
3092
3093L(bk_write_more32bytes):
3094 /* Copy 32 bytes at a time. */
3095 sub $32, %ecx
Jack Renc47703a2012-02-14 12:01:52 +04003096 movq -8(%edi), %xmm0
3097 movq %xmm0, -8(%edx)
3098 movq -16(%edi), %xmm0
3099 movq %xmm0, -16(%edx)
3100 movq -24(%edi), %xmm0
3101 movq %xmm0, -24(%edx)
3102 movq -32(%edi), %xmm0
3103 movq %xmm0, -32(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08003104 sub $32, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003105 sub $32, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003106
3107L(bk_write_less32bytes):
Jack Renc47703a2012-02-14 12:01:52 +04003108 movl %edi, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003109 sub %ecx, %edx
3110 sub %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +04003111 POP (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -08003112L(bk_write_less32bytes_2):
3113 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
3114
Jack Renc47703a2012-02-14 12:01:52 +04003115 CFI_PUSH (%edi)
3116
3117 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003118L(bk_align):
3119 cmp $8, %ecx
3120 jbe L(bk_write_less32bytes)
3121 testl $1, %edx
3122 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
Jack Renc47703a2012-02-14 12:01:52 +04003123 then (EDX & 2) must be != 0. */
Bruce Beare8ff1a272010-03-04 11:03:37 -08003124 jz L(bk_got2)
Jack Renc47703a2012-02-14 12:01:52 +04003125 sub $1, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003126 sub $1, %ecx
3127 sub $1, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003128 movzbl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003129 movb %al, (%edx)
3130
3131 testl $2, %edx
3132 jz L(bk_aligned_4)
3133
3134L(bk_got2):
Jack Renc47703a2012-02-14 12:01:52 +04003135 sub $2, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003136 sub $2, %ecx
3137 sub $2, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003138 movzwl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003139 movw %ax, (%edx)
3140 jmp L(bk_aligned_4)
3141
Jack Renc47703a2012-02-14 12:01:52 +04003142 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003143L(bk_write_more64bytes):
3144 /* Check alignment of last byte. */
3145 testl $15, %edx
3146 jz L(bk_ssse3_cpy_pre)
3147
3148/* EDX is aligned 4 bytes, but not 16 bytes. */
3149L(bk_ssse3_align):
Jack Renc47703a2012-02-14 12:01:52 +04003150 sub $4, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003151 sub $4, %ecx
3152 sub $4, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003153 movl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003154 movl %eax, (%edx)
3155
3156 testl $15, %edx
3157 jz L(bk_ssse3_cpy_pre)
3158
Jack Renc47703a2012-02-14 12:01:52 +04003159 sub $4, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003160 sub $4, %ecx
3161 sub $4, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003162 movl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003163 movl %eax, (%edx)
3164
3165 testl $15, %edx
3166 jz L(bk_ssse3_cpy_pre)
3167
Jack Renc47703a2012-02-14 12:01:52 +04003168 sub $4, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003169 sub $4, %ecx
3170 sub $4, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003171 movl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003172 movl %eax, (%edx)
3173
3174L(bk_ssse3_cpy_pre):
3175 cmp $64, %ecx
3176 jb L(bk_write_more32bytes)
3177
Jack Renc47703a2012-02-14 12:01:52 +04003178 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003179L(bk_ssse3_cpy):
Jack Renc47703a2012-02-14 12:01:52 +04003180 sub $64, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003181 sub $64, %ecx
3182 sub $64, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003183 movdqu 0x30(%edi), %xmm3
Bruce Beare8ff1a272010-03-04 11:03:37 -08003184 movdqa %xmm3, 0x30(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04003185 movdqu 0x20(%edi), %xmm2
Bruce Beare8ff1a272010-03-04 11:03:37 -08003186 movdqa %xmm2, 0x20(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04003187 movdqu 0x10(%edi), %xmm1
Bruce Beare8ff1a272010-03-04 11:03:37 -08003188 movdqa %xmm1, 0x10(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04003189 movdqu (%edi), %xmm0
Bruce Beare8ff1a272010-03-04 11:03:37 -08003190 movdqa %xmm0, (%edx)
3191 cmp $64, %ecx
3192 jae L(bk_ssse3_cpy)
3193 jmp L(bk_write_64bytesless)
3194
3195#endif
3196
3197END (MEMCPY)