blob: ac5ec2d4bd27dd214a774b2defabda5d258e869f [file] [log] [blame]
Bruce Beare8ff1a272010-03-04 11:03:37 -08001/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
Liubov Dmitrieva0a490662012-01-17 12:55:46 +040031#include "cache.h"
Liubov Dmitrieva0a490662012-01-17 12:55:46 +040032
Bruce Beare8ff1a272010-03-04 11:03:37 -080033#ifndef MEMCPY
Liubov Dmitrieva0a490662012-01-17 12:55:46 +040034# define MEMCPY memcpy
Bruce Beare8ff1a272010-03-04 11:03:37 -080035#endif
36
37#ifndef L
38# define L(label) .L##label
39#endif
40
Bruce Beare8ff1a272010-03-04 11:03:37 -080041#ifndef cfi_startproc
Jack Renc47703a2012-02-14 12:01:52 +040042# define cfi_startproc .cfi_startproc
Bruce Beare8ff1a272010-03-04 11:03:37 -080043#endif
44
45#ifndef cfi_endproc
Jack Renc47703a2012-02-14 12:01:52 +040046# define cfi_endproc .cfi_endproc
Bruce Beare8ff1a272010-03-04 11:03:37 -080047#endif
48
49#ifndef cfi_rel_offset
50# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
51#endif
52
53#ifndef cfi_restore
Jack Renc47703a2012-02-14 12:01:52 +040054# define cfi_restore(reg) .cfi_restore reg
Bruce Beare8ff1a272010-03-04 11:03:37 -080055#endif
56
57#ifndef cfi_adjust_cfa_offset
58# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
59#endif
60
61#ifndef ENTRY
Jack Renc47703a2012-02-14 12:01:52 +040062# define ENTRY(name) \
63 .type name, @function; \
64 .globl name; \
65 .p2align 4; \
66name: \
Bruce Beare8ff1a272010-03-04 11:03:37 -080067 cfi_startproc
68#endif
69
70#ifndef END
Jack Renc47703a2012-02-14 12:01:52 +040071# define END(name) \
72 cfi_endproc; \
Bruce Beare8ff1a272010-03-04 11:03:37 -080073 .size name, .-name
74#endif
75
76#ifdef USE_AS_BCOPY
77# define SRC PARMS
78# define DEST SRC+4
79# define LEN DEST+4
80#else
81# define DEST PARMS
82# define SRC DEST+4
83# define LEN SRC+4
84#endif
85
Jack Renc47703a2012-02-14 12:01:52 +040086#define CFI_PUSH(REG) \
87 cfi_adjust_cfa_offset (4); \
Bruce Beare8ff1a272010-03-04 11:03:37 -080088 cfi_rel_offset (REG, 0)
89
Jack Renc47703a2012-02-14 12:01:52 +040090#define CFI_POP(REG) \
91 cfi_adjust_cfa_offset (-4); \
Bruce Beare8ff1a272010-03-04 11:03:37 -080092 cfi_restore (REG)
93
94#define PUSH(REG) pushl REG; CFI_PUSH (REG)
95#define POP(REG) popl REG; CFI_POP (REG)
96
Nick Kralevich5982e332011-11-11 15:47:24 -080097#if (defined SHARED || defined __PIC__)
Bruce Beare8ff1a272010-03-04 11:03:37 -080098# define PARMS 8 /* Preserve EBX. */
99# define ENTRANCE PUSH (%ebx);
100# define RETURN_END POP (%ebx); ret
101# define RETURN RETURN_END; CFI_PUSH (%ebx)
102# define JMPTBL(I, B) I - B
Jack Renc47703a2012-02-14 12:01:52 +0400103
Varvara Rainchik5a922842014-04-24 15:41:20 +0400104# define SETUP_PIC_REG(x) call __x86.get_pc_thunk.x
Bruce Beare8ff1a272010-03-04 11:03:37 -0800105
106/* Load an entry in a jump table into EBX and branch to it. TABLE is a
Jack Renc47703a2012-02-14 12:01:52 +0400107 jump table with relative offsets. INDEX is a register contains the
108 index into the jump table. SCALE is the scale of INDEX. */
109
Bruce Beare8ff1a272010-03-04 11:03:37 -0800110# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
Jack Renc47703a2012-02-14 12:01:52 +0400111 /* We first load PC into EBX. */ \
112 SETUP_PIC_REG(bx); \
113 /* Get the address of the jump table. */ \
114 addl $(TABLE - .), %ebx; \
115 /* Get the entry and convert the relative offset to the \
116 absolute address. */ \
117 addl (%ebx, INDEX, SCALE), %ebx; \
118 /* We loaded the jump table. Go. */ \
119 jmp *%ebx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800120#else
Jack Renc47703a2012-02-14 12:01:52 +0400121
Bruce Beare8ff1a272010-03-04 11:03:37 -0800122# define PARMS 4
123# define ENTRANCE
124# define RETURN_END ret
125# define RETURN RETURN_END
126# define JMPTBL(I, B) I
127
128/* Branch to an entry in a jump table. TABLE is a jump table with
Jack Renc47703a2012-02-14 12:01:52 +0400129 absolute offsets. INDEX is a register contains the index into the
130 jump table. SCALE is the scale of INDEX. */
131
Bruce Beare8ff1a272010-03-04 11:03:37 -0800132# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
Jack Renc47703a2012-02-14 12:01:52 +0400133 jmp *TABLE(, INDEX, SCALE)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800134#endif
135
136 .section .text.ssse3,"ax",@progbits
137ENTRY (MEMCPY)
138 ENTRANCE
139 movl LEN(%esp), %ecx
140 movl SRC(%esp), %eax
141 movl DEST(%esp), %edx
142
143#ifdef USE_AS_MEMMOVE
144 cmp %eax, %edx
145 jb L(copy_forward)
146 je L(fwd_write_0bytes)
147 cmp $32, %ecx
148 jae L(memmove_bwd)
149 jmp L(bk_write_less32bytes_2)
Jack Renc47703a2012-02-14 12:01:52 +0400150
151 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800152L(memmove_bwd):
153 add %ecx, %eax
154 cmp %eax, %edx
155 movl SRC(%esp), %eax
156 jb L(copy_backward)
157
158L(copy_forward):
159#endif
160 cmp $48, %ecx
161 jae L(48bytesormore)
162
163L(fwd_write_less32bytes):
164#ifndef USE_AS_MEMMOVE
165 cmp %dl, %al
166 jb L(bk_write)
167#endif
168 add %ecx, %edx
169 add %ecx, %eax
170 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
171#ifndef USE_AS_MEMMOVE
Jack Renc47703a2012-02-14 12:01:52 +0400172 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800173L(bk_write):
174 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
175#endif
176
Jack Renc47703a2012-02-14 12:01:52 +0400177 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800178L(48bytesormore):
Jack Renc47703a2012-02-14 12:01:52 +0400179#ifndef USE_AS_MEMMOVE
180 movlpd (%eax), %xmm0
181 movlpd 8(%eax), %xmm1
182 movlpd %xmm0, (%edx)
183 movlpd %xmm1, 8(%edx)
184#else
Bruce Beare8ff1a272010-03-04 11:03:37 -0800185 movdqu (%eax), %xmm0
Jack Renc47703a2012-02-14 12:01:52 +0400186#endif
Bruce Beare8ff1a272010-03-04 11:03:37 -0800187 PUSH (%edi)
188 movl %edx, %edi
189 and $-16, %edx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800190 add $16, %edx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800191 sub %edx, %edi
192 add %edi, %ecx
193 sub %edi, %eax
194
195#ifdef SHARED_CACHE_SIZE_HALF
196 cmp $SHARED_CACHE_SIZE_HALF, %ecx
197#else
Nick Kralevich5982e332011-11-11 15:47:24 -0800198# if (defined SHARED || defined __PIC__)
Jack Renc47703a2012-02-14 12:01:52 +0400199 SETUP_PIC_REG(bx)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800200 add $_GLOBAL_OFFSET_TABLE_, %ebx
201 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
202# else
203 cmp __x86_shared_cache_size_half, %ecx
204# endif
205#endif
206
207 mov %eax, %edi
208 jae L(large_page)
209 and $0xf, %edi
210 jz L(shl_0)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800211 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
212
Jack Renc47703a2012-02-14 12:01:52 +0400213 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800214L(shl_0):
Jack Renc47703a2012-02-14 12:01:52 +0400215#ifdef USE_AS_MEMMOVE
216 movl DEST+4(%esp), %edi
217 movdqu %xmm0, (%edi)
218#endif
Bruce Beare8ff1a272010-03-04 11:03:37 -0800219 xor %edi, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -0800220 cmp $127, %ecx
221 ja L(shl_0_gobble)
222 lea -32(%ecx), %ecx
Jack Renc47703a2012-02-14 12:01:52 +0400223
224 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800225L(shl_0_loop):
226 movdqa (%eax, %edi), %xmm0
227 movdqa 16(%eax, %edi), %xmm1
228 sub $32, %ecx
229 movdqa %xmm0, (%edx, %edi)
230 movdqa %xmm1, 16(%edx, %edi)
231 lea 32(%edi), %edi
232 jb L(shl_0_end)
233
234 movdqa (%eax, %edi), %xmm0
235 movdqa 16(%eax, %edi), %xmm1
236 sub $32, %ecx
237 movdqa %xmm0, (%edx, %edi)
238 movdqa %xmm1, 16(%edx, %edi)
239 lea 32(%edi), %edi
240 jb L(shl_0_end)
241
242 movdqa (%eax, %edi), %xmm0
243 movdqa 16(%eax, %edi), %xmm1
244 sub $32, %ecx
245 movdqa %xmm0, (%edx, %edi)
246 movdqa %xmm1, 16(%edx, %edi)
247 lea 32(%edi), %edi
248 jb L(shl_0_end)
249
250 movdqa (%eax, %edi), %xmm0
251 movdqa 16(%eax, %edi), %xmm1
252 sub $32, %ecx
253 movdqa %xmm0, (%edx, %edi)
254 movdqa %xmm1, 16(%edx, %edi)
255 lea 32(%edi), %edi
Jack Renc47703a2012-02-14 12:01:52 +0400256
Bruce Beare8ff1a272010-03-04 11:03:37 -0800257L(shl_0_end):
258 lea 32(%ecx), %ecx
259 add %ecx, %edi
260 add %edi, %edx
261 add %edi, %eax
262 POP (%edi)
Jack Renc47703a2012-02-14 12:01:52 +0400263 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800264
Bruce Beare124a5422010-10-11 12:24:41 -0700265 CFI_PUSH (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800266
Jack Renc47703a2012-02-14 12:01:52 +0400267 .p2align 4
268L(shl_0_gobble):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800269#ifdef DATA_CACHE_SIZE_HALF
270 cmp $DATA_CACHE_SIZE_HALF, %ecx
271#else
Nick Kralevich5982e332011-11-11 15:47:24 -0800272# if (defined SHARED || defined __PIC__)
Jack Renc47703a2012-02-14 12:01:52 +0400273 SETUP_PIC_REG(bx)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800274 add $_GLOBAL_OFFSET_TABLE_, %ebx
275 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
276# else
277 cmp __x86_data_cache_size_half, %ecx
278# endif
279#endif
Jack Renc47703a2012-02-14 12:01:52 +0400280 POP (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800281 lea -128(%ecx), %ecx
282 jae L(shl_0_gobble_mem_loop)
Jack Renc47703a2012-02-14 12:01:52 +0400283
284 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800285L(shl_0_gobble_cache_loop):
286 movdqa (%eax), %xmm0
287 movdqa 0x10(%eax), %xmm1
288 movdqa 0x20(%eax), %xmm2
289 movdqa 0x30(%eax), %xmm3
290 movdqa 0x40(%eax), %xmm4
291 movdqa 0x50(%eax), %xmm5
292 movdqa 0x60(%eax), %xmm6
293 movdqa 0x70(%eax), %xmm7
294 lea 0x80(%eax), %eax
295 sub $128, %ecx
296 movdqa %xmm0, (%edx)
297 movdqa %xmm1, 0x10(%edx)
298 movdqa %xmm2, 0x20(%edx)
299 movdqa %xmm3, 0x30(%edx)
300 movdqa %xmm4, 0x40(%edx)
301 movdqa %xmm5, 0x50(%edx)
302 movdqa %xmm6, 0x60(%edx)
303 movdqa %xmm7, 0x70(%edx)
304 lea 0x80(%edx), %edx
305
306 jae L(shl_0_gobble_cache_loop)
307 cmp $-0x40, %ecx
308 lea 0x80(%ecx), %ecx
309 jl L(shl_0_cache_less_64bytes)
310
311 movdqa (%eax), %xmm0
312 sub $0x40, %ecx
313 movdqa 0x10(%eax), %xmm1
Bruce Beare8ff1a272010-03-04 11:03:37 -0800314 movdqa %xmm0, (%edx)
315 movdqa %xmm1, 0x10(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800316 movdqa 0x20(%eax), %xmm0
317 movdqa 0x30(%eax), %xmm1
318 add $0x40, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -0800319 movdqa %xmm0, 0x20(%edx)
320 movdqa %xmm1, 0x30(%edx)
321 add $0x40, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400322
Bruce Beare8ff1a272010-03-04 11:03:37 -0800323L(shl_0_cache_less_64bytes):
324 cmp $0x20, %ecx
325 jb L(shl_0_cache_less_32bytes)
326 movdqa (%eax), %xmm0
327 sub $0x20, %ecx
328 movdqa 0x10(%eax), %xmm1
329 add $0x20, %eax
330 movdqa %xmm0, (%edx)
331 movdqa %xmm1, 0x10(%edx)
332 add $0x20, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400333
Bruce Beare8ff1a272010-03-04 11:03:37 -0800334L(shl_0_cache_less_32bytes):
335 cmp $0x10, %ecx
336 jb L(shl_0_cache_less_16bytes)
337 sub $0x10, %ecx
338 movdqa (%eax), %xmm0
339 add $0x10, %eax
340 movdqa %xmm0, (%edx)
341 add $0x10, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400342
Bruce Beare8ff1a272010-03-04 11:03:37 -0800343L(shl_0_cache_less_16bytes):
344 add %ecx, %edx
345 add %ecx, %eax
346 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
347
Jack Renc47703a2012-02-14 12:01:52 +0400348 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800349L(shl_0_gobble_mem_loop):
350 prefetcht0 0x1c0(%eax)
351 prefetcht0 0x280(%eax)
352 prefetcht0 0x1c0(%edx)
353
354 movdqa (%eax), %xmm0
355 movdqa 0x10(%eax), %xmm1
356 movdqa 0x20(%eax), %xmm2
357 movdqa 0x30(%eax), %xmm3
358 movdqa 0x40(%eax), %xmm4
359 movdqa 0x50(%eax), %xmm5
360 movdqa 0x60(%eax), %xmm6
361 movdqa 0x70(%eax), %xmm7
362 lea 0x80(%eax), %eax
363 sub $0x80, %ecx
364 movdqa %xmm0, (%edx)
365 movdqa %xmm1, 0x10(%edx)
366 movdqa %xmm2, 0x20(%edx)
367 movdqa %xmm3, 0x30(%edx)
368 movdqa %xmm4, 0x40(%edx)
369 movdqa %xmm5, 0x50(%edx)
370 movdqa %xmm6, 0x60(%edx)
371 movdqa %xmm7, 0x70(%edx)
372 lea 0x80(%edx), %edx
373
374 jae L(shl_0_gobble_mem_loop)
375 cmp $-0x40, %ecx
376 lea 0x80(%ecx), %ecx
377 jl L(shl_0_mem_less_64bytes)
378
379 movdqa (%eax), %xmm0
380 sub $0x40, %ecx
381 movdqa 0x10(%eax), %xmm1
382
383 movdqa %xmm0, (%edx)
384 movdqa %xmm1, 0x10(%edx)
385
386 movdqa 0x20(%eax), %xmm0
387 movdqa 0x30(%eax), %xmm1
388 add $0x40, %eax
389
390 movdqa %xmm0, 0x20(%edx)
391 movdqa %xmm1, 0x30(%edx)
392 add $0x40, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400393
Bruce Beare8ff1a272010-03-04 11:03:37 -0800394L(shl_0_mem_less_64bytes):
395 cmp $0x20, %ecx
396 jb L(shl_0_mem_less_32bytes)
397 movdqa (%eax), %xmm0
398 sub $0x20, %ecx
399 movdqa 0x10(%eax), %xmm1
400 add $0x20, %eax
401 movdqa %xmm0, (%edx)
402 movdqa %xmm1, 0x10(%edx)
403 add $0x20, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400404
Bruce Beare8ff1a272010-03-04 11:03:37 -0800405L(shl_0_mem_less_32bytes):
406 cmp $0x10, %ecx
407 jb L(shl_0_mem_less_16bytes)
408 sub $0x10, %ecx
409 movdqa (%eax), %xmm0
410 add $0x10, %eax
411 movdqa %xmm0, (%edx)
412 add $0x10, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400413
Bruce Beare8ff1a272010-03-04 11:03:37 -0800414L(shl_0_mem_less_16bytes):
415 add %ecx, %edx
416 add %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +0400417 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800418
Jack Renc47703a2012-02-14 12:01:52 +0400419 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800420L(shl_1):
Jack Renc47703a2012-02-14 12:01:52 +0400421#ifndef USE_AS_MEMMOVE
422 movaps -1(%eax), %xmm1
423#else
424 movl DEST+4(%esp), %edi
425 movaps -1(%eax), %xmm1
426 movdqu %xmm0, (%edi)
427#endif
428#ifdef DATA_CACHE_SIZE_HALF
429 cmp $DATA_CACHE_SIZE_HALF, %ecx
430#else
431# if (defined SHARED || defined __PIC__)
432 SETUP_PIC_REG(bx)
433 add $_GLOBAL_OFFSET_TABLE_, %ebx
434 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
435# else
436 cmp __x86_data_cache_size_half, %ecx
437# endif
438#endif
439 jb L(sh_1_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800440
Jack Renc47703a2012-02-14 12:01:52 +0400441 lea -64(%ecx), %ecx
442
443 .p2align 4
444L(Shl1LoopStart):
445 prefetcht0 0x1c0(%eax)
446 prefetcht0 0x1c0(%edx)
447 movaps 15(%eax), %xmm2
448 movaps 31(%eax), %xmm3
449 movaps 47(%eax), %xmm4
450 movaps 63(%eax), %xmm5
451 movaps %xmm5, %xmm7
452 palignr $1, %xmm4, %xmm5
453 palignr $1, %xmm3, %xmm4
454 movaps %xmm5, 48(%edx)
455 palignr $1, %xmm2, %xmm3
456 lea 64(%eax), %eax
457 palignr $1, %xmm1, %xmm2
458 movaps %xmm4, 32(%edx)
459 movaps %xmm3, 16(%edx)
460 movaps %xmm7, %xmm1
461 movaps %xmm2, (%edx)
462 lea 64(%edx), %edx
463 sub $64, %ecx
464 ja L(Shl1LoopStart)
465
466L(Shl1LoopLeave):
467 add $32, %ecx
468 jle L(shl_end_0)
469
470 movaps 15(%eax), %xmm2
471 movaps 31(%eax), %xmm3
472 palignr $1, %xmm2, %xmm3
473 palignr $1, %xmm1, %xmm2
474 movaps %xmm2, (%edx)
475 movaps %xmm3, 16(%edx)
476 lea 32(%edx, %ecx), %edx
477 lea 32(%eax, %ecx), %eax
478 POP (%edi)
479 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
480
481 CFI_PUSH (%edi)
482
483 .p2align 4
484L(sh_1_no_prefetch):
485 lea -32(%ecx), %ecx
486 lea -1(%eax), %eax
487 xor %edi, %edi
488
489 .p2align 4
490L(sh_1_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800491 movdqa 16(%eax, %edi), %xmm2
492 sub $32, %ecx
493 movdqa 32(%eax, %edi), %xmm3
494 movdqa %xmm3, %xmm4
495 palignr $1, %xmm2, %xmm3
496 palignr $1, %xmm1, %xmm2
497 lea 32(%edi), %edi
498 movdqa %xmm2, -32(%edx, %edi)
499 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400500 jb L(sh_1_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800501
502 movdqa 16(%eax, %edi), %xmm2
503 sub $32, %ecx
504 movdqa 32(%eax, %edi), %xmm3
505 movdqa %xmm3, %xmm1
506 palignr $1, %xmm2, %xmm3
507 palignr $1, %xmm4, %xmm2
508 lea 32(%edi), %edi
509 movdqa %xmm2, -32(%edx, %edi)
510 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400511 jae L(sh_1_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800512
Jack Renc47703a2012-02-14 12:01:52 +0400513L(sh_1_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800514 lea 32(%ecx), %ecx
515 add %ecx, %edi
516 add %edi, %edx
517 lea 1(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400518 POP (%edi)
519 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800520
Jack Renc47703a2012-02-14 12:01:52 +0400521 CFI_PUSH (%edi)
522
523 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800524L(shl_2):
Jack Renc47703a2012-02-14 12:01:52 +0400525#ifndef USE_AS_MEMMOVE
526 movaps -2(%eax), %xmm1
527#else
528 movl DEST+4(%esp), %edi
529 movaps -2(%eax), %xmm1
530 movdqu %xmm0, (%edi)
531#endif
532#ifdef DATA_CACHE_SIZE_HALF
533 cmp $DATA_CACHE_SIZE_HALF, %ecx
534#else
535# if (defined SHARED || defined __PIC__)
536 SETUP_PIC_REG(bx)
537 add $_GLOBAL_OFFSET_TABLE_, %ebx
538 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
539# else
540 cmp __x86_data_cache_size_half, %ecx
541# endif
542#endif
543 jb L(sh_2_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800544
Jack Renc47703a2012-02-14 12:01:52 +0400545 lea -64(%ecx), %ecx
546
547 .p2align 4
548L(Shl2LoopStart):
549 prefetcht0 0x1c0(%eax)
550 prefetcht0 0x1c0(%edx)
551 movaps 14(%eax), %xmm2
552 movaps 30(%eax), %xmm3
553 movaps 46(%eax), %xmm4
554 movaps 62(%eax), %xmm5
555 movaps %xmm5, %xmm7
556 palignr $2, %xmm4, %xmm5
557 palignr $2, %xmm3, %xmm4
558 movaps %xmm5, 48(%edx)
559 palignr $2, %xmm2, %xmm3
560 lea 64(%eax), %eax
561 palignr $2, %xmm1, %xmm2
562 movaps %xmm4, 32(%edx)
563 movaps %xmm3, 16(%edx)
564 movaps %xmm7, %xmm1
565 movaps %xmm2, (%edx)
566 lea 64(%edx), %edx
567 sub $64, %ecx
568 ja L(Shl2LoopStart)
569
570L(Shl2LoopLeave):
571 add $32, %ecx
572 jle L(shl_end_0)
573
574 movaps 14(%eax), %xmm2
575 movaps 30(%eax), %xmm3
576 palignr $2, %xmm2, %xmm3
577 palignr $2, %xmm1, %xmm2
578 movaps %xmm2, (%edx)
579 movaps %xmm3, 16(%edx)
580 lea 32(%edx, %ecx), %edx
581 lea 32(%eax, %ecx), %eax
582 POP (%edi)
583 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
584
585 CFI_PUSH (%edi)
586
587 .p2align 4
588L(sh_2_no_prefetch):
589 lea -32(%ecx), %ecx
590 lea -2(%eax), %eax
591 xor %edi, %edi
592
593 .p2align 4
594L(sh_2_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800595 movdqa 16(%eax, %edi), %xmm2
596 sub $32, %ecx
597 movdqa 32(%eax, %edi), %xmm3
598 movdqa %xmm3, %xmm4
599 palignr $2, %xmm2, %xmm3
600 palignr $2, %xmm1, %xmm2
601 lea 32(%edi), %edi
602 movdqa %xmm2, -32(%edx, %edi)
603 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400604 jb L(sh_2_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800605
606 movdqa 16(%eax, %edi), %xmm2
607 sub $32, %ecx
608 movdqa 32(%eax, %edi), %xmm3
609 movdqa %xmm3, %xmm1
610 palignr $2, %xmm2, %xmm3
611 palignr $2, %xmm4, %xmm2
612 lea 32(%edi), %edi
613 movdqa %xmm2, -32(%edx, %edi)
614 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400615 jae L(sh_2_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800616
Jack Renc47703a2012-02-14 12:01:52 +0400617L(sh_2_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800618 lea 32(%ecx), %ecx
619 add %ecx, %edi
620 add %edi, %edx
621 lea 2(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400622 POP (%edi)
623 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800624
Jack Renc47703a2012-02-14 12:01:52 +0400625 CFI_PUSH (%edi)
626
627 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800628L(shl_3):
Jack Renc47703a2012-02-14 12:01:52 +0400629#ifndef USE_AS_MEMMOVE
630 movaps -3(%eax), %xmm1
631#else
632 movl DEST+4(%esp), %edi
633 movaps -3(%eax), %xmm1
634 movdqu %xmm0, (%edi)
635#endif
636#ifdef DATA_CACHE_SIZE_HALF
637 cmp $DATA_CACHE_SIZE_HALF, %ecx
638#else
639# if (defined SHARED || defined __PIC__)
640 SETUP_PIC_REG(bx)
641 add $_GLOBAL_OFFSET_TABLE_, %ebx
642 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
643# else
644 cmp __x86_data_cache_size_half, %ecx
645# endif
646#endif
647 jb L(sh_3_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800648
Jack Renc47703a2012-02-14 12:01:52 +0400649 lea -64(%ecx), %ecx
650
651 .p2align 4
652L(Shl3LoopStart):
653 prefetcht0 0x1c0(%eax)
654 prefetcht0 0x1c0(%edx)
655 movaps 13(%eax), %xmm2
656 movaps 29(%eax), %xmm3
657 movaps 45(%eax), %xmm4
658 movaps 61(%eax), %xmm5
659 movaps %xmm5, %xmm7
660 palignr $3, %xmm4, %xmm5
661 palignr $3, %xmm3, %xmm4
662 movaps %xmm5, 48(%edx)
663 palignr $3, %xmm2, %xmm3
664 lea 64(%eax), %eax
665 palignr $3, %xmm1, %xmm2
666 movaps %xmm4, 32(%edx)
667 movaps %xmm3, 16(%edx)
668 movaps %xmm7, %xmm1
669 movaps %xmm2, (%edx)
670 lea 64(%edx), %edx
671 sub $64, %ecx
672 ja L(Shl3LoopStart)
673
674L(Shl3LoopLeave):
675 add $32, %ecx
676 jle L(shl_end_0)
677
678 movaps 13(%eax), %xmm2
679 movaps 29(%eax), %xmm3
680 palignr $3, %xmm2, %xmm3
681 palignr $3, %xmm1, %xmm2
682 movaps %xmm2, (%edx)
683 movaps %xmm3, 16(%edx)
684 lea 32(%edx, %ecx), %edx
685 lea 32(%eax, %ecx), %eax
686 POP (%edi)
687 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
688
689 CFI_PUSH (%edi)
690
691 .p2align 4
692L(sh_3_no_prefetch):
693 lea -32(%ecx), %ecx
694 lea -3(%eax), %eax
695 xor %edi, %edi
696
697 .p2align 4
698L(sh_3_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800699 movdqa 16(%eax, %edi), %xmm2
700 sub $32, %ecx
701 movdqa 32(%eax, %edi), %xmm3
702 movdqa %xmm3, %xmm4
703 palignr $3, %xmm2, %xmm3
704 palignr $3, %xmm1, %xmm2
705 lea 32(%edi), %edi
706 movdqa %xmm2, -32(%edx, %edi)
707 movdqa %xmm3, -16(%edx, %edi)
708
Jack Renc47703a2012-02-14 12:01:52 +0400709 jb L(sh_3_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800710
711 movdqa 16(%eax, %edi), %xmm2
712 sub $32, %ecx
713 movdqa 32(%eax, %edi), %xmm3
714 movdqa %xmm3, %xmm1
715 palignr $3, %xmm2, %xmm3
716 palignr $3, %xmm4, %xmm2
717 lea 32(%edi), %edi
718 movdqa %xmm2, -32(%edx, %edi)
719 movdqa %xmm3, -16(%edx, %edi)
720
Jack Renc47703a2012-02-14 12:01:52 +0400721 jae L(sh_3_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800722
Jack Renc47703a2012-02-14 12:01:52 +0400723L(sh_3_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800724 lea 32(%ecx), %ecx
725 add %ecx, %edi
726 add %edi, %edx
727 lea 3(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400728 POP (%edi)
729 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800730
Jack Renc47703a2012-02-14 12:01:52 +0400731 CFI_PUSH (%edi)
732
733 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800734L(shl_4):
Jack Renc47703a2012-02-14 12:01:52 +0400735#ifndef USE_AS_MEMMOVE
736 movaps -4(%eax), %xmm1
737#else
738 movl DEST+4(%esp), %edi
739 movaps -4(%eax), %xmm1
740 movdqu %xmm0, (%edi)
741#endif
742#ifdef DATA_CACHE_SIZE_HALF
743 cmp $DATA_CACHE_SIZE_HALF, %ecx
744#else
745# if (defined SHARED || defined __PIC__)
746 SETUP_PIC_REG(bx)
747 add $_GLOBAL_OFFSET_TABLE_, %ebx
748 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
749# else
750 cmp __x86_data_cache_size_half, %ecx
751# endif
752#endif
753 jb L(sh_4_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800754
Jack Renc47703a2012-02-14 12:01:52 +0400755 lea -64(%ecx), %ecx
756
757 .p2align 4
758L(Shl4LoopStart):
759 prefetcht0 0x1c0(%eax)
760 prefetcht0 0x1c0(%edx)
761 movaps 12(%eax), %xmm2
762 movaps 28(%eax), %xmm3
763 movaps 44(%eax), %xmm4
764 movaps 60(%eax), %xmm5
765 movaps %xmm5, %xmm7
766 palignr $4, %xmm4, %xmm5
767 palignr $4, %xmm3, %xmm4
768 movaps %xmm5, 48(%edx)
769 palignr $4, %xmm2, %xmm3
770 lea 64(%eax), %eax
771 palignr $4, %xmm1, %xmm2
772 movaps %xmm4, 32(%edx)
773 movaps %xmm3, 16(%edx)
774 movaps %xmm7, %xmm1
775 movaps %xmm2, (%edx)
776 lea 64(%edx), %edx
777 sub $64, %ecx
778 ja L(Shl4LoopStart)
779
780L(Shl4LoopLeave):
781 add $32, %ecx
782 jle L(shl_end_0)
783
784 movaps 12(%eax), %xmm2
785 movaps 28(%eax), %xmm3
786 palignr $4, %xmm2, %xmm3
787 palignr $4, %xmm1, %xmm2
788 movaps %xmm2, (%edx)
789 movaps %xmm3, 16(%edx)
790 lea 32(%edx, %ecx), %edx
791 lea 32(%eax, %ecx), %eax
792 POP (%edi)
793 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
794
795 CFI_PUSH (%edi)
796
797 .p2align 4
798L(sh_4_no_prefetch):
799 lea -32(%ecx), %ecx
800 lea -4(%eax), %eax
801 xor %edi, %edi
802
803 .p2align 4
804L(sh_4_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800805 movdqa 16(%eax, %edi), %xmm2
806 sub $32, %ecx
807 movdqa 32(%eax, %edi), %xmm3
808 movdqa %xmm3, %xmm4
809 palignr $4, %xmm2, %xmm3
810 palignr $4, %xmm1, %xmm2
811 lea 32(%edi), %edi
812 movdqa %xmm2, -32(%edx, %edi)
813 movdqa %xmm3, -16(%edx, %edi)
814
Jack Renc47703a2012-02-14 12:01:52 +0400815 jb L(sh_4_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800816
817 movdqa 16(%eax, %edi), %xmm2
818 sub $32, %ecx
819 movdqa 32(%eax, %edi), %xmm3
820 movdqa %xmm3, %xmm1
821 palignr $4, %xmm2, %xmm3
822 palignr $4, %xmm4, %xmm2
823 lea 32(%edi), %edi
824 movdqa %xmm2, -32(%edx, %edi)
825 movdqa %xmm3, -16(%edx, %edi)
826
Jack Renc47703a2012-02-14 12:01:52 +0400827 jae L(sh_4_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800828
Jack Renc47703a2012-02-14 12:01:52 +0400829L(sh_4_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800830 lea 32(%ecx), %ecx
831 add %ecx, %edi
832 add %edi, %edx
833 lea 4(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400834 POP (%edi)
835 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800836
Jack Renc47703a2012-02-14 12:01:52 +0400837 CFI_PUSH (%edi)
838
839 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800840L(shl_5):
Jack Renc47703a2012-02-14 12:01:52 +0400841#ifndef USE_AS_MEMMOVE
842 movaps -5(%eax), %xmm1
843#else
844 movl DEST+4(%esp), %edi
845 movaps -5(%eax), %xmm1
846 movdqu %xmm0, (%edi)
847#endif
848#ifdef DATA_CACHE_SIZE_HALF
849 cmp $DATA_CACHE_SIZE_HALF, %ecx
850#else
851# if (defined SHARED || defined __PIC__)
852 SETUP_PIC_REG(bx)
853 add $_GLOBAL_OFFSET_TABLE_, %ebx
854 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
855# else
856 cmp __x86_data_cache_size_half, %ecx
857# endif
858#endif
859 jb L(sh_5_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800860
Jack Renc47703a2012-02-14 12:01:52 +0400861 lea -64(%ecx), %ecx
862
863 .p2align 4
864L(Shl5LoopStart):
865 prefetcht0 0x1c0(%eax)
866 prefetcht0 0x1c0(%edx)
867 movaps 11(%eax), %xmm2
868 movaps 27(%eax), %xmm3
869 movaps 43(%eax), %xmm4
870 movaps 59(%eax), %xmm5
871 movaps %xmm5, %xmm7
872 palignr $5, %xmm4, %xmm5
873 palignr $5, %xmm3, %xmm4
874 movaps %xmm5, 48(%edx)
875 palignr $5, %xmm2, %xmm3
876 lea 64(%eax), %eax
877 palignr $5, %xmm1, %xmm2
878 movaps %xmm4, 32(%edx)
879 movaps %xmm3, 16(%edx)
880 movaps %xmm7, %xmm1
881 movaps %xmm2, (%edx)
882 lea 64(%edx), %edx
883 sub $64, %ecx
884 ja L(Shl5LoopStart)
885
886L(Shl5LoopLeave):
887 add $32, %ecx
888 jle L(shl_end_0)
889
890 movaps 11(%eax), %xmm2
891 movaps 27(%eax), %xmm3
892 palignr $5, %xmm2, %xmm3
893 palignr $5, %xmm1, %xmm2
894 movaps %xmm2, (%edx)
895 movaps %xmm3, 16(%edx)
896 lea 32(%edx, %ecx), %edx
897 lea 32(%eax, %ecx), %eax
898 POP (%edi)
899 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
900
901 CFI_PUSH (%edi)
902
903 .p2align 4
904L(sh_5_no_prefetch):
905 lea -32(%ecx), %ecx
906 lea -5(%eax), %eax
907 xor %edi, %edi
908
909 .p2align 4
910L(sh_5_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800911 movdqa 16(%eax, %edi), %xmm2
912 sub $32, %ecx
913 movdqa 32(%eax, %edi), %xmm3
914 movdqa %xmm3, %xmm4
915 palignr $5, %xmm2, %xmm3
916 palignr $5, %xmm1, %xmm2
917 lea 32(%edi), %edi
918 movdqa %xmm2, -32(%edx, %edi)
919 movdqa %xmm3, -16(%edx, %edi)
920
Jack Renc47703a2012-02-14 12:01:52 +0400921 jb L(sh_5_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800922
923 movdqa 16(%eax, %edi), %xmm2
924 sub $32, %ecx
925 movdqa 32(%eax, %edi), %xmm3
926 movdqa %xmm3, %xmm1
927 palignr $5, %xmm2, %xmm3
928 palignr $5, %xmm4, %xmm2
929 lea 32(%edi), %edi
930 movdqa %xmm2, -32(%edx, %edi)
931 movdqa %xmm3, -16(%edx, %edi)
932
Jack Renc47703a2012-02-14 12:01:52 +0400933 jae L(sh_5_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800934
Jack Renc47703a2012-02-14 12:01:52 +0400935L(sh_5_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800936 lea 32(%ecx), %ecx
937 add %ecx, %edi
938 add %edi, %edx
939 lea 5(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400940 POP (%edi)
941 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800942
Jack Renc47703a2012-02-14 12:01:52 +0400943 CFI_PUSH (%edi)
944
945 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800946L(shl_6):
Jack Renc47703a2012-02-14 12:01:52 +0400947#ifndef USE_AS_MEMMOVE
948 movaps -6(%eax), %xmm1
949#else
950 movl DEST+4(%esp), %edi
951 movaps -6(%eax), %xmm1
952 movdqu %xmm0, (%edi)
953#endif
954#ifdef DATA_CACHE_SIZE_HALF
955 cmp $DATA_CACHE_SIZE_HALF, %ecx
956#else
957# if (defined SHARED || defined __PIC__)
958 SETUP_PIC_REG(bx)
959 add $_GLOBAL_OFFSET_TABLE_, %ebx
960 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
961# else
962 cmp __x86_data_cache_size_half, %ecx
963# endif
964#endif
965 jb L(sh_6_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800966
Jack Renc47703a2012-02-14 12:01:52 +0400967 lea -64(%ecx), %ecx
968
969 .p2align 4
970L(Shl6LoopStart):
971 prefetcht0 0x1c0(%eax)
972 prefetcht0 0x1c0(%edx)
973 movaps 10(%eax), %xmm2
974 movaps 26(%eax), %xmm3
975 movaps 42(%eax), %xmm4
976 movaps 58(%eax), %xmm5
977 movaps %xmm5, %xmm7
978 palignr $6, %xmm4, %xmm5
979 palignr $6, %xmm3, %xmm4
980 movaps %xmm5, 48(%edx)
981 palignr $6, %xmm2, %xmm3
982 lea 64(%eax), %eax
983 palignr $6, %xmm1, %xmm2
984 movaps %xmm4, 32(%edx)
985 movaps %xmm3, 16(%edx)
986 movaps %xmm7, %xmm1
987 movaps %xmm2, (%edx)
988 lea 64(%edx), %edx
989 sub $64, %ecx
990 ja L(Shl6LoopStart)
991
992L(Shl6LoopLeave):
993 add $32, %ecx
994 jle L(shl_end_0)
995
996 movaps 10(%eax), %xmm2
997 movaps 26(%eax), %xmm3
998 palignr $6, %xmm2, %xmm3
999 palignr $6, %xmm1, %xmm2
1000 movaps %xmm2, (%edx)
1001 movaps %xmm3, 16(%edx)
1002 lea 32(%edx, %ecx), %edx
1003 lea 32(%eax, %ecx), %eax
1004 POP (%edi)
1005 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1006
1007 CFI_PUSH (%edi)
1008
1009 .p2align 4
1010L(sh_6_no_prefetch):
1011 lea -32(%ecx), %ecx
1012 lea -6(%eax), %eax
1013 xor %edi, %edi
1014
1015 .p2align 4
1016L(sh_6_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001017 movdqa 16(%eax, %edi), %xmm2
1018 sub $32, %ecx
1019 movdqa 32(%eax, %edi), %xmm3
1020 movdqa %xmm3, %xmm4
1021 palignr $6, %xmm2, %xmm3
1022 palignr $6, %xmm1, %xmm2
1023 lea 32(%edi), %edi
1024 movdqa %xmm2, -32(%edx, %edi)
1025 movdqa %xmm3, -16(%edx, %edi)
1026
Jack Renc47703a2012-02-14 12:01:52 +04001027 jb L(sh_6_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001028
1029 movdqa 16(%eax, %edi), %xmm2
1030 sub $32, %ecx
1031 movdqa 32(%eax, %edi), %xmm3
1032 movdqa %xmm3, %xmm1
1033 palignr $6, %xmm2, %xmm3
1034 palignr $6, %xmm4, %xmm2
1035 lea 32(%edi), %edi
1036 movdqa %xmm2, -32(%edx, %edi)
1037 movdqa %xmm3, -16(%edx, %edi)
1038
Jack Renc47703a2012-02-14 12:01:52 +04001039 jae L(sh_6_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001040
Jack Renc47703a2012-02-14 12:01:52 +04001041L(sh_6_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001042 lea 32(%ecx), %ecx
1043 add %ecx, %edi
1044 add %edi, %edx
1045 lea 6(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001046 POP (%edi)
1047 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001048
Jack Renc47703a2012-02-14 12:01:52 +04001049 CFI_PUSH (%edi)
1050
1051 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001052L(shl_7):
Jack Renc47703a2012-02-14 12:01:52 +04001053#ifndef USE_AS_MEMMOVE
1054 movaps -7(%eax), %xmm1
1055#else
1056 movl DEST+4(%esp), %edi
1057 movaps -7(%eax), %xmm1
1058 movdqu %xmm0, (%edi)
1059#endif
1060#ifdef DATA_CACHE_SIZE_HALF
1061 cmp $DATA_CACHE_SIZE_HALF, %ecx
1062#else
1063# if (defined SHARED || defined __PIC__)
1064 SETUP_PIC_REG(bx)
1065 add $_GLOBAL_OFFSET_TABLE_, %ebx
1066 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1067# else
1068 cmp __x86_data_cache_size_half, %ecx
1069# endif
1070#endif
1071 jb L(sh_7_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001072
Jack Renc47703a2012-02-14 12:01:52 +04001073 lea -64(%ecx), %ecx
1074
1075 .p2align 4
1076L(Shl7LoopStart):
1077 prefetcht0 0x1c0(%eax)
1078 prefetcht0 0x1c0(%edx)
1079 movaps 9(%eax), %xmm2
1080 movaps 25(%eax), %xmm3
1081 movaps 41(%eax), %xmm4
1082 movaps 57(%eax), %xmm5
1083 movaps %xmm5, %xmm7
1084 palignr $7, %xmm4, %xmm5
1085 palignr $7, %xmm3, %xmm4
1086 movaps %xmm5, 48(%edx)
1087 palignr $7, %xmm2, %xmm3
1088 lea 64(%eax), %eax
1089 palignr $7, %xmm1, %xmm2
1090 movaps %xmm4, 32(%edx)
1091 movaps %xmm3, 16(%edx)
1092 movaps %xmm7, %xmm1
1093 movaps %xmm2, (%edx)
1094 lea 64(%edx), %edx
1095 sub $64, %ecx
1096 ja L(Shl7LoopStart)
1097
1098L(Shl7LoopLeave):
1099 add $32, %ecx
1100 jle L(shl_end_0)
1101
1102 movaps 9(%eax), %xmm2
1103 movaps 25(%eax), %xmm3
1104 palignr $7, %xmm2, %xmm3
1105 palignr $7, %xmm1, %xmm2
1106 movaps %xmm2, (%edx)
1107 movaps %xmm3, 16(%edx)
1108 lea 32(%edx, %ecx), %edx
1109 lea 32(%eax, %ecx), %eax
1110 POP (%edi)
1111 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1112
1113 CFI_PUSH (%edi)
1114
1115 .p2align 4
1116L(sh_7_no_prefetch):
1117 lea -32(%ecx), %ecx
1118 lea -7(%eax), %eax
1119 xor %edi, %edi
1120
1121 .p2align 4
1122L(sh_7_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001123 movdqa 16(%eax, %edi), %xmm2
1124 sub $32, %ecx
1125 movdqa 32(%eax, %edi), %xmm3
1126 movdqa %xmm3, %xmm4
1127 palignr $7, %xmm2, %xmm3
1128 palignr $7, %xmm1, %xmm2
1129 lea 32(%edi), %edi
1130 movdqa %xmm2, -32(%edx, %edi)
1131 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001132 jb L(sh_7_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001133
1134 movdqa 16(%eax, %edi), %xmm2
1135 sub $32, %ecx
1136 movdqa 32(%eax, %edi), %xmm3
1137 movdqa %xmm3, %xmm1
1138 palignr $7, %xmm2, %xmm3
1139 palignr $7, %xmm4, %xmm2
1140 lea 32(%edi), %edi
1141 movdqa %xmm2, -32(%edx, %edi)
1142 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001143 jae L(sh_7_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001144
Jack Renc47703a2012-02-14 12:01:52 +04001145L(sh_7_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001146 lea 32(%ecx), %ecx
1147 add %ecx, %edi
1148 add %edi, %edx
1149 lea 7(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001150 POP (%edi)
1151 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001152
Jack Renc47703a2012-02-14 12:01:52 +04001153 CFI_PUSH (%edi)
1154
1155 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001156L(shl_8):
Jack Renc47703a2012-02-14 12:01:52 +04001157#ifndef USE_AS_MEMMOVE
1158 movaps -8(%eax), %xmm1
1159#else
1160 movl DEST+4(%esp), %edi
1161 movaps -8(%eax), %xmm1
1162 movdqu %xmm0, (%edi)
1163#endif
1164#ifdef DATA_CACHE_SIZE_HALF
1165 cmp $DATA_CACHE_SIZE_HALF, %ecx
1166#else
1167# if (defined SHARED || defined __PIC__)
1168 SETUP_PIC_REG(bx)
1169 add $_GLOBAL_OFFSET_TABLE_, %ebx
1170 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1171# else
1172 cmp __x86_data_cache_size_half, %ecx
1173# endif
1174#endif
1175 jb L(sh_8_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001176
Jack Renc47703a2012-02-14 12:01:52 +04001177 lea -64(%ecx), %ecx
1178
1179 .p2align 4
1180L(Shl8LoopStart):
1181 prefetcht0 0x1c0(%eax)
1182 prefetcht0 0x1c0(%edx)
1183 movaps 8(%eax), %xmm2
1184 movaps 24(%eax), %xmm3
1185 movaps 40(%eax), %xmm4
1186 movaps 56(%eax), %xmm5
1187 movaps %xmm5, %xmm7
1188 palignr $8, %xmm4, %xmm5
1189 palignr $8, %xmm3, %xmm4
1190 movaps %xmm5, 48(%edx)
1191 palignr $8, %xmm2, %xmm3
1192 lea 64(%eax), %eax
1193 palignr $8, %xmm1, %xmm2
1194 movaps %xmm4, 32(%edx)
1195 movaps %xmm3, 16(%edx)
1196 movaps %xmm7, %xmm1
1197 movaps %xmm2, (%edx)
1198 lea 64(%edx), %edx
1199 sub $64, %ecx
1200 ja L(Shl8LoopStart)
1201
1202L(LoopLeave8):
1203 add $32, %ecx
1204 jle L(shl_end_0)
1205
1206 movaps 8(%eax), %xmm2
1207 movaps 24(%eax), %xmm3
1208 palignr $8, %xmm2, %xmm3
1209 palignr $8, %xmm1, %xmm2
1210 movaps %xmm2, (%edx)
1211 movaps %xmm3, 16(%edx)
1212 lea 32(%edx, %ecx), %edx
1213 lea 32(%eax, %ecx), %eax
1214 POP (%edi)
1215 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1216
1217 CFI_PUSH (%edi)
1218
1219 .p2align 4
1220L(sh_8_no_prefetch):
1221 lea -32(%ecx), %ecx
1222 lea -8(%eax), %eax
1223 xor %edi, %edi
1224
1225 .p2align 4
1226L(sh_8_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001227 movdqa 16(%eax, %edi), %xmm2
1228 sub $32, %ecx
1229 movdqa 32(%eax, %edi), %xmm3
1230 movdqa %xmm3, %xmm4
1231 palignr $8, %xmm2, %xmm3
1232 palignr $8, %xmm1, %xmm2
1233 lea 32(%edi), %edi
1234 movdqa %xmm2, -32(%edx, %edi)
1235 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001236 jb L(sh_8_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001237
1238 movdqa 16(%eax, %edi), %xmm2
1239 sub $32, %ecx
1240 movdqa 32(%eax, %edi), %xmm3
1241 movdqa %xmm3, %xmm1
1242 palignr $8, %xmm2, %xmm3
1243 palignr $8, %xmm4, %xmm2
1244 lea 32(%edi), %edi
1245 movdqa %xmm2, -32(%edx, %edi)
1246 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001247 jae L(sh_8_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001248
Jack Renc47703a2012-02-14 12:01:52 +04001249L(sh_8_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001250 lea 32(%ecx), %ecx
1251 add %ecx, %edi
1252 add %edi, %edx
1253 lea 8(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001254 POP (%edi)
1255 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001256
Jack Renc47703a2012-02-14 12:01:52 +04001257 CFI_PUSH (%edi)
1258
1259 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001260L(shl_9):
Jack Renc47703a2012-02-14 12:01:52 +04001261#ifndef USE_AS_MEMMOVE
1262 movaps -9(%eax), %xmm1
1263#else
1264 movl DEST+4(%esp), %edi
1265 movaps -9(%eax), %xmm1
1266 movdqu %xmm0, (%edi)
1267#endif
1268#ifdef DATA_CACHE_SIZE_HALF
1269 cmp $DATA_CACHE_SIZE_HALF, %ecx
1270#else
1271# if (defined SHARED || defined __PIC__)
1272 SETUP_PIC_REG(bx)
1273 add $_GLOBAL_OFFSET_TABLE_, %ebx
1274 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1275# else
1276 cmp __x86_data_cache_size_half, %ecx
1277# endif
1278#endif
1279 jb L(sh_9_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001280
Jack Renc47703a2012-02-14 12:01:52 +04001281 lea -64(%ecx), %ecx
1282
1283 .p2align 4
1284L(Shl9LoopStart):
1285 prefetcht0 0x1c0(%eax)
1286 prefetcht0 0x1c0(%edx)
1287 movaps 7(%eax), %xmm2
1288 movaps 23(%eax), %xmm3
1289 movaps 39(%eax), %xmm4
1290 movaps 55(%eax), %xmm5
1291 movaps %xmm5, %xmm7
1292 palignr $9, %xmm4, %xmm5
1293 palignr $9, %xmm3, %xmm4
1294 movaps %xmm5, 48(%edx)
1295 palignr $9, %xmm2, %xmm3
1296 lea 64(%eax), %eax
1297 palignr $9, %xmm1, %xmm2
1298 movaps %xmm4, 32(%edx)
1299 movaps %xmm3, 16(%edx)
1300 movaps %xmm7, %xmm1
1301 movaps %xmm2, (%edx)
1302 lea 64(%edx), %edx
1303 sub $64, %ecx
1304 ja L(Shl9LoopStart)
1305
1306L(Shl9LoopLeave):
1307 add $32, %ecx
1308 jle L(shl_end_0)
1309
1310 movaps 7(%eax), %xmm2
1311 movaps 23(%eax), %xmm3
1312 palignr $9, %xmm2, %xmm3
1313 palignr $9, %xmm1, %xmm2
1314
1315 movaps %xmm2, (%edx)
1316 movaps %xmm3, 16(%edx)
1317 lea 32(%edx, %ecx), %edx
1318 lea 32(%eax, %ecx), %eax
1319 POP (%edi)
1320 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1321
1322 CFI_PUSH (%edi)
1323
1324 .p2align 4
1325L(sh_9_no_prefetch):
1326 lea -32(%ecx), %ecx
1327 lea -9(%eax), %eax
1328 xor %edi, %edi
1329
1330 .p2align 4
1331L(sh_9_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001332 movdqa 16(%eax, %edi), %xmm2
1333 sub $32, %ecx
1334 movdqa 32(%eax, %edi), %xmm3
1335 movdqa %xmm3, %xmm4
1336 palignr $9, %xmm2, %xmm3
1337 palignr $9, %xmm1, %xmm2
1338 lea 32(%edi), %edi
1339 movdqa %xmm2, -32(%edx, %edi)
1340 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001341 jb L(sh_9_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001342
1343 movdqa 16(%eax, %edi), %xmm2
1344 sub $32, %ecx
1345 movdqa 32(%eax, %edi), %xmm3
1346 movdqa %xmm3, %xmm1
1347 palignr $9, %xmm2, %xmm3
1348 palignr $9, %xmm4, %xmm2
1349 lea 32(%edi), %edi
1350 movdqa %xmm2, -32(%edx, %edi)
1351 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001352 jae L(sh_9_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001353
Jack Renc47703a2012-02-14 12:01:52 +04001354L(sh_9_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001355 lea 32(%ecx), %ecx
1356 add %ecx, %edi
1357 add %edi, %edx
1358 lea 9(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001359 POP (%edi)
1360 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001361
Jack Renc47703a2012-02-14 12:01:52 +04001362 CFI_PUSH (%edi)
1363
1364 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001365L(shl_10):
Jack Renc47703a2012-02-14 12:01:52 +04001366#ifndef USE_AS_MEMMOVE
1367 movaps -10(%eax), %xmm1
1368#else
1369 movl DEST+4(%esp), %edi
1370 movaps -10(%eax), %xmm1
1371 movdqu %xmm0, (%edi)
1372#endif
1373#ifdef DATA_CACHE_SIZE_HALF
1374 cmp $DATA_CACHE_SIZE_HALF, %ecx
1375#else
1376# if (defined SHARED || defined __PIC__)
1377 SETUP_PIC_REG(bx)
1378 add $_GLOBAL_OFFSET_TABLE_, %ebx
1379 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1380# else
1381 cmp __x86_data_cache_size_half, %ecx
1382# endif
1383#endif
1384 jb L(sh_10_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001385
Jack Renc47703a2012-02-14 12:01:52 +04001386 lea -64(%ecx), %ecx
1387
1388 .p2align 4
1389L(Shl10LoopStart):
1390 prefetcht0 0x1c0(%eax)
1391 prefetcht0 0x1c0(%edx)
1392 movaps 6(%eax), %xmm2
1393 movaps 22(%eax), %xmm3
1394 movaps 38(%eax), %xmm4
1395 movaps 54(%eax), %xmm5
1396 movaps %xmm5, %xmm7
1397 palignr $10, %xmm4, %xmm5
1398 palignr $10, %xmm3, %xmm4
1399 movaps %xmm5, 48(%edx)
1400 palignr $10, %xmm2, %xmm3
1401 lea 64(%eax), %eax
1402 palignr $10, %xmm1, %xmm2
1403 movaps %xmm4, 32(%edx)
1404 movaps %xmm3, 16(%edx)
1405 movaps %xmm7, %xmm1
1406 movaps %xmm2, (%edx)
1407 lea 64(%edx), %edx
1408 sub $64, %ecx
1409 ja L(Shl10LoopStart)
1410
1411L(Shl10LoopLeave):
1412 add $32, %ecx
1413 jle L(shl_end_0)
1414
1415 movaps 6(%eax), %xmm2
1416 movaps 22(%eax), %xmm3
1417 palignr $10, %xmm2, %xmm3
1418 palignr $10, %xmm1, %xmm2
1419
1420 movaps %xmm2, (%edx)
1421 movaps %xmm3, 16(%edx)
1422 lea 32(%edx, %ecx), %edx
1423 lea 32(%eax, %ecx), %eax
1424 POP (%edi)
1425 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1426
1427 CFI_PUSH (%edi)
1428
1429 .p2align 4
1430L(sh_10_no_prefetch):
1431 lea -32(%ecx), %ecx
1432 lea -10(%eax), %eax
1433 xor %edi, %edi
1434
1435 .p2align 4
1436L(sh_10_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001437 movdqa 16(%eax, %edi), %xmm2
1438 sub $32, %ecx
1439 movdqa 32(%eax, %edi), %xmm3
1440 movdqa %xmm3, %xmm4
1441 palignr $10, %xmm2, %xmm3
1442 palignr $10, %xmm1, %xmm2
1443 lea 32(%edi), %edi
1444 movdqa %xmm2, -32(%edx, %edi)
1445 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001446 jb L(sh_10_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001447
1448 movdqa 16(%eax, %edi), %xmm2
1449 sub $32, %ecx
1450 movdqa 32(%eax, %edi), %xmm3
1451 movdqa %xmm3, %xmm1
1452 palignr $10, %xmm2, %xmm3
1453 palignr $10, %xmm4, %xmm2
1454 lea 32(%edi), %edi
1455 movdqa %xmm2, -32(%edx, %edi)
1456 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001457 jae L(sh_10_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001458
Jack Renc47703a2012-02-14 12:01:52 +04001459L(sh_10_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001460 lea 32(%ecx), %ecx
1461 add %ecx, %edi
1462 add %edi, %edx
1463 lea 10(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001464 POP (%edi)
1465 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001466
Jack Renc47703a2012-02-14 12:01:52 +04001467 CFI_PUSH (%edi)
1468
1469 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001470L(shl_11):
Jack Renc47703a2012-02-14 12:01:52 +04001471#ifndef USE_AS_MEMMOVE
1472 movaps -11(%eax), %xmm1
1473#else
1474 movl DEST+4(%esp), %edi
1475 movaps -11(%eax), %xmm1
1476 movdqu %xmm0, (%edi)
1477#endif
1478#ifdef DATA_CACHE_SIZE_HALF
1479 cmp $DATA_CACHE_SIZE_HALF, %ecx
1480#else
1481# if (defined SHARED || defined __PIC__)
1482 SETUP_PIC_REG(bx)
1483 add $_GLOBAL_OFFSET_TABLE_, %ebx
1484 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1485# else
1486 cmp __x86_data_cache_size_half, %ecx
1487# endif
1488#endif
1489 jb L(sh_11_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001490
Jack Renc47703a2012-02-14 12:01:52 +04001491 lea -64(%ecx), %ecx
1492
1493 .p2align 4
1494L(Shl11LoopStart):
1495 prefetcht0 0x1c0(%eax)
1496 prefetcht0 0x1c0(%edx)
1497 movaps 5(%eax), %xmm2
1498 movaps 21(%eax), %xmm3
1499 movaps 37(%eax), %xmm4
1500 movaps 53(%eax), %xmm5
1501 movaps %xmm5, %xmm7
1502 palignr $11, %xmm4, %xmm5
1503 palignr $11, %xmm3, %xmm4
1504 movaps %xmm5, 48(%edx)
1505 palignr $11, %xmm2, %xmm3
1506 lea 64(%eax), %eax
1507 palignr $11, %xmm1, %xmm2
1508 movaps %xmm4, 32(%edx)
1509 movaps %xmm3, 16(%edx)
1510 movaps %xmm7, %xmm1
1511 movaps %xmm2, (%edx)
1512 lea 64(%edx), %edx
1513 sub $64, %ecx
1514 ja L(Shl11LoopStart)
1515
1516L(Shl11LoopLeave):
1517 add $32, %ecx
1518 jle L(shl_end_0)
1519
1520 movaps 5(%eax), %xmm2
1521 movaps 21(%eax), %xmm3
1522 palignr $11, %xmm2, %xmm3
1523 palignr $11, %xmm1, %xmm2
1524
1525 movaps %xmm2, (%edx)
1526 movaps %xmm3, 16(%edx)
1527 lea 32(%edx, %ecx), %edx
1528 lea 32(%eax, %ecx), %eax
1529 POP (%edi)
1530 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1531
1532 CFI_PUSH (%edi)
1533
1534 .p2align 4
1535L(sh_11_no_prefetch):
1536 lea -32(%ecx), %ecx
1537 lea -11(%eax), %eax
1538 xor %edi, %edi
1539
1540 .p2align 4
1541L(sh_11_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001542 movdqa 16(%eax, %edi), %xmm2
1543 sub $32, %ecx
1544 movdqa 32(%eax, %edi), %xmm3
1545 movdqa %xmm3, %xmm4
1546 palignr $11, %xmm2, %xmm3
1547 palignr $11, %xmm1, %xmm2
1548 lea 32(%edi), %edi
1549 movdqa %xmm2, -32(%edx, %edi)
1550 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001551 jb L(sh_11_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001552
1553 movdqa 16(%eax, %edi), %xmm2
1554 sub $32, %ecx
1555 movdqa 32(%eax, %edi), %xmm3
1556 movdqa %xmm3, %xmm1
1557 palignr $11, %xmm2, %xmm3
1558 palignr $11, %xmm4, %xmm2
1559 lea 32(%edi), %edi
1560 movdqa %xmm2, -32(%edx, %edi)
1561 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001562 jae L(sh_11_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001563
Jack Renc47703a2012-02-14 12:01:52 +04001564L(sh_11_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001565 lea 32(%ecx), %ecx
1566 add %ecx, %edi
1567 add %edi, %edx
1568 lea 11(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001569 POP (%edi)
1570 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001571
Jack Renc47703a2012-02-14 12:01:52 +04001572 CFI_PUSH (%edi)
1573
1574 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001575L(shl_12):
Jack Renc47703a2012-02-14 12:01:52 +04001576#ifndef USE_AS_MEMMOVE
1577 movaps -12(%eax), %xmm1
1578#else
1579 movl DEST+4(%esp), %edi
1580 movaps -12(%eax), %xmm1
1581 movdqu %xmm0, (%edi)
1582#endif
1583#ifdef DATA_CACHE_SIZE_HALF
1584 cmp $DATA_CACHE_SIZE_HALF, %ecx
1585#else
1586# if (defined SHARED || defined __PIC__)
1587 SETUP_PIC_REG(bx)
1588 add $_GLOBAL_OFFSET_TABLE_, %ebx
1589 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1590# else
1591 cmp __x86_data_cache_size_half, %ecx
1592# endif
1593#endif
1594 jb L(sh_12_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001595
Jack Renc47703a2012-02-14 12:01:52 +04001596 lea -64(%ecx), %ecx
1597
1598 .p2align 4
1599L(Shl12LoopStart):
1600 prefetcht0 0x1c0(%eax)
1601 prefetcht0 0x1c0(%edx)
1602 movaps 4(%eax), %xmm2
1603 movaps 20(%eax), %xmm3
1604 movaps 36(%eax), %xmm4
1605 movaps 52(%eax), %xmm5
1606 movaps %xmm5, %xmm7
1607 palignr $12, %xmm4, %xmm5
1608 palignr $12, %xmm3, %xmm4
1609 movaps %xmm5, 48(%edx)
1610 palignr $12, %xmm2, %xmm3
1611 lea 64(%eax), %eax
1612 palignr $12, %xmm1, %xmm2
1613 movaps %xmm4, 32(%edx)
1614 movaps %xmm3, 16(%edx)
1615 movaps %xmm7, %xmm1
1616 movaps %xmm2, (%edx)
1617 lea 64(%edx), %edx
1618 sub $64, %ecx
1619 ja L(Shl12LoopStart)
1620
1621L(Shl12LoopLeave):
1622 add $32, %ecx
1623 jle L(shl_end_0)
1624
1625 movaps 4(%eax), %xmm2
1626 movaps 20(%eax), %xmm3
1627 palignr $12, %xmm2, %xmm3
1628 palignr $12, %xmm1, %xmm2
1629
1630 movaps %xmm2, (%edx)
1631 movaps %xmm3, 16(%edx)
1632 lea 32(%edx, %ecx), %edx
1633 lea 32(%eax, %ecx), %eax
1634 POP (%edi)
1635 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1636
1637 CFI_PUSH (%edi)
1638
1639 .p2align 4
1640L(sh_12_no_prefetch):
1641 lea -32(%ecx), %ecx
1642 lea -12(%eax), %eax
1643 xor %edi, %edi
1644
1645 .p2align 4
1646L(sh_12_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001647 movdqa 16(%eax, %edi), %xmm2
1648 sub $32, %ecx
1649 movdqa 32(%eax, %edi), %xmm3
1650 movdqa %xmm3, %xmm4
1651 palignr $12, %xmm2, %xmm3
1652 palignr $12, %xmm1, %xmm2
1653 lea 32(%edi), %edi
1654 movdqa %xmm2, -32(%edx, %edi)
1655 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001656 jb L(sh_12_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001657
1658 movdqa 16(%eax, %edi), %xmm2
1659 sub $32, %ecx
1660 movdqa 32(%eax, %edi), %xmm3
1661 movdqa %xmm3, %xmm1
1662 palignr $12, %xmm2, %xmm3
1663 palignr $12, %xmm4, %xmm2
1664 lea 32(%edi), %edi
1665 movdqa %xmm2, -32(%edx, %edi)
1666 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001667 jae L(sh_12_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001668
Jack Renc47703a2012-02-14 12:01:52 +04001669L(sh_12_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001670 lea 32(%ecx), %ecx
1671 add %ecx, %edi
1672 add %edi, %edx
1673 lea 12(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001674 POP (%edi)
1675 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001676
Jack Renc47703a2012-02-14 12:01:52 +04001677 CFI_PUSH (%edi)
1678
1679 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001680L(shl_13):
Jack Renc47703a2012-02-14 12:01:52 +04001681#ifndef USE_AS_MEMMOVE
1682 movaps -13(%eax), %xmm1
1683#else
1684 movl DEST+4(%esp), %edi
1685 movaps -13(%eax), %xmm1
1686 movdqu %xmm0, (%edi)
1687#endif
1688#ifdef DATA_CACHE_SIZE_HALF
1689 cmp $DATA_CACHE_SIZE_HALF, %ecx
1690#else
1691# if (defined SHARED || defined __PIC__)
1692 SETUP_PIC_REG(bx)
1693 add $_GLOBAL_OFFSET_TABLE_, %ebx
1694 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1695# else
1696 cmp __x86_data_cache_size_half, %ecx
1697# endif
1698#endif
1699 jb L(sh_13_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001700
Jack Renc47703a2012-02-14 12:01:52 +04001701 lea -64(%ecx), %ecx
1702
1703 .p2align 4
1704L(Shl13LoopStart):
1705 prefetcht0 0x1c0(%eax)
1706 prefetcht0 0x1c0(%edx)
1707 movaps 3(%eax), %xmm2
1708 movaps 19(%eax), %xmm3
1709 movaps 35(%eax), %xmm4
1710 movaps 51(%eax), %xmm5
1711 movaps %xmm5, %xmm7
1712 palignr $13, %xmm4, %xmm5
1713 palignr $13, %xmm3, %xmm4
1714 movaps %xmm5, 48(%edx)
1715 palignr $13, %xmm2, %xmm3
1716 lea 64(%eax), %eax
1717 palignr $13, %xmm1, %xmm2
1718 movaps %xmm4, 32(%edx)
1719 movaps %xmm3, 16(%edx)
1720 movaps %xmm7, %xmm1
1721 movaps %xmm2, (%edx)
1722 lea 64(%edx), %edx
1723 sub $64, %ecx
1724 ja L(Shl13LoopStart)
1725
1726L(Shl13LoopLeave):
1727 add $32, %ecx
1728 jle L(shl_end_0)
1729
1730 movaps 3(%eax), %xmm2
1731 movaps 19(%eax), %xmm3
1732 palignr $13, %xmm2, %xmm3
1733 palignr $13, %xmm1, %xmm2
1734
1735 movaps %xmm2, (%edx)
1736 movaps %xmm3, 16(%edx)
1737 lea 32(%edx, %ecx), %edx
1738 lea 32(%eax, %ecx), %eax
1739 POP (%edi)
1740 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1741
1742 CFI_PUSH (%edi)
1743
1744 .p2align 4
1745L(sh_13_no_prefetch):
1746 lea -32(%ecx), %ecx
1747 lea -13(%eax), %eax
1748 xor %edi, %edi
1749
1750 .p2align 4
1751L(sh_13_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001752 movdqa 16(%eax, %edi), %xmm2
1753 sub $32, %ecx
1754 movdqa 32(%eax, %edi), %xmm3
1755 movdqa %xmm3, %xmm4
1756 palignr $13, %xmm2, %xmm3
1757 palignr $13, %xmm1, %xmm2
1758 lea 32(%edi), %edi
1759 movdqa %xmm2, -32(%edx, %edi)
1760 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001761 jb L(sh_13_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001762
1763 movdqa 16(%eax, %edi), %xmm2
1764 sub $32, %ecx
1765 movdqa 32(%eax, %edi), %xmm3
1766 movdqa %xmm3, %xmm1
1767 palignr $13, %xmm2, %xmm3
1768 palignr $13, %xmm4, %xmm2
1769 lea 32(%edi), %edi
1770 movdqa %xmm2, -32(%edx, %edi)
1771 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001772 jae L(sh_13_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001773
Jack Renc47703a2012-02-14 12:01:52 +04001774L(sh_13_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001775 lea 32(%ecx), %ecx
1776 add %ecx, %edi
1777 add %edi, %edx
1778 lea 13(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001779 POP (%edi)
1780 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001781
Jack Renc47703a2012-02-14 12:01:52 +04001782 CFI_PUSH (%edi)
1783
1784 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001785L(shl_14):
Jack Renc47703a2012-02-14 12:01:52 +04001786#ifndef USE_AS_MEMMOVE
1787 movaps -14(%eax), %xmm1
1788#else
1789 movl DEST+4(%esp), %edi
1790 movaps -14(%eax), %xmm1
1791 movdqu %xmm0, (%edi)
1792#endif
1793#ifdef DATA_CACHE_SIZE_HALF
1794 cmp $DATA_CACHE_SIZE_HALF, %ecx
1795#else
1796# if (defined SHARED || defined __PIC__)
1797 SETUP_PIC_REG(bx)
1798 add $_GLOBAL_OFFSET_TABLE_, %ebx
1799 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1800# else
1801 cmp __x86_data_cache_size_half, %ecx
1802# endif
1803#endif
1804 jb L(sh_14_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001805
Jack Renc47703a2012-02-14 12:01:52 +04001806 lea -64(%ecx), %ecx
1807
1808 .p2align 4
1809L(Shl14LoopStart):
1810 prefetcht0 0x1c0(%eax)
1811 prefetcht0 0x1c0(%edx)
1812 movaps 2(%eax), %xmm2
1813 movaps 18(%eax), %xmm3
1814 movaps 34(%eax), %xmm4
1815 movaps 50(%eax), %xmm5
1816 movaps %xmm5, %xmm7
1817 palignr $14, %xmm4, %xmm5
1818 palignr $14, %xmm3, %xmm4
1819 movaps %xmm5, 48(%edx)
1820 palignr $14, %xmm2, %xmm3
1821 lea 64(%eax), %eax
1822 palignr $14, %xmm1, %xmm2
1823 movaps %xmm4, 32(%edx)
1824 movaps %xmm3, 16(%edx)
1825 movaps %xmm7, %xmm1
1826 movaps %xmm2, (%edx)
1827 lea 64(%edx), %edx
1828 sub $64, %ecx
1829 ja L(Shl14LoopStart)
1830
1831L(Shl14LoopLeave):
1832 add $32, %ecx
1833 jle L(shl_end_0)
1834
1835 movaps 2(%eax), %xmm2
1836 movaps 18(%eax), %xmm3
1837 palignr $14, %xmm2, %xmm3
1838 palignr $14, %xmm1, %xmm2
1839
1840 movaps %xmm2, (%edx)
1841 movaps %xmm3, 16(%edx)
1842 lea 32(%edx, %ecx), %edx
1843 lea 32(%eax, %ecx), %eax
1844 POP (%edi)
1845 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1846
1847 CFI_PUSH (%edi)
1848
1849 .p2align 4
1850L(sh_14_no_prefetch):
1851 lea -32(%ecx), %ecx
1852 lea -14(%eax), %eax
1853 xor %edi, %edi
1854
1855 .p2align 4
1856L(sh_14_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001857 movdqa 16(%eax, %edi), %xmm2
1858 sub $32, %ecx
1859 movdqa 32(%eax, %edi), %xmm3
1860 movdqa %xmm3, %xmm4
1861 palignr $14, %xmm2, %xmm3
1862 palignr $14, %xmm1, %xmm2
1863 lea 32(%edi), %edi
1864 movdqa %xmm2, -32(%edx, %edi)
1865 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001866 jb L(sh_14_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001867
1868 movdqa 16(%eax, %edi), %xmm2
1869 sub $32, %ecx
1870 movdqa 32(%eax, %edi), %xmm3
1871 movdqa %xmm3, %xmm1
1872 palignr $14, %xmm2, %xmm3
1873 palignr $14, %xmm4, %xmm2
1874 lea 32(%edi), %edi
1875 movdqa %xmm2, -32(%edx, %edi)
1876 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001877 jae L(sh_14_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001878
Jack Renc47703a2012-02-14 12:01:52 +04001879L(sh_14_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001880 lea 32(%ecx), %ecx
1881 add %ecx, %edi
1882 add %edi, %edx
1883 lea 14(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001884 POP (%edi)
1885 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001886
Jack Renc47703a2012-02-14 12:01:52 +04001887 CFI_PUSH (%edi)
1888
1889 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001890L(shl_15):
Jack Renc47703a2012-02-14 12:01:52 +04001891#ifndef USE_AS_MEMMOVE
1892 movaps -15(%eax), %xmm1
1893#else
1894 movl DEST+4(%esp), %edi
1895 movaps -15(%eax), %xmm1
1896 movdqu %xmm0, (%edi)
1897#endif
1898#ifdef DATA_CACHE_SIZE_HALF
1899 cmp $DATA_CACHE_SIZE_HALF, %ecx
1900#else
1901# if (defined SHARED || defined __PIC__)
1902 SETUP_PIC_REG(bx)
1903 add $_GLOBAL_OFFSET_TABLE_, %ebx
1904 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1905# else
1906 cmp __x86_data_cache_size_half, %ecx
1907# endif
1908#endif
1909 jb L(sh_15_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001910
Jack Renc47703a2012-02-14 12:01:52 +04001911 lea -64(%ecx), %ecx
1912
1913 .p2align 4
1914L(Shl15LoopStart):
1915 prefetcht0 0x1c0(%eax)
1916 prefetcht0 0x1c0(%edx)
1917 movaps 1(%eax), %xmm2
1918 movaps 17(%eax), %xmm3
1919 movaps 33(%eax), %xmm4
1920 movaps 49(%eax), %xmm5
1921 movaps %xmm5, %xmm7
1922 palignr $15, %xmm4, %xmm5
1923 palignr $15, %xmm3, %xmm4
1924 movaps %xmm5, 48(%edx)
1925 palignr $15, %xmm2, %xmm3
1926 lea 64(%eax), %eax
1927 palignr $15, %xmm1, %xmm2
1928 movaps %xmm4, 32(%edx)
1929 movaps %xmm3, 16(%edx)
1930 movaps %xmm7, %xmm1
1931 movaps %xmm2, (%edx)
1932 lea 64(%edx), %edx
1933 sub $64, %ecx
1934 ja L(Shl15LoopStart)
1935
1936L(Shl15LoopLeave):
1937 add $32, %ecx
1938 jle L(shl_end_0)
1939
1940 movaps 1(%eax), %xmm2
1941 movaps 17(%eax), %xmm3
1942 palignr $15, %xmm2, %xmm3
1943 palignr $15, %xmm1, %xmm2
1944
1945 movaps %xmm2, (%edx)
1946 movaps %xmm3, 16(%edx)
1947 lea 32(%edx, %ecx), %edx
1948 lea 32(%eax, %ecx), %eax
1949 POP (%edi)
1950 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1951
1952 CFI_PUSH (%edi)
1953
1954 .p2align 4
1955L(sh_15_no_prefetch):
1956 lea -32(%ecx), %ecx
1957 lea -15(%eax), %eax
1958 xor %edi, %edi
1959
1960 .p2align 4
1961L(sh_15_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001962 movdqa 16(%eax, %edi), %xmm2
1963 sub $32, %ecx
1964 movdqa 32(%eax, %edi), %xmm3
1965 movdqa %xmm3, %xmm4
1966 palignr $15, %xmm2, %xmm3
1967 palignr $15, %xmm1, %xmm2
1968 lea 32(%edi), %edi
1969 movdqa %xmm2, -32(%edx, %edi)
1970 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001971 jb L(sh_15_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001972
1973 movdqa 16(%eax, %edi), %xmm2
1974 sub $32, %ecx
1975 movdqa 32(%eax, %edi), %xmm3
1976 movdqa %xmm3, %xmm1
1977 palignr $15, %xmm2, %xmm3
1978 palignr $15, %xmm4, %xmm2
1979 lea 32(%edi), %edi
1980 movdqa %xmm2, -32(%edx, %edi)
1981 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001982 jae L(sh_15_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001983
Jack Renc47703a2012-02-14 12:01:52 +04001984L(sh_15_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001985 lea 32(%ecx), %ecx
1986 add %ecx, %edi
1987 add %edi, %edx
1988 lea 15(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001989 POP (%edi)
1990 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001991
Jack Renc47703a2012-02-14 12:01:52 +04001992 CFI_PUSH (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001993
Jack Renc47703a2012-02-14 12:01:52 +04001994 .p2align 4
1995L(shl_end_0):
1996 lea 32(%ecx), %ecx
1997 lea (%edx, %ecx), %edx
1998 lea (%eax, %ecx), %eax
1999 POP (%edi)
2000 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
2001
2002 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002003L(fwd_write_44bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002004 movq -44(%eax), %xmm0
2005 movq %xmm0, -44(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002006L(fwd_write_36bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002007 movq -36(%eax), %xmm0
2008 movq %xmm0, -36(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002009L(fwd_write_28bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002010 movq -28(%eax), %xmm0
2011 movq %xmm0, -28(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002012L(fwd_write_20bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002013 movq -20(%eax), %xmm0
2014 movq %xmm0, -20(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002015L(fwd_write_12bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002016 movq -12(%eax), %xmm0
2017 movq %xmm0, -12(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002018L(fwd_write_4bytes):
2019 movl -4(%eax), %ecx
2020 movl %ecx, -4(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002021#ifndef USE_AS_BCOPY
2022# ifdef USE_AS_MEMPCPY
2023 movl %edx, %eax
2024# else
2025 movl DEST(%esp), %eax
2026# endif
2027#endif
2028 RETURN
2029
2030 .p2align 4
2031L(fwd_write_40bytes):
2032 movq -40(%eax), %xmm0
2033 movq %xmm0, -40(%edx)
2034L(fwd_write_32bytes):
2035 movq -32(%eax), %xmm0
2036 movq %xmm0, -32(%edx)
2037L(fwd_write_24bytes):
2038 movq -24(%eax), %xmm0
2039 movq %xmm0, -24(%edx)
2040L(fwd_write_16bytes):
2041 movq -16(%eax), %xmm0
2042 movq %xmm0, -16(%edx)
2043L(fwd_write_8bytes):
2044 movq -8(%eax), %xmm0
2045 movq %xmm0, -8(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002046L(fwd_write_0bytes):
2047#ifndef USE_AS_BCOPY
2048# ifdef USE_AS_MEMPCPY
2049 movl %edx, %eax
2050# else
2051 movl DEST(%esp), %eax
2052# endif
2053#endif
2054 RETURN
2055
Jack Renc47703a2012-02-14 12:01:52 +04002056 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002057L(fwd_write_5bytes):
2058 movl -5(%eax), %ecx
2059 movl -4(%eax), %eax
2060 movl %ecx, -5(%edx)
2061 movl %eax, -4(%edx)
2062#ifndef USE_AS_BCOPY
2063# ifdef USE_AS_MEMPCPY
2064 movl %edx, %eax
2065# else
2066 movl DEST(%esp), %eax
2067# endif
2068#endif
2069 RETURN
2070
Jack Renc47703a2012-02-14 12:01:52 +04002071 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002072L(fwd_write_45bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002073 movq -45(%eax), %xmm0
2074 movq %xmm0, -45(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002075L(fwd_write_37bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002076 movq -37(%eax), %xmm0
2077 movq %xmm0, -37(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002078L(fwd_write_29bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002079 movq -29(%eax), %xmm0
2080 movq %xmm0, -29(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002081L(fwd_write_21bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002082 movq -21(%eax), %xmm0
2083 movq %xmm0, -21(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002084L(fwd_write_13bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002085 movq -13(%eax), %xmm0
2086 movq %xmm0, -13(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002087 movl -5(%eax), %ecx
2088 movl %ecx, -5(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002089 movzbl -1(%eax), %ecx
2090 movb %cl, -1(%edx)
2091#ifndef USE_AS_BCOPY
2092# ifdef USE_AS_MEMPCPY
2093 movl %edx, %eax
2094# else
2095 movl DEST(%esp), %eax
2096# endif
2097#endif
2098 RETURN
2099
2100 .p2align 4
2101L(fwd_write_41bytes):
2102 movq -41(%eax), %xmm0
2103 movq %xmm0, -41(%edx)
2104L(fwd_write_33bytes):
2105 movq -33(%eax), %xmm0
2106 movq %xmm0, -33(%edx)
2107L(fwd_write_25bytes):
2108 movq -25(%eax), %xmm0
2109 movq %xmm0, -25(%edx)
2110L(fwd_write_17bytes):
2111 movq -17(%eax), %xmm0
2112 movq %xmm0, -17(%edx)
2113L(fwd_write_9bytes):
2114 movq -9(%eax), %xmm0
2115 movq %xmm0, -9(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002116L(fwd_write_1bytes):
2117 movzbl -1(%eax), %ecx
2118 movb %cl, -1(%edx)
2119#ifndef USE_AS_BCOPY
2120# ifdef USE_AS_MEMPCPY
2121 movl %edx, %eax
2122# else
2123 movl DEST(%esp), %eax
2124# endif
2125#endif
2126 RETURN
2127
Jack Renc47703a2012-02-14 12:01:52 +04002128 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002129L(fwd_write_46bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002130 movq -46(%eax), %xmm0
2131 movq %xmm0, -46(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002132L(fwd_write_38bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002133 movq -38(%eax), %xmm0
2134 movq %xmm0, -38(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002135L(fwd_write_30bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002136 movq -30(%eax), %xmm0
2137 movq %xmm0, -30(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002138L(fwd_write_22bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002139 movq -22(%eax), %xmm0
2140 movq %xmm0, -22(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002141L(fwd_write_14bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002142 movq -14(%eax), %xmm0
2143 movq %xmm0, -14(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002144L(fwd_write_6bytes):
2145 movl -6(%eax), %ecx
2146 movl %ecx, -6(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002147 movzwl -2(%eax), %ecx
2148 movw %cx, -2(%edx)
2149#ifndef USE_AS_BCOPY
2150# ifdef USE_AS_MEMPCPY
2151 movl %edx, %eax
2152# else
2153 movl DEST(%esp), %eax
2154# endif
2155#endif
2156 RETURN
2157
2158 .p2align 4
2159L(fwd_write_42bytes):
2160 movq -42(%eax), %xmm0
2161 movq %xmm0, -42(%edx)
2162L(fwd_write_34bytes):
2163 movq -34(%eax), %xmm0
2164 movq %xmm0, -34(%edx)
2165L(fwd_write_26bytes):
2166 movq -26(%eax), %xmm0
2167 movq %xmm0, -26(%edx)
2168L(fwd_write_18bytes):
2169 movq -18(%eax), %xmm0
2170 movq %xmm0, -18(%edx)
2171L(fwd_write_10bytes):
2172 movq -10(%eax), %xmm0
2173 movq %xmm0, -10(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002174L(fwd_write_2bytes):
2175 movzwl -2(%eax), %ecx
2176 movw %cx, -2(%edx)
2177#ifndef USE_AS_BCOPY
2178# ifdef USE_AS_MEMPCPY
2179 movl %edx, %eax
2180# else
2181 movl DEST(%esp), %eax
2182# endif
2183#endif
2184 RETURN
2185
Jack Renc47703a2012-02-14 12:01:52 +04002186 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002187L(fwd_write_47bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002188 movq -47(%eax), %xmm0
2189 movq %xmm0, -47(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002190L(fwd_write_39bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002191 movq -39(%eax), %xmm0
2192 movq %xmm0, -39(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002193L(fwd_write_31bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002194 movq -31(%eax), %xmm0
2195 movq %xmm0, -31(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002196L(fwd_write_23bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002197 movq -23(%eax), %xmm0
2198 movq %xmm0, -23(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002199L(fwd_write_15bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002200 movq -15(%eax), %xmm0
2201 movq %xmm0, -15(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002202L(fwd_write_7bytes):
2203 movl -7(%eax), %ecx
2204 movl %ecx, -7(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002205 movzwl -3(%eax), %ecx
2206 movzbl -1(%eax), %eax
2207 movw %cx, -3(%edx)
2208 movb %al, -1(%edx)
2209#ifndef USE_AS_BCOPY
2210# ifdef USE_AS_MEMPCPY
2211 movl %edx, %eax
2212# else
2213 movl DEST(%esp), %eax
2214# endif
2215#endif
2216 RETURN
2217
2218 .p2align 4
2219L(fwd_write_43bytes):
2220 movq -43(%eax), %xmm0
2221 movq %xmm0, -43(%edx)
2222L(fwd_write_35bytes):
2223 movq -35(%eax), %xmm0
2224 movq %xmm0, -35(%edx)
2225L(fwd_write_27bytes):
2226 movq -27(%eax), %xmm0
2227 movq %xmm0, -27(%edx)
2228L(fwd_write_19bytes):
2229 movq -19(%eax), %xmm0
2230 movq %xmm0, -19(%edx)
2231L(fwd_write_11bytes):
2232 movq -11(%eax), %xmm0
2233 movq %xmm0, -11(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002234L(fwd_write_3bytes):
2235 movzwl -3(%eax), %ecx
2236 movzbl -1(%eax), %eax
2237 movw %cx, -3(%edx)
2238 movb %al, -1(%edx)
2239#ifndef USE_AS_BCOPY
2240# ifdef USE_AS_MEMPCPY
2241 movl %edx, %eax
2242# else
2243 movl DEST(%esp), %eax
2244# endif
2245#endif
Jack Renc47703a2012-02-14 12:01:52 +04002246 RETURN
2247
2248 .p2align 4
2249L(fwd_write_40bytes_align):
2250 movdqa -40(%eax), %xmm0
2251 movdqa %xmm0, -40(%edx)
2252L(fwd_write_24bytes_align):
2253 movdqa -24(%eax), %xmm0
2254 movdqa %xmm0, -24(%edx)
2255L(fwd_write_8bytes_align):
2256 movq -8(%eax), %xmm0
2257 movq %xmm0, -8(%edx)
2258L(fwd_write_0bytes_align):
2259#ifndef USE_AS_BCOPY
2260# ifdef USE_AS_MEMPCPY
2261 movl %edx, %eax
2262# else
2263 movl DEST(%esp), %eax
2264# endif
2265#endif
2266 RETURN
2267
2268 .p2align 4
2269L(fwd_write_32bytes_align):
2270 movdqa -32(%eax), %xmm0
2271 movdqa %xmm0, -32(%edx)
2272L(fwd_write_16bytes_align):
2273 movdqa -16(%eax), %xmm0
2274 movdqa %xmm0, -16(%edx)
2275#ifndef USE_AS_BCOPY
2276# ifdef USE_AS_MEMPCPY
2277 movl %edx, %eax
2278# else
2279 movl DEST(%esp), %eax
2280# endif
2281#endif
2282 RETURN
2283
2284 .p2align 4
2285L(fwd_write_5bytes_align):
2286 movl -5(%eax), %ecx
2287 movl -4(%eax), %eax
2288 movl %ecx, -5(%edx)
2289 movl %eax, -4(%edx)
2290#ifndef USE_AS_BCOPY
2291# ifdef USE_AS_MEMPCPY
2292 movl %edx, %eax
2293# else
2294 movl DEST(%esp), %eax
2295# endif
2296#endif
2297 RETURN
2298
2299 .p2align 4
2300L(fwd_write_45bytes_align):
2301 movdqa -45(%eax), %xmm0
2302 movdqa %xmm0, -45(%edx)
2303L(fwd_write_29bytes_align):
2304 movdqa -29(%eax), %xmm0
2305 movdqa %xmm0, -29(%edx)
2306L(fwd_write_13bytes_align):
2307 movq -13(%eax), %xmm0
2308 movq %xmm0, -13(%edx)
2309 movl -5(%eax), %ecx
2310 movl %ecx, -5(%edx)
2311 movzbl -1(%eax), %ecx
2312 movb %cl, -1(%edx)
2313#ifndef USE_AS_BCOPY
2314# ifdef USE_AS_MEMPCPY
2315 movl %edx, %eax
2316# else
2317 movl DEST(%esp), %eax
2318# endif
2319#endif
2320 RETURN
2321
2322 .p2align 4
2323L(fwd_write_37bytes_align):
2324 movdqa -37(%eax), %xmm0
2325 movdqa %xmm0, -37(%edx)
2326L(fwd_write_21bytes_align):
2327 movdqa -21(%eax), %xmm0
2328 movdqa %xmm0, -21(%edx)
2329 movl -5(%eax), %ecx
2330 movl %ecx, -5(%edx)
2331 movzbl -1(%eax), %ecx
2332 movb %cl, -1(%edx)
2333#ifndef USE_AS_BCOPY
2334# ifdef USE_AS_MEMPCPY
2335 movl %edx, %eax
2336# else
2337 movl DEST(%esp), %eax
2338# endif
2339#endif
2340 RETURN
2341
2342 .p2align 4
2343L(fwd_write_41bytes_align):
2344 movdqa -41(%eax), %xmm0
2345 movdqa %xmm0, -41(%edx)
2346L(fwd_write_25bytes_align):
2347 movdqa -25(%eax), %xmm0
2348 movdqa %xmm0, -25(%edx)
2349L(fwd_write_9bytes_align):
2350 movq -9(%eax), %xmm0
2351 movq %xmm0, -9(%edx)
2352L(fwd_write_1bytes_align):
2353 movzbl -1(%eax), %ecx
2354 movb %cl, -1(%edx)
2355#ifndef USE_AS_BCOPY
2356# ifdef USE_AS_MEMPCPY
2357 movl %edx, %eax
2358# else
2359 movl DEST(%esp), %eax
2360# endif
2361#endif
2362 RETURN
2363
2364 .p2align 4
2365L(fwd_write_33bytes_align):
2366 movdqa -33(%eax), %xmm0
2367 movdqa %xmm0, -33(%edx)
2368L(fwd_write_17bytes_align):
2369 movdqa -17(%eax), %xmm0
2370 movdqa %xmm0, -17(%edx)
2371 movzbl -1(%eax), %ecx
2372 movb %cl, -1(%edx)
2373#ifndef USE_AS_BCOPY
2374# ifdef USE_AS_MEMPCPY
2375 movl %edx, %eax
2376# else
2377 movl DEST(%esp), %eax
2378# endif
2379#endif
2380 RETURN
2381
2382 .p2align 4
2383L(fwd_write_46bytes_align):
2384 movdqa -46(%eax), %xmm0
2385 movdqa %xmm0, -46(%edx)
2386L(fwd_write_30bytes_align):
2387 movdqa -30(%eax), %xmm0
2388 movdqa %xmm0, -30(%edx)
2389L(fwd_write_14bytes_align):
2390 movq -14(%eax), %xmm0
2391 movq %xmm0, -14(%edx)
2392L(fwd_write_6bytes_align):
2393 movl -6(%eax), %ecx
2394 movl %ecx, -6(%edx)
2395 movzwl -2(%eax), %ecx
2396 movw %cx, -2(%edx)
2397#ifndef USE_AS_BCOPY
2398# ifdef USE_AS_MEMPCPY
2399 movl %edx, %eax
2400# else
2401 movl DEST(%esp), %eax
2402# endif
2403#endif
2404 RETURN
2405
2406 .p2align 4
2407L(fwd_write_38bytes_align):
2408 movdqa -38(%eax), %xmm0
2409 movdqa %xmm0, -38(%edx)
2410L(fwd_write_22bytes_align):
2411 movdqa -22(%eax), %xmm0
2412 movdqa %xmm0, -22(%edx)
2413 movl -6(%eax), %ecx
2414 movl %ecx, -6(%edx)
2415 movzwl -2(%eax), %ecx
2416 movw %cx, -2(%edx)
2417#ifndef USE_AS_BCOPY
2418# ifdef USE_AS_MEMPCPY
2419 movl %edx, %eax
2420# else
2421 movl DEST(%esp), %eax
2422# endif
2423#endif
2424 RETURN
2425
2426 .p2align 4
2427L(fwd_write_42bytes_align):
2428 movdqa -42(%eax), %xmm0
2429 movdqa %xmm0, -42(%edx)
2430L(fwd_write_26bytes_align):
2431 movdqa -26(%eax), %xmm0
2432 movdqa %xmm0, -26(%edx)
2433L(fwd_write_10bytes_align):
2434 movq -10(%eax), %xmm0
2435 movq %xmm0, -10(%edx)
2436L(fwd_write_2bytes_align):
2437 movzwl -2(%eax), %ecx
2438 movw %cx, -2(%edx)
2439#ifndef USE_AS_BCOPY
2440# ifdef USE_AS_MEMPCPY
2441 movl %edx, %eax
2442# else
2443 movl DEST(%esp), %eax
2444# endif
2445#endif
2446 RETURN
2447
2448 .p2align 4
2449L(fwd_write_34bytes_align):
2450 movdqa -34(%eax), %xmm0
2451 movdqa %xmm0, -34(%edx)
2452L(fwd_write_18bytes_align):
2453 movdqa -18(%eax), %xmm0
2454 movdqa %xmm0, -18(%edx)
2455 movzwl -2(%eax), %ecx
2456 movw %cx, -2(%edx)
2457#ifndef USE_AS_BCOPY
2458# ifdef USE_AS_MEMPCPY
2459 movl %edx, %eax
2460# else
2461 movl DEST(%esp), %eax
2462# endif
2463#endif
2464 RETURN
2465
2466 .p2align 4
2467L(fwd_write_47bytes_align):
2468 movdqa -47(%eax), %xmm0
2469 movdqa %xmm0, -47(%edx)
2470L(fwd_write_31bytes_align):
2471 movdqa -31(%eax), %xmm0
2472 movdqa %xmm0, -31(%edx)
2473L(fwd_write_15bytes_align):
2474 movq -15(%eax), %xmm0
2475 movq %xmm0, -15(%edx)
2476L(fwd_write_7bytes_align):
2477 movl -7(%eax), %ecx
2478 movl %ecx, -7(%edx)
2479 movzwl -3(%eax), %ecx
2480 movzbl -1(%eax), %eax
2481 movw %cx, -3(%edx)
2482 movb %al, -1(%edx)
2483#ifndef USE_AS_BCOPY
2484# ifdef USE_AS_MEMPCPY
2485 movl %edx, %eax
2486# else
2487 movl DEST(%esp), %eax
2488# endif
2489#endif
2490 RETURN
2491
2492 .p2align 4
2493L(fwd_write_39bytes_align):
2494 movdqa -39(%eax), %xmm0
2495 movdqa %xmm0, -39(%edx)
2496L(fwd_write_23bytes_align):
2497 movdqa -23(%eax), %xmm0
2498 movdqa %xmm0, -23(%edx)
2499 movl -7(%eax), %ecx
2500 movl %ecx, -7(%edx)
2501 movzwl -3(%eax), %ecx
2502 movzbl -1(%eax), %eax
2503 movw %cx, -3(%edx)
2504 movb %al, -1(%edx)
2505#ifndef USE_AS_BCOPY
2506# ifdef USE_AS_MEMPCPY
2507 movl %edx, %eax
2508# else
2509 movl DEST(%esp), %eax
2510# endif
2511#endif
2512 RETURN
2513
2514 .p2align 4
2515L(fwd_write_43bytes_align):
2516 movdqa -43(%eax), %xmm0
2517 movdqa %xmm0, -43(%edx)
2518L(fwd_write_27bytes_align):
2519 movdqa -27(%eax), %xmm0
2520 movdqa %xmm0, -27(%edx)
2521L(fwd_write_11bytes_align):
2522 movq -11(%eax), %xmm0
2523 movq %xmm0, -11(%edx)
2524L(fwd_write_3bytes_align):
2525 movzwl -3(%eax), %ecx
2526 movzbl -1(%eax), %eax
2527 movw %cx, -3(%edx)
2528 movb %al, -1(%edx)
2529#ifndef USE_AS_BCOPY
2530# ifdef USE_AS_MEMPCPY
2531 movl %edx, %eax
2532# else
2533 movl DEST(%esp), %eax
2534# endif
2535#endif
2536 RETURN
2537
2538 .p2align 4
2539L(fwd_write_35bytes_align):
2540 movdqa -35(%eax), %xmm0
2541 movdqa %xmm0, -35(%edx)
2542L(fwd_write_19bytes_align):
2543 movdqa -19(%eax), %xmm0
2544 movdqa %xmm0, -19(%edx)
2545 movzwl -3(%eax), %ecx
2546 movzbl -1(%eax), %eax
2547 movw %cx, -3(%edx)
2548 movb %al, -1(%edx)
2549#ifndef USE_AS_BCOPY
2550# ifdef USE_AS_MEMPCPY
2551 movl %edx, %eax
2552# else
2553 movl DEST(%esp), %eax
2554# endif
2555#endif
2556 RETURN
2557
2558 .p2align 4
2559L(fwd_write_44bytes_align):
2560 movdqa -44(%eax), %xmm0
2561 movdqa %xmm0, -44(%edx)
2562L(fwd_write_28bytes_align):
2563 movdqa -28(%eax), %xmm0
2564 movdqa %xmm0, -28(%edx)
2565L(fwd_write_12bytes_align):
2566 movq -12(%eax), %xmm0
2567 movq %xmm0, -12(%edx)
2568L(fwd_write_4bytes_align):
2569 movl -4(%eax), %ecx
2570 movl %ecx, -4(%edx)
2571#ifndef USE_AS_BCOPY
2572# ifdef USE_AS_MEMPCPY
2573 movl %edx, %eax
2574# else
2575 movl DEST(%esp), %eax
2576# endif
2577#endif
2578 RETURN
2579
2580 .p2align 4
2581L(fwd_write_36bytes_align):
2582 movdqa -36(%eax), %xmm0
2583 movdqa %xmm0, -36(%edx)
2584L(fwd_write_20bytes_align):
2585 movdqa -20(%eax), %xmm0
2586 movdqa %xmm0, -20(%edx)
2587 movl -4(%eax), %ecx
2588 movl %ecx, -4(%edx)
2589#ifndef USE_AS_BCOPY
2590# ifdef USE_AS_MEMPCPY
2591 movl %edx, %eax
2592# else
2593 movl DEST(%esp), %eax
2594# endif
2595#endif
Bruce Beare124a5422010-10-11 12:24:41 -07002596 RETURN_END
Bruce Beare8ff1a272010-03-04 11:03:37 -08002597
Jack Renc47703a2012-02-14 12:01:52 +04002598 CFI_PUSH (%edi)
2599
2600 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002601L(large_page):
2602 movdqu (%eax), %xmm1
Jack Renc47703a2012-02-14 12:01:52 +04002603#ifdef USE_AS_MEMMOVE
2604 movl DEST+4(%esp), %edi
2605 movdqu %xmm0, (%edi)
2606#endif
Bruce Beare8ff1a272010-03-04 11:03:37 -08002607 lea 16(%eax), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002608 movntdq %xmm1, (%edx)
2609 lea 16(%edx), %edx
Bruce Beare8ff1a272010-03-04 11:03:37 -08002610 lea -0x90(%ecx), %ecx
2611 POP (%edi)
Jack Renc47703a2012-02-14 12:01:52 +04002612
2613 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002614L(large_page_loop):
2615 movdqu (%eax), %xmm0
2616 movdqu 0x10(%eax), %xmm1
2617 movdqu 0x20(%eax), %xmm2
2618 movdqu 0x30(%eax), %xmm3
2619 movdqu 0x40(%eax), %xmm4
2620 movdqu 0x50(%eax), %xmm5
2621 movdqu 0x60(%eax), %xmm6
2622 movdqu 0x70(%eax), %xmm7
2623 lea 0x80(%eax), %eax
2624
2625 sub $0x80, %ecx
2626 movntdq %xmm0, (%edx)
2627 movntdq %xmm1, 0x10(%edx)
2628 movntdq %xmm2, 0x20(%edx)
2629 movntdq %xmm3, 0x30(%edx)
2630 movntdq %xmm4, 0x40(%edx)
2631 movntdq %xmm5, 0x50(%edx)
2632 movntdq %xmm6, 0x60(%edx)
2633 movntdq %xmm7, 0x70(%edx)
2634 lea 0x80(%edx), %edx
2635 jae L(large_page_loop)
2636 cmp $-0x40, %ecx
2637 lea 0x80(%ecx), %ecx
2638 jl L(large_page_less_64bytes)
2639
2640 movdqu (%eax), %xmm0
2641 movdqu 0x10(%eax), %xmm1
2642 movdqu 0x20(%eax), %xmm2
2643 movdqu 0x30(%eax), %xmm3
2644 lea 0x40(%eax), %eax
2645
2646 movntdq %xmm0, (%edx)
2647 movntdq %xmm1, 0x10(%edx)
2648 movntdq %xmm2, 0x20(%edx)
2649 movntdq %xmm3, 0x30(%edx)
2650 lea 0x40(%edx), %edx
2651 sub $0x40, %ecx
2652L(large_page_less_64bytes):
2653 cmp $32, %ecx
2654 jb L(large_page_less_32bytes)
2655 movdqu (%eax), %xmm0
2656 movdqu 0x10(%eax), %xmm1
2657 lea 0x20(%eax), %eax
2658 movntdq %xmm0, (%edx)
2659 movntdq %xmm1, 0x10(%edx)
2660 lea 0x20(%edx), %edx
2661 sub $0x20, %ecx
2662L(large_page_less_32bytes):
2663 add %ecx, %edx
2664 add %ecx, %eax
2665 sfence
2666 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
2667
Jack Renc47703a2012-02-14 12:01:52 +04002668 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002669L(bk_write_44bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002670 movq 36(%eax), %xmm0
2671 movq %xmm0, 36(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002672L(bk_write_36bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002673 movq 28(%eax), %xmm0
2674 movq %xmm0, 28(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002675L(bk_write_28bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002676 movq 20(%eax), %xmm0
2677 movq %xmm0, 20(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002678L(bk_write_20bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002679 movq 12(%eax), %xmm0
2680 movq %xmm0, 12(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002681L(bk_write_12bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002682 movq 4(%eax), %xmm0
2683 movq %xmm0, 4(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002684L(bk_write_4bytes):
2685 movl (%eax), %ecx
2686 movl %ecx, (%edx)
2687L(bk_write_0bytes):
2688#ifndef USE_AS_BCOPY
2689 movl DEST(%esp), %eax
2690# ifdef USE_AS_MEMPCPY
2691 movl LEN(%esp), %ecx
2692 add %ecx, %eax
2693# endif
2694#endif
2695 RETURN
2696
Jack Renc47703a2012-02-14 12:01:52 +04002697 .p2align 4
2698L(bk_write_40bytes):
2699 movq 32(%eax), %xmm0
2700 movq %xmm0, 32(%edx)
2701L(bk_write_32bytes):
2702 movq 24(%eax), %xmm0
2703 movq %xmm0, 24(%edx)
2704L(bk_write_24bytes):
2705 movq 16(%eax), %xmm0
2706 movq %xmm0, 16(%edx)
2707L(bk_write_16bytes):
2708 movq 8(%eax), %xmm0
2709 movq %xmm0, 8(%edx)
2710L(bk_write_8bytes):
2711 movq (%eax), %xmm0
2712 movq %xmm0, (%edx)
2713#ifndef USE_AS_BCOPY
2714 movl DEST(%esp), %eax
2715# ifdef USE_AS_MEMPCPY
2716 movl LEN(%esp), %ecx
2717 add %ecx, %eax
2718# endif
2719#endif
2720 RETURN
2721
2722 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002723L(bk_write_45bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002724 movq 37(%eax), %xmm0
2725 movq %xmm0, 37(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002726L(bk_write_37bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002727 movq 29(%eax), %xmm0
2728 movq %xmm0, 29(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002729L(bk_write_29bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002730 movq 21(%eax), %xmm0
2731 movq %xmm0, 21(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002732L(bk_write_21bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002733 movq 13(%eax), %xmm0
2734 movq %xmm0, 13(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002735L(bk_write_13bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002736 movq 5(%eax), %xmm0
2737 movq %xmm0, 5(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002738L(bk_write_5bytes):
2739 movl 1(%eax), %ecx
2740 movl %ecx, 1(%edx)
2741L(bk_write_1bytes):
2742 movzbl (%eax), %ecx
2743 movb %cl, (%edx)
2744#ifndef USE_AS_BCOPY
2745 movl DEST(%esp), %eax
2746# ifdef USE_AS_MEMPCPY
2747 movl LEN(%esp), %ecx
2748 add %ecx, %eax
2749# endif
2750#endif
2751 RETURN
2752
Jack Renc47703a2012-02-14 12:01:52 +04002753 .p2align 4
2754L(bk_write_41bytes):
2755 movq 33(%eax), %xmm0
2756 movq %xmm0, 33(%edx)
2757L(bk_write_33bytes):
2758 movq 25(%eax), %xmm0
2759 movq %xmm0, 25(%edx)
2760L(bk_write_25bytes):
2761 movq 17(%eax), %xmm0
2762 movq %xmm0, 17(%edx)
2763L(bk_write_17bytes):
2764 movq 9(%eax), %xmm0
2765 movq %xmm0, 9(%edx)
2766L(bk_write_9bytes):
2767 movq 1(%eax), %xmm0
2768 movq %xmm0, 1(%edx)
2769 movzbl (%eax), %ecx
2770 movb %cl, (%edx)
2771#ifndef USE_AS_BCOPY
2772 movl DEST(%esp), %eax
2773# ifdef USE_AS_MEMPCPY
2774 movl LEN(%esp), %ecx
2775 add %ecx, %eax
2776# endif
2777#endif
2778 RETURN
2779
2780 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002781L(bk_write_46bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002782 movq 38(%eax), %xmm0
2783 movq %xmm0, 38(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002784L(bk_write_38bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002785 movq 30(%eax), %xmm0
2786 movq %xmm0, 30(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002787L(bk_write_30bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002788 movq 22(%eax), %xmm0
2789 movq %xmm0, 22(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002790L(bk_write_22bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002791 movq 14(%eax), %xmm0
2792 movq %xmm0, 14(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002793L(bk_write_14bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002794 movq 6(%eax), %xmm0
2795 movq %xmm0, 6(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002796L(bk_write_6bytes):
2797 movl 2(%eax), %ecx
2798 movl %ecx, 2(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002799 movzwl (%eax), %ecx
2800 movw %cx, (%edx)
2801#ifndef USE_AS_BCOPY
2802 movl DEST(%esp), %eax
2803# ifdef USE_AS_MEMPCPY
2804 movl LEN(%esp), %ecx
2805 add %ecx, %eax
2806# endif
2807#endif
2808 RETURN
2809
2810 .p2align 4
2811L(bk_write_42bytes):
2812 movq 34(%eax), %xmm0
2813 movq %xmm0, 34(%edx)
2814L(bk_write_34bytes):
2815 movq 26(%eax), %xmm0
2816 movq %xmm0, 26(%edx)
2817L(bk_write_26bytes):
2818 movq 18(%eax), %xmm0
2819 movq %xmm0, 18(%edx)
2820L(bk_write_18bytes):
2821 movq 10(%eax), %xmm0
2822 movq %xmm0, 10(%edx)
2823L(bk_write_10bytes):
2824 movq 2(%eax), %xmm0
2825 movq %xmm0, 2(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002826L(bk_write_2bytes):
2827 movzwl (%eax), %ecx
2828 movw %cx, (%edx)
2829#ifndef USE_AS_BCOPY
2830 movl DEST(%esp), %eax
2831# ifdef USE_AS_MEMPCPY
2832 movl LEN(%esp), %ecx
2833 add %ecx, %eax
2834# endif
2835#endif
2836 RETURN
2837
Jack Renc47703a2012-02-14 12:01:52 +04002838 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002839L(bk_write_47bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002840 movq 39(%eax), %xmm0
2841 movq %xmm0, 39(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002842L(bk_write_39bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002843 movq 31(%eax), %xmm0
2844 movq %xmm0, 31(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002845L(bk_write_31bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002846 movq 23(%eax), %xmm0
2847 movq %xmm0, 23(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002848L(bk_write_23bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002849 movq 15(%eax), %xmm0
2850 movq %xmm0, 15(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002851L(bk_write_15bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002852 movq 7(%eax), %xmm0
2853 movq %xmm0, 7(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002854L(bk_write_7bytes):
2855 movl 3(%eax), %ecx
2856 movl %ecx, 3(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002857 movzwl 1(%eax), %ecx
2858 movw %cx, 1(%edx)
2859 movzbl (%eax), %eax
2860 movb %al, (%edx)
2861#ifndef USE_AS_BCOPY
2862 movl DEST(%esp), %eax
2863# ifdef USE_AS_MEMPCPY
2864 movl LEN(%esp), %ecx
2865 add %ecx, %eax
2866# endif
2867#endif
2868 RETURN
2869
2870 .p2align 4
2871L(bk_write_43bytes):
2872 movq 35(%eax), %xmm0
2873 movq %xmm0, 35(%edx)
2874L(bk_write_35bytes):
2875 movq 27(%eax), %xmm0
2876 movq %xmm0, 27(%edx)
2877L(bk_write_27bytes):
2878 movq 19(%eax), %xmm0
2879 movq %xmm0, 19(%edx)
2880L(bk_write_19bytes):
2881 movq 11(%eax), %xmm0
2882 movq %xmm0, 11(%edx)
2883L(bk_write_11bytes):
2884 movq 3(%eax), %xmm0
2885 movq %xmm0, 3(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002886L(bk_write_3bytes):
2887 movzwl 1(%eax), %ecx
2888 movw %cx, 1(%edx)
2889 movzbl (%eax), %eax
2890 movb %al, (%edx)
2891#ifndef USE_AS_BCOPY
2892 movl DEST(%esp), %eax
2893# ifdef USE_AS_MEMPCPY
2894 movl LEN(%esp), %ecx
2895 add %ecx, %eax
2896# endif
2897#endif
2898 RETURN_END
2899
2900
2901 .pushsection .rodata.ssse3,"a",@progbits
Jack Renc47703a2012-02-14 12:01:52 +04002902 .p2align 2
Bruce Beare8ff1a272010-03-04 11:03:37 -08002903L(table_48bytes_fwd):
2904 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
2905 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
2906 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
2907 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
2908 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
2909 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
2910 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
2911 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
2912 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
2913 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
2914 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
2915 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
2916 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
2917 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
2918 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
2919 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
2920 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
2921 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
2922 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
2923 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
2924 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
2925 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
2926 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
2927 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
2928 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
2929 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
2930 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
2931 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
2932 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
2933 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
2934 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
2935 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
2936 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
2937 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
2938 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
2939 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
2940 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
2941 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
2942 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
2943 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
2944 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
2945 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
2946 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
2947 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
2948 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
2949 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
2950 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
2951 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
2952
Jack Renc47703a2012-02-14 12:01:52 +04002953 .p2align 2
2954L(table_48bytes_fwd_align):
2955 .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
2956 .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
2957 .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
2958 .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
2959 .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
2960 .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
2961 .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
2962 .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
2963 .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
2964 .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
2965 .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
2966 .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
2967 .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
2968 .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
2969 .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
2970 .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
2971 .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
2972 .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
2973 .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
2974 .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
2975 .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
2976 .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
2977 .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
2978 .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
2979 .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
2980 .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
2981 .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
2982 .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
2983 .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
2984 .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
2985 .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
2986 .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
2987 .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
2988 .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
2989 .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
2990 .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
2991 .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
2992 .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
2993 .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
2994 .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
2995 .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
2996 .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
2997 .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
2998 .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
2999 .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
3000 .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
3001 .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
3002 .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
3003
3004 .p2align 2
Bruce Beare8ff1a272010-03-04 11:03:37 -08003005L(shl_table):
3006 .int JMPTBL (L(shl_0), L(shl_table))
3007 .int JMPTBL (L(shl_1), L(shl_table))
3008 .int JMPTBL (L(shl_2), L(shl_table))
3009 .int JMPTBL (L(shl_3), L(shl_table))
3010 .int JMPTBL (L(shl_4), L(shl_table))
3011 .int JMPTBL (L(shl_5), L(shl_table))
3012 .int JMPTBL (L(shl_6), L(shl_table))
3013 .int JMPTBL (L(shl_7), L(shl_table))
3014 .int JMPTBL (L(shl_8), L(shl_table))
3015 .int JMPTBL (L(shl_9), L(shl_table))
3016 .int JMPTBL (L(shl_10), L(shl_table))
3017 .int JMPTBL (L(shl_11), L(shl_table))
3018 .int JMPTBL (L(shl_12), L(shl_table))
3019 .int JMPTBL (L(shl_13), L(shl_table))
3020 .int JMPTBL (L(shl_14), L(shl_table))
3021 .int JMPTBL (L(shl_15), L(shl_table))
3022
Jack Renc47703a2012-02-14 12:01:52 +04003023 .p2align 2
Bruce Beare8ff1a272010-03-04 11:03:37 -08003024L(table_48_bytes_bwd):
3025 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
3026 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
3027 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
3028 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
3029 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
3030 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
3031 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
3032 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
3033 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
3034 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
3035 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
3036 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
3037 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
3038 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
3039 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
3040 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
3041 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
3042 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
3043 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
3044 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
3045 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
3046 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
3047 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
3048 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
3049 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
3050 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
3051 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
3052 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
3053 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
3054 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
3055 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
3056 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
3057 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
3058 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
3059 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
3060 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
3061 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
3062 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
3063 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
3064 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
3065 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
3066 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
3067 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
3068 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
3069 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
3070 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
3071 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
3072 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
3073
3074 .popsection
3075
3076#ifdef USE_AS_MEMMOVE
Jack Renc47703a2012-02-14 12:01:52 +04003077 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003078L(copy_backward):
Jack Renc47703a2012-02-14 12:01:52 +04003079 PUSH (%edi)
3080 movl %eax, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003081 lea (%ecx,%edx,1),%edx
Jack Renc47703a2012-02-14 12:01:52 +04003082 lea (%ecx,%edi,1),%edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003083 testl $0x3, %edx
3084 jnz L(bk_align)
3085
3086L(bk_aligned_4):
3087 cmp $64, %ecx
3088 jae L(bk_write_more64bytes)
3089
3090L(bk_write_64bytesless):
3091 cmp $32, %ecx
3092 jb L(bk_write_less32bytes)
3093
3094L(bk_write_more32bytes):
3095 /* Copy 32 bytes at a time. */
3096 sub $32, %ecx
Jack Renc47703a2012-02-14 12:01:52 +04003097 movq -8(%edi), %xmm0
3098 movq %xmm0, -8(%edx)
3099 movq -16(%edi), %xmm0
3100 movq %xmm0, -16(%edx)
3101 movq -24(%edi), %xmm0
3102 movq %xmm0, -24(%edx)
3103 movq -32(%edi), %xmm0
3104 movq %xmm0, -32(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08003105 sub $32, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003106 sub $32, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003107
3108L(bk_write_less32bytes):
Jack Renc47703a2012-02-14 12:01:52 +04003109 movl %edi, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003110 sub %ecx, %edx
3111 sub %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +04003112 POP (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -08003113L(bk_write_less32bytes_2):
3114 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
3115
Jack Renc47703a2012-02-14 12:01:52 +04003116 CFI_PUSH (%edi)
3117
3118 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003119L(bk_align):
3120 cmp $8, %ecx
3121 jbe L(bk_write_less32bytes)
3122 testl $1, %edx
3123 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
Jack Renc47703a2012-02-14 12:01:52 +04003124 then (EDX & 2) must be != 0. */
Bruce Beare8ff1a272010-03-04 11:03:37 -08003125 jz L(bk_got2)
Jack Renc47703a2012-02-14 12:01:52 +04003126 sub $1, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003127 sub $1, %ecx
3128 sub $1, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003129 movzbl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003130 movb %al, (%edx)
3131
3132 testl $2, %edx
3133 jz L(bk_aligned_4)
3134
3135L(bk_got2):
Jack Renc47703a2012-02-14 12:01:52 +04003136 sub $2, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003137 sub $2, %ecx
3138 sub $2, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003139 movzwl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003140 movw %ax, (%edx)
3141 jmp L(bk_aligned_4)
3142
Jack Renc47703a2012-02-14 12:01:52 +04003143 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003144L(bk_write_more64bytes):
3145 /* Check alignment of last byte. */
3146 testl $15, %edx
3147 jz L(bk_ssse3_cpy_pre)
3148
3149/* EDX is aligned 4 bytes, but not 16 bytes. */
3150L(bk_ssse3_align):
Jack Renc47703a2012-02-14 12:01:52 +04003151 sub $4, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003152 sub $4, %ecx
3153 sub $4, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003154 movl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003155 movl %eax, (%edx)
3156
3157 testl $15, %edx
3158 jz L(bk_ssse3_cpy_pre)
3159
Jack Renc47703a2012-02-14 12:01:52 +04003160 sub $4, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003161 sub $4, %ecx
3162 sub $4, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003163 movl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003164 movl %eax, (%edx)
3165
3166 testl $15, %edx
3167 jz L(bk_ssse3_cpy_pre)
3168
Jack Renc47703a2012-02-14 12:01:52 +04003169 sub $4, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003170 sub $4, %ecx
3171 sub $4, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003172 movl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003173 movl %eax, (%edx)
3174
3175L(bk_ssse3_cpy_pre):
3176 cmp $64, %ecx
3177 jb L(bk_write_more32bytes)
3178
Jack Renc47703a2012-02-14 12:01:52 +04003179 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003180L(bk_ssse3_cpy):
Jack Renc47703a2012-02-14 12:01:52 +04003181 sub $64, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003182 sub $64, %ecx
3183 sub $64, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003184 movdqu 0x30(%edi), %xmm3
Bruce Beare8ff1a272010-03-04 11:03:37 -08003185 movdqa %xmm3, 0x30(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04003186 movdqu 0x20(%edi), %xmm2
Bruce Beare8ff1a272010-03-04 11:03:37 -08003187 movdqa %xmm2, 0x20(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04003188 movdqu 0x10(%edi), %xmm1
Bruce Beare8ff1a272010-03-04 11:03:37 -08003189 movdqa %xmm1, 0x10(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04003190 movdqu (%edi), %xmm0
Bruce Beare8ff1a272010-03-04 11:03:37 -08003191 movdqa %xmm0, (%edx)
3192 cmp $64, %ecx
3193 jae L(bk_ssse3_cpy)
3194 jmp L(bk_write_64bytesless)
3195
3196#endif
3197
3198END (MEMCPY)