blob: 1080a38490a32ca254603fcf83c3b924fe644948 [file] [log] [blame]
Bruce Beare8ff1a272010-03-04 11:03:37 -08001/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
Liubov Dmitrieva0a490662012-01-17 12:55:46 +040031#include "cache.h"
32#undef __i686
33
Bruce Beare8ff1a272010-03-04 11:03:37 -080034#ifndef MEMCPY
Liubov Dmitrieva0a490662012-01-17 12:55:46 +040035# define MEMCPY memcpy
Bruce Beare8ff1a272010-03-04 11:03:37 -080036#endif
37
38#ifndef L
39# define L(label) .L##label
40#endif
41
Bruce Beare8ff1a272010-03-04 11:03:37 -080042#ifndef cfi_startproc
Jack Renc47703a2012-02-14 12:01:52 +040043# define cfi_startproc .cfi_startproc
Bruce Beare8ff1a272010-03-04 11:03:37 -080044#endif
45
46#ifndef cfi_endproc
Jack Renc47703a2012-02-14 12:01:52 +040047# define cfi_endproc .cfi_endproc
Bruce Beare8ff1a272010-03-04 11:03:37 -080048#endif
49
50#ifndef cfi_rel_offset
51# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
52#endif
53
54#ifndef cfi_restore
Jack Renc47703a2012-02-14 12:01:52 +040055# define cfi_restore(reg) .cfi_restore reg
Bruce Beare8ff1a272010-03-04 11:03:37 -080056#endif
57
58#ifndef cfi_adjust_cfa_offset
59# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
60#endif
61
62#ifndef ENTRY
Jack Renc47703a2012-02-14 12:01:52 +040063# define ENTRY(name) \
64 .type name, @function; \
65 .globl name; \
66 .p2align 4; \
67name: \
Bruce Beare8ff1a272010-03-04 11:03:37 -080068 cfi_startproc
69#endif
70
71#ifndef END
Jack Renc47703a2012-02-14 12:01:52 +040072# define END(name) \
73 cfi_endproc; \
Bruce Beare8ff1a272010-03-04 11:03:37 -080074 .size name, .-name
75#endif
76
77#ifdef USE_AS_BCOPY
78# define SRC PARMS
79# define DEST SRC+4
80# define LEN DEST+4
81#else
82# define DEST PARMS
83# define SRC DEST+4
84# define LEN SRC+4
85#endif
86
Jack Renc47703a2012-02-14 12:01:52 +040087#define CFI_PUSH(REG) \
88 cfi_adjust_cfa_offset (4); \
Bruce Beare8ff1a272010-03-04 11:03:37 -080089 cfi_rel_offset (REG, 0)
90
Jack Renc47703a2012-02-14 12:01:52 +040091#define CFI_POP(REG) \
92 cfi_adjust_cfa_offset (-4); \
Bruce Beare8ff1a272010-03-04 11:03:37 -080093 cfi_restore (REG)
94
95#define PUSH(REG) pushl REG; CFI_PUSH (REG)
96#define POP(REG) popl REG; CFI_POP (REG)
97
Nick Kralevich5982e332011-11-11 15:47:24 -080098#if (defined SHARED || defined __PIC__)
Bruce Beare8ff1a272010-03-04 11:03:37 -080099# define PARMS 8 /* Preserve EBX. */
100# define ENTRANCE PUSH (%ebx);
101# define RETURN_END POP (%ebx); ret
102# define RETURN RETURN_END; CFI_PUSH (%ebx)
103# define JMPTBL(I, B) I - B
Jack Renc47703a2012-02-14 12:01:52 +0400104# undef __i686
105
106# define SETUP_PIC_REG(x) call __i686.get_pc_thunk.x
Bruce Beare8ff1a272010-03-04 11:03:37 -0800107
108/* Load an entry in a jump table into EBX and branch to it. TABLE is a
Jack Renc47703a2012-02-14 12:01:52 +0400109 jump table with relative offsets. INDEX is a register contains the
110 index into the jump table. SCALE is the scale of INDEX. */
111
Bruce Beare8ff1a272010-03-04 11:03:37 -0800112# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
Jack Renc47703a2012-02-14 12:01:52 +0400113 /* We first load PC into EBX. */ \
114 SETUP_PIC_REG(bx); \
115 /* Get the address of the jump table. */ \
116 addl $(TABLE - .), %ebx; \
117 /* Get the entry and convert the relative offset to the \
118 absolute address. */ \
119 addl (%ebx, INDEX, SCALE), %ebx; \
120 /* We loaded the jump table. Go. */ \
121 jmp *%ebx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800122#else
Jack Renc47703a2012-02-14 12:01:52 +0400123
Bruce Beare8ff1a272010-03-04 11:03:37 -0800124# define PARMS 4
125# define ENTRANCE
126# define RETURN_END ret
127# define RETURN RETURN_END
128# define JMPTBL(I, B) I
129
130/* Branch to an entry in a jump table. TABLE is a jump table with
Jack Renc47703a2012-02-14 12:01:52 +0400131 absolute offsets. INDEX is a register contains the index into the
132 jump table. SCALE is the scale of INDEX. */
133
Bruce Beare8ff1a272010-03-04 11:03:37 -0800134# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
Jack Renc47703a2012-02-14 12:01:52 +0400135 jmp *TABLE(, INDEX, SCALE)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800136#endif
137
138 .section .text.ssse3,"ax",@progbits
139ENTRY (MEMCPY)
140 ENTRANCE
141 movl LEN(%esp), %ecx
142 movl SRC(%esp), %eax
143 movl DEST(%esp), %edx
144
145#ifdef USE_AS_MEMMOVE
146 cmp %eax, %edx
147 jb L(copy_forward)
148 je L(fwd_write_0bytes)
149 cmp $32, %ecx
150 jae L(memmove_bwd)
151 jmp L(bk_write_less32bytes_2)
Jack Renc47703a2012-02-14 12:01:52 +0400152
153 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800154L(memmove_bwd):
155 add %ecx, %eax
156 cmp %eax, %edx
157 movl SRC(%esp), %eax
158 jb L(copy_backward)
159
160L(copy_forward):
161#endif
162 cmp $48, %ecx
163 jae L(48bytesormore)
164
165L(fwd_write_less32bytes):
166#ifndef USE_AS_MEMMOVE
167 cmp %dl, %al
168 jb L(bk_write)
169#endif
170 add %ecx, %edx
171 add %ecx, %eax
172 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
173#ifndef USE_AS_MEMMOVE
Jack Renc47703a2012-02-14 12:01:52 +0400174 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800175L(bk_write):
176 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
177#endif
178
Jack Renc47703a2012-02-14 12:01:52 +0400179 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800180L(48bytesormore):
Jack Renc47703a2012-02-14 12:01:52 +0400181#ifndef USE_AS_MEMMOVE
182 movlpd (%eax), %xmm0
183 movlpd 8(%eax), %xmm1
184 movlpd %xmm0, (%edx)
185 movlpd %xmm1, 8(%edx)
186#else
Bruce Beare8ff1a272010-03-04 11:03:37 -0800187 movdqu (%eax), %xmm0
Jack Renc47703a2012-02-14 12:01:52 +0400188#endif
Bruce Beare8ff1a272010-03-04 11:03:37 -0800189 PUSH (%edi)
190 movl %edx, %edi
191 and $-16, %edx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800192 add $16, %edx
Bruce Beare8ff1a272010-03-04 11:03:37 -0800193 sub %edx, %edi
194 add %edi, %ecx
195 sub %edi, %eax
196
197#ifdef SHARED_CACHE_SIZE_HALF
198 cmp $SHARED_CACHE_SIZE_HALF, %ecx
199#else
Nick Kralevich5982e332011-11-11 15:47:24 -0800200# if (defined SHARED || defined __PIC__)
Jack Renc47703a2012-02-14 12:01:52 +0400201 SETUP_PIC_REG(bx)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800202 add $_GLOBAL_OFFSET_TABLE_, %ebx
203 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
204# else
205 cmp __x86_shared_cache_size_half, %ecx
206# endif
207#endif
208
209 mov %eax, %edi
210 jae L(large_page)
211 and $0xf, %edi
212 jz L(shl_0)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800213 BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
214
Jack Renc47703a2012-02-14 12:01:52 +0400215 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800216L(shl_0):
Jack Renc47703a2012-02-14 12:01:52 +0400217#ifdef USE_AS_MEMMOVE
218 movl DEST+4(%esp), %edi
219 movdqu %xmm0, (%edi)
220#endif
Bruce Beare8ff1a272010-03-04 11:03:37 -0800221 xor %edi, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -0800222 cmp $127, %ecx
223 ja L(shl_0_gobble)
224 lea -32(%ecx), %ecx
Jack Renc47703a2012-02-14 12:01:52 +0400225
226 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800227L(shl_0_loop):
228 movdqa (%eax, %edi), %xmm0
229 movdqa 16(%eax, %edi), %xmm1
230 sub $32, %ecx
231 movdqa %xmm0, (%edx, %edi)
232 movdqa %xmm1, 16(%edx, %edi)
233 lea 32(%edi), %edi
234 jb L(shl_0_end)
235
236 movdqa (%eax, %edi), %xmm0
237 movdqa 16(%eax, %edi), %xmm1
238 sub $32, %ecx
239 movdqa %xmm0, (%edx, %edi)
240 movdqa %xmm1, 16(%edx, %edi)
241 lea 32(%edi), %edi
242 jb L(shl_0_end)
243
244 movdqa (%eax, %edi), %xmm0
245 movdqa 16(%eax, %edi), %xmm1
246 sub $32, %ecx
247 movdqa %xmm0, (%edx, %edi)
248 movdqa %xmm1, 16(%edx, %edi)
249 lea 32(%edi), %edi
250 jb L(shl_0_end)
251
252 movdqa (%eax, %edi), %xmm0
253 movdqa 16(%eax, %edi), %xmm1
254 sub $32, %ecx
255 movdqa %xmm0, (%edx, %edi)
256 movdqa %xmm1, 16(%edx, %edi)
257 lea 32(%edi), %edi
Jack Renc47703a2012-02-14 12:01:52 +0400258
Bruce Beare8ff1a272010-03-04 11:03:37 -0800259L(shl_0_end):
260 lea 32(%ecx), %ecx
261 add %ecx, %edi
262 add %edi, %edx
263 add %edi, %eax
264 POP (%edi)
Jack Renc47703a2012-02-14 12:01:52 +0400265 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800266
Bruce Beare124a5422010-10-11 12:24:41 -0700267 CFI_PUSH (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800268
Jack Renc47703a2012-02-14 12:01:52 +0400269 .p2align 4
270L(shl_0_gobble):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800271#ifdef DATA_CACHE_SIZE_HALF
272 cmp $DATA_CACHE_SIZE_HALF, %ecx
273#else
Nick Kralevich5982e332011-11-11 15:47:24 -0800274# if (defined SHARED || defined __PIC__)
Jack Renc47703a2012-02-14 12:01:52 +0400275 SETUP_PIC_REG(bx)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800276 add $_GLOBAL_OFFSET_TABLE_, %ebx
277 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
278# else
279 cmp __x86_data_cache_size_half, %ecx
280# endif
281#endif
Jack Renc47703a2012-02-14 12:01:52 +0400282 POP (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800283 lea -128(%ecx), %ecx
284 jae L(shl_0_gobble_mem_loop)
Jack Renc47703a2012-02-14 12:01:52 +0400285
286 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800287L(shl_0_gobble_cache_loop):
288 movdqa (%eax), %xmm0
289 movdqa 0x10(%eax), %xmm1
290 movdqa 0x20(%eax), %xmm2
291 movdqa 0x30(%eax), %xmm3
292 movdqa 0x40(%eax), %xmm4
293 movdqa 0x50(%eax), %xmm5
294 movdqa 0x60(%eax), %xmm6
295 movdqa 0x70(%eax), %xmm7
296 lea 0x80(%eax), %eax
297 sub $128, %ecx
298 movdqa %xmm0, (%edx)
299 movdqa %xmm1, 0x10(%edx)
300 movdqa %xmm2, 0x20(%edx)
301 movdqa %xmm3, 0x30(%edx)
302 movdqa %xmm4, 0x40(%edx)
303 movdqa %xmm5, 0x50(%edx)
304 movdqa %xmm6, 0x60(%edx)
305 movdqa %xmm7, 0x70(%edx)
306 lea 0x80(%edx), %edx
307
308 jae L(shl_0_gobble_cache_loop)
309 cmp $-0x40, %ecx
310 lea 0x80(%ecx), %ecx
311 jl L(shl_0_cache_less_64bytes)
312
313 movdqa (%eax), %xmm0
314 sub $0x40, %ecx
315 movdqa 0x10(%eax), %xmm1
Bruce Beare8ff1a272010-03-04 11:03:37 -0800316 movdqa %xmm0, (%edx)
317 movdqa %xmm1, 0x10(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800318 movdqa 0x20(%eax), %xmm0
319 movdqa 0x30(%eax), %xmm1
320 add $0x40, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -0800321 movdqa %xmm0, 0x20(%edx)
322 movdqa %xmm1, 0x30(%edx)
323 add $0x40, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400324
Bruce Beare8ff1a272010-03-04 11:03:37 -0800325L(shl_0_cache_less_64bytes):
326 cmp $0x20, %ecx
327 jb L(shl_0_cache_less_32bytes)
328 movdqa (%eax), %xmm0
329 sub $0x20, %ecx
330 movdqa 0x10(%eax), %xmm1
331 add $0x20, %eax
332 movdqa %xmm0, (%edx)
333 movdqa %xmm1, 0x10(%edx)
334 add $0x20, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400335
Bruce Beare8ff1a272010-03-04 11:03:37 -0800336L(shl_0_cache_less_32bytes):
337 cmp $0x10, %ecx
338 jb L(shl_0_cache_less_16bytes)
339 sub $0x10, %ecx
340 movdqa (%eax), %xmm0
341 add $0x10, %eax
342 movdqa %xmm0, (%edx)
343 add $0x10, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400344
Bruce Beare8ff1a272010-03-04 11:03:37 -0800345L(shl_0_cache_less_16bytes):
346 add %ecx, %edx
347 add %ecx, %eax
348 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
349
Jack Renc47703a2012-02-14 12:01:52 +0400350 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800351L(shl_0_gobble_mem_loop):
352 prefetcht0 0x1c0(%eax)
353 prefetcht0 0x280(%eax)
354 prefetcht0 0x1c0(%edx)
355
356 movdqa (%eax), %xmm0
357 movdqa 0x10(%eax), %xmm1
358 movdqa 0x20(%eax), %xmm2
359 movdqa 0x30(%eax), %xmm3
360 movdqa 0x40(%eax), %xmm4
361 movdqa 0x50(%eax), %xmm5
362 movdqa 0x60(%eax), %xmm6
363 movdqa 0x70(%eax), %xmm7
364 lea 0x80(%eax), %eax
365 sub $0x80, %ecx
366 movdqa %xmm0, (%edx)
367 movdqa %xmm1, 0x10(%edx)
368 movdqa %xmm2, 0x20(%edx)
369 movdqa %xmm3, 0x30(%edx)
370 movdqa %xmm4, 0x40(%edx)
371 movdqa %xmm5, 0x50(%edx)
372 movdqa %xmm6, 0x60(%edx)
373 movdqa %xmm7, 0x70(%edx)
374 lea 0x80(%edx), %edx
375
376 jae L(shl_0_gobble_mem_loop)
377 cmp $-0x40, %ecx
378 lea 0x80(%ecx), %ecx
379 jl L(shl_0_mem_less_64bytes)
380
381 movdqa (%eax), %xmm0
382 sub $0x40, %ecx
383 movdqa 0x10(%eax), %xmm1
384
385 movdqa %xmm0, (%edx)
386 movdqa %xmm1, 0x10(%edx)
387
388 movdqa 0x20(%eax), %xmm0
389 movdqa 0x30(%eax), %xmm1
390 add $0x40, %eax
391
392 movdqa %xmm0, 0x20(%edx)
393 movdqa %xmm1, 0x30(%edx)
394 add $0x40, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400395
Bruce Beare8ff1a272010-03-04 11:03:37 -0800396L(shl_0_mem_less_64bytes):
397 cmp $0x20, %ecx
398 jb L(shl_0_mem_less_32bytes)
399 movdqa (%eax), %xmm0
400 sub $0x20, %ecx
401 movdqa 0x10(%eax), %xmm1
402 add $0x20, %eax
403 movdqa %xmm0, (%edx)
404 movdqa %xmm1, 0x10(%edx)
405 add $0x20, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400406
Bruce Beare8ff1a272010-03-04 11:03:37 -0800407L(shl_0_mem_less_32bytes):
408 cmp $0x10, %ecx
409 jb L(shl_0_mem_less_16bytes)
410 sub $0x10, %ecx
411 movdqa (%eax), %xmm0
412 add $0x10, %eax
413 movdqa %xmm0, (%edx)
414 add $0x10, %edx
Jack Renc47703a2012-02-14 12:01:52 +0400415
Bruce Beare8ff1a272010-03-04 11:03:37 -0800416L(shl_0_mem_less_16bytes):
417 add %ecx, %edx
418 add %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +0400419 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800420
Jack Renc47703a2012-02-14 12:01:52 +0400421 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800422L(shl_1):
Jack Renc47703a2012-02-14 12:01:52 +0400423#ifndef USE_AS_MEMMOVE
424 movaps -1(%eax), %xmm1
425#else
426 movl DEST+4(%esp), %edi
427 movaps -1(%eax), %xmm1
428 movdqu %xmm0, (%edi)
429#endif
430#ifdef DATA_CACHE_SIZE_HALF
431 cmp $DATA_CACHE_SIZE_HALF, %ecx
432#else
433# if (defined SHARED || defined __PIC__)
434 SETUP_PIC_REG(bx)
435 add $_GLOBAL_OFFSET_TABLE_, %ebx
436 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
437# else
438 cmp __x86_data_cache_size_half, %ecx
439# endif
440#endif
441 jb L(sh_1_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800442
Jack Renc47703a2012-02-14 12:01:52 +0400443 lea -64(%ecx), %ecx
444
445 .p2align 4
446L(Shl1LoopStart):
447 prefetcht0 0x1c0(%eax)
448 prefetcht0 0x1c0(%edx)
449 movaps 15(%eax), %xmm2
450 movaps 31(%eax), %xmm3
451 movaps 47(%eax), %xmm4
452 movaps 63(%eax), %xmm5
453 movaps %xmm5, %xmm7
454 palignr $1, %xmm4, %xmm5
455 palignr $1, %xmm3, %xmm4
456 movaps %xmm5, 48(%edx)
457 palignr $1, %xmm2, %xmm3
458 lea 64(%eax), %eax
459 palignr $1, %xmm1, %xmm2
460 movaps %xmm4, 32(%edx)
461 movaps %xmm3, 16(%edx)
462 movaps %xmm7, %xmm1
463 movaps %xmm2, (%edx)
464 lea 64(%edx), %edx
465 sub $64, %ecx
466 ja L(Shl1LoopStart)
467
468L(Shl1LoopLeave):
469 add $32, %ecx
470 jle L(shl_end_0)
471
472 movaps 15(%eax), %xmm2
473 movaps 31(%eax), %xmm3
474 palignr $1, %xmm2, %xmm3
475 palignr $1, %xmm1, %xmm2
476 movaps %xmm2, (%edx)
477 movaps %xmm3, 16(%edx)
478 lea 32(%edx, %ecx), %edx
479 lea 32(%eax, %ecx), %eax
480 POP (%edi)
481 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
482
483 CFI_PUSH (%edi)
484
485 .p2align 4
486L(sh_1_no_prefetch):
487 lea -32(%ecx), %ecx
488 lea -1(%eax), %eax
489 xor %edi, %edi
490
491 .p2align 4
492L(sh_1_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800493 movdqa 16(%eax, %edi), %xmm2
494 sub $32, %ecx
495 movdqa 32(%eax, %edi), %xmm3
496 movdqa %xmm3, %xmm4
497 palignr $1, %xmm2, %xmm3
498 palignr $1, %xmm1, %xmm2
499 lea 32(%edi), %edi
500 movdqa %xmm2, -32(%edx, %edi)
501 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400502 jb L(sh_1_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800503
504 movdqa 16(%eax, %edi), %xmm2
505 sub $32, %ecx
506 movdqa 32(%eax, %edi), %xmm3
507 movdqa %xmm3, %xmm1
508 palignr $1, %xmm2, %xmm3
509 palignr $1, %xmm4, %xmm2
510 lea 32(%edi), %edi
511 movdqa %xmm2, -32(%edx, %edi)
512 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400513 jae L(sh_1_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800514
Jack Renc47703a2012-02-14 12:01:52 +0400515L(sh_1_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800516 lea 32(%ecx), %ecx
517 add %ecx, %edi
518 add %edi, %edx
519 lea 1(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400520 POP (%edi)
521 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800522
Jack Renc47703a2012-02-14 12:01:52 +0400523 CFI_PUSH (%edi)
524
525 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800526L(shl_2):
Jack Renc47703a2012-02-14 12:01:52 +0400527#ifndef USE_AS_MEMMOVE
528 movaps -2(%eax), %xmm1
529#else
530 movl DEST+4(%esp), %edi
531 movaps -2(%eax), %xmm1
532 movdqu %xmm0, (%edi)
533#endif
534#ifdef DATA_CACHE_SIZE_HALF
535 cmp $DATA_CACHE_SIZE_HALF, %ecx
536#else
537# if (defined SHARED || defined __PIC__)
538 SETUP_PIC_REG(bx)
539 add $_GLOBAL_OFFSET_TABLE_, %ebx
540 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
541# else
542 cmp __x86_data_cache_size_half, %ecx
543# endif
544#endif
545 jb L(sh_2_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800546
Jack Renc47703a2012-02-14 12:01:52 +0400547 lea -64(%ecx), %ecx
548
549 .p2align 4
550L(Shl2LoopStart):
551 prefetcht0 0x1c0(%eax)
552 prefetcht0 0x1c0(%edx)
553 movaps 14(%eax), %xmm2
554 movaps 30(%eax), %xmm3
555 movaps 46(%eax), %xmm4
556 movaps 62(%eax), %xmm5
557 movaps %xmm5, %xmm7
558 palignr $2, %xmm4, %xmm5
559 palignr $2, %xmm3, %xmm4
560 movaps %xmm5, 48(%edx)
561 palignr $2, %xmm2, %xmm3
562 lea 64(%eax), %eax
563 palignr $2, %xmm1, %xmm2
564 movaps %xmm4, 32(%edx)
565 movaps %xmm3, 16(%edx)
566 movaps %xmm7, %xmm1
567 movaps %xmm2, (%edx)
568 lea 64(%edx), %edx
569 sub $64, %ecx
570 ja L(Shl2LoopStart)
571
572L(Shl2LoopLeave):
573 add $32, %ecx
574 jle L(shl_end_0)
575
576 movaps 14(%eax), %xmm2
577 movaps 30(%eax), %xmm3
578 palignr $2, %xmm2, %xmm3
579 palignr $2, %xmm1, %xmm2
580 movaps %xmm2, (%edx)
581 movaps %xmm3, 16(%edx)
582 lea 32(%edx, %ecx), %edx
583 lea 32(%eax, %ecx), %eax
584 POP (%edi)
585 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
586
587 CFI_PUSH (%edi)
588
589 .p2align 4
590L(sh_2_no_prefetch):
591 lea -32(%ecx), %ecx
592 lea -2(%eax), %eax
593 xor %edi, %edi
594
595 .p2align 4
596L(sh_2_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800597 movdqa 16(%eax, %edi), %xmm2
598 sub $32, %ecx
599 movdqa 32(%eax, %edi), %xmm3
600 movdqa %xmm3, %xmm4
601 palignr $2, %xmm2, %xmm3
602 palignr $2, %xmm1, %xmm2
603 lea 32(%edi), %edi
604 movdqa %xmm2, -32(%edx, %edi)
605 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400606 jb L(sh_2_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800607
608 movdqa 16(%eax, %edi), %xmm2
609 sub $32, %ecx
610 movdqa 32(%eax, %edi), %xmm3
611 movdqa %xmm3, %xmm1
612 palignr $2, %xmm2, %xmm3
613 palignr $2, %xmm4, %xmm2
614 lea 32(%edi), %edi
615 movdqa %xmm2, -32(%edx, %edi)
616 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +0400617 jae L(sh_2_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800618
Jack Renc47703a2012-02-14 12:01:52 +0400619L(sh_2_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800620 lea 32(%ecx), %ecx
621 add %ecx, %edi
622 add %edi, %edx
623 lea 2(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400624 POP (%edi)
625 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800626
Jack Renc47703a2012-02-14 12:01:52 +0400627 CFI_PUSH (%edi)
628
629 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800630L(shl_3):
Jack Renc47703a2012-02-14 12:01:52 +0400631#ifndef USE_AS_MEMMOVE
632 movaps -3(%eax), %xmm1
633#else
634 movl DEST+4(%esp), %edi
635 movaps -3(%eax), %xmm1
636 movdqu %xmm0, (%edi)
637#endif
638#ifdef DATA_CACHE_SIZE_HALF
639 cmp $DATA_CACHE_SIZE_HALF, %ecx
640#else
641# if (defined SHARED || defined __PIC__)
642 SETUP_PIC_REG(bx)
643 add $_GLOBAL_OFFSET_TABLE_, %ebx
644 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
645# else
646 cmp __x86_data_cache_size_half, %ecx
647# endif
648#endif
649 jb L(sh_3_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800650
Jack Renc47703a2012-02-14 12:01:52 +0400651 lea -64(%ecx), %ecx
652
653 .p2align 4
654L(Shl3LoopStart):
655 prefetcht0 0x1c0(%eax)
656 prefetcht0 0x1c0(%edx)
657 movaps 13(%eax), %xmm2
658 movaps 29(%eax), %xmm3
659 movaps 45(%eax), %xmm4
660 movaps 61(%eax), %xmm5
661 movaps %xmm5, %xmm7
662 palignr $3, %xmm4, %xmm5
663 palignr $3, %xmm3, %xmm4
664 movaps %xmm5, 48(%edx)
665 palignr $3, %xmm2, %xmm3
666 lea 64(%eax), %eax
667 palignr $3, %xmm1, %xmm2
668 movaps %xmm4, 32(%edx)
669 movaps %xmm3, 16(%edx)
670 movaps %xmm7, %xmm1
671 movaps %xmm2, (%edx)
672 lea 64(%edx), %edx
673 sub $64, %ecx
674 ja L(Shl3LoopStart)
675
676L(Shl3LoopLeave):
677 add $32, %ecx
678 jle L(shl_end_0)
679
680 movaps 13(%eax), %xmm2
681 movaps 29(%eax), %xmm3
682 palignr $3, %xmm2, %xmm3
683 palignr $3, %xmm1, %xmm2
684 movaps %xmm2, (%edx)
685 movaps %xmm3, 16(%edx)
686 lea 32(%edx, %ecx), %edx
687 lea 32(%eax, %ecx), %eax
688 POP (%edi)
689 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
690
691 CFI_PUSH (%edi)
692
693 .p2align 4
694L(sh_3_no_prefetch):
695 lea -32(%ecx), %ecx
696 lea -3(%eax), %eax
697 xor %edi, %edi
698
699 .p2align 4
700L(sh_3_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800701 movdqa 16(%eax, %edi), %xmm2
702 sub $32, %ecx
703 movdqa 32(%eax, %edi), %xmm3
704 movdqa %xmm3, %xmm4
705 palignr $3, %xmm2, %xmm3
706 palignr $3, %xmm1, %xmm2
707 lea 32(%edi), %edi
708 movdqa %xmm2, -32(%edx, %edi)
709 movdqa %xmm3, -16(%edx, %edi)
710
Jack Renc47703a2012-02-14 12:01:52 +0400711 jb L(sh_3_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800712
713 movdqa 16(%eax, %edi), %xmm2
714 sub $32, %ecx
715 movdqa 32(%eax, %edi), %xmm3
716 movdqa %xmm3, %xmm1
717 palignr $3, %xmm2, %xmm3
718 palignr $3, %xmm4, %xmm2
719 lea 32(%edi), %edi
720 movdqa %xmm2, -32(%edx, %edi)
721 movdqa %xmm3, -16(%edx, %edi)
722
Jack Renc47703a2012-02-14 12:01:52 +0400723 jae L(sh_3_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800724
Jack Renc47703a2012-02-14 12:01:52 +0400725L(sh_3_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800726 lea 32(%ecx), %ecx
727 add %ecx, %edi
728 add %edi, %edx
729 lea 3(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400730 POP (%edi)
731 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800732
Jack Renc47703a2012-02-14 12:01:52 +0400733 CFI_PUSH (%edi)
734
735 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800736L(shl_4):
Jack Renc47703a2012-02-14 12:01:52 +0400737#ifndef USE_AS_MEMMOVE
738 movaps -4(%eax), %xmm1
739#else
740 movl DEST+4(%esp), %edi
741 movaps -4(%eax), %xmm1
742 movdqu %xmm0, (%edi)
743#endif
744#ifdef DATA_CACHE_SIZE_HALF
745 cmp $DATA_CACHE_SIZE_HALF, %ecx
746#else
747# if (defined SHARED || defined __PIC__)
748 SETUP_PIC_REG(bx)
749 add $_GLOBAL_OFFSET_TABLE_, %ebx
750 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
751# else
752 cmp __x86_data_cache_size_half, %ecx
753# endif
754#endif
755 jb L(sh_4_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800756
Jack Renc47703a2012-02-14 12:01:52 +0400757 lea -64(%ecx), %ecx
758
759 .p2align 4
760L(Shl4LoopStart):
761 prefetcht0 0x1c0(%eax)
762 prefetcht0 0x1c0(%edx)
763 movaps 12(%eax), %xmm2
764 movaps 28(%eax), %xmm3
765 movaps 44(%eax), %xmm4
766 movaps 60(%eax), %xmm5
767 movaps %xmm5, %xmm7
768 palignr $4, %xmm4, %xmm5
769 palignr $4, %xmm3, %xmm4
770 movaps %xmm5, 48(%edx)
771 palignr $4, %xmm2, %xmm3
772 lea 64(%eax), %eax
773 palignr $4, %xmm1, %xmm2
774 movaps %xmm4, 32(%edx)
775 movaps %xmm3, 16(%edx)
776 movaps %xmm7, %xmm1
777 movaps %xmm2, (%edx)
778 lea 64(%edx), %edx
779 sub $64, %ecx
780 ja L(Shl4LoopStart)
781
782L(Shl4LoopLeave):
783 add $32, %ecx
784 jle L(shl_end_0)
785
786 movaps 12(%eax), %xmm2
787 movaps 28(%eax), %xmm3
788 palignr $4, %xmm2, %xmm3
789 palignr $4, %xmm1, %xmm2
790 movaps %xmm2, (%edx)
791 movaps %xmm3, 16(%edx)
792 lea 32(%edx, %ecx), %edx
793 lea 32(%eax, %ecx), %eax
794 POP (%edi)
795 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
796
797 CFI_PUSH (%edi)
798
799 .p2align 4
800L(sh_4_no_prefetch):
801 lea -32(%ecx), %ecx
802 lea -4(%eax), %eax
803 xor %edi, %edi
804
805 .p2align 4
806L(sh_4_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800807 movdqa 16(%eax, %edi), %xmm2
808 sub $32, %ecx
809 movdqa 32(%eax, %edi), %xmm3
810 movdqa %xmm3, %xmm4
811 palignr $4, %xmm2, %xmm3
812 palignr $4, %xmm1, %xmm2
813 lea 32(%edi), %edi
814 movdqa %xmm2, -32(%edx, %edi)
815 movdqa %xmm3, -16(%edx, %edi)
816
Jack Renc47703a2012-02-14 12:01:52 +0400817 jb L(sh_4_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800818
819 movdqa 16(%eax, %edi), %xmm2
820 sub $32, %ecx
821 movdqa 32(%eax, %edi), %xmm3
822 movdqa %xmm3, %xmm1
823 palignr $4, %xmm2, %xmm3
824 palignr $4, %xmm4, %xmm2
825 lea 32(%edi), %edi
826 movdqa %xmm2, -32(%edx, %edi)
827 movdqa %xmm3, -16(%edx, %edi)
828
Jack Renc47703a2012-02-14 12:01:52 +0400829 jae L(sh_4_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800830
Jack Renc47703a2012-02-14 12:01:52 +0400831L(sh_4_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800832 lea 32(%ecx), %ecx
833 add %ecx, %edi
834 add %edi, %edx
835 lea 4(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400836 POP (%edi)
837 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800838
Jack Renc47703a2012-02-14 12:01:52 +0400839 CFI_PUSH (%edi)
840
841 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800842L(shl_5):
Jack Renc47703a2012-02-14 12:01:52 +0400843#ifndef USE_AS_MEMMOVE
844 movaps -5(%eax), %xmm1
845#else
846 movl DEST+4(%esp), %edi
847 movaps -5(%eax), %xmm1
848 movdqu %xmm0, (%edi)
849#endif
850#ifdef DATA_CACHE_SIZE_HALF
851 cmp $DATA_CACHE_SIZE_HALF, %ecx
852#else
853# if (defined SHARED || defined __PIC__)
854 SETUP_PIC_REG(bx)
855 add $_GLOBAL_OFFSET_TABLE_, %ebx
856 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
857# else
858 cmp __x86_data_cache_size_half, %ecx
859# endif
860#endif
861 jb L(sh_5_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800862
Jack Renc47703a2012-02-14 12:01:52 +0400863 lea -64(%ecx), %ecx
864
865 .p2align 4
866L(Shl5LoopStart):
867 prefetcht0 0x1c0(%eax)
868 prefetcht0 0x1c0(%edx)
869 movaps 11(%eax), %xmm2
870 movaps 27(%eax), %xmm3
871 movaps 43(%eax), %xmm4
872 movaps 59(%eax), %xmm5
873 movaps %xmm5, %xmm7
874 palignr $5, %xmm4, %xmm5
875 palignr $5, %xmm3, %xmm4
876 movaps %xmm5, 48(%edx)
877 palignr $5, %xmm2, %xmm3
878 lea 64(%eax), %eax
879 palignr $5, %xmm1, %xmm2
880 movaps %xmm4, 32(%edx)
881 movaps %xmm3, 16(%edx)
882 movaps %xmm7, %xmm1
883 movaps %xmm2, (%edx)
884 lea 64(%edx), %edx
885 sub $64, %ecx
886 ja L(Shl5LoopStart)
887
888L(Shl5LoopLeave):
889 add $32, %ecx
890 jle L(shl_end_0)
891
892 movaps 11(%eax), %xmm2
893 movaps 27(%eax), %xmm3
894 palignr $5, %xmm2, %xmm3
895 palignr $5, %xmm1, %xmm2
896 movaps %xmm2, (%edx)
897 movaps %xmm3, 16(%edx)
898 lea 32(%edx, %ecx), %edx
899 lea 32(%eax, %ecx), %eax
900 POP (%edi)
901 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
902
903 CFI_PUSH (%edi)
904
905 .p2align 4
906L(sh_5_no_prefetch):
907 lea -32(%ecx), %ecx
908 lea -5(%eax), %eax
909 xor %edi, %edi
910
911 .p2align 4
912L(sh_5_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800913 movdqa 16(%eax, %edi), %xmm2
914 sub $32, %ecx
915 movdqa 32(%eax, %edi), %xmm3
916 movdqa %xmm3, %xmm4
917 palignr $5, %xmm2, %xmm3
918 palignr $5, %xmm1, %xmm2
919 lea 32(%edi), %edi
920 movdqa %xmm2, -32(%edx, %edi)
921 movdqa %xmm3, -16(%edx, %edi)
922
Jack Renc47703a2012-02-14 12:01:52 +0400923 jb L(sh_5_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800924
925 movdqa 16(%eax, %edi), %xmm2
926 sub $32, %ecx
927 movdqa 32(%eax, %edi), %xmm3
928 movdqa %xmm3, %xmm1
929 palignr $5, %xmm2, %xmm3
930 palignr $5, %xmm4, %xmm2
931 lea 32(%edi), %edi
932 movdqa %xmm2, -32(%edx, %edi)
933 movdqa %xmm3, -16(%edx, %edi)
934
Jack Renc47703a2012-02-14 12:01:52 +0400935 jae L(sh_5_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800936
Jack Renc47703a2012-02-14 12:01:52 +0400937L(sh_5_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -0800938 lea 32(%ecx), %ecx
939 add %ecx, %edi
940 add %edi, %edx
941 lea 5(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +0400942 POP (%edi)
943 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800944
Jack Renc47703a2012-02-14 12:01:52 +0400945 CFI_PUSH (%edi)
946
947 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -0800948L(shl_6):
Jack Renc47703a2012-02-14 12:01:52 +0400949#ifndef USE_AS_MEMMOVE
950 movaps -6(%eax), %xmm1
951#else
952 movl DEST+4(%esp), %edi
953 movaps -6(%eax), %xmm1
954 movdqu %xmm0, (%edi)
955#endif
956#ifdef DATA_CACHE_SIZE_HALF
957 cmp $DATA_CACHE_SIZE_HALF, %ecx
958#else
959# if (defined SHARED || defined __PIC__)
960 SETUP_PIC_REG(bx)
961 add $_GLOBAL_OFFSET_TABLE_, %ebx
962 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
963# else
964 cmp __x86_data_cache_size_half, %ecx
965# endif
966#endif
967 jb L(sh_6_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -0800968
Jack Renc47703a2012-02-14 12:01:52 +0400969 lea -64(%ecx), %ecx
970
971 .p2align 4
972L(Shl6LoopStart):
973 prefetcht0 0x1c0(%eax)
974 prefetcht0 0x1c0(%edx)
975 movaps 10(%eax), %xmm2
976 movaps 26(%eax), %xmm3
977 movaps 42(%eax), %xmm4
978 movaps 58(%eax), %xmm5
979 movaps %xmm5, %xmm7
980 palignr $6, %xmm4, %xmm5
981 palignr $6, %xmm3, %xmm4
982 movaps %xmm5, 48(%edx)
983 palignr $6, %xmm2, %xmm3
984 lea 64(%eax), %eax
985 palignr $6, %xmm1, %xmm2
986 movaps %xmm4, 32(%edx)
987 movaps %xmm3, 16(%edx)
988 movaps %xmm7, %xmm1
989 movaps %xmm2, (%edx)
990 lea 64(%edx), %edx
991 sub $64, %ecx
992 ja L(Shl6LoopStart)
993
994L(Shl6LoopLeave):
995 add $32, %ecx
996 jle L(shl_end_0)
997
998 movaps 10(%eax), %xmm2
999 movaps 26(%eax), %xmm3
1000 palignr $6, %xmm2, %xmm3
1001 palignr $6, %xmm1, %xmm2
1002 movaps %xmm2, (%edx)
1003 movaps %xmm3, 16(%edx)
1004 lea 32(%edx, %ecx), %edx
1005 lea 32(%eax, %ecx), %eax
1006 POP (%edi)
1007 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1008
1009 CFI_PUSH (%edi)
1010
1011 .p2align 4
1012L(sh_6_no_prefetch):
1013 lea -32(%ecx), %ecx
1014 lea -6(%eax), %eax
1015 xor %edi, %edi
1016
1017 .p2align 4
1018L(sh_6_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001019 movdqa 16(%eax, %edi), %xmm2
1020 sub $32, %ecx
1021 movdqa 32(%eax, %edi), %xmm3
1022 movdqa %xmm3, %xmm4
1023 palignr $6, %xmm2, %xmm3
1024 palignr $6, %xmm1, %xmm2
1025 lea 32(%edi), %edi
1026 movdqa %xmm2, -32(%edx, %edi)
1027 movdqa %xmm3, -16(%edx, %edi)
1028
Jack Renc47703a2012-02-14 12:01:52 +04001029 jb L(sh_6_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001030
1031 movdqa 16(%eax, %edi), %xmm2
1032 sub $32, %ecx
1033 movdqa 32(%eax, %edi), %xmm3
1034 movdqa %xmm3, %xmm1
1035 palignr $6, %xmm2, %xmm3
1036 palignr $6, %xmm4, %xmm2
1037 lea 32(%edi), %edi
1038 movdqa %xmm2, -32(%edx, %edi)
1039 movdqa %xmm3, -16(%edx, %edi)
1040
Jack Renc47703a2012-02-14 12:01:52 +04001041 jae L(sh_6_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001042
Jack Renc47703a2012-02-14 12:01:52 +04001043L(sh_6_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001044 lea 32(%ecx), %ecx
1045 add %ecx, %edi
1046 add %edi, %edx
1047 lea 6(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001048 POP (%edi)
1049 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001050
Jack Renc47703a2012-02-14 12:01:52 +04001051 CFI_PUSH (%edi)
1052
1053 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001054L(shl_7):
Jack Renc47703a2012-02-14 12:01:52 +04001055#ifndef USE_AS_MEMMOVE
1056 movaps -7(%eax), %xmm1
1057#else
1058 movl DEST+4(%esp), %edi
1059 movaps -7(%eax), %xmm1
1060 movdqu %xmm0, (%edi)
1061#endif
1062#ifdef DATA_CACHE_SIZE_HALF
1063 cmp $DATA_CACHE_SIZE_HALF, %ecx
1064#else
1065# if (defined SHARED || defined __PIC__)
1066 SETUP_PIC_REG(bx)
1067 add $_GLOBAL_OFFSET_TABLE_, %ebx
1068 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1069# else
1070 cmp __x86_data_cache_size_half, %ecx
1071# endif
1072#endif
1073 jb L(sh_7_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001074
Jack Renc47703a2012-02-14 12:01:52 +04001075 lea -64(%ecx), %ecx
1076
1077 .p2align 4
1078L(Shl7LoopStart):
1079 prefetcht0 0x1c0(%eax)
1080 prefetcht0 0x1c0(%edx)
1081 movaps 9(%eax), %xmm2
1082 movaps 25(%eax), %xmm3
1083 movaps 41(%eax), %xmm4
1084 movaps 57(%eax), %xmm5
1085 movaps %xmm5, %xmm7
1086 palignr $7, %xmm4, %xmm5
1087 palignr $7, %xmm3, %xmm4
1088 movaps %xmm5, 48(%edx)
1089 palignr $7, %xmm2, %xmm3
1090 lea 64(%eax), %eax
1091 palignr $7, %xmm1, %xmm2
1092 movaps %xmm4, 32(%edx)
1093 movaps %xmm3, 16(%edx)
1094 movaps %xmm7, %xmm1
1095 movaps %xmm2, (%edx)
1096 lea 64(%edx), %edx
1097 sub $64, %ecx
1098 ja L(Shl7LoopStart)
1099
1100L(Shl7LoopLeave):
1101 add $32, %ecx
1102 jle L(shl_end_0)
1103
1104 movaps 9(%eax), %xmm2
1105 movaps 25(%eax), %xmm3
1106 palignr $7, %xmm2, %xmm3
1107 palignr $7, %xmm1, %xmm2
1108 movaps %xmm2, (%edx)
1109 movaps %xmm3, 16(%edx)
1110 lea 32(%edx, %ecx), %edx
1111 lea 32(%eax, %ecx), %eax
1112 POP (%edi)
1113 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1114
1115 CFI_PUSH (%edi)
1116
1117 .p2align 4
1118L(sh_7_no_prefetch):
1119 lea -32(%ecx), %ecx
1120 lea -7(%eax), %eax
1121 xor %edi, %edi
1122
1123 .p2align 4
1124L(sh_7_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001125 movdqa 16(%eax, %edi), %xmm2
1126 sub $32, %ecx
1127 movdqa 32(%eax, %edi), %xmm3
1128 movdqa %xmm3, %xmm4
1129 palignr $7, %xmm2, %xmm3
1130 palignr $7, %xmm1, %xmm2
1131 lea 32(%edi), %edi
1132 movdqa %xmm2, -32(%edx, %edi)
1133 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001134 jb L(sh_7_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001135
1136 movdqa 16(%eax, %edi), %xmm2
1137 sub $32, %ecx
1138 movdqa 32(%eax, %edi), %xmm3
1139 movdqa %xmm3, %xmm1
1140 palignr $7, %xmm2, %xmm3
1141 palignr $7, %xmm4, %xmm2
1142 lea 32(%edi), %edi
1143 movdqa %xmm2, -32(%edx, %edi)
1144 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001145 jae L(sh_7_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001146
Jack Renc47703a2012-02-14 12:01:52 +04001147L(sh_7_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001148 lea 32(%ecx), %ecx
1149 add %ecx, %edi
1150 add %edi, %edx
1151 lea 7(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001152 POP (%edi)
1153 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001154
Jack Renc47703a2012-02-14 12:01:52 +04001155 CFI_PUSH (%edi)
1156
1157 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001158L(shl_8):
Jack Renc47703a2012-02-14 12:01:52 +04001159#ifndef USE_AS_MEMMOVE
1160 movaps -8(%eax), %xmm1
1161#else
1162 movl DEST+4(%esp), %edi
1163 movaps -8(%eax), %xmm1
1164 movdqu %xmm0, (%edi)
1165#endif
1166#ifdef DATA_CACHE_SIZE_HALF
1167 cmp $DATA_CACHE_SIZE_HALF, %ecx
1168#else
1169# if (defined SHARED || defined __PIC__)
1170 SETUP_PIC_REG(bx)
1171 add $_GLOBAL_OFFSET_TABLE_, %ebx
1172 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1173# else
1174 cmp __x86_data_cache_size_half, %ecx
1175# endif
1176#endif
1177 jb L(sh_8_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001178
Jack Renc47703a2012-02-14 12:01:52 +04001179 lea -64(%ecx), %ecx
1180
1181 .p2align 4
1182L(Shl8LoopStart):
1183 prefetcht0 0x1c0(%eax)
1184 prefetcht0 0x1c0(%edx)
1185 movaps 8(%eax), %xmm2
1186 movaps 24(%eax), %xmm3
1187 movaps 40(%eax), %xmm4
1188 movaps 56(%eax), %xmm5
1189 movaps %xmm5, %xmm7
1190 palignr $8, %xmm4, %xmm5
1191 palignr $8, %xmm3, %xmm4
1192 movaps %xmm5, 48(%edx)
1193 palignr $8, %xmm2, %xmm3
1194 lea 64(%eax), %eax
1195 palignr $8, %xmm1, %xmm2
1196 movaps %xmm4, 32(%edx)
1197 movaps %xmm3, 16(%edx)
1198 movaps %xmm7, %xmm1
1199 movaps %xmm2, (%edx)
1200 lea 64(%edx), %edx
1201 sub $64, %ecx
1202 ja L(Shl8LoopStart)
1203
1204L(LoopLeave8):
1205 add $32, %ecx
1206 jle L(shl_end_0)
1207
1208 movaps 8(%eax), %xmm2
1209 movaps 24(%eax), %xmm3
1210 palignr $8, %xmm2, %xmm3
1211 palignr $8, %xmm1, %xmm2
1212 movaps %xmm2, (%edx)
1213 movaps %xmm3, 16(%edx)
1214 lea 32(%edx, %ecx), %edx
1215 lea 32(%eax, %ecx), %eax
1216 POP (%edi)
1217 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1218
1219 CFI_PUSH (%edi)
1220
1221 .p2align 4
1222L(sh_8_no_prefetch):
1223 lea -32(%ecx), %ecx
1224 lea -8(%eax), %eax
1225 xor %edi, %edi
1226
1227 .p2align 4
1228L(sh_8_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001229 movdqa 16(%eax, %edi), %xmm2
1230 sub $32, %ecx
1231 movdqa 32(%eax, %edi), %xmm3
1232 movdqa %xmm3, %xmm4
1233 palignr $8, %xmm2, %xmm3
1234 palignr $8, %xmm1, %xmm2
1235 lea 32(%edi), %edi
1236 movdqa %xmm2, -32(%edx, %edi)
1237 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001238 jb L(sh_8_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001239
1240 movdqa 16(%eax, %edi), %xmm2
1241 sub $32, %ecx
1242 movdqa 32(%eax, %edi), %xmm3
1243 movdqa %xmm3, %xmm1
1244 palignr $8, %xmm2, %xmm3
1245 palignr $8, %xmm4, %xmm2
1246 lea 32(%edi), %edi
1247 movdqa %xmm2, -32(%edx, %edi)
1248 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001249 jae L(sh_8_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001250
Jack Renc47703a2012-02-14 12:01:52 +04001251L(sh_8_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001252 lea 32(%ecx), %ecx
1253 add %ecx, %edi
1254 add %edi, %edx
1255 lea 8(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001256 POP (%edi)
1257 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001258
Jack Renc47703a2012-02-14 12:01:52 +04001259 CFI_PUSH (%edi)
1260
1261 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001262L(shl_9):
Jack Renc47703a2012-02-14 12:01:52 +04001263#ifndef USE_AS_MEMMOVE
1264 movaps -9(%eax), %xmm1
1265#else
1266 movl DEST+4(%esp), %edi
1267 movaps -9(%eax), %xmm1
1268 movdqu %xmm0, (%edi)
1269#endif
1270#ifdef DATA_CACHE_SIZE_HALF
1271 cmp $DATA_CACHE_SIZE_HALF, %ecx
1272#else
1273# if (defined SHARED || defined __PIC__)
1274 SETUP_PIC_REG(bx)
1275 add $_GLOBAL_OFFSET_TABLE_, %ebx
1276 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1277# else
1278 cmp __x86_data_cache_size_half, %ecx
1279# endif
1280#endif
1281 jb L(sh_9_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001282
Jack Renc47703a2012-02-14 12:01:52 +04001283 lea -64(%ecx), %ecx
1284
1285 .p2align 4
1286L(Shl9LoopStart):
1287 prefetcht0 0x1c0(%eax)
1288 prefetcht0 0x1c0(%edx)
1289 movaps 7(%eax), %xmm2
1290 movaps 23(%eax), %xmm3
1291 movaps 39(%eax), %xmm4
1292 movaps 55(%eax), %xmm5
1293 movaps %xmm5, %xmm7
1294 palignr $9, %xmm4, %xmm5
1295 palignr $9, %xmm3, %xmm4
1296 movaps %xmm5, 48(%edx)
1297 palignr $9, %xmm2, %xmm3
1298 lea 64(%eax), %eax
1299 palignr $9, %xmm1, %xmm2
1300 movaps %xmm4, 32(%edx)
1301 movaps %xmm3, 16(%edx)
1302 movaps %xmm7, %xmm1
1303 movaps %xmm2, (%edx)
1304 lea 64(%edx), %edx
1305 sub $64, %ecx
1306 ja L(Shl9LoopStart)
1307
1308L(Shl9LoopLeave):
1309 add $32, %ecx
1310 jle L(shl_end_0)
1311
1312 movaps 7(%eax), %xmm2
1313 movaps 23(%eax), %xmm3
1314 palignr $9, %xmm2, %xmm3
1315 palignr $9, %xmm1, %xmm2
1316
1317 movaps %xmm2, (%edx)
1318 movaps %xmm3, 16(%edx)
1319 lea 32(%edx, %ecx), %edx
1320 lea 32(%eax, %ecx), %eax
1321 POP (%edi)
1322 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1323
1324 CFI_PUSH (%edi)
1325
1326 .p2align 4
1327L(sh_9_no_prefetch):
1328 lea -32(%ecx), %ecx
1329 lea -9(%eax), %eax
1330 xor %edi, %edi
1331
1332 .p2align 4
1333L(sh_9_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001334 movdqa 16(%eax, %edi), %xmm2
1335 sub $32, %ecx
1336 movdqa 32(%eax, %edi), %xmm3
1337 movdqa %xmm3, %xmm4
1338 palignr $9, %xmm2, %xmm3
1339 palignr $9, %xmm1, %xmm2
1340 lea 32(%edi), %edi
1341 movdqa %xmm2, -32(%edx, %edi)
1342 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001343 jb L(sh_9_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001344
1345 movdqa 16(%eax, %edi), %xmm2
1346 sub $32, %ecx
1347 movdqa 32(%eax, %edi), %xmm3
1348 movdqa %xmm3, %xmm1
1349 palignr $9, %xmm2, %xmm3
1350 palignr $9, %xmm4, %xmm2
1351 lea 32(%edi), %edi
1352 movdqa %xmm2, -32(%edx, %edi)
1353 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001354 jae L(sh_9_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001355
Jack Renc47703a2012-02-14 12:01:52 +04001356L(sh_9_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001357 lea 32(%ecx), %ecx
1358 add %ecx, %edi
1359 add %edi, %edx
1360 lea 9(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001361 POP (%edi)
1362 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001363
Jack Renc47703a2012-02-14 12:01:52 +04001364 CFI_PUSH (%edi)
1365
1366 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001367L(shl_10):
Jack Renc47703a2012-02-14 12:01:52 +04001368#ifndef USE_AS_MEMMOVE
1369 movaps -10(%eax), %xmm1
1370#else
1371 movl DEST+4(%esp), %edi
1372 movaps -10(%eax), %xmm1
1373 movdqu %xmm0, (%edi)
1374#endif
1375#ifdef DATA_CACHE_SIZE_HALF
1376 cmp $DATA_CACHE_SIZE_HALF, %ecx
1377#else
1378# if (defined SHARED || defined __PIC__)
1379 SETUP_PIC_REG(bx)
1380 add $_GLOBAL_OFFSET_TABLE_, %ebx
1381 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1382# else
1383 cmp __x86_data_cache_size_half, %ecx
1384# endif
1385#endif
1386 jb L(sh_10_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001387
Jack Renc47703a2012-02-14 12:01:52 +04001388 lea -64(%ecx), %ecx
1389
1390 .p2align 4
1391L(Shl10LoopStart):
1392 prefetcht0 0x1c0(%eax)
1393 prefetcht0 0x1c0(%edx)
1394 movaps 6(%eax), %xmm2
1395 movaps 22(%eax), %xmm3
1396 movaps 38(%eax), %xmm4
1397 movaps 54(%eax), %xmm5
1398 movaps %xmm5, %xmm7
1399 palignr $10, %xmm4, %xmm5
1400 palignr $10, %xmm3, %xmm4
1401 movaps %xmm5, 48(%edx)
1402 palignr $10, %xmm2, %xmm3
1403 lea 64(%eax), %eax
1404 palignr $10, %xmm1, %xmm2
1405 movaps %xmm4, 32(%edx)
1406 movaps %xmm3, 16(%edx)
1407 movaps %xmm7, %xmm1
1408 movaps %xmm2, (%edx)
1409 lea 64(%edx), %edx
1410 sub $64, %ecx
1411 ja L(Shl10LoopStart)
1412
1413L(Shl10LoopLeave):
1414 add $32, %ecx
1415 jle L(shl_end_0)
1416
1417 movaps 6(%eax), %xmm2
1418 movaps 22(%eax), %xmm3
1419 palignr $10, %xmm2, %xmm3
1420 palignr $10, %xmm1, %xmm2
1421
1422 movaps %xmm2, (%edx)
1423 movaps %xmm3, 16(%edx)
1424 lea 32(%edx, %ecx), %edx
1425 lea 32(%eax, %ecx), %eax
1426 POP (%edi)
1427 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1428
1429 CFI_PUSH (%edi)
1430
1431 .p2align 4
1432L(sh_10_no_prefetch):
1433 lea -32(%ecx), %ecx
1434 lea -10(%eax), %eax
1435 xor %edi, %edi
1436
1437 .p2align 4
1438L(sh_10_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001439 movdqa 16(%eax, %edi), %xmm2
1440 sub $32, %ecx
1441 movdqa 32(%eax, %edi), %xmm3
1442 movdqa %xmm3, %xmm4
1443 palignr $10, %xmm2, %xmm3
1444 palignr $10, %xmm1, %xmm2
1445 lea 32(%edi), %edi
1446 movdqa %xmm2, -32(%edx, %edi)
1447 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001448 jb L(sh_10_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001449
1450 movdqa 16(%eax, %edi), %xmm2
1451 sub $32, %ecx
1452 movdqa 32(%eax, %edi), %xmm3
1453 movdqa %xmm3, %xmm1
1454 palignr $10, %xmm2, %xmm3
1455 palignr $10, %xmm4, %xmm2
1456 lea 32(%edi), %edi
1457 movdqa %xmm2, -32(%edx, %edi)
1458 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001459 jae L(sh_10_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001460
Jack Renc47703a2012-02-14 12:01:52 +04001461L(sh_10_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001462 lea 32(%ecx), %ecx
1463 add %ecx, %edi
1464 add %edi, %edx
1465 lea 10(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001466 POP (%edi)
1467 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001468
Jack Renc47703a2012-02-14 12:01:52 +04001469 CFI_PUSH (%edi)
1470
1471 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001472L(shl_11):
Jack Renc47703a2012-02-14 12:01:52 +04001473#ifndef USE_AS_MEMMOVE
1474 movaps -11(%eax), %xmm1
1475#else
1476 movl DEST+4(%esp), %edi
1477 movaps -11(%eax), %xmm1
1478 movdqu %xmm0, (%edi)
1479#endif
1480#ifdef DATA_CACHE_SIZE_HALF
1481 cmp $DATA_CACHE_SIZE_HALF, %ecx
1482#else
1483# if (defined SHARED || defined __PIC__)
1484 SETUP_PIC_REG(bx)
1485 add $_GLOBAL_OFFSET_TABLE_, %ebx
1486 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1487# else
1488 cmp __x86_data_cache_size_half, %ecx
1489# endif
1490#endif
1491 jb L(sh_11_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001492
Jack Renc47703a2012-02-14 12:01:52 +04001493 lea -64(%ecx), %ecx
1494
1495 .p2align 4
1496L(Shl11LoopStart):
1497 prefetcht0 0x1c0(%eax)
1498 prefetcht0 0x1c0(%edx)
1499 movaps 5(%eax), %xmm2
1500 movaps 21(%eax), %xmm3
1501 movaps 37(%eax), %xmm4
1502 movaps 53(%eax), %xmm5
1503 movaps %xmm5, %xmm7
1504 palignr $11, %xmm4, %xmm5
1505 palignr $11, %xmm3, %xmm4
1506 movaps %xmm5, 48(%edx)
1507 palignr $11, %xmm2, %xmm3
1508 lea 64(%eax), %eax
1509 palignr $11, %xmm1, %xmm2
1510 movaps %xmm4, 32(%edx)
1511 movaps %xmm3, 16(%edx)
1512 movaps %xmm7, %xmm1
1513 movaps %xmm2, (%edx)
1514 lea 64(%edx), %edx
1515 sub $64, %ecx
1516 ja L(Shl11LoopStart)
1517
1518L(Shl11LoopLeave):
1519 add $32, %ecx
1520 jle L(shl_end_0)
1521
1522 movaps 5(%eax), %xmm2
1523 movaps 21(%eax), %xmm3
1524 palignr $11, %xmm2, %xmm3
1525 palignr $11, %xmm1, %xmm2
1526
1527 movaps %xmm2, (%edx)
1528 movaps %xmm3, 16(%edx)
1529 lea 32(%edx, %ecx), %edx
1530 lea 32(%eax, %ecx), %eax
1531 POP (%edi)
1532 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1533
1534 CFI_PUSH (%edi)
1535
1536 .p2align 4
1537L(sh_11_no_prefetch):
1538 lea -32(%ecx), %ecx
1539 lea -11(%eax), %eax
1540 xor %edi, %edi
1541
1542 .p2align 4
1543L(sh_11_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001544 movdqa 16(%eax, %edi), %xmm2
1545 sub $32, %ecx
1546 movdqa 32(%eax, %edi), %xmm3
1547 movdqa %xmm3, %xmm4
1548 palignr $11, %xmm2, %xmm3
1549 palignr $11, %xmm1, %xmm2
1550 lea 32(%edi), %edi
1551 movdqa %xmm2, -32(%edx, %edi)
1552 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001553 jb L(sh_11_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001554
1555 movdqa 16(%eax, %edi), %xmm2
1556 sub $32, %ecx
1557 movdqa 32(%eax, %edi), %xmm3
1558 movdqa %xmm3, %xmm1
1559 palignr $11, %xmm2, %xmm3
1560 palignr $11, %xmm4, %xmm2
1561 lea 32(%edi), %edi
1562 movdqa %xmm2, -32(%edx, %edi)
1563 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001564 jae L(sh_11_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001565
Jack Renc47703a2012-02-14 12:01:52 +04001566L(sh_11_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001567 lea 32(%ecx), %ecx
1568 add %ecx, %edi
1569 add %edi, %edx
1570 lea 11(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001571 POP (%edi)
1572 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001573
Jack Renc47703a2012-02-14 12:01:52 +04001574 CFI_PUSH (%edi)
1575
1576 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001577L(shl_12):
Jack Renc47703a2012-02-14 12:01:52 +04001578#ifndef USE_AS_MEMMOVE
1579 movaps -12(%eax), %xmm1
1580#else
1581 movl DEST+4(%esp), %edi
1582 movaps -12(%eax), %xmm1
1583 movdqu %xmm0, (%edi)
1584#endif
1585#ifdef DATA_CACHE_SIZE_HALF
1586 cmp $DATA_CACHE_SIZE_HALF, %ecx
1587#else
1588# if (defined SHARED || defined __PIC__)
1589 SETUP_PIC_REG(bx)
1590 add $_GLOBAL_OFFSET_TABLE_, %ebx
1591 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1592# else
1593 cmp __x86_data_cache_size_half, %ecx
1594# endif
1595#endif
1596 jb L(sh_12_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001597
Jack Renc47703a2012-02-14 12:01:52 +04001598 lea -64(%ecx), %ecx
1599
1600 .p2align 4
1601L(Shl12LoopStart):
1602 prefetcht0 0x1c0(%eax)
1603 prefetcht0 0x1c0(%edx)
1604 movaps 4(%eax), %xmm2
1605 movaps 20(%eax), %xmm3
1606 movaps 36(%eax), %xmm4
1607 movaps 52(%eax), %xmm5
1608 movaps %xmm5, %xmm7
1609 palignr $12, %xmm4, %xmm5
1610 palignr $12, %xmm3, %xmm4
1611 movaps %xmm5, 48(%edx)
1612 palignr $12, %xmm2, %xmm3
1613 lea 64(%eax), %eax
1614 palignr $12, %xmm1, %xmm2
1615 movaps %xmm4, 32(%edx)
1616 movaps %xmm3, 16(%edx)
1617 movaps %xmm7, %xmm1
1618 movaps %xmm2, (%edx)
1619 lea 64(%edx), %edx
1620 sub $64, %ecx
1621 ja L(Shl12LoopStart)
1622
1623L(Shl12LoopLeave):
1624 add $32, %ecx
1625 jle L(shl_end_0)
1626
1627 movaps 4(%eax), %xmm2
1628 movaps 20(%eax), %xmm3
1629 palignr $12, %xmm2, %xmm3
1630 palignr $12, %xmm1, %xmm2
1631
1632 movaps %xmm2, (%edx)
1633 movaps %xmm3, 16(%edx)
1634 lea 32(%edx, %ecx), %edx
1635 lea 32(%eax, %ecx), %eax
1636 POP (%edi)
1637 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1638
1639 CFI_PUSH (%edi)
1640
1641 .p2align 4
1642L(sh_12_no_prefetch):
1643 lea -32(%ecx), %ecx
1644 lea -12(%eax), %eax
1645 xor %edi, %edi
1646
1647 .p2align 4
1648L(sh_12_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001649 movdqa 16(%eax, %edi), %xmm2
1650 sub $32, %ecx
1651 movdqa 32(%eax, %edi), %xmm3
1652 movdqa %xmm3, %xmm4
1653 palignr $12, %xmm2, %xmm3
1654 palignr $12, %xmm1, %xmm2
1655 lea 32(%edi), %edi
1656 movdqa %xmm2, -32(%edx, %edi)
1657 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001658 jb L(sh_12_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001659
1660 movdqa 16(%eax, %edi), %xmm2
1661 sub $32, %ecx
1662 movdqa 32(%eax, %edi), %xmm3
1663 movdqa %xmm3, %xmm1
1664 palignr $12, %xmm2, %xmm3
1665 palignr $12, %xmm4, %xmm2
1666 lea 32(%edi), %edi
1667 movdqa %xmm2, -32(%edx, %edi)
1668 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001669 jae L(sh_12_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001670
Jack Renc47703a2012-02-14 12:01:52 +04001671L(sh_12_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001672 lea 32(%ecx), %ecx
1673 add %ecx, %edi
1674 add %edi, %edx
1675 lea 12(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001676 POP (%edi)
1677 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001678
Jack Renc47703a2012-02-14 12:01:52 +04001679 CFI_PUSH (%edi)
1680
1681 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001682L(shl_13):
Jack Renc47703a2012-02-14 12:01:52 +04001683#ifndef USE_AS_MEMMOVE
1684 movaps -13(%eax), %xmm1
1685#else
1686 movl DEST+4(%esp), %edi
1687 movaps -13(%eax), %xmm1
1688 movdqu %xmm0, (%edi)
1689#endif
1690#ifdef DATA_CACHE_SIZE_HALF
1691 cmp $DATA_CACHE_SIZE_HALF, %ecx
1692#else
1693# if (defined SHARED || defined __PIC__)
1694 SETUP_PIC_REG(bx)
1695 add $_GLOBAL_OFFSET_TABLE_, %ebx
1696 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1697# else
1698 cmp __x86_data_cache_size_half, %ecx
1699# endif
1700#endif
1701 jb L(sh_13_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001702
Jack Renc47703a2012-02-14 12:01:52 +04001703 lea -64(%ecx), %ecx
1704
1705 .p2align 4
1706L(Shl13LoopStart):
1707 prefetcht0 0x1c0(%eax)
1708 prefetcht0 0x1c0(%edx)
1709 movaps 3(%eax), %xmm2
1710 movaps 19(%eax), %xmm3
1711 movaps 35(%eax), %xmm4
1712 movaps 51(%eax), %xmm5
1713 movaps %xmm5, %xmm7
1714 palignr $13, %xmm4, %xmm5
1715 palignr $13, %xmm3, %xmm4
1716 movaps %xmm5, 48(%edx)
1717 palignr $13, %xmm2, %xmm3
1718 lea 64(%eax), %eax
1719 palignr $13, %xmm1, %xmm2
1720 movaps %xmm4, 32(%edx)
1721 movaps %xmm3, 16(%edx)
1722 movaps %xmm7, %xmm1
1723 movaps %xmm2, (%edx)
1724 lea 64(%edx), %edx
1725 sub $64, %ecx
1726 ja L(Shl13LoopStart)
1727
1728L(Shl13LoopLeave):
1729 add $32, %ecx
1730 jle L(shl_end_0)
1731
1732 movaps 3(%eax), %xmm2
1733 movaps 19(%eax), %xmm3
1734 palignr $13, %xmm2, %xmm3
1735 palignr $13, %xmm1, %xmm2
1736
1737 movaps %xmm2, (%edx)
1738 movaps %xmm3, 16(%edx)
1739 lea 32(%edx, %ecx), %edx
1740 lea 32(%eax, %ecx), %eax
1741 POP (%edi)
1742 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1743
1744 CFI_PUSH (%edi)
1745
1746 .p2align 4
1747L(sh_13_no_prefetch):
1748 lea -32(%ecx), %ecx
1749 lea -13(%eax), %eax
1750 xor %edi, %edi
1751
1752 .p2align 4
1753L(sh_13_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001754 movdqa 16(%eax, %edi), %xmm2
1755 sub $32, %ecx
1756 movdqa 32(%eax, %edi), %xmm3
1757 movdqa %xmm3, %xmm4
1758 palignr $13, %xmm2, %xmm3
1759 palignr $13, %xmm1, %xmm2
1760 lea 32(%edi), %edi
1761 movdqa %xmm2, -32(%edx, %edi)
1762 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001763 jb L(sh_13_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001764
1765 movdqa 16(%eax, %edi), %xmm2
1766 sub $32, %ecx
1767 movdqa 32(%eax, %edi), %xmm3
1768 movdqa %xmm3, %xmm1
1769 palignr $13, %xmm2, %xmm3
1770 palignr $13, %xmm4, %xmm2
1771 lea 32(%edi), %edi
1772 movdqa %xmm2, -32(%edx, %edi)
1773 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001774 jae L(sh_13_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001775
Jack Renc47703a2012-02-14 12:01:52 +04001776L(sh_13_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001777 lea 32(%ecx), %ecx
1778 add %ecx, %edi
1779 add %edi, %edx
1780 lea 13(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001781 POP (%edi)
1782 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001783
Jack Renc47703a2012-02-14 12:01:52 +04001784 CFI_PUSH (%edi)
1785
1786 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001787L(shl_14):
Jack Renc47703a2012-02-14 12:01:52 +04001788#ifndef USE_AS_MEMMOVE
1789 movaps -14(%eax), %xmm1
1790#else
1791 movl DEST+4(%esp), %edi
1792 movaps -14(%eax), %xmm1
1793 movdqu %xmm0, (%edi)
1794#endif
1795#ifdef DATA_CACHE_SIZE_HALF
1796 cmp $DATA_CACHE_SIZE_HALF, %ecx
1797#else
1798# if (defined SHARED || defined __PIC__)
1799 SETUP_PIC_REG(bx)
1800 add $_GLOBAL_OFFSET_TABLE_, %ebx
1801 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1802# else
1803 cmp __x86_data_cache_size_half, %ecx
1804# endif
1805#endif
1806 jb L(sh_14_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001807
Jack Renc47703a2012-02-14 12:01:52 +04001808 lea -64(%ecx), %ecx
1809
1810 .p2align 4
1811L(Shl14LoopStart):
1812 prefetcht0 0x1c0(%eax)
1813 prefetcht0 0x1c0(%edx)
1814 movaps 2(%eax), %xmm2
1815 movaps 18(%eax), %xmm3
1816 movaps 34(%eax), %xmm4
1817 movaps 50(%eax), %xmm5
1818 movaps %xmm5, %xmm7
1819 palignr $14, %xmm4, %xmm5
1820 palignr $14, %xmm3, %xmm4
1821 movaps %xmm5, 48(%edx)
1822 palignr $14, %xmm2, %xmm3
1823 lea 64(%eax), %eax
1824 palignr $14, %xmm1, %xmm2
1825 movaps %xmm4, 32(%edx)
1826 movaps %xmm3, 16(%edx)
1827 movaps %xmm7, %xmm1
1828 movaps %xmm2, (%edx)
1829 lea 64(%edx), %edx
1830 sub $64, %ecx
1831 ja L(Shl14LoopStart)
1832
1833L(Shl14LoopLeave):
1834 add $32, %ecx
1835 jle L(shl_end_0)
1836
1837 movaps 2(%eax), %xmm2
1838 movaps 18(%eax), %xmm3
1839 palignr $14, %xmm2, %xmm3
1840 palignr $14, %xmm1, %xmm2
1841
1842 movaps %xmm2, (%edx)
1843 movaps %xmm3, 16(%edx)
1844 lea 32(%edx, %ecx), %edx
1845 lea 32(%eax, %ecx), %eax
1846 POP (%edi)
1847 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1848
1849 CFI_PUSH (%edi)
1850
1851 .p2align 4
1852L(sh_14_no_prefetch):
1853 lea -32(%ecx), %ecx
1854 lea -14(%eax), %eax
1855 xor %edi, %edi
1856
1857 .p2align 4
1858L(sh_14_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001859 movdqa 16(%eax, %edi), %xmm2
1860 sub $32, %ecx
1861 movdqa 32(%eax, %edi), %xmm3
1862 movdqa %xmm3, %xmm4
1863 palignr $14, %xmm2, %xmm3
1864 palignr $14, %xmm1, %xmm2
1865 lea 32(%edi), %edi
1866 movdqa %xmm2, -32(%edx, %edi)
1867 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001868 jb L(sh_14_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001869
1870 movdqa 16(%eax, %edi), %xmm2
1871 sub $32, %ecx
1872 movdqa 32(%eax, %edi), %xmm3
1873 movdqa %xmm3, %xmm1
1874 palignr $14, %xmm2, %xmm3
1875 palignr $14, %xmm4, %xmm2
1876 lea 32(%edi), %edi
1877 movdqa %xmm2, -32(%edx, %edi)
1878 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001879 jae L(sh_14_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001880
Jack Renc47703a2012-02-14 12:01:52 +04001881L(sh_14_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001882 lea 32(%ecx), %ecx
1883 add %ecx, %edi
1884 add %edi, %edx
1885 lea 14(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001886 POP (%edi)
1887 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001888
Jack Renc47703a2012-02-14 12:01:52 +04001889 CFI_PUSH (%edi)
1890
1891 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08001892L(shl_15):
Jack Renc47703a2012-02-14 12:01:52 +04001893#ifndef USE_AS_MEMMOVE
1894 movaps -15(%eax), %xmm1
1895#else
1896 movl DEST+4(%esp), %edi
1897 movaps -15(%eax), %xmm1
1898 movdqu %xmm0, (%edi)
1899#endif
1900#ifdef DATA_CACHE_SIZE_HALF
1901 cmp $DATA_CACHE_SIZE_HALF, %ecx
1902#else
1903# if (defined SHARED || defined __PIC__)
1904 SETUP_PIC_REG(bx)
1905 add $_GLOBAL_OFFSET_TABLE_, %ebx
1906 cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
1907# else
1908 cmp __x86_data_cache_size_half, %ecx
1909# endif
1910#endif
1911 jb L(sh_15_no_prefetch)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001912
Jack Renc47703a2012-02-14 12:01:52 +04001913 lea -64(%ecx), %ecx
1914
1915 .p2align 4
1916L(Shl15LoopStart):
1917 prefetcht0 0x1c0(%eax)
1918 prefetcht0 0x1c0(%edx)
1919 movaps 1(%eax), %xmm2
1920 movaps 17(%eax), %xmm3
1921 movaps 33(%eax), %xmm4
1922 movaps 49(%eax), %xmm5
1923 movaps %xmm5, %xmm7
1924 palignr $15, %xmm4, %xmm5
1925 palignr $15, %xmm3, %xmm4
1926 movaps %xmm5, 48(%edx)
1927 palignr $15, %xmm2, %xmm3
1928 lea 64(%eax), %eax
1929 palignr $15, %xmm1, %xmm2
1930 movaps %xmm4, 32(%edx)
1931 movaps %xmm3, 16(%edx)
1932 movaps %xmm7, %xmm1
1933 movaps %xmm2, (%edx)
1934 lea 64(%edx), %edx
1935 sub $64, %ecx
1936 ja L(Shl15LoopStart)
1937
1938L(Shl15LoopLeave):
1939 add $32, %ecx
1940 jle L(shl_end_0)
1941
1942 movaps 1(%eax), %xmm2
1943 movaps 17(%eax), %xmm3
1944 palignr $15, %xmm2, %xmm3
1945 palignr $15, %xmm1, %xmm2
1946
1947 movaps %xmm2, (%edx)
1948 movaps %xmm3, 16(%edx)
1949 lea 32(%edx, %ecx), %edx
1950 lea 32(%eax, %ecx), %eax
1951 POP (%edi)
1952 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
1953
1954 CFI_PUSH (%edi)
1955
1956 .p2align 4
1957L(sh_15_no_prefetch):
1958 lea -32(%ecx), %ecx
1959 lea -15(%eax), %eax
1960 xor %edi, %edi
1961
1962 .p2align 4
1963L(sh_15_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001964 movdqa 16(%eax, %edi), %xmm2
1965 sub $32, %ecx
1966 movdqa 32(%eax, %edi), %xmm3
1967 movdqa %xmm3, %xmm4
1968 palignr $15, %xmm2, %xmm3
1969 palignr $15, %xmm1, %xmm2
1970 lea 32(%edi), %edi
1971 movdqa %xmm2, -32(%edx, %edi)
1972 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001973 jb L(sh_15_end_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001974
1975 movdqa 16(%eax, %edi), %xmm2
1976 sub $32, %ecx
1977 movdqa 32(%eax, %edi), %xmm3
1978 movdqa %xmm3, %xmm1
1979 palignr $15, %xmm2, %xmm3
1980 palignr $15, %xmm4, %xmm2
1981 lea 32(%edi), %edi
1982 movdqa %xmm2, -32(%edx, %edi)
1983 movdqa %xmm3, -16(%edx, %edi)
Jack Renc47703a2012-02-14 12:01:52 +04001984 jae L(sh_15_no_prefetch_loop)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001985
Jack Renc47703a2012-02-14 12:01:52 +04001986L(sh_15_end_no_prefetch_loop):
Bruce Beare8ff1a272010-03-04 11:03:37 -08001987 lea 32(%ecx), %ecx
1988 add %ecx, %edi
1989 add %edi, %edx
1990 lea 15(%edi, %eax), %eax
Jack Renc47703a2012-02-14 12:01:52 +04001991 POP (%edi)
1992 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001993
Jack Renc47703a2012-02-14 12:01:52 +04001994 CFI_PUSH (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -08001995
Jack Renc47703a2012-02-14 12:01:52 +04001996 .p2align 4
1997L(shl_end_0):
1998 lea 32(%ecx), %ecx
1999 lea (%edx, %ecx), %edx
2000 lea (%eax, %ecx), %eax
2001 POP (%edi)
2002 BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
2003
2004 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002005L(fwd_write_44bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002006 movq -44(%eax), %xmm0
2007 movq %xmm0, -44(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002008L(fwd_write_36bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002009 movq -36(%eax), %xmm0
2010 movq %xmm0, -36(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002011L(fwd_write_28bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002012 movq -28(%eax), %xmm0
2013 movq %xmm0, -28(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002014L(fwd_write_20bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002015 movq -20(%eax), %xmm0
2016 movq %xmm0, -20(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002017L(fwd_write_12bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002018 movq -12(%eax), %xmm0
2019 movq %xmm0, -12(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002020L(fwd_write_4bytes):
2021 movl -4(%eax), %ecx
2022 movl %ecx, -4(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002023#ifndef USE_AS_BCOPY
2024# ifdef USE_AS_MEMPCPY
2025 movl %edx, %eax
2026# else
2027 movl DEST(%esp), %eax
2028# endif
2029#endif
2030 RETURN
2031
2032 .p2align 4
2033L(fwd_write_40bytes):
2034 movq -40(%eax), %xmm0
2035 movq %xmm0, -40(%edx)
2036L(fwd_write_32bytes):
2037 movq -32(%eax), %xmm0
2038 movq %xmm0, -32(%edx)
2039L(fwd_write_24bytes):
2040 movq -24(%eax), %xmm0
2041 movq %xmm0, -24(%edx)
2042L(fwd_write_16bytes):
2043 movq -16(%eax), %xmm0
2044 movq %xmm0, -16(%edx)
2045L(fwd_write_8bytes):
2046 movq -8(%eax), %xmm0
2047 movq %xmm0, -8(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002048L(fwd_write_0bytes):
2049#ifndef USE_AS_BCOPY
2050# ifdef USE_AS_MEMPCPY
2051 movl %edx, %eax
2052# else
2053 movl DEST(%esp), %eax
2054# endif
2055#endif
2056 RETURN
2057
Jack Renc47703a2012-02-14 12:01:52 +04002058 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002059L(fwd_write_5bytes):
2060 movl -5(%eax), %ecx
2061 movl -4(%eax), %eax
2062 movl %ecx, -5(%edx)
2063 movl %eax, -4(%edx)
2064#ifndef USE_AS_BCOPY
2065# ifdef USE_AS_MEMPCPY
2066 movl %edx, %eax
2067# else
2068 movl DEST(%esp), %eax
2069# endif
2070#endif
2071 RETURN
2072
Jack Renc47703a2012-02-14 12:01:52 +04002073 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002074L(fwd_write_45bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002075 movq -45(%eax), %xmm0
2076 movq %xmm0, -45(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002077L(fwd_write_37bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002078 movq -37(%eax), %xmm0
2079 movq %xmm0, -37(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002080L(fwd_write_29bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002081 movq -29(%eax), %xmm0
2082 movq %xmm0, -29(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002083L(fwd_write_21bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002084 movq -21(%eax), %xmm0
2085 movq %xmm0, -21(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002086L(fwd_write_13bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002087 movq -13(%eax), %xmm0
2088 movq %xmm0, -13(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002089 movl -5(%eax), %ecx
2090 movl %ecx, -5(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002091 movzbl -1(%eax), %ecx
2092 movb %cl, -1(%edx)
2093#ifndef USE_AS_BCOPY
2094# ifdef USE_AS_MEMPCPY
2095 movl %edx, %eax
2096# else
2097 movl DEST(%esp), %eax
2098# endif
2099#endif
2100 RETURN
2101
2102 .p2align 4
2103L(fwd_write_41bytes):
2104 movq -41(%eax), %xmm0
2105 movq %xmm0, -41(%edx)
2106L(fwd_write_33bytes):
2107 movq -33(%eax), %xmm0
2108 movq %xmm0, -33(%edx)
2109L(fwd_write_25bytes):
2110 movq -25(%eax), %xmm0
2111 movq %xmm0, -25(%edx)
2112L(fwd_write_17bytes):
2113 movq -17(%eax), %xmm0
2114 movq %xmm0, -17(%edx)
2115L(fwd_write_9bytes):
2116 movq -9(%eax), %xmm0
2117 movq %xmm0, -9(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002118L(fwd_write_1bytes):
2119 movzbl -1(%eax), %ecx
2120 movb %cl, -1(%edx)
2121#ifndef USE_AS_BCOPY
2122# ifdef USE_AS_MEMPCPY
2123 movl %edx, %eax
2124# else
2125 movl DEST(%esp), %eax
2126# endif
2127#endif
2128 RETURN
2129
Jack Renc47703a2012-02-14 12:01:52 +04002130 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002131L(fwd_write_46bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002132 movq -46(%eax), %xmm0
2133 movq %xmm0, -46(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002134L(fwd_write_38bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002135 movq -38(%eax), %xmm0
2136 movq %xmm0, -38(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002137L(fwd_write_30bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002138 movq -30(%eax), %xmm0
2139 movq %xmm0, -30(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002140L(fwd_write_22bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002141 movq -22(%eax), %xmm0
2142 movq %xmm0, -22(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002143L(fwd_write_14bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002144 movq -14(%eax), %xmm0
2145 movq %xmm0, -14(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002146L(fwd_write_6bytes):
2147 movl -6(%eax), %ecx
2148 movl %ecx, -6(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002149 movzwl -2(%eax), %ecx
2150 movw %cx, -2(%edx)
2151#ifndef USE_AS_BCOPY
2152# ifdef USE_AS_MEMPCPY
2153 movl %edx, %eax
2154# else
2155 movl DEST(%esp), %eax
2156# endif
2157#endif
2158 RETURN
2159
2160 .p2align 4
2161L(fwd_write_42bytes):
2162 movq -42(%eax), %xmm0
2163 movq %xmm0, -42(%edx)
2164L(fwd_write_34bytes):
2165 movq -34(%eax), %xmm0
2166 movq %xmm0, -34(%edx)
2167L(fwd_write_26bytes):
2168 movq -26(%eax), %xmm0
2169 movq %xmm0, -26(%edx)
2170L(fwd_write_18bytes):
2171 movq -18(%eax), %xmm0
2172 movq %xmm0, -18(%edx)
2173L(fwd_write_10bytes):
2174 movq -10(%eax), %xmm0
2175 movq %xmm0, -10(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002176L(fwd_write_2bytes):
2177 movzwl -2(%eax), %ecx
2178 movw %cx, -2(%edx)
2179#ifndef USE_AS_BCOPY
2180# ifdef USE_AS_MEMPCPY
2181 movl %edx, %eax
2182# else
2183 movl DEST(%esp), %eax
2184# endif
2185#endif
2186 RETURN
2187
Jack Renc47703a2012-02-14 12:01:52 +04002188 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002189L(fwd_write_47bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002190 movq -47(%eax), %xmm0
2191 movq %xmm0, -47(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002192L(fwd_write_39bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002193 movq -39(%eax), %xmm0
2194 movq %xmm0, -39(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002195L(fwd_write_31bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002196 movq -31(%eax), %xmm0
2197 movq %xmm0, -31(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002198L(fwd_write_23bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002199 movq -23(%eax), %xmm0
2200 movq %xmm0, -23(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002201L(fwd_write_15bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002202 movq -15(%eax), %xmm0
2203 movq %xmm0, -15(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002204L(fwd_write_7bytes):
2205 movl -7(%eax), %ecx
2206 movl %ecx, -7(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002207 movzwl -3(%eax), %ecx
2208 movzbl -1(%eax), %eax
2209 movw %cx, -3(%edx)
2210 movb %al, -1(%edx)
2211#ifndef USE_AS_BCOPY
2212# ifdef USE_AS_MEMPCPY
2213 movl %edx, %eax
2214# else
2215 movl DEST(%esp), %eax
2216# endif
2217#endif
2218 RETURN
2219
2220 .p2align 4
2221L(fwd_write_43bytes):
2222 movq -43(%eax), %xmm0
2223 movq %xmm0, -43(%edx)
2224L(fwd_write_35bytes):
2225 movq -35(%eax), %xmm0
2226 movq %xmm0, -35(%edx)
2227L(fwd_write_27bytes):
2228 movq -27(%eax), %xmm0
2229 movq %xmm0, -27(%edx)
2230L(fwd_write_19bytes):
2231 movq -19(%eax), %xmm0
2232 movq %xmm0, -19(%edx)
2233L(fwd_write_11bytes):
2234 movq -11(%eax), %xmm0
2235 movq %xmm0, -11(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002236L(fwd_write_3bytes):
2237 movzwl -3(%eax), %ecx
2238 movzbl -1(%eax), %eax
2239 movw %cx, -3(%edx)
2240 movb %al, -1(%edx)
2241#ifndef USE_AS_BCOPY
2242# ifdef USE_AS_MEMPCPY
2243 movl %edx, %eax
2244# else
2245 movl DEST(%esp), %eax
2246# endif
2247#endif
Jack Renc47703a2012-02-14 12:01:52 +04002248 RETURN
2249
2250 .p2align 4
2251L(fwd_write_40bytes_align):
2252 movdqa -40(%eax), %xmm0
2253 movdqa %xmm0, -40(%edx)
2254L(fwd_write_24bytes_align):
2255 movdqa -24(%eax), %xmm0
2256 movdqa %xmm0, -24(%edx)
2257L(fwd_write_8bytes_align):
2258 movq -8(%eax), %xmm0
2259 movq %xmm0, -8(%edx)
2260L(fwd_write_0bytes_align):
2261#ifndef USE_AS_BCOPY
2262# ifdef USE_AS_MEMPCPY
2263 movl %edx, %eax
2264# else
2265 movl DEST(%esp), %eax
2266# endif
2267#endif
2268 RETURN
2269
2270 .p2align 4
2271L(fwd_write_32bytes_align):
2272 movdqa -32(%eax), %xmm0
2273 movdqa %xmm0, -32(%edx)
2274L(fwd_write_16bytes_align):
2275 movdqa -16(%eax), %xmm0
2276 movdqa %xmm0, -16(%edx)
2277#ifndef USE_AS_BCOPY
2278# ifdef USE_AS_MEMPCPY
2279 movl %edx, %eax
2280# else
2281 movl DEST(%esp), %eax
2282# endif
2283#endif
2284 RETURN
2285
2286 .p2align 4
2287L(fwd_write_5bytes_align):
2288 movl -5(%eax), %ecx
2289 movl -4(%eax), %eax
2290 movl %ecx, -5(%edx)
2291 movl %eax, -4(%edx)
2292#ifndef USE_AS_BCOPY
2293# ifdef USE_AS_MEMPCPY
2294 movl %edx, %eax
2295# else
2296 movl DEST(%esp), %eax
2297# endif
2298#endif
2299 RETURN
2300
2301 .p2align 4
2302L(fwd_write_45bytes_align):
2303 movdqa -45(%eax), %xmm0
2304 movdqa %xmm0, -45(%edx)
2305L(fwd_write_29bytes_align):
2306 movdqa -29(%eax), %xmm0
2307 movdqa %xmm0, -29(%edx)
2308L(fwd_write_13bytes_align):
2309 movq -13(%eax), %xmm0
2310 movq %xmm0, -13(%edx)
2311 movl -5(%eax), %ecx
2312 movl %ecx, -5(%edx)
2313 movzbl -1(%eax), %ecx
2314 movb %cl, -1(%edx)
2315#ifndef USE_AS_BCOPY
2316# ifdef USE_AS_MEMPCPY
2317 movl %edx, %eax
2318# else
2319 movl DEST(%esp), %eax
2320# endif
2321#endif
2322 RETURN
2323
2324 .p2align 4
2325L(fwd_write_37bytes_align):
2326 movdqa -37(%eax), %xmm0
2327 movdqa %xmm0, -37(%edx)
2328L(fwd_write_21bytes_align):
2329 movdqa -21(%eax), %xmm0
2330 movdqa %xmm0, -21(%edx)
2331 movl -5(%eax), %ecx
2332 movl %ecx, -5(%edx)
2333 movzbl -1(%eax), %ecx
2334 movb %cl, -1(%edx)
2335#ifndef USE_AS_BCOPY
2336# ifdef USE_AS_MEMPCPY
2337 movl %edx, %eax
2338# else
2339 movl DEST(%esp), %eax
2340# endif
2341#endif
2342 RETURN
2343
2344 .p2align 4
2345L(fwd_write_41bytes_align):
2346 movdqa -41(%eax), %xmm0
2347 movdqa %xmm0, -41(%edx)
2348L(fwd_write_25bytes_align):
2349 movdqa -25(%eax), %xmm0
2350 movdqa %xmm0, -25(%edx)
2351L(fwd_write_9bytes_align):
2352 movq -9(%eax), %xmm0
2353 movq %xmm0, -9(%edx)
2354L(fwd_write_1bytes_align):
2355 movzbl -1(%eax), %ecx
2356 movb %cl, -1(%edx)
2357#ifndef USE_AS_BCOPY
2358# ifdef USE_AS_MEMPCPY
2359 movl %edx, %eax
2360# else
2361 movl DEST(%esp), %eax
2362# endif
2363#endif
2364 RETURN
2365
2366 .p2align 4
2367L(fwd_write_33bytes_align):
2368 movdqa -33(%eax), %xmm0
2369 movdqa %xmm0, -33(%edx)
2370L(fwd_write_17bytes_align):
2371 movdqa -17(%eax), %xmm0
2372 movdqa %xmm0, -17(%edx)
2373 movzbl -1(%eax), %ecx
2374 movb %cl, -1(%edx)
2375#ifndef USE_AS_BCOPY
2376# ifdef USE_AS_MEMPCPY
2377 movl %edx, %eax
2378# else
2379 movl DEST(%esp), %eax
2380# endif
2381#endif
2382 RETURN
2383
2384 .p2align 4
2385L(fwd_write_46bytes_align):
2386 movdqa -46(%eax), %xmm0
2387 movdqa %xmm0, -46(%edx)
2388L(fwd_write_30bytes_align):
2389 movdqa -30(%eax), %xmm0
2390 movdqa %xmm0, -30(%edx)
2391L(fwd_write_14bytes_align):
2392 movq -14(%eax), %xmm0
2393 movq %xmm0, -14(%edx)
2394L(fwd_write_6bytes_align):
2395 movl -6(%eax), %ecx
2396 movl %ecx, -6(%edx)
2397 movzwl -2(%eax), %ecx
2398 movw %cx, -2(%edx)
2399#ifndef USE_AS_BCOPY
2400# ifdef USE_AS_MEMPCPY
2401 movl %edx, %eax
2402# else
2403 movl DEST(%esp), %eax
2404# endif
2405#endif
2406 RETURN
2407
2408 .p2align 4
2409L(fwd_write_38bytes_align):
2410 movdqa -38(%eax), %xmm0
2411 movdqa %xmm0, -38(%edx)
2412L(fwd_write_22bytes_align):
2413 movdqa -22(%eax), %xmm0
2414 movdqa %xmm0, -22(%edx)
2415 movl -6(%eax), %ecx
2416 movl %ecx, -6(%edx)
2417 movzwl -2(%eax), %ecx
2418 movw %cx, -2(%edx)
2419#ifndef USE_AS_BCOPY
2420# ifdef USE_AS_MEMPCPY
2421 movl %edx, %eax
2422# else
2423 movl DEST(%esp), %eax
2424# endif
2425#endif
2426 RETURN
2427
2428 .p2align 4
2429L(fwd_write_42bytes_align):
2430 movdqa -42(%eax), %xmm0
2431 movdqa %xmm0, -42(%edx)
2432L(fwd_write_26bytes_align):
2433 movdqa -26(%eax), %xmm0
2434 movdqa %xmm0, -26(%edx)
2435L(fwd_write_10bytes_align):
2436 movq -10(%eax), %xmm0
2437 movq %xmm0, -10(%edx)
2438L(fwd_write_2bytes_align):
2439 movzwl -2(%eax), %ecx
2440 movw %cx, -2(%edx)
2441#ifndef USE_AS_BCOPY
2442# ifdef USE_AS_MEMPCPY
2443 movl %edx, %eax
2444# else
2445 movl DEST(%esp), %eax
2446# endif
2447#endif
2448 RETURN
2449
2450 .p2align 4
2451L(fwd_write_34bytes_align):
2452 movdqa -34(%eax), %xmm0
2453 movdqa %xmm0, -34(%edx)
2454L(fwd_write_18bytes_align):
2455 movdqa -18(%eax), %xmm0
2456 movdqa %xmm0, -18(%edx)
2457 movzwl -2(%eax), %ecx
2458 movw %cx, -2(%edx)
2459#ifndef USE_AS_BCOPY
2460# ifdef USE_AS_MEMPCPY
2461 movl %edx, %eax
2462# else
2463 movl DEST(%esp), %eax
2464# endif
2465#endif
2466 RETURN
2467
2468 .p2align 4
2469L(fwd_write_47bytes_align):
2470 movdqa -47(%eax), %xmm0
2471 movdqa %xmm0, -47(%edx)
2472L(fwd_write_31bytes_align):
2473 movdqa -31(%eax), %xmm0
2474 movdqa %xmm0, -31(%edx)
2475L(fwd_write_15bytes_align):
2476 movq -15(%eax), %xmm0
2477 movq %xmm0, -15(%edx)
2478L(fwd_write_7bytes_align):
2479 movl -7(%eax), %ecx
2480 movl %ecx, -7(%edx)
2481 movzwl -3(%eax), %ecx
2482 movzbl -1(%eax), %eax
2483 movw %cx, -3(%edx)
2484 movb %al, -1(%edx)
2485#ifndef USE_AS_BCOPY
2486# ifdef USE_AS_MEMPCPY
2487 movl %edx, %eax
2488# else
2489 movl DEST(%esp), %eax
2490# endif
2491#endif
2492 RETURN
2493
2494 .p2align 4
2495L(fwd_write_39bytes_align):
2496 movdqa -39(%eax), %xmm0
2497 movdqa %xmm0, -39(%edx)
2498L(fwd_write_23bytes_align):
2499 movdqa -23(%eax), %xmm0
2500 movdqa %xmm0, -23(%edx)
2501 movl -7(%eax), %ecx
2502 movl %ecx, -7(%edx)
2503 movzwl -3(%eax), %ecx
2504 movzbl -1(%eax), %eax
2505 movw %cx, -3(%edx)
2506 movb %al, -1(%edx)
2507#ifndef USE_AS_BCOPY
2508# ifdef USE_AS_MEMPCPY
2509 movl %edx, %eax
2510# else
2511 movl DEST(%esp), %eax
2512# endif
2513#endif
2514 RETURN
2515
2516 .p2align 4
2517L(fwd_write_43bytes_align):
2518 movdqa -43(%eax), %xmm0
2519 movdqa %xmm0, -43(%edx)
2520L(fwd_write_27bytes_align):
2521 movdqa -27(%eax), %xmm0
2522 movdqa %xmm0, -27(%edx)
2523L(fwd_write_11bytes_align):
2524 movq -11(%eax), %xmm0
2525 movq %xmm0, -11(%edx)
2526L(fwd_write_3bytes_align):
2527 movzwl -3(%eax), %ecx
2528 movzbl -1(%eax), %eax
2529 movw %cx, -3(%edx)
2530 movb %al, -1(%edx)
2531#ifndef USE_AS_BCOPY
2532# ifdef USE_AS_MEMPCPY
2533 movl %edx, %eax
2534# else
2535 movl DEST(%esp), %eax
2536# endif
2537#endif
2538 RETURN
2539
2540 .p2align 4
2541L(fwd_write_35bytes_align):
2542 movdqa -35(%eax), %xmm0
2543 movdqa %xmm0, -35(%edx)
2544L(fwd_write_19bytes_align):
2545 movdqa -19(%eax), %xmm0
2546 movdqa %xmm0, -19(%edx)
2547 movzwl -3(%eax), %ecx
2548 movzbl -1(%eax), %eax
2549 movw %cx, -3(%edx)
2550 movb %al, -1(%edx)
2551#ifndef USE_AS_BCOPY
2552# ifdef USE_AS_MEMPCPY
2553 movl %edx, %eax
2554# else
2555 movl DEST(%esp), %eax
2556# endif
2557#endif
2558 RETURN
2559
2560 .p2align 4
2561L(fwd_write_44bytes_align):
2562 movdqa -44(%eax), %xmm0
2563 movdqa %xmm0, -44(%edx)
2564L(fwd_write_28bytes_align):
2565 movdqa -28(%eax), %xmm0
2566 movdqa %xmm0, -28(%edx)
2567L(fwd_write_12bytes_align):
2568 movq -12(%eax), %xmm0
2569 movq %xmm0, -12(%edx)
2570L(fwd_write_4bytes_align):
2571 movl -4(%eax), %ecx
2572 movl %ecx, -4(%edx)
2573#ifndef USE_AS_BCOPY
2574# ifdef USE_AS_MEMPCPY
2575 movl %edx, %eax
2576# else
2577 movl DEST(%esp), %eax
2578# endif
2579#endif
2580 RETURN
2581
2582 .p2align 4
2583L(fwd_write_36bytes_align):
2584 movdqa -36(%eax), %xmm0
2585 movdqa %xmm0, -36(%edx)
2586L(fwd_write_20bytes_align):
2587 movdqa -20(%eax), %xmm0
2588 movdqa %xmm0, -20(%edx)
2589 movl -4(%eax), %ecx
2590 movl %ecx, -4(%edx)
2591#ifndef USE_AS_BCOPY
2592# ifdef USE_AS_MEMPCPY
2593 movl %edx, %eax
2594# else
2595 movl DEST(%esp), %eax
2596# endif
2597#endif
Bruce Beare124a5422010-10-11 12:24:41 -07002598 RETURN_END
Bruce Beare8ff1a272010-03-04 11:03:37 -08002599
Jack Renc47703a2012-02-14 12:01:52 +04002600 CFI_PUSH (%edi)
2601
2602 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002603L(large_page):
2604 movdqu (%eax), %xmm1
Jack Renc47703a2012-02-14 12:01:52 +04002605#ifdef USE_AS_MEMMOVE
2606 movl DEST+4(%esp), %edi
2607 movdqu %xmm0, (%edi)
2608#endif
Bruce Beare8ff1a272010-03-04 11:03:37 -08002609 lea 16(%eax), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08002610 movntdq %xmm1, (%edx)
2611 lea 16(%edx), %edx
Bruce Beare8ff1a272010-03-04 11:03:37 -08002612 lea -0x90(%ecx), %ecx
2613 POP (%edi)
Jack Renc47703a2012-02-14 12:01:52 +04002614
2615 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002616L(large_page_loop):
2617 movdqu (%eax), %xmm0
2618 movdqu 0x10(%eax), %xmm1
2619 movdqu 0x20(%eax), %xmm2
2620 movdqu 0x30(%eax), %xmm3
2621 movdqu 0x40(%eax), %xmm4
2622 movdqu 0x50(%eax), %xmm5
2623 movdqu 0x60(%eax), %xmm6
2624 movdqu 0x70(%eax), %xmm7
2625 lea 0x80(%eax), %eax
2626
2627 sub $0x80, %ecx
2628 movntdq %xmm0, (%edx)
2629 movntdq %xmm1, 0x10(%edx)
2630 movntdq %xmm2, 0x20(%edx)
2631 movntdq %xmm3, 0x30(%edx)
2632 movntdq %xmm4, 0x40(%edx)
2633 movntdq %xmm5, 0x50(%edx)
2634 movntdq %xmm6, 0x60(%edx)
2635 movntdq %xmm7, 0x70(%edx)
2636 lea 0x80(%edx), %edx
2637 jae L(large_page_loop)
2638 cmp $-0x40, %ecx
2639 lea 0x80(%ecx), %ecx
2640 jl L(large_page_less_64bytes)
2641
2642 movdqu (%eax), %xmm0
2643 movdqu 0x10(%eax), %xmm1
2644 movdqu 0x20(%eax), %xmm2
2645 movdqu 0x30(%eax), %xmm3
2646 lea 0x40(%eax), %eax
2647
2648 movntdq %xmm0, (%edx)
2649 movntdq %xmm1, 0x10(%edx)
2650 movntdq %xmm2, 0x20(%edx)
2651 movntdq %xmm3, 0x30(%edx)
2652 lea 0x40(%edx), %edx
2653 sub $0x40, %ecx
2654L(large_page_less_64bytes):
2655 cmp $32, %ecx
2656 jb L(large_page_less_32bytes)
2657 movdqu (%eax), %xmm0
2658 movdqu 0x10(%eax), %xmm1
2659 lea 0x20(%eax), %eax
2660 movntdq %xmm0, (%edx)
2661 movntdq %xmm1, 0x10(%edx)
2662 lea 0x20(%edx), %edx
2663 sub $0x20, %ecx
2664L(large_page_less_32bytes):
2665 add %ecx, %edx
2666 add %ecx, %eax
2667 sfence
2668 BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
2669
Jack Renc47703a2012-02-14 12:01:52 +04002670 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002671L(bk_write_44bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002672 movq 36(%eax), %xmm0
2673 movq %xmm0, 36(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002674L(bk_write_36bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002675 movq 28(%eax), %xmm0
2676 movq %xmm0, 28(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002677L(bk_write_28bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002678 movq 20(%eax), %xmm0
2679 movq %xmm0, 20(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002680L(bk_write_20bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002681 movq 12(%eax), %xmm0
2682 movq %xmm0, 12(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002683L(bk_write_12bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002684 movq 4(%eax), %xmm0
2685 movq %xmm0, 4(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002686L(bk_write_4bytes):
2687 movl (%eax), %ecx
2688 movl %ecx, (%edx)
2689L(bk_write_0bytes):
2690#ifndef USE_AS_BCOPY
2691 movl DEST(%esp), %eax
2692# ifdef USE_AS_MEMPCPY
2693 movl LEN(%esp), %ecx
2694 add %ecx, %eax
2695# endif
2696#endif
2697 RETURN
2698
Jack Renc47703a2012-02-14 12:01:52 +04002699 .p2align 4
2700L(bk_write_40bytes):
2701 movq 32(%eax), %xmm0
2702 movq %xmm0, 32(%edx)
2703L(bk_write_32bytes):
2704 movq 24(%eax), %xmm0
2705 movq %xmm0, 24(%edx)
2706L(bk_write_24bytes):
2707 movq 16(%eax), %xmm0
2708 movq %xmm0, 16(%edx)
2709L(bk_write_16bytes):
2710 movq 8(%eax), %xmm0
2711 movq %xmm0, 8(%edx)
2712L(bk_write_8bytes):
2713 movq (%eax), %xmm0
2714 movq %xmm0, (%edx)
2715#ifndef USE_AS_BCOPY
2716 movl DEST(%esp), %eax
2717# ifdef USE_AS_MEMPCPY
2718 movl LEN(%esp), %ecx
2719 add %ecx, %eax
2720# endif
2721#endif
2722 RETURN
2723
2724 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002725L(bk_write_45bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002726 movq 37(%eax), %xmm0
2727 movq %xmm0, 37(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002728L(bk_write_37bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002729 movq 29(%eax), %xmm0
2730 movq %xmm0, 29(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002731L(bk_write_29bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002732 movq 21(%eax), %xmm0
2733 movq %xmm0, 21(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002734L(bk_write_21bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002735 movq 13(%eax), %xmm0
2736 movq %xmm0, 13(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002737L(bk_write_13bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002738 movq 5(%eax), %xmm0
2739 movq %xmm0, 5(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002740L(bk_write_5bytes):
2741 movl 1(%eax), %ecx
2742 movl %ecx, 1(%edx)
2743L(bk_write_1bytes):
2744 movzbl (%eax), %ecx
2745 movb %cl, (%edx)
2746#ifndef USE_AS_BCOPY
2747 movl DEST(%esp), %eax
2748# ifdef USE_AS_MEMPCPY
2749 movl LEN(%esp), %ecx
2750 add %ecx, %eax
2751# endif
2752#endif
2753 RETURN
2754
Jack Renc47703a2012-02-14 12:01:52 +04002755 .p2align 4
2756L(bk_write_41bytes):
2757 movq 33(%eax), %xmm0
2758 movq %xmm0, 33(%edx)
2759L(bk_write_33bytes):
2760 movq 25(%eax), %xmm0
2761 movq %xmm0, 25(%edx)
2762L(bk_write_25bytes):
2763 movq 17(%eax), %xmm0
2764 movq %xmm0, 17(%edx)
2765L(bk_write_17bytes):
2766 movq 9(%eax), %xmm0
2767 movq %xmm0, 9(%edx)
2768L(bk_write_9bytes):
2769 movq 1(%eax), %xmm0
2770 movq %xmm0, 1(%edx)
2771 movzbl (%eax), %ecx
2772 movb %cl, (%edx)
2773#ifndef USE_AS_BCOPY
2774 movl DEST(%esp), %eax
2775# ifdef USE_AS_MEMPCPY
2776 movl LEN(%esp), %ecx
2777 add %ecx, %eax
2778# endif
2779#endif
2780 RETURN
2781
2782 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002783L(bk_write_46bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002784 movq 38(%eax), %xmm0
2785 movq %xmm0, 38(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002786L(bk_write_38bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002787 movq 30(%eax), %xmm0
2788 movq %xmm0, 30(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002789L(bk_write_30bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002790 movq 22(%eax), %xmm0
2791 movq %xmm0, 22(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002792L(bk_write_22bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002793 movq 14(%eax), %xmm0
2794 movq %xmm0, 14(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002795L(bk_write_14bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002796 movq 6(%eax), %xmm0
2797 movq %xmm0, 6(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002798L(bk_write_6bytes):
2799 movl 2(%eax), %ecx
2800 movl %ecx, 2(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002801 movzwl (%eax), %ecx
2802 movw %cx, (%edx)
2803#ifndef USE_AS_BCOPY
2804 movl DEST(%esp), %eax
2805# ifdef USE_AS_MEMPCPY
2806 movl LEN(%esp), %ecx
2807 add %ecx, %eax
2808# endif
2809#endif
2810 RETURN
2811
2812 .p2align 4
2813L(bk_write_42bytes):
2814 movq 34(%eax), %xmm0
2815 movq %xmm0, 34(%edx)
2816L(bk_write_34bytes):
2817 movq 26(%eax), %xmm0
2818 movq %xmm0, 26(%edx)
2819L(bk_write_26bytes):
2820 movq 18(%eax), %xmm0
2821 movq %xmm0, 18(%edx)
2822L(bk_write_18bytes):
2823 movq 10(%eax), %xmm0
2824 movq %xmm0, 10(%edx)
2825L(bk_write_10bytes):
2826 movq 2(%eax), %xmm0
2827 movq %xmm0, 2(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002828L(bk_write_2bytes):
2829 movzwl (%eax), %ecx
2830 movw %cx, (%edx)
2831#ifndef USE_AS_BCOPY
2832 movl DEST(%esp), %eax
2833# ifdef USE_AS_MEMPCPY
2834 movl LEN(%esp), %ecx
2835 add %ecx, %eax
2836# endif
2837#endif
2838 RETURN
2839
Jack Renc47703a2012-02-14 12:01:52 +04002840 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08002841L(bk_write_47bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002842 movq 39(%eax), %xmm0
2843 movq %xmm0, 39(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002844L(bk_write_39bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002845 movq 31(%eax), %xmm0
2846 movq %xmm0, 31(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002847L(bk_write_31bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002848 movq 23(%eax), %xmm0
2849 movq %xmm0, 23(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002850L(bk_write_23bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002851 movq 15(%eax), %xmm0
2852 movq %xmm0, 15(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002853L(bk_write_15bytes):
Jack Renc47703a2012-02-14 12:01:52 +04002854 movq 7(%eax), %xmm0
2855 movq %xmm0, 7(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002856L(bk_write_7bytes):
2857 movl 3(%eax), %ecx
2858 movl %ecx, 3(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04002859 movzwl 1(%eax), %ecx
2860 movw %cx, 1(%edx)
2861 movzbl (%eax), %eax
2862 movb %al, (%edx)
2863#ifndef USE_AS_BCOPY
2864 movl DEST(%esp), %eax
2865# ifdef USE_AS_MEMPCPY
2866 movl LEN(%esp), %ecx
2867 add %ecx, %eax
2868# endif
2869#endif
2870 RETURN
2871
2872 .p2align 4
2873L(bk_write_43bytes):
2874 movq 35(%eax), %xmm0
2875 movq %xmm0, 35(%edx)
2876L(bk_write_35bytes):
2877 movq 27(%eax), %xmm0
2878 movq %xmm0, 27(%edx)
2879L(bk_write_27bytes):
2880 movq 19(%eax), %xmm0
2881 movq %xmm0, 19(%edx)
2882L(bk_write_19bytes):
2883 movq 11(%eax), %xmm0
2884 movq %xmm0, 11(%edx)
2885L(bk_write_11bytes):
2886 movq 3(%eax), %xmm0
2887 movq %xmm0, 3(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08002888L(bk_write_3bytes):
2889 movzwl 1(%eax), %ecx
2890 movw %cx, 1(%edx)
2891 movzbl (%eax), %eax
2892 movb %al, (%edx)
2893#ifndef USE_AS_BCOPY
2894 movl DEST(%esp), %eax
2895# ifdef USE_AS_MEMPCPY
2896 movl LEN(%esp), %ecx
2897 add %ecx, %eax
2898# endif
2899#endif
2900 RETURN_END
2901
2902
2903 .pushsection .rodata.ssse3,"a",@progbits
Jack Renc47703a2012-02-14 12:01:52 +04002904 .p2align 2
Bruce Beare8ff1a272010-03-04 11:03:37 -08002905L(table_48bytes_fwd):
2906 .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
2907 .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
2908 .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
2909 .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
2910 .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
2911 .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
2912 .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
2913 .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
2914 .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
2915 .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
2916 .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
2917 .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
2918 .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
2919 .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
2920 .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
2921 .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
2922 .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
2923 .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
2924 .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
2925 .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
2926 .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
2927 .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
2928 .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
2929 .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
2930 .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
2931 .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
2932 .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
2933 .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
2934 .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
2935 .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
2936 .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
2937 .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
2938 .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
2939 .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
2940 .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
2941 .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
2942 .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
2943 .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
2944 .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
2945 .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
2946 .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
2947 .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
2948 .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
2949 .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
2950 .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
2951 .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
2952 .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
2953 .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
2954
Jack Renc47703a2012-02-14 12:01:52 +04002955 .p2align 2
2956L(table_48bytes_fwd_align):
2957 .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
2958 .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
2959 .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
2960 .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
2961 .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
2962 .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
2963 .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
2964 .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
2965 .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
2966 .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
2967 .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
2968 .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
2969 .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
2970 .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
2971 .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
2972 .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
2973 .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
2974 .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
2975 .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
2976 .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
2977 .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
2978 .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
2979 .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
2980 .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
2981 .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
2982 .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
2983 .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
2984 .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
2985 .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
2986 .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
2987 .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
2988 .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
2989 .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
2990 .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
2991 .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
2992 .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
2993 .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
2994 .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
2995 .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
2996 .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
2997 .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
2998 .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
2999 .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
3000 .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
3001 .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
3002 .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
3003 .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
3004 .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
3005
3006 .p2align 2
Bruce Beare8ff1a272010-03-04 11:03:37 -08003007L(shl_table):
3008 .int JMPTBL (L(shl_0), L(shl_table))
3009 .int JMPTBL (L(shl_1), L(shl_table))
3010 .int JMPTBL (L(shl_2), L(shl_table))
3011 .int JMPTBL (L(shl_3), L(shl_table))
3012 .int JMPTBL (L(shl_4), L(shl_table))
3013 .int JMPTBL (L(shl_5), L(shl_table))
3014 .int JMPTBL (L(shl_6), L(shl_table))
3015 .int JMPTBL (L(shl_7), L(shl_table))
3016 .int JMPTBL (L(shl_8), L(shl_table))
3017 .int JMPTBL (L(shl_9), L(shl_table))
3018 .int JMPTBL (L(shl_10), L(shl_table))
3019 .int JMPTBL (L(shl_11), L(shl_table))
3020 .int JMPTBL (L(shl_12), L(shl_table))
3021 .int JMPTBL (L(shl_13), L(shl_table))
3022 .int JMPTBL (L(shl_14), L(shl_table))
3023 .int JMPTBL (L(shl_15), L(shl_table))
3024
Jack Renc47703a2012-02-14 12:01:52 +04003025 .p2align 2
Bruce Beare8ff1a272010-03-04 11:03:37 -08003026L(table_48_bytes_bwd):
3027 .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
3028 .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
3029 .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
3030 .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
3031 .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
3032 .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
3033 .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
3034 .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
3035 .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
3036 .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
3037 .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
3038 .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
3039 .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
3040 .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
3041 .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
3042 .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
3043 .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
3044 .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
3045 .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
3046 .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
3047 .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
3048 .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
3049 .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
3050 .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
3051 .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
3052 .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
3053 .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
3054 .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
3055 .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
3056 .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
3057 .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
3058 .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
3059 .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
3060 .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
3061 .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
3062 .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
3063 .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
3064 .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
3065 .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
3066 .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
3067 .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
3068 .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
3069 .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
3070 .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
3071 .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
3072 .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
3073 .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
3074 .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
3075
3076 .popsection
3077
3078#ifdef USE_AS_MEMMOVE
Jack Renc47703a2012-02-14 12:01:52 +04003079 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003080L(copy_backward):
Jack Renc47703a2012-02-14 12:01:52 +04003081 PUSH (%edi)
3082 movl %eax, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003083 lea (%ecx,%edx,1),%edx
Jack Renc47703a2012-02-14 12:01:52 +04003084 lea (%ecx,%edi,1),%edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003085 testl $0x3, %edx
3086 jnz L(bk_align)
3087
3088L(bk_aligned_4):
3089 cmp $64, %ecx
3090 jae L(bk_write_more64bytes)
3091
3092L(bk_write_64bytesless):
3093 cmp $32, %ecx
3094 jb L(bk_write_less32bytes)
3095
3096L(bk_write_more32bytes):
3097 /* Copy 32 bytes at a time. */
3098 sub $32, %ecx
Jack Renc47703a2012-02-14 12:01:52 +04003099 movq -8(%edi), %xmm0
3100 movq %xmm0, -8(%edx)
3101 movq -16(%edi), %xmm0
3102 movq %xmm0, -16(%edx)
3103 movq -24(%edi), %xmm0
3104 movq %xmm0, -24(%edx)
3105 movq -32(%edi), %xmm0
3106 movq %xmm0, -32(%edx)
Bruce Beare8ff1a272010-03-04 11:03:37 -08003107 sub $32, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003108 sub $32, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003109
3110L(bk_write_less32bytes):
Jack Renc47703a2012-02-14 12:01:52 +04003111 movl %edi, %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003112 sub %ecx, %edx
3113 sub %ecx, %eax
Jack Renc47703a2012-02-14 12:01:52 +04003114 POP (%edi)
Bruce Beare8ff1a272010-03-04 11:03:37 -08003115L(bk_write_less32bytes_2):
3116 BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
3117
Jack Renc47703a2012-02-14 12:01:52 +04003118 CFI_PUSH (%edi)
3119
3120 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003121L(bk_align):
3122 cmp $8, %ecx
3123 jbe L(bk_write_less32bytes)
3124 testl $1, %edx
3125 /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
Jack Renc47703a2012-02-14 12:01:52 +04003126 then (EDX & 2) must be != 0. */
Bruce Beare8ff1a272010-03-04 11:03:37 -08003127 jz L(bk_got2)
Jack Renc47703a2012-02-14 12:01:52 +04003128 sub $1, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003129 sub $1, %ecx
3130 sub $1, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003131 movzbl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003132 movb %al, (%edx)
3133
3134 testl $2, %edx
3135 jz L(bk_aligned_4)
3136
3137L(bk_got2):
Jack Renc47703a2012-02-14 12:01:52 +04003138 sub $2, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003139 sub $2, %ecx
3140 sub $2, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003141 movzwl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003142 movw %ax, (%edx)
3143 jmp L(bk_aligned_4)
3144
Jack Renc47703a2012-02-14 12:01:52 +04003145 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003146L(bk_write_more64bytes):
3147 /* Check alignment of last byte. */
3148 testl $15, %edx
3149 jz L(bk_ssse3_cpy_pre)
3150
3151/* EDX is aligned 4 bytes, but not 16 bytes. */
3152L(bk_ssse3_align):
Jack Renc47703a2012-02-14 12:01:52 +04003153 sub $4, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003154 sub $4, %ecx
3155 sub $4, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003156 movl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003157 movl %eax, (%edx)
3158
3159 testl $15, %edx
3160 jz L(bk_ssse3_cpy_pre)
3161
Jack Renc47703a2012-02-14 12:01:52 +04003162 sub $4, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003163 sub $4, %ecx
3164 sub $4, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003165 movl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003166 movl %eax, (%edx)
3167
3168 testl $15, %edx
3169 jz L(bk_ssse3_cpy_pre)
3170
Jack Renc47703a2012-02-14 12:01:52 +04003171 sub $4, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003172 sub $4, %ecx
3173 sub $4, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003174 movl (%edi), %eax
Bruce Beare8ff1a272010-03-04 11:03:37 -08003175 movl %eax, (%edx)
3176
3177L(bk_ssse3_cpy_pre):
3178 cmp $64, %ecx
3179 jb L(bk_write_more32bytes)
3180
Jack Renc47703a2012-02-14 12:01:52 +04003181 .p2align 4
Bruce Beare8ff1a272010-03-04 11:03:37 -08003182L(bk_ssse3_cpy):
Jack Renc47703a2012-02-14 12:01:52 +04003183 sub $64, %edi
Bruce Beare8ff1a272010-03-04 11:03:37 -08003184 sub $64, %ecx
3185 sub $64, %edx
Jack Renc47703a2012-02-14 12:01:52 +04003186 movdqu 0x30(%edi), %xmm3
Bruce Beare8ff1a272010-03-04 11:03:37 -08003187 movdqa %xmm3, 0x30(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04003188 movdqu 0x20(%edi), %xmm2
Bruce Beare8ff1a272010-03-04 11:03:37 -08003189 movdqa %xmm2, 0x20(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04003190 movdqu 0x10(%edi), %xmm1
Bruce Beare8ff1a272010-03-04 11:03:37 -08003191 movdqa %xmm1, 0x10(%edx)
Jack Renc47703a2012-02-14 12:01:52 +04003192 movdqu (%edi), %xmm0
Bruce Beare8ff1a272010-03-04 11:03:37 -08003193 movdqa %xmm0, (%edx)
3194 cmp $64, %ecx
3195 jae L(bk_ssse3_cpy)
3196 jmp L(bk_write_64bytesless)
3197
3198#endif
3199
3200END (MEMCPY)