blob: 59a598c36c3c2b05b356ca4a3204273dc2d50a3a [file] [log] [blame]
Bruce Beare8ff1a272010-03-04 11:03:37 -08001/*
2Copyright (c) 2010, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef L
32# define L(label) .L##label
33#endif
34
35#ifndef ALIGN
36# define ALIGN(n) .p2align n
37#endif
38
39#ifndef cfi_startproc
40# define cfi_startproc .cfi_startproc
41#endif
42
43#ifndef cfi_endproc
44# define cfi_endproc .cfi_endproc
45#endif
46
47#ifndef cfi_rel_offset
48# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
49#endif
50
51#ifndef cfi_restore
52# define cfi_restore(reg) .cfi_restore (reg)
53#endif
54
55#ifndef cfi_adjust_cfa_offset
56# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
57#endif
58
59#ifndef ENTRY
60# define ENTRY(name) \
61 .type name, @function; \
62 .globl name; \
63 .p2align 4; \
64name: \
65 cfi_startproc
66#endif
67
68#ifndef END
69# define END(name) \
70 cfi_endproc; \
71 .size name, .-name
72#endif
73
74#define CFI_PUSH(REG) \
75 cfi_adjust_cfa_offset (4); \
76 cfi_rel_offset (REG, 0)
77
78#define CFI_POP(REG) \
79 cfi_adjust_cfa_offset (-4); \
80 cfi_restore (REG)
81
82#define PUSH(REG) pushl REG; CFI_PUSH (REG)
83#define POP(REG) popl REG; CFI_POP (REG)
84
85#ifdef USE_AS_BZERO
86# define DEST PARMS
87# define LEN DEST+4
88# define SETRTNVAL
89#else
90# define DEST PARMS
91# define CHR DEST+4
92# define LEN CHR+4
93# define SETRTNVAL movl DEST(%esp), %eax
94#endif
95
96#ifdef SHARED
97# define ENTRANCE PUSH (%ebx);
98# define RETURN_END POP (%ebx); ret
99# define RETURN RETURN_END; CFI_PUSH (%ebx)
100# define PARMS 8 /* Preserve EBX. */
101# define JMPTBL(I, B) I - B
102
103/* Load an entry in a jump table into EBX and branch to it. TABLE is a
104 jump table with relative offsets. */
105# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
106 /* We first load PC into EBX. */ \
107 call __i686.get_pc_thunk.bx; \
108 /* Get the address of the jump table. */ \
109 add $(TABLE - .), %ebx; \
110 /* Get the entry and convert the relative offset to the \
111 absolute address. */ \
112 add (%ebx,%ecx,4), %ebx; \
113 add %ecx, %edx; \
114 /* We loaded the jump table and adjuested EDX. Go. */ \
115 jmp *%ebx
116
117 .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
118 .globl __i686.get_pc_thunk.bx
119 .hidden __i686.get_pc_thunk.bx
120 ALIGN (4)
121 .type __i686.get_pc_thunk.bx,@function
122__i686.get_pc_thunk.bx:
123 movl (%esp), %ebx
124 ret
125#else
126# define ENTRANCE
127# define RETURN_END ret
128# define RETURN RETURN_END
129# define PARMS 4
130# define JMPTBL(I, B) I
131
132/* Branch to an entry in a jump table. TABLE is a jump table with
133 absolute offsets. */
134# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
135 add %ecx, %edx; \
136 jmp *TABLE(,%ecx,4)
137#endif
138
139 .section .text.sse2,"ax",@progbits
140 ALIGN (4)
141ENTRY (sse2_memset5_atom)
142 ENTRANCE
143
144 movl LEN(%esp), %ecx
145#ifdef USE_AS_BZERO
146 xor %eax, %eax
147#else
148 movzbl CHR(%esp), %eax
149 movb %al, %ah
150 /* Fill the whole EAX with pattern. */
151 movl %eax, %edx
152 shl $16, %eax
153 or %edx, %eax
154#endif
155 movl DEST(%esp), %edx
156 cmp $32, %ecx
157 jae L(32bytesormore)
158
159L(write_less32bytes):
160 BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
161
162
163 .pushsection .rodata.sse2,"a",@progbits
164 ALIGN (2)
165L(table_less_32bytes):
166 .int JMPTBL (L(write_0bytes), L(table_less_32bytes))
167 .int JMPTBL (L(write_1bytes), L(table_less_32bytes))
168 .int JMPTBL (L(write_2bytes), L(table_less_32bytes))
169 .int JMPTBL (L(write_3bytes), L(table_less_32bytes))
170 .int JMPTBL (L(write_4bytes), L(table_less_32bytes))
171 .int JMPTBL (L(write_5bytes), L(table_less_32bytes))
172 .int JMPTBL (L(write_6bytes), L(table_less_32bytes))
173 .int JMPTBL (L(write_7bytes), L(table_less_32bytes))
174 .int JMPTBL (L(write_8bytes), L(table_less_32bytes))
175 .int JMPTBL (L(write_9bytes), L(table_less_32bytes))
176 .int JMPTBL (L(write_10bytes), L(table_less_32bytes))
177 .int JMPTBL (L(write_11bytes), L(table_less_32bytes))
178 .int JMPTBL (L(write_12bytes), L(table_less_32bytes))
179 .int JMPTBL (L(write_13bytes), L(table_less_32bytes))
180 .int JMPTBL (L(write_14bytes), L(table_less_32bytes))
181 .int JMPTBL (L(write_15bytes), L(table_less_32bytes))
182 .int JMPTBL (L(write_16bytes), L(table_less_32bytes))
183 .int JMPTBL (L(write_17bytes), L(table_less_32bytes))
184 .int JMPTBL (L(write_18bytes), L(table_less_32bytes))
185 .int JMPTBL (L(write_19bytes), L(table_less_32bytes))
186 .int JMPTBL (L(write_20bytes), L(table_less_32bytes))
187 .int JMPTBL (L(write_21bytes), L(table_less_32bytes))
188 .int JMPTBL (L(write_22bytes), L(table_less_32bytes))
189 .int JMPTBL (L(write_23bytes), L(table_less_32bytes))
190 .int JMPTBL (L(write_24bytes), L(table_less_32bytes))
191 .int JMPTBL (L(write_25bytes), L(table_less_32bytes))
192 .int JMPTBL (L(write_26bytes), L(table_less_32bytes))
193 .int JMPTBL (L(write_27bytes), L(table_less_32bytes))
194 .int JMPTBL (L(write_28bytes), L(table_less_32bytes))
195 .int JMPTBL (L(write_29bytes), L(table_less_32bytes))
196 .int JMPTBL (L(write_30bytes), L(table_less_32bytes))
197 .int JMPTBL (L(write_31bytes), L(table_less_32bytes))
198 .popsection
199
200 ALIGN (4)
201L(write_28bytes):
202 movl %eax, -28(%edx)
203L(write_24bytes):
204 movl %eax, -24(%edx)
205L(write_20bytes):
206 movl %eax, -20(%edx)
207L(write_16bytes):
208 movl %eax, -16(%edx)
209L(write_12bytes):
210 movl %eax, -12(%edx)
211L(write_8bytes):
212 movl %eax, -8(%edx)
213L(write_4bytes):
214 movl %eax, -4(%edx)
215L(write_0bytes):
216 SETRTNVAL
217 RETURN
218
219 ALIGN (4)
220L(write_29bytes):
221 movl %eax, -29(%edx)
222L(write_25bytes):
223 movl %eax, -25(%edx)
224L(write_21bytes):
225 movl %eax, -21(%edx)
226L(write_17bytes):
227 movl %eax, -17(%edx)
228L(write_13bytes):
229 movl %eax, -13(%edx)
230L(write_9bytes):
231 movl %eax, -9(%edx)
232L(write_5bytes):
233 movl %eax, -5(%edx)
234L(write_1bytes):
235 movb %al, -1(%edx)
236 SETRTNVAL
237 RETURN
238
239 ALIGN (4)
240L(write_30bytes):
241 movl %eax, -30(%edx)
242L(write_26bytes):
243 movl %eax, -26(%edx)
244L(write_22bytes):
245 movl %eax, -22(%edx)
246L(write_18bytes):
247 movl %eax, -18(%edx)
248L(write_14bytes):
249 movl %eax, -14(%edx)
250L(write_10bytes):
251 movl %eax, -10(%edx)
252L(write_6bytes):
253 movl %eax, -6(%edx)
254L(write_2bytes):
255 movw %ax, -2(%edx)
256 SETRTNVAL
257 RETURN
258
259 ALIGN (4)
260L(write_31bytes):
261 movl %eax, -31(%edx)
262L(write_27bytes):
263 movl %eax, -27(%edx)
264L(write_23bytes):
265 movl %eax, -23(%edx)
266L(write_19bytes):
267 movl %eax, -19(%edx)
268L(write_15bytes):
269 movl %eax, -15(%edx)
270L(write_11bytes):
271 movl %eax, -11(%edx)
272L(write_7bytes):
273 movl %eax, -7(%edx)
274L(write_3bytes):
275 movw %ax, -3(%edx)
276 movb %al, -1(%edx)
277 SETRTNVAL
278 RETURN
279
280 ALIGN (4)
281/* ECX > 32 and EDX is 4 byte aligned. */
282L(32bytesormore):
283 /* Fill xmm0 with the pattern. */
284#ifdef USE_AS_BZERO
285 pxor %xmm0, %xmm0
286#else
287 movd %eax, %xmm0
288 punpcklbw %xmm0, %xmm0
289 pshufd $0, %xmm0, %xmm0
290#endif
291 testl $0xf, %edx
292 jz L(aligned_16)
293/* ECX > 32 and EDX is not 16 byte aligned. */
294L(not_aligned_16):
295 movdqu %xmm0, (%edx)
296 movl %edx, %eax
297 and $-16, %edx
298 add $16, %edx
299 sub %edx, %eax
300 add %eax, %ecx
301 movd %xmm0, %eax
302
303 ALIGN (4)
304L(aligned_16):
305 cmp $128, %ecx
306 jae L(128bytesormore)
307
308L(aligned_16_less128bytes):
309 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
310
311 ALIGN (4)
312L(128bytesormore):
313#ifdef SHARED_CACHE_SIZE
314 PUSH (%ebx)
315 mov $SHARED_CACHE_SIZE, %ebx
316#else
317# ifdef SHARED
318 call __i686.get_pc_thunk.bx
319 add $_GLOBAL_OFFSET_TABLE_, %ebx
320 mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx
321# else
322 PUSH (%ebx)
323 mov __x86_shared_cache_size, %ebx
324# endif
325#endif
326 cmp %ebx, %ecx
327 jae L(128bytesormore_nt_start)
328
329
330#ifdef DATA_CACHE_SIZE
331 POP (%ebx)
332 cmp $DATA_CACHE_SIZE, %ecx
333#else
334# ifdef SHARED
335 call __i686.get_pc_thunk.bx
336 add $_GLOBAL_OFFSET_TABLE_, %ebx
337 cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx
338# else
339 POP (%ebx)
340 cmp __x86_data_cache_size, %ecx
341# endif
342#endif
343
344 jae L(128bytes_L2_normal)
345 subl $128, %ecx
346L(128bytesormore_normal):
347 sub $128, %ecx
348 movdqa %xmm0, (%edx)
349 movdqa %xmm0, 0x10(%edx)
350 movdqa %xmm0, 0x20(%edx)
351 movdqa %xmm0, 0x30(%edx)
352 movdqa %xmm0, 0x40(%edx)
353 movdqa %xmm0, 0x50(%edx)
354 movdqa %xmm0, 0x60(%edx)
355 movdqa %xmm0, 0x70(%edx)
356 lea 128(%edx), %edx
357 jb L(128bytesless_normal)
358
359
360 sub $128, %ecx
361 movdqa %xmm0, (%edx)
362 movdqa %xmm0, 0x10(%edx)
363 movdqa %xmm0, 0x20(%edx)
364 movdqa %xmm0, 0x30(%edx)
365 movdqa %xmm0, 0x40(%edx)
366 movdqa %xmm0, 0x50(%edx)
367 movdqa %xmm0, 0x60(%edx)
368 movdqa %xmm0, 0x70(%edx)
369 lea 128(%edx), %edx
370 jae L(128bytesormore_normal)
371
372L(128bytesless_normal):
373 lea 128(%ecx), %ecx
374 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
375
376 ALIGN (4)
377L(128bytes_L2_normal):
378 prefetcht0 0x380(%edx)
379 prefetcht0 0x3c0(%edx)
380 sub $128, %ecx
381 movdqa %xmm0, (%edx)
382 movaps %xmm0, 0x10(%edx)
383 movaps %xmm0, 0x20(%edx)
384 movaps %xmm0, 0x30(%edx)
385 movaps %xmm0, 0x40(%edx)
386 movaps %xmm0, 0x50(%edx)
387 movaps %xmm0, 0x60(%edx)
388 movaps %xmm0, 0x70(%edx)
389 add $128, %edx
390 cmp $128, %ecx
391 jae L(128bytes_L2_normal)
392
393L(128bytesless_L2_normal):
394 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
395
396L(128bytesormore_nt_start):
397 sub %ebx, %ecx
398 ALIGN (4)
399L(128bytesormore_shared_cache_loop):
400 prefetcht0 0x3c0(%edx)
401 prefetcht0 0x380(%edx)
402 sub $0x80, %ebx
403 movdqa %xmm0, (%edx)
404 movdqa %xmm0, 0x10(%edx)
405 movdqa %xmm0, 0x20(%edx)
406 movdqa %xmm0, 0x30(%edx)
407 movdqa %xmm0, 0x40(%edx)
408 movdqa %xmm0, 0x50(%edx)
409 movdqa %xmm0, 0x60(%edx)
410 movdqa %xmm0, 0x70(%edx)
411 add $0x80, %edx
412 cmp $0x80, %ebx
413 jae L(128bytesormore_shared_cache_loop)
414 cmp $0x80, %ecx
415 jb L(shared_cache_loop_end)
416 ALIGN (4)
417L(128bytesormore_nt):
418 sub $0x80, %ecx
419 movntdq %xmm0, (%edx)
420 movntdq %xmm0, 0x10(%edx)
421 movntdq %xmm0, 0x20(%edx)
422 movntdq %xmm0, 0x30(%edx)
423 movntdq %xmm0, 0x40(%edx)
424 movntdq %xmm0, 0x50(%edx)
425 movntdq %xmm0, 0x60(%edx)
426 movntdq %xmm0, 0x70(%edx)
427 add $0x80, %edx
428 cmp $0x80, %ecx
429 jae L(128bytesormore_nt)
430 sfence
431L(shared_cache_loop_end):
432#if defined DATA_CACHE_SIZE || !defined SHARED
433 POP (%ebx)
434#endif
435 BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
436
437
438 .pushsection .rodata.sse2,"a",@progbits
439 ALIGN (2)
440L(table_16_128bytes):
441 .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes))
442 .int JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes))
443 .int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes))
444 .int JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes))
445 .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes))
446 .int JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes))
447 .int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes))
448 .int JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes))
449 .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes))
450 .int JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes))
451 .int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes))
452 .int JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes))
453 .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes))
454 .int JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes))
455 .int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes))
456 .int JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes))
457 .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes))
458 .int JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes))
459 .int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes))
460 .int JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes))
461 .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes))
462 .int JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes))
463 .int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes))
464 .int JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes))
465 .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes))
466 .int JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes))
467 .int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes))
468 .int JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes))
469 .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes))
470 .int JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes))
471 .int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes))
472 .int JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes))
473 .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes))
474 .int JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes))
475 .int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes))
476 .int JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes))
477 .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes))
478 .int JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes))
479 .int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes))
480 .int JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes))
481 .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes))
482 .int JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes))
483 .int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes))
484 .int JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes))
485 .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes))
486 .int JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes))
487 .int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes))
488 .int JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes))
489 .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes))
490 .int JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes))
491 .int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes))
492 .int JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes))
493 .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes))
494 .int JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes))
495 .int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes))
496 .int JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes))
497 .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes))
498 .int JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes))
499 .int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes))
500 .int JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes))
501 .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes))
502 .int JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes))
503 .int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes))
504 .int JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes))
505 .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes))
506 .int JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes))
507 .int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes))
508 .int JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes))
509 .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes))
510 .int JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes))
511 .int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes))
512 .int JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes))
513 .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes))
514 .int JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes))
515 .int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes))
516 .int JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes))
517 .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes))
518 .int JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes))
519 .int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes))
520 .int JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes))
521 .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes))
522 .int JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes))
523 .int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes))
524 .int JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes))
525 .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes))
526 .int JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes))
527 .int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes))
528 .int JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes))
529 .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes))
530 .int JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes))
531 .int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes))
532 .int JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes))
533 .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes))
534 .int JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes))
535 .int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes))
536 .int JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes))
537 .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes))
538 .int JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes))
539 .int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes))
540 .int JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes))
541 .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes))
542 .int JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes))
543 .int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes))
544 .int JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes))
545 .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes))
546 .int JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes))
547 .int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes))
548 .int JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes))
549 .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes))
550 .int JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes))
551 .int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes))
552 .int JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes))
553 .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes))
554 .int JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes))
555 .int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes))
556 .int JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes))
557 .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes))
558 .int JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes))
559 .int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes))
560 .int JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes))
561 .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes))
562 .int JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes))
563 .int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes))
564 .int JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes))
565 .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes))
566 .int JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes))
567 .int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes))
568 .int JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes))
569 .popsection
570
571 ALIGN (4)
572L(aligned_16_112bytes):
573 movdqa %xmm0, -112(%edx)
574L(aligned_16_96bytes):
575 movdqa %xmm0, -96(%edx)
576L(aligned_16_80bytes):
577 movdqa %xmm0, -80(%edx)
578L(aligned_16_64bytes):
579 movdqa %xmm0, -64(%edx)
580L(aligned_16_48bytes):
581 movdqa %xmm0, -48(%edx)
582L(aligned_16_32bytes):
583 movdqa %xmm0, -32(%edx)
584L(aligned_16_16bytes):
585 movdqa %xmm0, -16(%edx)
586L(aligned_16_0bytes):
587 SETRTNVAL
588 RETURN
589
590 ALIGN (4)
591L(aligned_16_113bytes):
592 movdqa %xmm0, -113(%edx)
593L(aligned_16_97bytes):
594 movdqa %xmm0, -97(%edx)
595L(aligned_16_81bytes):
596 movdqa %xmm0, -81(%edx)
597L(aligned_16_65bytes):
598 movdqa %xmm0, -65(%edx)
599L(aligned_16_49bytes):
600 movdqa %xmm0, -49(%edx)
601L(aligned_16_33bytes):
602 movdqa %xmm0, -33(%edx)
603L(aligned_16_17bytes):
604 movdqa %xmm0, -17(%edx)
605L(aligned_16_1bytes):
606 movb %al, -1(%edx)
607 SETRTNVAL
608 RETURN
609
610 ALIGN (4)
611L(aligned_16_114bytes):
612 movdqa %xmm0, -114(%edx)
613L(aligned_16_98bytes):
614 movdqa %xmm0, -98(%edx)
615L(aligned_16_82bytes):
616 movdqa %xmm0, -82(%edx)
617L(aligned_16_66bytes):
618 movdqa %xmm0, -66(%edx)
619L(aligned_16_50bytes):
620 movdqa %xmm0, -50(%edx)
621L(aligned_16_34bytes):
622 movdqa %xmm0, -34(%edx)
623L(aligned_16_18bytes):
624 movdqa %xmm0, -18(%edx)
625L(aligned_16_2bytes):
626 movw %ax, -2(%edx)
627 SETRTNVAL
628 RETURN
629
630 ALIGN (4)
631L(aligned_16_115bytes):
632 movdqa %xmm0, -115(%edx)
633L(aligned_16_99bytes):
634 movdqa %xmm0, -99(%edx)
635L(aligned_16_83bytes):
636 movdqa %xmm0, -83(%edx)
637L(aligned_16_67bytes):
638 movdqa %xmm0, -67(%edx)
639L(aligned_16_51bytes):
640 movdqa %xmm0, -51(%edx)
641L(aligned_16_35bytes):
642 movdqa %xmm0, -35(%edx)
643L(aligned_16_19bytes):
644 movdqa %xmm0, -19(%edx)
645L(aligned_16_3bytes):
646 movw %ax, -3(%edx)
647 movb %al, -1(%edx)
648 SETRTNVAL
649 RETURN
650
651 ALIGN (4)
652L(aligned_16_116bytes):
653 movdqa %xmm0, -116(%edx)
654L(aligned_16_100bytes):
655 movdqa %xmm0, -100(%edx)
656L(aligned_16_84bytes):
657 movdqa %xmm0, -84(%edx)
658L(aligned_16_68bytes):
659 movdqa %xmm0, -68(%edx)
660L(aligned_16_52bytes):
661 movdqa %xmm0, -52(%edx)
662L(aligned_16_36bytes):
663 movdqa %xmm0, -36(%edx)
664L(aligned_16_20bytes):
665 movdqa %xmm0, -20(%edx)
666L(aligned_16_4bytes):
667 movl %eax, -4(%edx)
668 SETRTNVAL
669 RETURN
670
671 ALIGN (4)
672L(aligned_16_117bytes):
673 movdqa %xmm0, -117(%edx)
674L(aligned_16_101bytes):
675 movdqa %xmm0, -101(%edx)
676L(aligned_16_85bytes):
677 movdqa %xmm0, -85(%edx)
678L(aligned_16_69bytes):
679 movdqa %xmm0, -69(%edx)
680L(aligned_16_53bytes):
681 movdqa %xmm0, -53(%edx)
682L(aligned_16_37bytes):
683 movdqa %xmm0, -37(%edx)
684L(aligned_16_21bytes):
685 movdqa %xmm0, -21(%edx)
686L(aligned_16_5bytes):
687 movl %eax, -5(%edx)
688 movb %al, -1(%edx)
689 SETRTNVAL
690 RETURN
691
692 ALIGN (4)
693L(aligned_16_118bytes):
694 movdqa %xmm0, -118(%edx)
695L(aligned_16_102bytes):
696 movdqa %xmm0, -102(%edx)
697L(aligned_16_86bytes):
698 movdqa %xmm0, -86(%edx)
699L(aligned_16_70bytes):
700 movdqa %xmm0, -70(%edx)
701L(aligned_16_54bytes):
702 movdqa %xmm0, -54(%edx)
703L(aligned_16_38bytes):
704 movdqa %xmm0, -38(%edx)
705L(aligned_16_22bytes):
706 movdqa %xmm0, -22(%edx)
707L(aligned_16_6bytes):
708 movl %eax, -6(%edx)
709 movw %ax, -2(%edx)
710 SETRTNVAL
711 RETURN
712
713 ALIGN (4)
714L(aligned_16_119bytes):
715 movdqa %xmm0, -119(%edx)
716L(aligned_16_103bytes):
717 movdqa %xmm0, -103(%edx)
718L(aligned_16_87bytes):
719 movdqa %xmm0, -87(%edx)
720L(aligned_16_71bytes):
721 movdqa %xmm0, -71(%edx)
722L(aligned_16_55bytes):
723 movdqa %xmm0, -55(%edx)
724L(aligned_16_39bytes):
725 movdqa %xmm0, -39(%edx)
726L(aligned_16_23bytes):
727 movdqa %xmm0, -23(%edx)
728L(aligned_16_7bytes):
729 movl %eax, -7(%edx)
730 movw %ax, -3(%edx)
731 movb %al, -1(%edx)
732 SETRTNVAL
733 RETURN
734
735 ALIGN (4)
736L(aligned_16_120bytes):
737 movdqa %xmm0, -120(%edx)
738L(aligned_16_104bytes):
739 movdqa %xmm0, -104(%edx)
740L(aligned_16_88bytes):
741 movdqa %xmm0, -88(%edx)
742L(aligned_16_72bytes):
743 movdqa %xmm0, -72(%edx)
744L(aligned_16_56bytes):
745 movdqa %xmm0, -56(%edx)
746L(aligned_16_40bytes):
747 movdqa %xmm0, -40(%edx)
748L(aligned_16_24bytes):
749 movdqa %xmm0, -24(%edx)
750L(aligned_16_8bytes):
751 movq %xmm0, -8(%edx)
752 SETRTNVAL
753 RETURN
754
755 ALIGN (4)
756L(aligned_16_121bytes):
757 movdqa %xmm0, -121(%edx)
758L(aligned_16_105bytes):
759 movdqa %xmm0, -105(%edx)
760L(aligned_16_89bytes):
761 movdqa %xmm0, -89(%edx)
762L(aligned_16_73bytes):
763 movdqa %xmm0, -73(%edx)
764L(aligned_16_57bytes):
765 movdqa %xmm0, -57(%edx)
766L(aligned_16_41bytes):
767 movdqa %xmm0, -41(%edx)
768L(aligned_16_25bytes):
769 movdqa %xmm0, -25(%edx)
770L(aligned_16_9bytes):
771 movq %xmm0, -9(%edx)
772 movb %al, -1(%edx)
773 SETRTNVAL
774 RETURN
775
776 ALIGN (4)
777L(aligned_16_122bytes):
778 movdqa %xmm0, -122(%edx)
779L(aligned_16_106bytes):
780 movdqa %xmm0, -106(%edx)
781L(aligned_16_90bytes):
782 movdqa %xmm0, -90(%edx)
783L(aligned_16_74bytes):
784 movdqa %xmm0, -74(%edx)
785L(aligned_16_58bytes):
786 movdqa %xmm0, -58(%edx)
787L(aligned_16_42bytes):
788 movdqa %xmm0, -42(%edx)
789L(aligned_16_26bytes):
790 movdqa %xmm0, -26(%edx)
791L(aligned_16_10bytes):
792 movq %xmm0, -10(%edx)
793 movw %ax, -2(%edx)
794 SETRTNVAL
795 RETURN
796
797 ALIGN (4)
798L(aligned_16_123bytes):
799 movdqa %xmm0, -123(%edx)
800L(aligned_16_107bytes):
801 movdqa %xmm0, -107(%edx)
802L(aligned_16_91bytes):
803 movdqa %xmm0, -91(%edx)
804L(aligned_16_75bytes):
805 movdqa %xmm0, -75(%edx)
806L(aligned_16_59bytes):
807 movdqa %xmm0, -59(%edx)
808L(aligned_16_43bytes):
809 movdqa %xmm0, -43(%edx)
810L(aligned_16_27bytes):
811 movdqa %xmm0, -27(%edx)
812L(aligned_16_11bytes):
813 movq %xmm0, -11(%edx)
814 movw %ax, -3(%edx)
815 movb %al, -1(%edx)
816 SETRTNVAL
817 RETURN
818
819 ALIGN (4)
820L(aligned_16_124bytes):
821 movdqa %xmm0, -124(%edx)
822L(aligned_16_108bytes):
823 movdqa %xmm0, -108(%edx)
824L(aligned_16_92bytes):
825 movdqa %xmm0, -92(%edx)
826L(aligned_16_76bytes):
827 movdqa %xmm0, -76(%edx)
828L(aligned_16_60bytes):
829 movdqa %xmm0, -60(%edx)
830L(aligned_16_44bytes):
831 movdqa %xmm0, -44(%edx)
832L(aligned_16_28bytes):
833 movdqa %xmm0, -28(%edx)
834L(aligned_16_12bytes):
835 movq %xmm0, -12(%edx)
836 movl %eax, -4(%edx)
837 SETRTNVAL
838 RETURN
839
840 ALIGN (4)
841L(aligned_16_125bytes):
842 movdqa %xmm0, -125(%edx)
843L(aligned_16_109bytes):
844 movdqa %xmm0, -109(%edx)
845L(aligned_16_93bytes):
846 movdqa %xmm0, -93(%edx)
847L(aligned_16_77bytes):
848 movdqa %xmm0, -77(%edx)
849L(aligned_16_61bytes):
850 movdqa %xmm0, -61(%edx)
851L(aligned_16_45bytes):
852 movdqa %xmm0, -45(%edx)
853L(aligned_16_29bytes):
854 movdqa %xmm0, -29(%edx)
855L(aligned_16_13bytes):
856 movq %xmm0, -13(%edx)
857 movl %eax, -5(%edx)
858 movb %al, -1(%edx)
859 SETRTNVAL
860 RETURN
861
862 ALIGN (4)
863L(aligned_16_126bytes):
864 movdqa %xmm0, -126(%edx)
865L(aligned_16_110bytes):
866 movdqa %xmm0, -110(%edx)
867L(aligned_16_94bytes):
868 movdqa %xmm0, -94(%edx)
869L(aligned_16_78bytes):
870 movdqa %xmm0, -78(%edx)
871L(aligned_16_62bytes):
872 movdqa %xmm0, -62(%edx)
873L(aligned_16_46bytes):
874 movdqa %xmm0, -46(%edx)
875L(aligned_16_30bytes):
876 movdqa %xmm0, -30(%edx)
877L(aligned_16_14bytes):
878 movq %xmm0, -14(%edx)
879 movl %eax, -6(%edx)
880 movw %ax, -2(%edx)
881 SETRTNVAL
882 RETURN
883
884 ALIGN (4)
885L(aligned_16_127bytes):
886 movdqa %xmm0, -127(%edx)
887L(aligned_16_111bytes):
888 movdqa %xmm0, -111(%edx)
889L(aligned_16_95bytes):
890 movdqa %xmm0, -95(%edx)
891L(aligned_16_79bytes):
892 movdqa %xmm0, -79(%edx)
893L(aligned_16_63bytes):
894 movdqa %xmm0, -63(%edx)
895L(aligned_16_47bytes):
896 movdqa %xmm0, -47(%edx)
897L(aligned_16_31bytes):
898 movdqa %xmm0, -31(%edx)
899L(aligned_16_15bytes):
900 movq %xmm0, -15(%edx)
901 movl %eax, -7(%edx)
902 movw %ax, -3(%edx)
903 movb %al, -1(%edx)
904 SETRTNVAL
905 RETURN_END
906
907END (sse2_memset5_atom)