blob: e5028ffb9d2ec3ba2ad0591b9642dcb74ce63745 [file] [log] [blame]
Varvara Rainchik5a922842014-04-24 15:41:20 +04001/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
Elliott Hughes15581382014-07-07 15:42:06 -070023ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
Varvara Rainchik5a922842014-04-24 15:41:20 +040025LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef L
32# define L(label) .L##label
33#endif
34
35#ifndef cfi_startproc
36# define cfi_startproc .cfi_startproc
37#endif
38
39#ifndef cfi_endproc
40# define cfi_endproc .cfi_endproc
41#endif
42
43#ifndef cfi_rel_offset
44# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
45#endif
46
47#ifndef cfi_restore
48# define cfi_restore(reg) .cfi_restore reg
49#endif
50
51#ifndef cfi_adjust_cfa_offset
52# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
53#endif
54
55#ifndef cfi_remember_state
56# define cfi_remember_state .cfi_remember_state
57#endif
58
59#ifndef cfi_restore_state
60# define cfi_restore_state .cfi_restore_state
61#endif
62
63#ifndef ENTRY
64# define ENTRY(name) \
65 .type name, @function; \
66 .globl name; \
67 .p2align 4; \
68name: \
69 cfi_startproc
70#endif
71
72#ifndef END
73# define END(name) \
74 cfi_endproc; \
75 .size name, .-name
76#endif
77
78#ifndef MEMCMP
79# define MEMCMP memcmp
80#endif
81
82#define CFI_PUSH(REG) \
83 cfi_adjust_cfa_offset (4); \
84 cfi_rel_offset (REG, 0)
85
86#define CFI_POP(REG) \
87 cfi_adjust_cfa_offset (-4); \
88 cfi_restore (REG)
89
90#define PUSH(REG) pushl REG; CFI_PUSH (REG)
91#define POP(REG) popl REG; CFI_POP (REG)
92
93#define PARMS 4
94#define BLK1 PARMS
95#define BLK2 BLK1 + 4
96#define LEN BLK2 + 4
97#define RETURN POP (%ebx); ret; CFI_PUSH (%ebx)
98
99
100#if (defined SHARED || defined __PIC__)
101# define JMPTBL(I, B) I - B
102
103/* Load an entry in a jump table into EBX and branch to it. TABLE is a
104 jump table with relative offsets. INDEX is a register contains the
105 index into the jump table. SCALE is the scale of INDEX. */
106
107# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
108/* We first load PC into EBX. */ \
109 call __x86.get_pc_thunk.bx; \
110/* Get the address of the jump table. */ \
111 addl $(TABLE - .), %ebx; \
112/* Get the entry and convert the relative offset to the \
113 absolute address. */ \
114 addl (%ebx,INDEX,SCALE), %ebx; \
115/* We loaded the jump table and adjuested EDX/ESI. Go. */ \
116 jmp *%ebx
117#else
118# define JMPTBL(I, B) I
119
120/* Load an entry in a jump table into EBX and branch to it. TABLE is a
121 jump table with relative offsets. INDEX is a register contains the
122 index into the jump table. SCALE is the scale of INDEX. */
123# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
124 jmp *TABLE(,INDEX,SCALE)
125#endif
126
127
128/* Warning!
129 wmemcmp has to use SIGNED comparison for elements.
130 memcmp has to use UNSIGNED comparison for elemnts.
131*/
132
133 .section .text.sse4.2,"ax",@progbits
134ENTRY (MEMCMP)
135 movl BLK1(%esp), %eax
136 movl BLK2(%esp), %edx
137 movl LEN(%esp), %ecx
138
139#ifdef USE_AS_WMEMCMP
140 shl $2, %ecx
141 test %ecx, %ecx
142 jz L(return0)
143#else
144 cmp $1, %ecx
145 jbe L(less1bytes)
146#endif
147
148 pxor %xmm0, %xmm0
149 cmp $64, %ecx
150 ja L(64bytesormore)
151 cmp $8, %ecx
152
153#ifndef USE_AS_WMEMCMP
154 PUSH (%ebx)
155 jb L(less8bytes)
156#else
157 jb L(less8bytes)
158 PUSH (%ebx)
159#endif
160
161 add %ecx, %edx
162 add %ecx, %eax
163 BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
164
165#ifndef USE_AS_WMEMCMP
166 .p2align 4
167L(less8bytes):
168 mov (%eax), %bl
169 cmpb (%edx), %bl
170 jne L(nonzero)
171
172 mov 1(%eax), %bl
173 cmpb 1(%edx), %bl
174 jne L(nonzero)
175
176 cmp $2, %ecx
177 jz L(0bytes)
178
179 mov 2(%eax), %bl
180 cmpb 2(%edx), %bl
181 jne L(nonzero)
182
183 cmp $3, %ecx
184 jz L(0bytes)
185
186 mov 3(%eax), %bl
187 cmpb 3(%edx), %bl
188 jne L(nonzero)
189
190 cmp $4, %ecx
191 jz L(0bytes)
192
193 mov 4(%eax), %bl
194 cmpb 4(%edx), %bl
195 jne L(nonzero)
196
197 cmp $5, %ecx
198 jz L(0bytes)
199
200 mov 5(%eax), %bl
201 cmpb 5(%edx), %bl
202 jne L(nonzero)
203
204 cmp $6, %ecx
205 jz L(0bytes)
206
207 mov 6(%eax), %bl
208 cmpb 6(%edx), %bl
209 je L(0bytes)
210
211L(nonzero):
212 POP (%ebx)
213 mov $1, %eax
214 ja L(above)
215 neg %eax
216L(above):
217 ret
218 CFI_PUSH (%ebx)
219#endif
220
221 .p2align 4
222L(0bytes):
223 POP (%ebx)
224 xor %eax, %eax
225 ret
226
227#ifdef USE_AS_WMEMCMP
228
229/* for wmemcmp, case N == 1 */
230
231 .p2align 4
232L(less8bytes):
233 mov (%eax), %ecx
234 cmp (%edx), %ecx
235 je L(return0)
236 mov $1, %eax
237 jg L(find_diff_bigger)
238 neg %eax
239 ret
240
241 .p2align 4
242L(find_diff_bigger):
243 ret
244
245 .p2align 4
246L(return0):
247 xor %eax, %eax
248 ret
249#endif
250
251#ifndef USE_AS_WMEMCMP
252 .p2align 4
253L(less1bytes):
254 jb L(0bytesend)
255 movzbl (%eax), %eax
256 movzbl (%edx), %edx
257 sub %edx, %eax
258 ret
259
260 .p2align 4
261L(0bytesend):
262 xor %eax, %eax
263 ret
264#endif
265 .p2align 4
266L(64bytesormore):
267 PUSH (%ebx)
268 mov %ecx, %ebx
269 mov $64, %ecx
270 sub $64, %ebx
271L(64bytesormore_loop):
272 movdqu (%eax), %xmm1
273 movdqu (%edx), %xmm2
274 pxor %xmm1, %xmm2
275 ptest %xmm2, %xmm0
276 jnc L(find_16diff)
277
278 movdqu 16(%eax), %xmm1
279 movdqu 16(%edx), %xmm2
280 pxor %xmm1, %xmm2
281 ptest %xmm2, %xmm0
282 jnc L(find_32diff)
283
284 movdqu 32(%eax), %xmm1
285 movdqu 32(%edx), %xmm2
286 pxor %xmm1, %xmm2
287 ptest %xmm2, %xmm0
288 jnc L(find_48diff)
289
290 movdqu 48(%eax), %xmm1
291 movdqu 48(%edx), %xmm2
292 pxor %xmm1, %xmm2
293 ptest %xmm2, %xmm0
294 jnc L(find_64diff)
295 add %ecx, %eax
296 add %ecx, %edx
297 sub %ecx, %ebx
298 jae L(64bytesormore_loop)
299 add %ebx, %ecx
300 add %ecx, %edx
301 add %ecx, %eax
302 BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
303
304#ifdef USE_AS_WMEMCMP
305
306/* Label needs only for table_64bytes filling */
307L(unreal_case):
308/* no code here */
309
310#endif
311 .p2align 4
312L(find_16diff):
313 sub $16, %ecx
314L(find_32diff):
315 sub $16, %ecx
316L(find_48diff):
317 sub $16, %ecx
318L(find_64diff):
319 add %ecx, %edx
320 add %ecx, %eax
321
322#ifndef USE_AS_WMEMCMP
323 .p2align 4
324L(16bytes):
325 mov -16(%eax), %ecx
326 mov -16(%edx), %ebx
327 cmp %ebx, %ecx
328 jne L(find_diff)
329L(12bytes):
330 mov -12(%eax), %ecx
331 mov -12(%edx), %ebx
332 cmp %ebx, %ecx
333 jne L(find_diff)
334L(8bytes):
335 mov -8(%eax), %ecx
336 mov -8(%edx), %ebx
337 cmp %ebx, %ecx
338 jne L(find_diff)
339L(4bytes):
340 mov -4(%eax), %ecx
341 mov -4(%edx), %ebx
342 cmp %ebx, %ecx
343 mov $0, %eax
344 jne L(find_diff)
345 RETURN
346#else
347 .p2align 4
348L(16bytes):
349 mov -16(%eax), %ecx
350 cmp -16(%edx), %ecx
351 jne L(find_diff)
352L(12bytes):
353 mov -12(%eax), %ecx
354 cmp -12(%edx), %ecx
355 jne L(find_diff)
356L(8bytes):
357 mov -8(%eax), %ecx
358 cmp -8(%edx), %ecx
359 jne L(find_diff)
360L(4bytes):
361 mov -4(%eax), %ecx
362 cmp -4(%edx), %ecx
363 mov $0, %eax
364 jne L(find_diff)
365 RETURN
366#endif
367
368#ifndef USE_AS_WMEMCMP
369 .p2align 4
370L(49bytes):
371 movdqu -49(%eax), %xmm1
372 movdqu -49(%edx), %xmm2
373 mov $-49, %ebx
374 pxor %xmm1, %xmm2
375 ptest %xmm2, %xmm0
376 jnc L(less16bytes)
377L(33bytes):
378 movdqu -33(%eax), %xmm1
379 movdqu -33(%edx), %xmm2
380 mov $-33, %ebx
381 pxor %xmm1, %xmm2
382 ptest %xmm2, %xmm0
383 jnc L(less16bytes)
384L(17bytes):
385 mov -17(%eax), %ecx
386 mov -17(%edx), %ebx
387 cmp %ebx, %ecx
388 jne L(find_diff)
389L(13bytes):
390 mov -13(%eax), %ecx
391 mov -13(%edx), %ebx
392 cmp %ebx, %ecx
393 jne L(find_diff)
394L(9bytes):
395 mov -9(%eax), %ecx
396 mov -9(%edx), %ebx
397 cmp %ebx, %ecx
398 jne L(find_diff)
399L(5bytes):
400 mov -5(%eax), %ecx
401 mov -5(%edx), %ebx
402 cmp %ebx, %ecx
403 jne L(find_diff)
404 movzbl -1(%eax), %ecx
405 cmp -1(%edx), %cl
406 mov $0, %eax
407 jne L(end)
408 RETURN
409
410 .p2align 4
411L(50bytes):
412 mov $-50, %ebx
413 movdqu -50(%eax), %xmm1
414 movdqu -50(%edx), %xmm2
415 pxor %xmm1, %xmm2
416 ptest %xmm2, %xmm0
417 jnc L(less16bytes)
418L(34bytes):
419 mov $-34, %ebx
420 movdqu -34(%eax), %xmm1
421 movdqu -34(%edx), %xmm2
422 pxor %xmm1, %xmm2
423 ptest %xmm2, %xmm0
424 jnc L(less16bytes)
425L(18bytes):
426 mov -18(%eax), %ecx
427 mov -18(%edx), %ebx
428 cmp %ebx, %ecx
429 jne L(find_diff)
430L(14bytes):
431 mov -14(%eax), %ecx
432 mov -14(%edx), %ebx
433 cmp %ebx, %ecx
434 jne L(find_diff)
435L(10bytes):
436 mov -10(%eax), %ecx
437 mov -10(%edx), %ebx
438 cmp %ebx, %ecx
439 jne L(find_diff)
440L(6bytes):
441 mov -6(%eax), %ecx
442 mov -6(%edx), %ebx
443 cmp %ebx, %ecx
444 jne L(find_diff)
445L(2bytes):
446 movzwl -2(%eax), %ecx
447 movzwl -2(%edx), %ebx
448 cmp %bl, %cl
449 jne L(end)
450 cmp %bh, %ch
451 mov $0, %eax
452 jne L(end)
453 RETURN
454
455 .p2align 4
456L(51bytes):
457 mov $-51, %ebx
458 movdqu -51(%eax), %xmm1
459 movdqu -51(%edx), %xmm2
460 pxor %xmm1, %xmm2
461 ptest %xmm2, %xmm0
462 jnc L(less16bytes)
463L(35bytes):
464 mov $-35, %ebx
465 movdqu -35(%eax), %xmm1
466 movdqu -35(%edx), %xmm2
467 pxor %xmm1, %xmm2
468 ptest %xmm2, %xmm0
469 jnc L(less16bytes)
470L(19bytes):
471 movl -19(%eax), %ecx
472 movl -19(%edx), %ebx
473 cmp %ebx, %ecx
474 jne L(find_diff)
475L(15bytes):
476 movl -15(%eax), %ecx
477 movl -15(%edx), %ebx
478 cmp %ebx, %ecx
479 jne L(find_diff)
480L(11bytes):
481 movl -11(%eax), %ecx
482 movl -11(%edx), %ebx
483 cmp %ebx, %ecx
484 jne L(find_diff)
485L(7bytes):
486 movl -7(%eax), %ecx
487 movl -7(%edx), %ebx
488 cmp %ebx, %ecx
489 jne L(find_diff)
490L(3bytes):
491 movzwl -3(%eax), %ecx
492 movzwl -3(%edx), %ebx
493 cmpb %bl, %cl
494 jne L(end)
495 cmp %bx, %cx
496 jne L(end)
497L(1bytes):
498 movzbl -1(%eax), %eax
499 cmpb -1(%edx), %al
500 mov $0, %eax
501 jne L(end)
502 RETURN
503#endif
504 .p2align 4
505L(52bytes):
506 movdqu -52(%eax), %xmm1
507 movdqu -52(%edx), %xmm2
508 mov $-52, %ebx
509 pxor %xmm1, %xmm2
510 ptest %xmm2, %xmm0
511 jnc L(less16bytes)
512L(36bytes):
513 movdqu -36(%eax), %xmm1
514 movdqu -36(%edx), %xmm2
515 mov $-36, %ebx
516 pxor %xmm1, %xmm2
517 ptest %xmm2, %xmm0
518 jnc L(less16bytes)
519L(20bytes):
520 movdqu -20(%eax), %xmm1
521 movdqu -20(%edx), %xmm2
522 mov $-20, %ebx
523 pxor %xmm1, %xmm2
524 ptest %xmm2, %xmm0
525 jnc L(less16bytes)
526 mov -4(%eax), %ecx
527#ifndef USE_AS_WMEMCMP
528 mov -4(%edx), %ebx
529 cmp %ebx, %ecx
530#else
531 cmp -4(%edx), %ecx
532#endif
533 mov $0, %eax
534 jne L(find_diff)
535 RETURN
536
537#ifndef USE_AS_WMEMCMP
538 .p2align 4
539L(53bytes):
540 movdqu -53(%eax), %xmm1
541 movdqu -53(%edx), %xmm2
542 mov $-53, %ebx
543 pxor %xmm1, %xmm2
544 ptest %xmm2, %xmm0
545 jnc L(less16bytes)
546L(37bytes):
547 mov $-37, %ebx
548 movdqu -37(%eax), %xmm1
549 movdqu -37(%edx), %xmm2
550 pxor %xmm1, %xmm2
551 ptest %xmm2, %xmm0
552 jnc L(less16bytes)
553L(21bytes):
554 mov $-21, %ebx
555 movdqu -21(%eax), %xmm1
556 movdqu -21(%edx), %xmm2
557 pxor %xmm1, %xmm2
558 ptest %xmm2, %xmm0
559 jnc L(less16bytes)
560 mov -5(%eax), %ecx
561 mov -5(%edx), %ebx
562 cmp %ebx, %ecx
563 jne L(find_diff)
564 movzbl -1(%eax), %ecx
565 cmp -1(%edx), %cl
566 mov $0, %eax
567 jne L(end)
568 RETURN
569
570 .p2align 4
571L(54bytes):
572 movdqu -54(%eax), %xmm1
573 movdqu -54(%edx), %xmm2
574 mov $-54, %ebx
575 pxor %xmm1, %xmm2
576 ptest %xmm2, %xmm0
577 jnc L(less16bytes)
578L(38bytes):
579 mov $-38, %ebx
580 movdqu -38(%eax), %xmm1
581 movdqu -38(%edx), %xmm2
582 pxor %xmm1, %xmm2
583 ptest %xmm2, %xmm0
584 jnc L(less16bytes)
585L(22bytes):
586 mov $-22, %ebx
587 movdqu -22(%eax), %xmm1
588 movdqu -22(%edx), %xmm2
589 pxor %xmm1, %xmm2
590 ptest %xmm2, %xmm0
591 jnc L(less16bytes)
592
593 mov -6(%eax), %ecx
594 mov -6(%edx), %ebx
595 cmp %ebx, %ecx
596 jne L(find_diff)
597 movzwl -2(%eax), %ecx
598 movzwl -2(%edx), %ebx
599 cmp %bl, %cl
600 jne L(end)
601 cmp %bh, %ch
602 mov $0, %eax
603 jne L(end)
604 RETURN
605
606 .p2align 4
607L(55bytes):
608 movdqu -55(%eax), %xmm1
609 movdqu -55(%edx), %xmm2
610 mov $-55, %ebx
611 pxor %xmm1, %xmm2
612 ptest %xmm2, %xmm0
613 jnc L(less16bytes)
614L(39bytes):
615 mov $-39, %ebx
616 movdqu -39(%eax), %xmm1
617 movdqu -39(%edx), %xmm2
618 pxor %xmm1, %xmm2
619 ptest %xmm2, %xmm0
620 jnc L(less16bytes)
621L(23bytes):
622 mov $-23, %ebx
623 movdqu -23(%eax), %xmm1
624 movdqu -23(%edx), %xmm2
625 pxor %xmm1, %xmm2
626 ptest %xmm2, %xmm0
627 jnc L(less16bytes)
628 movl -7(%eax), %ecx
629 movl -7(%edx), %ebx
630 cmp %ebx, %ecx
631 jne L(find_diff)
632 movzwl -3(%eax), %ecx
633 movzwl -3(%edx), %ebx
634 cmpb %bl, %cl
635 jne L(end)
636 cmp %bx, %cx
637 jne L(end)
638 movzbl -1(%eax), %eax
639 cmpb -1(%edx), %al
640 mov $0, %eax
641 jne L(end)
642 RETURN
643#endif
644 .p2align 4
645L(56bytes):
646 movdqu -56(%eax), %xmm1
647 movdqu -56(%edx), %xmm2
648 mov $-56, %ebx
649 pxor %xmm1, %xmm2
650 ptest %xmm2, %xmm0
651 jnc L(less16bytes)
652L(40bytes):
653 mov $-40, %ebx
654 movdqu -40(%eax), %xmm1
655 movdqu -40(%edx), %xmm2
656 pxor %xmm1, %xmm2
657 ptest %xmm2, %xmm0
658 jnc L(less16bytes)
659L(24bytes):
660 mov $-24, %ebx
661 movdqu -24(%eax), %xmm1
662 movdqu -24(%edx), %xmm2
663 pxor %xmm1, %xmm2
664 ptest %xmm2, %xmm0
665 jnc L(less16bytes)
666
667 mov -8(%eax), %ecx
668#ifndef USE_AS_WMEMCMP
669 mov -8(%edx), %ebx
670 cmp %ebx, %ecx
671#else
672 cmp -8(%edx), %ecx
673#endif
674 jne L(find_diff)
675
676 mov -4(%eax), %ecx
677#ifndef USE_AS_WMEMCMP
678 mov -4(%edx), %ebx
679 cmp %ebx, %ecx
680#else
681 cmp -4(%edx), %ecx
682#endif
683 mov $0, %eax
684 jne L(find_diff)
685 RETURN
686
687#ifndef USE_AS_WMEMCMP
688 .p2align 4
689L(57bytes):
690 movdqu -57(%eax), %xmm1
691 movdqu -57(%edx), %xmm2
692 mov $-57, %ebx
693 pxor %xmm1, %xmm2
694 ptest %xmm2, %xmm0
695 jnc L(less16bytes)
696L(41bytes):
697 mov $-41, %ebx
698 movdqu -41(%eax), %xmm1
699 movdqu -41(%edx), %xmm2
700 pxor %xmm1, %xmm2
701 ptest %xmm2, %xmm0
702 jnc L(less16bytes)
703L(25bytes):
704 mov $-25, %ebx
705 movdqu -25(%eax), %xmm1
706 movdqu -25(%edx), %xmm2
707 pxor %xmm1, %xmm2
708 ptest %xmm2, %xmm0
709 jnc L(less16bytes)
710 mov -9(%eax), %ecx
711 mov -9(%edx), %ebx
712 cmp %ebx, %ecx
713 jne L(find_diff)
714 mov -5(%eax), %ecx
715 mov -5(%edx), %ebx
716 cmp %ebx, %ecx
717 jne L(find_diff)
718 movzbl -1(%eax), %ecx
719 cmp -1(%edx), %cl
720 mov $0, %eax
721 jne L(end)
722 RETURN
723
724 .p2align 4
725L(58bytes):
726 movdqu -58(%eax), %xmm1
727 movdqu -58(%edx), %xmm2
728 mov $-58, %ebx
729 pxor %xmm1, %xmm2
730 ptest %xmm2, %xmm0
731 jnc L(less16bytes)
732L(42bytes):
733 mov $-42, %ebx
734 movdqu -42(%eax), %xmm1
735 movdqu -42(%edx), %xmm2
736 pxor %xmm1, %xmm2
737 ptest %xmm2, %xmm0
738 jnc L(less16bytes)
739L(26bytes):
740 mov $-26, %ebx
741 movdqu -26(%eax), %xmm1
742 movdqu -26(%edx), %xmm2
743 pxor %xmm1, %xmm2
744 ptest %xmm2, %xmm0
745 jnc L(less16bytes)
746
747 mov -10(%eax), %ecx
748 mov -10(%edx), %ebx
749 cmp %ebx, %ecx
750 jne L(find_diff)
751
752 mov -6(%eax), %ecx
753 mov -6(%edx), %ebx
754 cmp %ebx, %ecx
755 jne L(find_diff)
756
757 movzwl -2(%eax), %ecx
758 movzwl -2(%edx), %ebx
759 cmp %bl, %cl
760 jne L(end)
761 cmp %bh, %ch
762 mov $0, %eax
763 jne L(end)
764 RETURN
765
766 .p2align 4
767L(59bytes):
768 movdqu -59(%eax), %xmm1
769 movdqu -59(%edx), %xmm2
770 mov $-59, %ebx
771 pxor %xmm1, %xmm2
772 ptest %xmm2, %xmm0
773 jnc L(less16bytes)
774L(43bytes):
775 mov $-43, %ebx
776 movdqu -43(%eax), %xmm1
777 movdqu -43(%edx), %xmm2
778 pxor %xmm1, %xmm2
779 ptest %xmm2, %xmm0
780 jnc L(less16bytes)
781L(27bytes):
782 mov $-27, %ebx
783 movdqu -27(%eax), %xmm1
784 movdqu -27(%edx), %xmm2
785 pxor %xmm1, %xmm2
786 ptest %xmm2, %xmm0
787 jnc L(less16bytes)
788 movl -11(%eax), %ecx
789 movl -11(%edx), %ebx
790 cmp %ebx, %ecx
791 jne L(find_diff)
792 movl -7(%eax), %ecx
793 movl -7(%edx), %ebx
794 cmp %ebx, %ecx
795 jne L(find_diff)
796 movzwl -3(%eax), %ecx
797 movzwl -3(%edx), %ebx
798 cmpb %bl, %cl
799 jne L(end)
800 cmp %bx, %cx
801 jne L(end)
802 movzbl -1(%eax), %eax
803 cmpb -1(%edx), %al
804 mov $0, %eax
805 jne L(end)
806 RETURN
807#endif
808 .p2align 4
809L(60bytes):
810 movdqu -60(%eax), %xmm1
811 movdqu -60(%edx), %xmm2
812 mov $-60, %ebx
813 pxor %xmm1, %xmm2
814 ptest %xmm2, %xmm0
815 jnc L(less16bytes)
816L(44bytes):
817 mov $-44, %ebx
818 movdqu -44(%eax), %xmm1
819 movdqu -44(%edx), %xmm2
820 pxor %xmm1, %xmm2
821 ptest %xmm2, %xmm0
822 jnc L(less16bytes)
823L(28bytes):
824 mov $-28, %ebx
825 movdqu -28(%eax), %xmm1
826 movdqu -28(%edx), %xmm2
827 pxor %xmm1, %xmm2
828 ptest %xmm2, %xmm0
829 jnc L(less16bytes)
830
831 mov -12(%eax), %ecx
832#ifndef USE_AS_WMEMCMP
833 mov -12(%edx), %ebx
834 cmp %ebx, %ecx
835#else
836 cmp -12(%edx), %ecx
837#endif
838 jne L(find_diff)
839
840 mov -8(%eax), %ecx
841#ifndef USE_AS_WMEMCMP
842 mov -8(%edx), %ebx
843 cmp %ebx, %ecx
844#else
845 cmp -8(%edx), %ecx
846#endif
847 jne L(find_diff)
848
849 mov -4(%eax), %ecx
850#ifndef USE_AS_WMEMCMP
851 mov -4(%edx), %ebx
852 cmp %ebx, %ecx
853#else
854 cmp -4(%edx), %ecx
855#endif
856 mov $0, %eax
857 jne L(find_diff)
858 RETURN
859
860#ifndef USE_AS_WMEMCMP
861 .p2align 4
862L(61bytes):
863 movdqu -61(%eax), %xmm1
864 movdqu -61(%edx), %xmm2
865 mov $-61, %ebx
866 pxor %xmm1, %xmm2
867 ptest %xmm2, %xmm0
868 jnc L(less16bytes)
869L(45bytes):
870 mov $-45, %ebx
871 movdqu -45(%eax), %xmm1
872 movdqu -45(%edx), %xmm2
873 pxor %xmm1, %xmm2
874 ptest %xmm2, %xmm0
875 jnc L(less16bytes)
876L(29bytes):
877 mov $-29, %ebx
878 movdqu -29(%eax), %xmm1
879 movdqu -29(%edx), %xmm2
880 pxor %xmm1, %xmm2
881 ptest %xmm2, %xmm0
882 jnc L(less16bytes)
883
884 mov -13(%eax), %ecx
885 mov -13(%edx), %ebx
886 cmp %ebx, %ecx
887 jne L(find_diff)
888
889 mov -9(%eax), %ecx
890 mov -9(%edx), %ebx
891 cmp %ebx, %ecx
892 jne L(find_diff)
893
894 mov -5(%eax), %ecx
895 mov -5(%edx), %ebx
896 cmp %ebx, %ecx
897 jne L(find_diff)
898 movzbl -1(%eax), %ecx
899 cmp -1(%edx), %cl
900 mov $0, %eax
901 jne L(end)
902 RETURN
903
904 .p2align 4
905L(62bytes):
906 movdqu -62(%eax), %xmm1
907 movdqu -62(%edx), %xmm2
908 mov $-62, %ebx
909 pxor %xmm1, %xmm2
910 ptest %xmm2, %xmm0
911 jnc L(less16bytes)
912L(46bytes):
913 mov $-46, %ebx
914 movdqu -46(%eax), %xmm1
915 movdqu -46(%edx), %xmm2
916 pxor %xmm1, %xmm2
917 ptest %xmm2, %xmm0
918 jnc L(less16bytes)
919L(30bytes):
920 mov $-30, %ebx
921 movdqu -30(%eax), %xmm1
922 movdqu -30(%edx), %xmm2
923 pxor %xmm1, %xmm2
924 ptest %xmm2, %xmm0
925 jnc L(less16bytes)
926 mov -14(%eax), %ecx
927 mov -14(%edx), %ebx
928 cmp %ebx, %ecx
929 jne L(find_diff)
930 mov -10(%eax), %ecx
931 mov -10(%edx), %ebx
932 cmp %ebx, %ecx
933 jne L(find_diff)
934 mov -6(%eax), %ecx
935 mov -6(%edx), %ebx
936 cmp %ebx, %ecx
937 jne L(find_diff)
938 movzwl -2(%eax), %ecx
939 movzwl -2(%edx), %ebx
940 cmp %bl, %cl
941 jne L(end)
942 cmp %bh, %ch
943 mov $0, %eax
944 jne L(end)
945 RETURN
946
947 .p2align 4
948L(63bytes):
949 movdqu -63(%eax), %xmm1
950 movdqu -63(%edx), %xmm2
951 mov $-63, %ebx
952 pxor %xmm1, %xmm2
953 ptest %xmm2, %xmm0
954 jnc L(less16bytes)
955L(47bytes):
956 mov $-47, %ebx
957 movdqu -47(%eax), %xmm1
958 movdqu -47(%edx), %xmm2
959 pxor %xmm1, %xmm2
960 ptest %xmm2, %xmm0
961 jnc L(less16bytes)
962L(31bytes):
963 mov $-31, %ebx
964 movdqu -31(%eax), %xmm1
965 movdqu -31(%edx), %xmm2
966 pxor %xmm1, %xmm2
967 ptest %xmm2, %xmm0
968 jnc L(less16bytes)
969
970 movl -15(%eax), %ecx
971 movl -15(%edx), %ebx
972 cmp %ebx, %ecx
973 jne L(find_diff)
974 movl -11(%eax), %ecx
975 movl -11(%edx), %ebx
976 cmp %ebx, %ecx
977 jne L(find_diff)
978 movl -7(%eax), %ecx
979 movl -7(%edx), %ebx
980 cmp %ebx, %ecx
981 jne L(find_diff)
982 movzwl -3(%eax), %ecx
983 movzwl -3(%edx), %ebx
984 cmpb %bl, %cl
985 jne L(end)
986 cmp %bx, %cx
987 jne L(end)
988 movzbl -1(%eax), %eax
989 cmpb -1(%edx), %al
990 mov $0, %eax
991 jne L(end)
992 RETURN
993#endif
994
995 .p2align 4
996L(64bytes):
997 movdqu -64(%eax), %xmm1
998 movdqu -64(%edx), %xmm2
999 mov $-64, %ebx
1000 pxor %xmm1, %xmm2
1001 ptest %xmm2, %xmm0
1002 jnc L(less16bytes)
1003L(48bytes):
1004 movdqu -48(%eax), %xmm1
1005 movdqu -48(%edx), %xmm2
1006 mov $-48, %ebx
1007 pxor %xmm1, %xmm2
1008 ptest %xmm2, %xmm0
1009 jnc L(less16bytes)
1010L(32bytes):
1011 movdqu -32(%eax), %xmm1
1012 movdqu -32(%edx), %xmm2
1013 mov $-32, %ebx
1014 pxor %xmm1, %xmm2
1015 ptest %xmm2, %xmm0
1016 jnc L(less16bytes)
1017
1018 mov -16(%eax), %ecx
1019#ifndef USE_AS_WMEMCMP
1020 mov -16(%edx), %ebx
1021 cmp %ebx, %ecx
1022#else
1023 cmp -16(%edx), %ecx
1024#endif
1025 jne L(find_diff)
1026
1027 mov -12(%eax), %ecx
1028#ifndef USE_AS_WMEMCMP
1029 mov -12(%edx), %ebx
1030 cmp %ebx, %ecx
1031#else
1032 cmp -12(%edx), %ecx
1033#endif
1034 jne L(find_diff)
1035
1036 mov -8(%eax), %ecx
1037#ifndef USE_AS_WMEMCMP
1038 mov -8(%edx), %ebx
1039 cmp %ebx, %ecx
1040#else
1041 cmp -8(%edx), %ecx
1042#endif
1043 jne L(find_diff)
1044
1045 mov -4(%eax), %ecx
1046#ifndef USE_AS_WMEMCMP
1047 mov -4(%edx), %ebx
1048 cmp %ebx, %ecx
1049#else
1050 cmp -4(%edx), %ecx
1051#endif
1052 mov $0, %eax
1053 jne L(find_diff)
1054 RETURN
1055
1056#ifndef USE_AS_WMEMCMP
1057 .p2align 4
1058L(less16bytes):
1059 add %ebx, %eax
1060 add %ebx, %edx
1061
1062 mov (%eax), %ecx
1063 mov (%edx), %ebx
1064 cmp %ebx, %ecx
1065 jne L(find_diff)
1066
1067 mov 4(%eax), %ecx
1068 mov 4(%edx), %ebx
1069 cmp %ebx, %ecx
1070 jne L(find_diff)
1071
1072 mov 8(%eax), %ecx
1073 mov 8(%edx), %ebx
1074 cmp %ebx, %ecx
1075 jne L(find_diff)
1076
1077 mov 12(%eax), %ecx
1078 mov 12(%edx), %ebx
1079 cmp %ebx, %ecx
1080 mov $0, %eax
1081 jne L(find_diff)
1082 RETURN
1083#else
1084 .p2align 4
1085L(less16bytes):
1086 add %ebx, %eax
1087 add %ebx, %edx
1088
1089 mov (%eax), %ecx
1090 cmp (%edx), %ecx
1091 jne L(find_diff)
1092
1093 mov 4(%eax), %ecx
1094 cmp 4(%edx), %ecx
1095 jne L(find_diff)
1096
1097 mov 8(%eax), %ecx
1098 cmp 8(%edx), %ecx
1099 jne L(find_diff)
1100
1101 mov 12(%eax), %ecx
1102 cmp 12(%edx), %ecx
1103
1104 mov $0, %eax
1105 jne L(find_diff)
1106 RETURN
1107#endif
1108
1109 .p2align 4
1110L(find_diff):
1111#ifndef USE_AS_WMEMCMP
1112 cmpb %bl, %cl
1113 jne L(end)
1114 cmp %bx, %cx
1115 jne L(end)
1116 shr $16,%ecx
1117 shr $16,%ebx
1118 cmp %bl, %cl
1119 jne L(end)
1120 cmp %bx, %cx
1121L(end):
1122 POP (%ebx)
1123 mov $1, %eax
1124 ja L(bigger)
1125 neg %eax
1126L(bigger):
1127 ret
1128#else
1129 POP (%ebx)
1130 mov $1, %eax
1131 jg L(bigger)
1132 neg %eax
1133 ret
1134
1135 .p2align 4
1136L(bigger):
1137 ret
1138#endif
1139END (MEMCMP)
1140
1141 .section .rodata.sse4.2,"a",@progbits
1142 .p2align 2
1143 .type L(table_64bytes), @object
1144#ifndef USE_AS_WMEMCMP
1145L(table_64bytes):
1146 .int JMPTBL (L(0bytes), L(table_64bytes))
1147 .int JMPTBL (L(1bytes), L(table_64bytes))
1148 .int JMPTBL (L(2bytes), L(table_64bytes))
1149 .int JMPTBL (L(3bytes), L(table_64bytes))
1150 .int JMPTBL (L(4bytes), L(table_64bytes))
1151 .int JMPTBL (L(5bytes), L(table_64bytes))
1152 .int JMPTBL (L(6bytes), L(table_64bytes))
1153 .int JMPTBL (L(7bytes), L(table_64bytes))
1154 .int JMPTBL (L(8bytes), L(table_64bytes))
1155 .int JMPTBL (L(9bytes), L(table_64bytes))
1156 .int JMPTBL (L(10bytes), L(table_64bytes))
1157 .int JMPTBL (L(11bytes), L(table_64bytes))
1158 .int JMPTBL (L(12bytes), L(table_64bytes))
1159 .int JMPTBL (L(13bytes), L(table_64bytes))
1160 .int JMPTBL (L(14bytes), L(table_64bytes))
1161 .int JMPTBL (L(15bytes), L(table_64bytes))
1162 .int JMPTBL (L(16bytes), L(table_64bytes))
1163 .int JMPTBL (L(17bytes), L(table_64bytes))
1164 .int JMPTBL (L(18bytes), L(table_64bytes))
1165 .int JMPTBL (L(19bytes), L(table_64bytes))
1166 .int JMPTBL (L(20bytes), L(table_64bytes))
1167 .int JMPTBL (L(21bytes), L(table_64bytes))
1168 .int JMPTBL (L(22bytes), L(table_64bytes))
1169 .int JMPTBL (L(23bytes), L(table_64bytes))
1170 .int JMPTBL (L(24bytes), L(table_64bytes))
1171 .int JMPTBL (L(25bytes), L(table_64bytes))
1172 .int JMPTBL (L(26bytes), L(table_64bytes))
1173 .int JMPTBL (L(27bytes), L(table_64bytes))
1174 .int JMPTBL (L(28bytes), L(table_64bytes))
1175 .int JMPTBL (L(29bytes), L(table_64bytes))
1176 .int JMPTBL (L(30bytes), L(table_64bytes))
1177 .int JMPTBL (L(31bytes), L(table_64bytes))
1178 .int JMPTBL (L(32bytes), L(table_64bytes))
1179 .int JMPTBL (L(33bytes), L(table_64bytes))
1180 .int JMPTBL (L(34bytes), L(table_64bytes))
1181 .int JMPTBL (L(35bytes), L(table_64bytes))
1182 .int JMPTBL (L(36bytes), L(table_64bytes))
1183 .int JMPTBL (L(37bytes), L(table_64bytes))
1184 .int JMPTBL (L(38bytes), L(table_64bytes))
1185 .int JMPTBL (L(39bytes), L(table_64bytes))
1186 .int JMPTBL (L(40bytes), L(table_64bytes))
1187 .int JMPTBL (L(41bytes), L(table_64bytes))
1188 .int JMPTBL (L(42bytes), L(table_64bytes))
1189 .int JMPTBL (L(43bytes), L(table_64bytes))
1190 .int JMPTBL (L(44bytes), L(table_64bytes))
1191 .int JMPTBL (L(45bytes), L(table_64bytes))
1192 .int JMPTBL (L(46bytes), L(table_64bytes))
1193 .int JMPTBL (L(47bytes), L(table_64bytes))
1194 .int JMPTBL (L(48bytes), L(table_64bytes))
1195 .int JMPTBL (L(49bytes), L(table_64bytes))
1196 .int JMPTBL (L(50bytes), L(table_64bytes))
1197 .int JMPTBL (L(51bytes), L(table_64bytes))
1198 .int JMPTBL (L(52bytes), L(table_64bytes))
1199 .int JMPTBL (L(53bytes), L(table_64bytes))
1200 .int JMPTBL (L(54bytes), L(table_64bytes))
1201 .int JMPTBL (L(55bytes), L(table_64bytes))
1202 .int JMPTBL (L(56bytes), L(table_64bytes))
1203 .int JMPTBL (L(57bytes), L(table_64bytes))
1204 .int JMPTBL (L(58bytes), L(table_64bytes))
1205 .int JMPTBL (L(59bytes), L(table_64bytes))
1206 .int JMPTBL (L(60bytes), L(table_64bytes))
1207 .int JMPTBL (L(61bytes), L(table_64bytes))
1208 .int JMPTBL (L(62bytes), L(table_64bytes))
1209 .int JMPTBL (L(63bytes), L(table_64bytes))
1210 .int JMPTBL (L(64bytes), L(table_64bytes))
1211#else
1212L(table_64bytes):
1213 .int JMPTBL (L(0bytes), L(table_64bytes))
1214 .int JMPTBL (L(unreal_case), L(table_64bytes))
1215 .int JMPTBL (L(unreal_case), L(table_64bytes))
1216 .int JMPTBL (L(unreal_case), L(table_64bytes))
1217 .int JMPTBL (L(4bytes), L(table_64bytes))
1218 .int JMPTBL (L(unreal_case), L(table_64bytes))
1219 .int JMPTBL (L(unreal_case), L(table_64bytes))
1220 .int JMPTBL (L(unreal_case), L(table_64bytes))
1221 .int JMPTBL (L(8bytes), L(table_64bytes))
1222 .int JMPTBL (L(unreal_case), L(table_64bytes))
1223 .int JMPTBL (L(unreal_case), L(table_64bytes))
1224 .int JMPTBL (L(unreal_case), L(table_64bytes))
1225 .int JMPTBL (L(12bytes), L(table_64bytes))
1226 .int JMPTBL (L(unreal_case), L(table_64bytes))
1227 .int JMPTBL (L(unreal_case), L(table_64bytes))
1228 .int JMPTBL (L(unreal_case), L(table_64bytes))
1229 .int JMPTBL (L(16bytes), L(table_64bytes))
1230 .int JMPTBL (L(unreal_case), L(table_64bytes))
1231 .int JMPTBL (L(unreal_case), L(table_64bytes))
1232 .int JMPTBL (L(unreal_case), L(table_64bytes))
1233 .int JMPTBL (L(20bytes), L(table_64bytes))
1234 .int JMPTBL (L(unreal_case), L(table_64bytes))
1235 .int JMPTBL (L(unreal_case), L(table_64bytes))
1236 .int JMPTBL (L(unreal_case), L(table_64bytes))
1237 .int JMPTBL (L(24bytes), L(table_64bytes))
1238 .int JMPTBL (L(unreal_case), L(table_64bytes))
1239 .int JMPTBL (L(unreal_case), L(table_64bytes))
1240 .int JMPTBL (L(unreal_case), L(table_64bytes))
1241 .int JMPTBL (L(28bytes), L(table_64bytes))
1242 .int JMPTBL (L(unreal_case), L(table_64bytes))
1243 .int JMPTBL (L(unreal_case), L(table_64bytes))
1244 .int JMPTBL (L(unreal_case), L(table_64bytes))
1245 .int JMPTBL (L(32bytes), L(table_64bytes))
1246 .int JMPTBL (L(unreal_case), L(table_64bytes))
1247 .int JMPTBL (L(unreal_case), L(table_64bytes))
1248 .int JMPTBL (L(unreal_case), L(table_64bytes))
1249 .int JMPTBL (L(36bytes), L(table_64bytes))
1250 .int JMPTBL (L(unreal_case), L(table_64bytes))
1251 .int JMPTBL (L(unreal_case), L(table_64bytes))
1252 .int JMPTBL (L(unreal_case), L(table_64bytes))
1253 .int JMPTBL (L(40bytes), L(table_64bytes))
1254 .int JMPTBL (L(unreal_case), L(table_64bytes))
1255 .int JMPTBL (L(unreal_case), L(table_64bytes))
1256 .int JMPTBL (L(unreal_case), L(table_64bytes))
1257 .int JMPTBL (L(44bytes), L(table_64bytes))
1258 .int JMPTBL (L(unreal_case), L(table_64bytes))
1259 .int JMPTBL (L(unreal_case), L(table_64bytes))
1260 .int JMPTBL (L(unreal_case), L(table_64bytes))
1261 .int JMPTBL (L(48bytes), L(table_64bytes))
1262 .int JMPTBL (L(unreal_case), L(table_64bytes))
1263 .int JMPTBL (L(unreal_case), L(table_64bytes))
1264 .int JMPTBL (L(unreal_case), L(table_64bytes))
1265 .int JMPTBL (L(52bytes), L(table_64bytes))
1266 .int JMPTBL (L(unreal_case), L(table_64bytes))
1267 .int JMPTBL (L(unreal_case), L(table_64bytes))
1268 .int JMPTBL (L(unreal_case), L(table_64bytes))
1269 .int JMPTBL (L(56bytes), L(table_64bytes))
1270 .int JMPTBL (L(unreal_case), L(table_64bytes))
1271 .int JMPTBL (L(unreal_case), L(table_64bytes))
1272 .int JMPTBL (L(unreal_case), L(table_64bytes))
1273 .int JMPTBL (L(60bytes), L(table_64bytes))
1274 .int JMPTBL (L(unreal_case), L(table_64bytes))
1275 .int JMPTBL (L(unreal_case), L(table_64bytes))
1276 .int JMPTBL (L(unreal_case), L(table_64bytes))
1277 .int JMPTBL (L(64bytes), L(table_64bytes))
1278#endif