blob: b302883490cc071676ee48b772744c0fef090dd6 [file] [log] [blame]
Varvara Rainchik5a922842014-04-24 15:41:20 +04001/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
25ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28*/
29
30#ifndef L
31# define L(label) .L##label
32#endif
33
34#ifndef cfi_startproc
35# define cfi_startproc .cfi_startproc
36#endif
37
38#ifndef cfi_endproc
39# define cfi_endproc .cfi_endproc
40#endif
41
42#ifndef cfi_rel_offset
43# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
44#endif
45
46#ifndef cfi_restore
47# define cfi_restore(reg) .cfi_restore reg
48#endif
49
50#ifndef cfi_adjust_cfa_offset
51# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
52#endif
53
54#ifndef cfi_remember_state
55# define cfi_remember_state .cfi_remember_state
56#endif
57
58#ifndef cfi_restore_state
59# define cfi_restore_state .cfi_restore_state
60#endif
61
62#ifndef ENTRY
63# define ENTRY(name) \
64 .type name, @function; \
65 .globl name; \
66 .p2align 4; \
67name: \
68 cfi_startproc
69#endif
70
71#ifndef END
72# define END(name) \
73 cfi_endproc; \
74 .size name, .-name
75#endif
76
77#ifndef MEMCMP
78# define MEMCMP memcmp
79#endif
80
81#define CFI_PUSH(REG) \
82 cfi_adjust_cfa_offset (4); \
83 cfi_rel_offset (REG, 0)
84
85#define CFI_POP(REG) \
86 cfi_adjust_cfa_offset (-4); \
87 cfi_restore (REG)
88
89#define PUSH(REG) pushl REG; CFI_PUSH (REG)
90#define POP(REG) popl REG; CFI_POP (REG)
91
92#define PARMS 4
93#define BLK1 PARMS
94#define BLK2 BLK1 + 4
95#define LEN BLK2 + 4
96#define RETURN POP (%ebx); ret; CFI_PUSH (%ebx)
97
98
99#if (defined SHARED || defined __PIC__)
100# define JMPTBL(I, B) I - B
101
102/* Load an entry in a jump table into EBX and branch to it. TABLE is a
103 jump table with relative offsets. INDEX is a register contains the
104 index into the jump table. SCALE is the scale of INDEX. */
105
106# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
107/* We first load PC into EBX. */ \
108 call __x86.get_pc_thunk.bx; \
109/* Get the address of the jump table. */ \
110 addl $(TABLE - .), %ebx; \
111/* Get the entry and convert the relative offset to the \
112 absolute address. */ \
113 addl (%ebx,INDEX,SCALE), %ebx; \
114/* We loaded the jump table and adjuested EDX/ESI. Go. */ \
115 jmp *%ebx
116#else
117# define JMPTBL(I, B) I
118
119/* Load an entry in a jump table into EBX and branch to it. TABLE is a
120 jump table with relative offsets. INDEX is a register contains the
121 index into the jump table. SCALE is the scale of INDEX. */
122# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
123 jmp *TABLE(,INDEX,SCALE)
124#endif
125
126
127/* Warning!
128 wmemcmp has to use SIGNED comparison for elements.
129 memcmp has to use UNSIGNED comparison for elemnts.
130*/
131
132 .section .text.sse4.2,"ax",@progbits
133ENTRY (MEMCMP)
134 movl BLK1(%esp), %eax
135 movl BLK2(%esp), %edx
136 movl LEN(%esp), %ecx
137
138#ifdef USE_AS_WMEMCMP
139 shl $2, %ecx
140 test %ecx, %ecx
141 jz L(return0)
142#else
143 cmp $1, %ecx
144 jbe L(less1bytes)
145#endif
146
147 pxor %xmm0, %xmm0
148 cmp $64, %ecx
149 ja L(64bytesormore)
150 cmp $8, %ecx
151
152#ifndef USE_AS_WMEMCMP
153 PUSH (%ebx)
154 jb L(less8bytes)
155#else
156 jb L(less8bytes)
157 PUSH (%ebx)
158#endif
159
160 add %ecx, %edx
161 add %ecx, %eax
162 BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
163
164#ifndef USE_AS_WMEMCMP
165 .p2align 4
166L(less8bytes):
167 mov (%eax), %bl
168 cmpb (%edx), %bl
169 jne L(nonzero)
170
171 mov 1(%eax), %bl
172 cmpb 1(%edx), %bl
173 jne L(nonzero)
174
175 cmp $2, %ecx
176 jz L(0bytes)
177
178 mov 2(%eax), %bl
179 cmpb 2(%edx), %bl
180 jne L(nonzero)
181
182 cmp $3, %ecx
183 jz L(0bytes)
184
185 mov 3(%eax), %bl
186 cmpb 3(%edx), %bl
187 jne L(nonzero)
188
189 cmp $4, %ecx
190 jz L(0bytes)
191
192 mov 4(%eax), %bl
193 cmpb 4(%edx), %bl
194 jne L(nonzero)
195
196 cmp $5, %ecx
197 jz L(0bytes)
198
199 mov 5(%eax), %bl
200 cmpb 5(%edx), %bl
201 jne L(nonzero)
202
203 cmp $6, %ecx
204 jz L(0bytes)
205
206 mov 6(%eax), %bl
207 cmpb 6(%edx), %bl
208 je L(0bytes)
209
210L(nonzero):
211 POP (%ebx)
212 mov $1, %eax
213 ja L(above)
214 neg %eax
215L(above):
216 ret
217 CFI_PUSH (%ebx)
218#endif
219
220 .p2align 4
221L(0bytes):
222 POP (%ebx)
223 xor %eax, %eax
224 ret
225
226#ifdef USE_AS_WMEMCMP
227
228/* for wmemcmp, case N == 1 */
229
230 .p2align 4
231L(less8bytes):
232 mov (%eax), %ecx
233 cmp (%edx), %ecx
234 je L(return0)
235 mov $1, %eax
236 jg L(find_diff_bigger)
237 neg %eax
238 ret
239
240 .p2align 4
241L(find_diff_bigger):
242 ret
243
244 .p2align 4
245L(return0):
246 xor %eax, %eax
247 ret
248#endif
249
250#ifndef USE_AS_WMEMCMP
251 .p2align 4
252L(less1bytes):
253 jb L(0bytesend)
254 movzbl (%eax), %eax
255 movzbl (%edx), %edx
256 sub %edx, %eax
257 ret
258
259 .p2align 4
260L(0bytesend):
261 xor %eax, %eax
262 ret
263#endif
264 .p2align 4
265L(64bytesormore):
266 PUSH (%ebx)
267 mov %ecx, %ebx
268 mov $64, %ecx
269 sub $64, %ebx
270L(64bytesormore_loop):
271 movdqu (%eax), %xmm1
272 movdqu (%edx), %xmm2
273 pxor %xmm1, %xmm2
274 ptest %xmm2, %xmm0
275 jnc L(find_16diff)
276
277 movdqu 16(%eax), %xmm1
278 movdqu 16(%edx), %xmm2
279 pxor %xmm1, %xmm2
280 ptest %xmm2, %xmm0
281 jnc L(find_32diff)
282
283 movdqu 32(%eax), %xmm1
284 movdqu 32(%edx), %xmm2
285 pxor %xmm1, %xmm2
286 ptest %xmm2, %xmm0
287 jnc L(find_48diff)
288
289 movdqu 48(%eax), %xmm1
290 movdqu 48(%edx), %xmm2
291 pxor %xmm1, %xmm2
292 ptest %xmm2, %xmm0
293 jnc L(find_64diff)
294 add %ecx, %eax
295 add %ecx, %edx
296 sub %ecx, %ebx
297 jae L(64bytesormore_loop)
298 add %ebx, %ecx
299 add %ecx, %edx
300 add %ecx, %eax
301 BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
302
303#ifdef USE_AS_WMEMCMP
304
305/* Label needs only for table_64bytes filling */
306L(unreal_case):
307/* no code here */
308
309#endif
310 .p2align 4
311L(find_16diff):
312 sub $16, %ecx
313L(find_32diff):
314 sub $16, %ecx
315L(find_48diff):
316 sub $16, %ecx
317L(find_64diff):
318 add %ecx, %edx
319 add %ecx, %eax
320
321#ifndef USE_AS_WMEMCMP
322 .p2align 4
323L(16bytes):
324 mov -16(%eax), %ecx
325 mov -16(%edx), %ebx
326 cmp %ebx, %ecx
327 jne L(find_diff)
328L(12bytes):
329 mov -12(%eax), %ecx
330 mov -12(%edx), %ebx
331 cmp %ebx, %ecx
332 jne L(find_diff)
333L(8bytes):
334 mov -8(%eax), %ecx
335 mov -8(%edx), %ebx
336 cmp %ebx, %ecx
337 jne L(find_diff)
338L(4bytes):
339 mov -4(%eax), %ecx
340 mov -4(%edx), %ebx
341 cmp %ebx, %ecx
342 mov $0, %eax
343 jne L(find_diff)
344 RETURN
345#else
346 .p2align 4
347L(16bytes):
348 mov -16(%eax), %ecx
349 cmp -16(%edx), %ecx
350 jne L(find_diff)
351L(12bytes):
352 mov -12(%eax), %ecx
353 cmp -12(%edx), %ecx
354 jne L(find_diff)
355L(8bytes):
356 mov -8(%eax), %ecx
357 cmp -8(%edx), %ecx
358 jne L(find_diff)
359L(4bytes):
360 mov -4(%eax), %ecx
361 cmp -4(%edx), %ecx
362 mov $0, %eax
363 jne L(find_diff)
364 RETURN
365#endif
366
367#ifndef USE_AS_WMEMCMP
368 .p2align 4
369L(49bytes):
370 movdqu -49(%eax), %xmm1
371 movdqu -49(%edx), %xmm2
372 mov $-49, %ebx
373 pxor %xmm1, %xmm2
374 ptest %xmm2, %xmm0
375 jnc L(less16bytes)
376L(33bytes):
377 movdqu -33(%eax), %xmm1
378 movdqu -33(%edx), %xmm2
379 mov $-33, %ebx
380 pxor %xmm1, %xmm2
381 ptest %xmm2, %xmm0
382 jnc L(less16bytes)
383L(17bytes):
384 mov -17(%eax), %ecx
385 mov -17(%edx), %ebx
386 cmp %ebx, %ecx
387 jne L(find_diff)
388L(13bytes):
389 mov -13(%eax), %ecx
390 mov -13(%edx), %ebx
391 cmp %ebx, %ecx
392 jne L(find_diff)
393L(9bytes):
394 mov -9(%eax), %ecx
395 mov -9(%edx), %ebx
396 cmp %ebx, %ecx
397 jne L(find_diff)
398L(5bytes):
399 mov -5(%eax), %ecx
400 mov -5(%edx), %ebx
401 cmp %ebx, %ecx
402 jne L(find_diff)
403 movzbl -1(%eax), %ecx
404 cmp -1(%edx), %cl
405 mov $0, %eax
406 jne L(end)
407 RETURN
408
409 .p2align 4
410L(50bytes):
411 mov $-50, %ebx
412 movdqu -50(%eax), %xmm1
413 movdqu -50(%edx), %xmm2
414 pxor %xmm1, %xmm2
415 ptest %xmm2, %xmm0
416 jnc L(less16bytes)
417L(34bytes):
418 mov $-34, %ebx
419 movdqu -34(%eax), %xmm1
420 movdqu -34(%edx), %xmm2
421 pxor %xmm1, %xmm2
422 ptest %xmm2, %xmm0
423 jnc L(less16bytes)
424L(18bytes):
425 mov -18(%eax), %ecx
426 mov -18(%edx), %ebx
427 cmp %ebx, %ecx
428 jne L(find_diff)
429L(14bytes):
430 mov -14(%eax), %ecx
431 mov -14(%edx), %ebx
432 cmp %ebx, %ecx
433 jne L(find_diff)
434L(10bytes):
435 mov -10(%eax), %ecx
436 mov -10(%edx), %ebx
437 cmp %ebx, %ecx
438 jne L(find_diff)
439L(6bytes):
440 mov -6(%eax), %ecx
441 mov -6(%edx), %ebx
442 cmp %ebx, %ecx
443 jne L(find_diff)
444L(2bytes):
445 movzwl -2(%eax), %ecx
446 movzwl -2(%edx), %ebx
447 cmp %bl, %cl
448 jne L(end)
449 cmp %bh, %ch
450 mov $0, %eax
451 jne L(end)
452 RETURN
453
454 .p2align 4
455L(51bytes):
456 mov $-51, %ebx
457 movdqu -51(%eax), %xmm1
458 movdqu -51(%edx), %xmm2
459 pxor %xmm1, %xmm2
460 ptest %xmm2, %xmm0
461 jnc L(less16bytes)
462L(35bytes):
463 mov $-35, %ebx
464 movdqu -35(%eax), %xmm1
465 movdqu -35(%edx), %xmm2
466 pxor %xmm1, %xmm2
467 ptest %xmm2, %xmm0
468 jnc L(less16bytes)
469L(19bytes):
470 movl -19(%eax), %ecx
471 movl -19(%edx), %ebx
472 cmp %ebx, %ecx
473 jne L(find_diff)
474L(15bytes):
475 movl -15(%eax), %ecx
476 movl -15(%edx), %ebx
477 cmp %ebx, %ecx
478 jne L(find_diff)
479L(11bytes):
480 movl -11(%eax), %ecx
481 movl -11(%edx), %ebx
482 cmp %ebx, %ecx
483 jne L(find_diff)
484L(7bytes):
485 movl -7(%eax), %ecx
486 movl -7(%edx), %ebx
487 cmp %ebx, %ecx
488 jne L(find_diff)
489L(3bytes):
490 movzwl -3(%eax), %ecx
491 movzwl -3(%edx), %ebx
492 cmpb %bl, %cl
493 jne L(end)
494 cmp %bx, %cx
495 jne L(end)
496L(1bytes):
497 movzbl -1(%eax), %eax
498 cmpb -1(%edx), %al
499 mov $0, %eax
500 jne L(end)
501 RETURN
502#endif
503 .p2align 4
504L(52bytes):
505 movdqu -52(%eax), %xmm1
506 movdqu -52(%edx), %xmm2
507 mov $-52, %ebx
508 pxor %xmm1, %xmm2
509 ptest %xmm2, %xmm0
510 jnc L(less16bytes)
511L(36bytes):
512 movdqu -36(%eax), %xmm1
513 movdqu -36(%edx), %xmm2
514 mov $-36, %ebx
515 pxor %xmm1, %xmm2
516 ptest %xmm2, %xmm0
517 jnc L(less16bytes)
518L(20bytes):
519 movdqu -20(%eax), %xmm1
520 movdqu -20(%edx), %xmm2
521 mov $-20, %ebx
522 pxor %xmm1, %xmm2
523 ptest %xmm2, %xmm0
524 jnc L(less16bytes)
525 mov -4(%eax), %ecx
526#ifndef USE_AS_WMEMCMP
527 mov -4(%edx), %ebx
528 cmp %ebx, %ecx
529#else
530 cmp -4(%edx), %ecx
531#endif
532 mov $0, %eax
533 jne L(find_diff)
534 RETURN
535
536#ifndef USE_AS_WMEMCMP
537 .p2align 4
538L(53bytes):
539 movdqu -53(%eax), %xmm1
540 movdqu -53(%edx), %xmm2
541 mov $-53, %ebx
542 pxor %xmm1, %xmm2
543 ptest %xmm2, %xmm0
544 jnc L(less16bytes)
545L(37bytes):
546 mov $-37, %ebx
547 movdqu -37(%eax), %xmm1
548 movdqu -37(%edx), %xmm2
549 pxor %xmm1, %xmm2
550 ptest %xmm2, %xmm0
551 jnc L(less16bytes)
552L(21bytes):
553 mov $-21, %ebx
554 movdqu -21(%eax), %xmm1
555 movdqu -21(%edx), %xmm2
556 pxor %xmm1, %xmm2
557 ptest %xmm2, %xmm0
558 jnc L(less16bytes)
559 mov -5(%eax), %ecx
560 mov -5(%edx), %ebx
561 cmp %ebx, %ecx
562 jne L(find_diff)
563 movzbl -1(%eax), %ecx
564 cmp -1(%edx), %cl
565 mov $0, %eax
566 jne L(end)
567 RETURN
568
569 .p2align 4
570L(54bytes):
571 movdqu -54(%eax), %xmm1
572 movdqu -54(%edx), %xmm2
573 mov $-54, %ebx
574 pxor %xmm1, %xmm2
575 ptest %xmm2, %xmm0
576 jnc L(less16bytes)
577L(38bytes):
578 mov $-38, %ebx
579 movdqu -38(%eax), %xmm1
580 movdqu -38(%edx), %xmm2
581 pxor %xmm1, %xmm2
582 ptest %xmm2, %xmm0
583 jnc L(less16bytes)
584L(22bytes):
585 mov $-22, %ebx
586 movdqu -22(%eax), %xmm1
587 movdqu -22(%edx), %xmm2
588 pxor %xmm1, %xmm2
589 ptest %xmm2, %xmm0
590 jnc L(less16bytes)
591
592 mov -6(%eax), %ecx
593 mov -6(%edx), %ebx
594 cmp %ebx, %ecx
595 jne L(find_diff)
596 movzwl -2(%eax), %ecx
597 movzwl -2(%edx), %ebx
598 cmp %bl, %cl
599 jne L(end)
600 cmp %bh, %ch
601 mov $0, %eax
602 jne L(end)
603 RETURN
604
605 .p2align 4
606L(55bytes):
607 movdqu -55(%eax), %xmm1
608 movdqu -55(%edx), %xmm2
609 mov $-55, %ebx
610 pxor %xmm1, %xmm2
611 ptest %xmm2, %xmm0
612 jnc L(less16bytes)
613L(39bytes):
614 mov $-39, %ebx
615 movdqu -39(%eax), %xmm1
616 movdqu -39(%edx), %xmm2
617 pxor %xmm1, %xmm2
618 ptest %xmm2, %xmm0
619 jnc L(less16bytes)
620L(23bytes):
621 mov $-23, %ebx
622 movdqu -23(%eax), %xmm1
623 movdqu -23(%edx), %xmm2
624 pxor %xmm1, %xmm2
625 ptest %xmm2, %xmm0
626 jnc L(less16bytes)
627 movl -7(%eax), %ecx
628 movl -7(%edx), %ebx
629 cmp %ebx, %ecx
630 jne L(find_diff)
631 movzwl -3(%eax), %ecx
632 movzwl -3(%edx), %ebx
633 cmpb %bl, %cl
634 jne L(end)
635 cmp %bx, %cx
636 jne L(end)
637 movzbl -1(%eax), %eax
638 cmpb -1(%edx), %al
639 mov $0, %eax
640 jne L(end)
641 RETURN
642#endif
643 .p2align 4
644L(56bytes):
645 movdqu -56(%eax), %xmm1
646 movdqu -56(%edx), %xmm2
647 mov $-56, %ebx
648 pxor %xmm1, %xmm2
649 ptest %xmm2, %xmm0
650 jnc L(less16bytes)
651L(40bytes):
652 mov $-40, %ebx
653 movdqu -40(%eax), %xmm1
654 movdqu -40(%edx), %xmm2
655 pxor %xmm1, %xmm2
656 ptest %xmm2, %xmm0
657 jnc L(less16bytes)
658L(24bytes):
659 mov $-24, %ebx
660 movdqu -24(%eax), %xmm1
661 movdqu -24(%edx), %xmm2
662 pxor %xmm1, %xmm2
663 ptest %xmm2, %xmm0
664 jnc L(less16bytes)
665
666 mov -8(%eax), %ecx
667#ifndef USE_AS_WMEMCMP
668 mov -8(%edx), %ebx
669 cmp %ebx, %ecx
670#else
671 cmp -8(%edx), %ecx
672#endif
673 jne L(find_diff)
674
675 mov -4(%eax), %ecx
676#ifndef USE_AS_WMEMCMP
677 mov -4(%edx), %ebx
678 cmp %ebx, %ecx
679#else
680 cmp -4(%edx), %ecx
681#endif
682 mov $0, %eax
683 jne L(find_diff)
684 RETURN
685
686#ifndef USE_AS_WMEMCMP
687 .p2align 4
688L(57bytes):
689 movdqu -57(%eax), %xmm1
690 movdqu -57(%edx), %xmm2
691 mov $-57, %ebx
692 pxor %xmm1, %xmm2
693 ptest %xmm2, %xmm0
694 jnc L(less16bytes)
695L(41bytes):
696 mov $-41, %ebx
697 movdqu -41(%eax), %xmm1
698 movdqu -41(%edx), %xmm2
699 pxor %xmm1, %xmm2
700 ptest %xmm2, %xmm0
701 jnc L(less16bytes)
702L(25bytes):
703 mov $-25, %ebx
704 movdqu -25(%eax), %xmm1
705 movdqu -25(%edx), %xmm2
706 pxor %xmm1, %xmm2
707 ptest %xmm2, %xmm0
708 jnc L(less16bytes)
709 mov -9(%eax), %ecx
710 mov -9(%edx), %ebx
711 cmp %ebx, %ecx
712 jne L(find_diff)
713 mov -5(%eax), %ecx
714 mov -5(%edx), %ebx
715 cmp %ebx, %ecx
716 jne L(find_diff)
717 movzbl -1(%eax), %ecx
718 cmp -1(%edx), %cl
719 mov $0, %eax
720 jne L(end)
721 RETURN
722
723 .p2align 4
724L(58bytes):
725 movdqu -58(%eax), %xmm1
726 movdqu -58(%edx), %xmm2
727 mov $-58, %ebx
728 pxor %xmm1, %xmm2
729 ptest %xmm2, %xmm0
730 jnc L(less16bytes)
731L(42bytes):
732 mov $-42, %ebx
733 movdqu -42(%eax), %xmm1
734 movdqu -42(%edx), %xmm2
735 pxor %xmm1, %xmm2
736 ptest %xmm2, %xmm0
737 jnc L(less16bytes)
738L(26bytes):
739 mov $-26, %ebx
740 movdqu -26(%eax), %xmm1
741 movdqu -26(%edx), %xmm2
742 pxor %xmm1, %xmm2
743 ptest %xmm2, %xmm0
744 jnc L(less16bytes)
745
746 mov -10(%eax), %ecx
747 mov -10(%edx), %ebx
748 cmp %ebx, %ecx
749 jne L(find_diff)
750
751 mov -6(%eax), %ecx
752 mov -6(%edx), %ebx
753 cmp %ebx, %ecx
754 jne L(find_diff)
755
756 movzwl -2(%eax), %ecx
757 movzwl -2(%edx), %ebx
758 cmp %bl, %cl
759 jne L(end)
760 cmp %bh, %ch
761 mov $0, %eax
762 jne L(end)
763 RETURN
764
765 .p2align 4
766L(59bytes):
767 movdqu -59(%eax), %xmm1
768 movdqu -59(%edx), %xmm2
769 mov $-59, %ebx
770 pxor %xmm1, %xmm2
771 ptest %xmm2, %xmm0
772 jnc L(less16bytes)
773L(43bytes):
774 mov $-43, %ebx
775 movdqu -43(%eax), %xmm1
776 movdqu -43(%edx), %xmm2
777 pxor %xmm1, %xmm2
778 ptest %xmm2, %xmm0
779 jnc L(less16bytes)
780L(27bytes):
781 mov $-27, %ebx
782 movdqu -27(%eax), %xmm1
783 movdqu -27(%edx), %xmm2
784 pxor %xmm1, %xmm2
785 ptest %xmm2, %xmm0
786 jnc L(less16bytes)
787 movl -11(%eax), %ecx
788 movl -11(%edx), %ebx
789 cmp %ebx, %ecx
790 jne L(find_diff)
791 movl -7(%eax), %ecx
792 movl -7(%edx), %ebx
793 cmp %ebx, %ecx
794 jne L(find_diff)
795 movzwl -3(%eax), %ecx
796 movzwl -3(%edx), %ebx
797 cmpb %bl, %cl
798 jne L(end)
799 cmp %bx, %cx
800 jne L(end)
801 movzbl -1(%eax), %eax
802 cmpb -1(%edx), %al
803 mov $0, %eax
804 jne L(end)
805 RETURN
806#endif
807 .p2align 4
808L(60bytes):
809 movdqu -60(%eax), %xmm1
810 movdqu -60(%edx), %xmm2
811 mov $-60, %ebx
812 pxor %xmm1, %xmm2
813 ptest %xmm2, %xmm0
814 jnc L(less16bytes)
815L(44bytes):
816 mov $-44, %ebx
817 movdqu -44(%eax), %xmm1
818 movdqu -44(%edx), %xmm2
819 pxor %xmm1, %xmm2
820 ptest %xmm2, %xmm0
821 jnc L(less16bytes)
822L(28bytes):
823 mov $-28, %ebx
824 movdqu -28(%eax), %xmm1
825 movdqu -28(%edx), %xmm2
826 pxor %xmm1, %xmm2
827 ptest %xmm2, %xmm0
828 jnc L(less16bytes)
829
830 mov -12(%eax), %ecx
831#ifndef USE_AS_WMEMCMP
832 mov -12(%edx), %ebx
833 cmp %ebx, %ecx
834#else
835 cmp -12(%edx), %ecx
836#endif
837 jne L(find_diff)
838
839 mov -8(%eax), %ecx
840#ifndef USE_AS_WMEMCMP
841 mov -8(%edx), %ebx
842 cmp %ebx, %ecx
843#else
844 cmp -8(%edx), %ecx
845#endif
846 jne L(find_diff)
847
848 mov -4(%eax), %ecx
849#ifndef USE_AS_WMEMCMP
850 mov -4(%edx), %ebx
851 cmp %ebx, %ecx
852#else
853 cmp -4(%edx), %ecx
854#endif
855 mov $0, %eax
856 jne L(find_diff)
857 RETURN
858
859#ifndef USE_AS_WMEMCMP
860 .p2align 4
861L(61bytes):
862 movdqu -61(%eax), %xmm1
863 movdqu -61(%edx), %xmm2
864 mov $-61, %ebx
865 pxor %xmm1, %xmm2
866 ptest %xmm2, %xmm0
867 jnc L(less16bytes)
868L(45bytes):
869 mov $-45, %ebx
870 movdqu -45(%eax), %xmm1
871 movdqu -45(%edx), %xmm2
872 pxor %xmm1, %xmm2
873 ptest %xmm2, %xmm0
874 jnc L(less16bytes)
875L(29bytes):
876 mov $-29, %ebx
877 movdqu -29(%eax), %xmm1
878 movdqu -29(%edx), %xmm2
879 pxor %xmm1, %xmm2
880 ptest %xmm2, %xmm0
881 jnc L(less16bytes)
882
883 mov -13(%eax), %ecx
884 mov -13(%edx), %ebx
885 cmp %ebx, %ecx
886 jne L(find_diff)
887
888 mov -9(%eax), %ecx
889 mov -9(%edx), %ebx
890 cmp %ebx, %ecx
891 jne L(find_diff)
892
893 mov -5(%eax), %ecx
894 mov -5(%edx), %ebx
895 cmp %ebx, %ecx
896 jne L(find_diff)
897 movzbl -1(%eax), %ecx
898 cmp -1(%edx), %cl
899 mov $0, %eax
900 jne L(end)
901 RETURN
902
903 .p2align 4
904L(62bytes):
905 movdqu -62(%eax), %xmm1
906 movdqu -62(%edx), %xmm2
907 mov $-62, %ebx
908 pxor %xmm1, %xmm2
909 ptest %xmm2, %xmm0
910 jnc L(less16bytes)
911L(46bytes):
912 mov $-46, %ebx
913 movdqu -46(%eax), %xmm1
914 movdqu -46(%edx), %xmm2
915 pxor %xmm1, %xmm2
916 ptest %xmm2, %xmm0
917 jnc L(less16bytes)
918L(30bytes):
919 mov $-30, %ebx
920 movdqu -30(%eax), %xmm1
921 movdqu -30(%edx), %xmm2
922 pxor %xmm1, %xmm2
923 ptest %xmm2, %xmm0
924 jnc L(less16bytes)
925 mov -14(%eax), %ecx
926 mov -14(%edx), %ebx
927 cmp %ebx, %ecx
928 jne L(find_diff)
929 mov -10(%eax), %ecx
930 mov -10(%edx), %ebx
931 cmp %ebx, %ecx
932 jne L(find_diff)
933 mov -6(%eax), %ecx
934 mov -6(%edx), %ebx
935 cmp %ebx, %ecx
936 jne L(find_diff)
937 movzwl -2(%eax), %ecx
938 movzwl -2(%edx), %ebx
939 cmp %bl, %cl
940 jne L(end)
941 cmp %bh, %ch
942 mov $0, %eax
943 jne L(end)
944 RETURN
945
946 .p2align 4
947L(63bytes):
948 movdqu -63(%eax), %xmm1
949 movdqu -63(%edx), %xmm2
950 mov $-63, %ebx
951 pxor %xmm1, %xmm2
952 ptest %xmm2, %xmm0
953 jnc L(less16bytes)
954L(47bytes):
955 mov $-47, %ebx
956 movdqu -47(%eax), %xmm1
957 movdqu -47(%edx), %xmm2
958 pxor %xmm1, %xmm2
959 ptest %xmm2, %xmm0
960 jnc L(less16bytes)
961L(31bytes):
962 mov $-31, %ebx
963 movdqu -31(%eax), %xmm1
964 movdqu -31(%edx), %xmm2
965 pxor %xmm1, %xmm2
966 ptest %xmm2, %xmm0
967 jnc L(less16bytes)
968
969 movl -15(%eax), %ecx
970 movl -15(%edx), %ebx
971 cmp %ebx, %ecx
972 jne L(find_diff)
973 movl -11(%eax), %ecx
974 movl -11(%edx), %ebx
975 cmp %ebx, %ecx
976 jne L(find_diff)
977 movl -7(%eax), %ecx
978 movl -7(%edx), %ebx
979 cmp %ebx, %ecx
980 jne L(find_diff)
981 movzwl -3(%eax), %ecx
982 movzwl -3(%edx), %ebx
983 cmpb %bl, %cl
984 jne L(end)
985 cmp %bx, %cx
986 jne L(end)
987 movzbl -1(%eax), %eax
988 cmpb -1(%edx), %al
989 mov $0, %eax
990 jne L(end)
991 RETURN
992#endif
993
994 .p2align 4
995L(64bytes):
996 movdqu -64(%eax), %xmm1
997 movdqu -64(%edx), %xmm2
998 mov $-64, %ebx
999 pxor %xmm1, %xmm2
1000 ptest %xmm2, %xmm0
1001 jnc L(less16bytes)
1002L(48bytes):
1003 movdqu -48(%eax), %xmm1
1004 movdqu -48(%edx), %xmm2
1005 mov $-48, %ebx
1006 pxor %xmm1, %xmm2
1007 ptest %xmm2, %xmm0
1008 jnc L(less16bytes)
1009L(32bytes):
1010 movdqu -32(%eax), %xmm1
1011 movdqu -32(%edx), %xmm2
1012 mov $-32, %ebx
1013 pxor %xmm1, %xmm2
1014 ptest %xmm2, %xmm0
1015 jnc L(less16bytes)
1016
1017 mov -16(%eax), %ecx
1018#ifndef USE_AS_WMEMCMP
1019 mov -16(%edx), %ebx
1020 cmp %ebx, %ecx
1021#else
1022 cmp -16(%edx), %ecx
1023#endif
1024 jne L(find_diff)
1025
1026 mov -12(%eax), %ecx
1027#ifndef USE_AS_WMEMCMP
1028 mov -12(%edx), %ebx
1029 cmp %ebx, %ecx
1030#else
1031 cmp -12(%edx), %ecx
1032#endif
1033 jne L(find_diff)
1034
1035 mov -8(%eax), %ecx
1036#ifndef USE_AS_WMEMCMP
1037 mov -8(%edx), %ebx
1038 cmp %ebx, %ecx
1039#else
1040 cmp -8(%edx), %ecx
1041#endif
1042 jne L(find_diff)
1043
1044 mov -4(%eax), %ecx
1045#ifndef USE_AS_WMEMCMP
1046 mov -4(%edx), %ebx
1047 cmp %ebx, %ecx
1048#else
1049 cmp -4(%edx), %ecx
1050#endif
1051 mov $0, %eax
1052 jne L(find_diff)
1053 RETURN
1054
1055#ifndef USE_AS_WMEMCMP
1056 .p2align 4
1057L(less16bytes):
1058 add %ebx, %eax
1059 add %ebx, %edx
1060
1061 mov (%eax), %ecx
1062 mov (%edx), %ebx
1063 cmp %ebx, %ecx
1064 jne L(find_diff)
1065
1066 mov 4(%eax), %ecx
1067 mov 4(%edx), %ebx
1068 cmp %ebx, %ecx
1069 jne L(find_diff)
1070
1071 mov 8(%eax), %ecx
1072 mov 8(%edx), %ebx
1073 cmp %ebx, %ecx
1074 jne L(find_diff)
1075
1076 mov 12(%eax), %ecx
1077 mov 12(%edx), %ebx
1078 cmp %ebx, %ecx
1079 mov $0, %eax
1080 jne L(find_diff)
1081 RETURN
1082#else
1083 .p2align 4
1084L(less16bytes):
1085 add %ebx, %eax
1086 add %ebx, %edx
1087
1088 mov (%eax), %ecx
1089 cmp (%edx), %ecx
1090 jne L(find_diff)
1091
1092 mov 4(%eax), %ecx
1093 cmp 4(%edx), %ecx
1094 jne L(find_diff)
1095
1096 mov 8(%eax), %ecx
1097 cmp 8(%edx), %ecx
1098 jne L(find_diff)
1099
1100 mov 12(%eax), %ecx
1101 cmp 12(%edx), %ecx
1102
1103 mov $0, %eax
1104 jne L(find_diff)
1105 RETURN
1106#endif
1107
1108 .p2align 4
1109L(find_diff):
1110#ifndef USE_AS_WMEMCMP
1111 cmpb %bl, %cl
1112 jne L(end)
1113 cmp %bx, %cx
1114 jne L(end)
1115 shr $16,%ecx
1116 shr $16,%ebx
1117 cmp %bl, %cl
1118 jne L(end)
1119 cmp %bx, %cx
1120L(end):
1121 POP (%ebx)
1122 mov $1, %eax
1123 ja L(bigger)
1124 neg %eax
1125L(bigger):
1126 ret
1127#else
1128 POP (%ebx)
1129 mov $1, %eax
1130 jg L(bigger)
1131 neg %eax
1132 ret
1133
1134 .p2align 4
1135L(bigger):
1136 ret
1137#endif
1138END (MEMCMP)
1139
1140 .section .rodata.sse4.2,"a",@progbits
1141 .p2align 2
1142 .type L(table_64bytes), @object
1143#ifndef USE_AS_WMEMCMP
1144L(table_64bytes):
1145 .int JMPTBL (L(0bytes), L(table_64bytes))
1146 .int JMPTBL (L(1bytes), L(table_64bytes))
1147 .int JMPTBL (L(2bytes), L(table_64bytes))
1148 .int JMPTBL (L(3bytes), L(table_64bytes))
1149 .int JMPTBL (L(4bytes), L(table_64bytes))
1150 .int JMPTBL (L(5bytes), L(table_64bytes))
1151 .int JMPTBL (L(6bytes), L(table_64bytes))
1152 .int JMPTBL (L(7bytes), L(table_64bytes))
1153 .int JMPTBL (L(8bytes), L(table_64bytes))
1154 .int JMPTBL (L(9bytes), L(table_64bytes))
1155 .int JMPTBL (L(10bytes), L(table_64bytes))
1156 .int JMPTBL (L(11bytes), L(table_64bytes))
1157 .int JMPTBL (L(12bytes), L(table_64bytes))
1158 .int JMPTBL (L(13bytes), L(table_64bytes))
1159 .int JMPTBL (L(14bytes), L(table_64bytes))
1160 .int JMPTBL (L(15bytes), L(table_64bytes))
1161 .int JMPTBL (L(16bytes), L(table_64bytes))
1162 .int JMPTBL (L(17bytes), L(table_64bytes))
1163 .int JMPTBL (L(18bytes), L(table_64bytes))
1164 .int JMPTBL (L(19bytes), L(table_64bytes))
1165 .int JMPTBL (L(20bytes), L(table_64bytes))
1166 .int JMPTBL (L(21bytes), L(table_64bytes))
1167 .int JMPTBL (L(22bytes), L(table_64bytes))
1168 .int JMPTBL (L(23bytes), L(table_64bytes))
1169 .int JMPTBL (L(24bytes), L(table_64bytes))
1170 .int JMPTBL (L(25bytes), L(table_64bytes))
1171 .int JMPTBL (L(26bytes), L(table_64bytes))
1172 .int JMPTBL (L(27bytes), L(table_64bytes))
1173 .int JMPTBL (L(28bytes), L(table_64bytes))
1174 .int JMPTBL (L(29bytes), L(table_64bytes))
1175 .int JMPTBL (L(30bytes), L(table_64bytes))
1176 .int JMPTBL (L(31bytes), L(table_64bytes))
1177 .int JMPTBL (L(32bytes), L(table_64bytes))
1178 .int JMPTBL (L(33bytes), L(table_64bytes))
1179 .int JMPTBL (L(34bytes), L(table_64bytes))
1180 .int JMPTBL (L(35bytes), L(table_64bytes))
1181 .int JMPTBL (L(36bytes), L(table_64bytes))
1182 .int JMPTBL (L(37bytes), L(table_64bytes))
1183 .int JMPTBL (L(38bytes), L(table_64bytes))
1184 .int JMPTBL (L(39bytes), L(table_64bytes))
1185 .int JMPTBL (L(40bytes), L(table_64bytes))
1186 .int JMPTBL (L(41bytes), L(table_64bytes))
1187 .int JMPTBL (L(42bytes), L(table_64bytes))
1188 .int JMPTBL (L(43bytes), L(table_64bytes))
1189 .int JMPTBL (L(44bytes), L(table_64bytes))
1190 .int JMPTBL (L(45bytes), L(table_64bytes))
1191 .int JMPTBL (L(46bytes), L(table_64bytes))
1192 .int JMPTBL (L(47bytes), L(table_64bytes))
1193 .int JMPTBL (L(48bytes), L(table_64bytes))
1194 .int JMPTBL (L(49bytes), L(table_64bytes))
1195 .int JMPTBL (L(50bytes), L(table_64bytes))
1196 .int JMPTBL (L(51bytes), L(table_64bytes))
1197 .int JMPTBL (L(52bytes), L(table_64bytes))
1198 .int JMPTBL (L(53bytes), L(table_64bytes))
1199 .int JMPTBL (L(54bytes), L(table_64bytes))
1200 .int JMPTBL (L(55bytes), L(table_64bytes))
1201 .int JMPTBL (L(56bytes), L(table_64bytes))
1202 .int JMPTBL (L(57bytes), L(table_64bytes))
1203 .int JMPTBL (L(58bytes), L(table_64bytes))
1204 .int JMPTBL (L(59bytes), L(table_64bytes))
1205 .int JMPTBL (L(60bytes), L(table_64bytes))
1206 .int JMPTBL (L(61bytes), L(table_64bytes))
1207 .int JMPTBL (L(62bytes), L(table_64bytes))
1208 .int JMPTBL (L(63bytes), L(table_64bytes))
1209 .int JMPTBL (L(64bytes), L(table_64bytes))
1210#else
1211L(table_64bytes):
1212 .int JMPTBL (L(0bytes), L(table_64bytes))
1213 .int JMPTBL (L(unreal_case), L(table_64bytes))
1214 .int JMPTBL (L(unreal_case), L(table_64bytes))
1215 .int JMPTBL (L(unreal_case), L(table_64bytes))
1216 .int JMPTBL (L(4bytes), L(table_64bytes))
1217 .int JMPTBL (L(unreal_case), L(table_64bytes))
1218 .int JMPTBL (L(unreal_case), L(table_64bytes))
1219 .int JMPTBL (L(unreal_case), L(table_64bytes))
1220 .int JMPTBL (L(8bytes), L(table_64bytes))
1221 .int JMPTBL (L(unreal_case), L(table_64bytes))
1222 .int JMPTBL (L(unreal_case), L(table_64bytes))
1223 .int JMPTBL (L(unreal_case), L(table_64bytes))
1224 .int JMPTBL (L(12bytes), L(table_64bytes))
1225 .int JMPTBL (L(unreal_case), L(table_64bytes))
1226 .int JMPTBL (L(unreal_case), L(table_64bytes))
1227 .int JMPTBL (L(unreal_case), L(table_64bytes))
1228 .int JMPTBL (L(16bytes), L(table_64bytes))
1229 .int JMPTBL (L(unreal_case), L(table_64bytes))
1230 .int JMPTBL (L(unreal_case), L(table_64bytes))
1231 .int JMPTBL (L(unreal_case), L(table_64bytes))
1232 .int JMPTBL (L(20bytes), L(table_64bytes))
1233 .int JMPTBL (L(unreal_case), L(table_64bytes))
1234 .int JMPTBL (L(unreal_case), L(table_64bytes))
1235 .int JMPTBL (L(unreal_case), L(table_64bytes))
1236 .int JMPTBL (L(24bytes), L(table_64bytes))
1237 .int JMPTBL (L(unreal_case), L(table_64bytes))
1238 .int JMPTBL (L(unreal_case), L(table_64bytes))
1239 .int JMPTBL (L(unreal_case), L(table_64bytes))
1240 .int JMPTBL (L(28bytes), L(table_64bytes))
1241 .int JMPTBL (L(unreal_case), L(table_64bytes))
1242 .int JMPTBL (L(unreal_case), L(table_64bytes))
1243 .int JMPTBL (L(unreal_case), L(table_64bytes))
1244 .int JMPTBL (L(32bytes), L(table_64bytes))
1245 .int JMPTBL (L(unreal_case), L(table_64bytes))
1246 .int JMPTBL (L(unreal_case), L(table_64bytes))
1247 .int JMPTBL (L(unreal_case), L(table_64bytes))
1248 .int JMPTBL (L(36bytes), L(table_64bytes))
1249 .int JMPTBL (L(unreal_case), L(table_64bytes))
1250 .int JMPTBL (L(unreal_case), L(table_64bytes))
1251 .int JMPTBL (L(unreal_case), L(table_64bytes))
1252 .int JMPTBL (L(40bytes), L(table_64bytes))
1253 .int JMPTBL (L(unreal_case), L(table_64bytes))
1254 .int JMPTBL (L(unreal_case), L(table_64bytes))
1255 .int JMPTBL (L(unreal_case), L(table_64bytes))
1256 .int JMPTBL (L(44bytes), L(table_64bytes))
1257 .int JMPTBL (L(unreal_case), L(table_64bytes))
1258 .int JMPTBL (L(unreal_case), L(table_64bytes))
1259 .int JMPTBL (L(unreal_case), L(table_64bytes))
1260 .int JMPTBL (L(48bytes), L(table_64bytes))
1261 .int JMPTBL (L(unreal_case), L(table_64bytes))
1262 .int JMPTBL (L(unreal_case), L(table_64bytes))
1263 .int JMPTBL (L(unreal_case), L(table_64bytes))
1264 .int JMPTBL (L(52bytes), L(table_64bytes))
1265 .int JMPTBL (L(unreal_case), L(table_64bytes))
1266 .int JMPTBL (L(unreal_case), L(table_64bytes))
1267 .int JMPTBL (L(unreal_case), L(table_64bytes))
1268 .int JMPTBL (L(56bytes), L(table_64bytes))
1269 .int JMPTBL (L(unreal_case), L(table_64bytes))
1270 .int JMPTBL (L(unreal_case), L(table_64bytes))
1271 .int JMPTBL (L(unreal_case), L(table_64bytes))
1272 .int JMPTBL (L(60bytes), L(table_64bytes))
1273 .int JMPTBL (L(unreal_case), L(table_64bytes))
1274 .int JMPTBL (L(unreal_case), L(table_64bytes))
1275 .int JMPTBL (L(unreal_case), L(table_64bytes))
1276 .int JMPTBL (L(64bytes), L(table_64bytes))
1277#endif