blob: 1aa1a1a40c9e38b91272df2231bb8b7c9a698153 [file] [log] [blame]
Liubov Dmitrieva0a490662012-01-17 12:55:46 +04001/*
2Copyright (c) 2011, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef L
32# define L(label) .L##label
33#endif
34
35#ifndef cfi_startproc
36# define cfi_startproc .cfi_startproc
37#endif
38
39#ifndef cfi_endproc
40# define cfi_endproc .cfi_endproc
41#endif
42
43#ifndef cfi_rel_offset
44# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
45#endif
46
47#ifndef cfi_restore
48# define cfi_restore(reg) .cfi_restore reg
49#endif
50
51#ifndef cfi_adjust_cfa_offset
52# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
53#endif
54
55#ifndef ENTRY
56# define ENTRY(name) \
57 .type name, @function; \
58 .globl name; \
59 .p2align 4; \
60name: \
61 cfi_startproc
62#endif
63
64#ifndef END
65# define END(name) \
66 cfi_endproc; \
67 .size name, .-name
68#endif
69
70#define CFI_PUSH(REG) \
71 cfi_adjust_cfa_offset (4); \
72 cfi_rel_offset (REG, 0)
73
74#define CFI_POP(REG) \
75 cfi_adjust_cfa_offset (-4); \
76 cfi_restore (REG)
77
78#define PUSH(REG) pushl REG; CFI_PUSH (REG)
79#define POP(REG) popl REG; CFI_POP (REG)
80
81#define PARMS 4
82#define STR1 PARMS
83#define STR2 STR1+4
84#define LEN STR2+4
85
86 .text
87ENTRY (memrchr)
88 mov STR1(%esp), %ecx
89 movd STR2(%esp), %xmm1
90 mov LEN(%esp), %edx
91
92 test %edx, %edx
93 jz L(return_null)
94 sub $16, %edx
95 jbe L(length_less16)
96
97 punpcklbw %xmm1, %xmm1
98 add %edx, %ecx
99 punpcklbw %xmm1, %xmm1
100
101 movdqu (%ecx), %xmm0
102 pshufd $0, %xmm1, %xmm1
103 pcmpeqb %xmm1, %xmm0
104
105 pmovmskb %xmm0, %eax
106 test %eax, %eax
107 jnz L(exit_dispatch)
108
109 sub $64, %ecx
110 mov %ecx, %eax
111 and $15, %eax
112 jz L(loop_prolog)
113
114 add $16, %ecx
115 add $16, %edx
116 and $-16, %ecx
117 sub %eax, %edx
118
119 .p2align 4
120/* Loop start on aligned string. */
121L(loop_prolog):
122 sub $64, %edx
123 jbe L(exit_loop)
124
125 movdqa 48(%ecx), %xmm0
126 pcmpeqb %xmm1, %xmm0
127 pmovmskb %xmm0, %eax
128 test %eax, %eax
129 jnz L(matches48)
130
131 movdqa 32(%ecx), %xmm2
132 pcmpeqb %xmm1, %xmm2
133 pmovmskb %xmm2, %eax
134 test %eax, %eax
135 jnz L(matches32)
136
137 movdqa 16(%ecx), %xmm3
138 pcmpeqb %xmm1, %xmm3
139 pmovmskb %xmm3, %eax
140 test %eax, %eax
141 jnz L(matches16)
142
143 movdqa (%ecx), %xmm4
144 pcmpeqb %xmm1, %xmm4
145 pmovmskb %xmm4, %eax
146 test %eax, %eax
147 jnz L(exit_dispatch)
148
149 sub $64, %ecx
150 sub $64, %edx
151 jbe L(exit_loop)
152
153 movdqa 48(%ecx), %xmm0
154 pcmpeqb %xmm1, %xmm0
155 pmovmskb %xmm0, %eax
156 test %eax, %eax
157 jnz L(matches48)
158
159 movdqa 32(%ecx), %xmm2
160 pcmpeqb %xmm1, %xmm2
161 pmovmskb %xmm2, %eax
162 test %eax, %eax
163 jnz L(matches32)
164
165 movdqa 16(%ecx), %xmm3
166 pcmpeqb %xmm1, %xmm3
167 pmovmskb %xmm3, %eax
168 test %eax, %eax
169 jnz L(matches16)
170
171 movdqa (%ecx), %xmm3
172 pcmpeqb %xmm1, %xmm3
173 pmovmskb %xmm3, %eax
174 test %eax, %eax
175 jnz L(exit_dispatch)
176
177 mov %ecx, %eax
178 and $63, %eax
179 test %eax, %eax
180 jz L(align64_loop)
181
182 add $64, %ecx
183 add $64, %edx
184 and $-64, %ecx
185 sub %eax, %edx
186
187 .p2align 4
188L(align64_loop):
189 sub $64, %ecx
190 sub $64, %edx
191 jbe L(exit_loop)
192
193 movdqa (%ecx), %xmm0
194 movdqa 16(%ecx), %xmm2
195 movdqa 32(%ecx), %xmm3
196 movdqa 48(%ecx), %xmm4
197
198 pcmpeqb %xmm1, %xmm0
199 pcmpeqb %xmm1, %xmm2
200 pcmpeqb %xmm1, %xmm3
201 pcmpeqb %xmm1, %xmm4
202
203 pmaxub %xmm3, %xmm0
204 pmaxub %xmm4, %xmm2
205 pmaxub %xmm0, %xmm2
206 pmovmskb %xmm2, %eax
207
208 test %eax, %eax
209 jz L(align64_loop)
210
211 pmovmskb %xmm4, %eax
212 test %eax, %eax
213 jnz L(matches48)
214
215 pmovmskb %xmm3, %eax
216 test %eax, %eax
217 jnz L(matches32)
218
219 movdqa 16(%ecx), %xmm2
220
221 pcmpeqb %xmm1, %xmm2
222 pcmpeqb (%ecx), %xmm1
223
224 pmovmskb %xmm2, %eax
225 test %eax, %eax
226 jnz L(matches16)
227
228 pmovmskb %xmm1, %eax
229 test %ah, %ah
230 jnz L(exit_dispatch_high)
231 mov %al, %dl
232 and $15 << 4, %dl
233 jnz L(exit_dispatch_8)
234 test $0x08, %al
235 jnz L(exit_4)
236 test $0x04, %al
237 jnz L(exit_3)
238 test $0x02, %al
239 jnz L(exit_2)
240 mov %ecx, %eax
241 ret
242
243 .p2align 4
244L(exit_loop):
245 add $64, %edx
246 cmp $32, %edx
247 jbe L(exit_loop_32)
248
249 movdqa 48(%ecx), %xmm0
250 pcmpeqb %xmm1, %xmm0
251 pmovmskb %xmm0, %eax
252 test %eax, %eax
253 jnz L(matches48)
254
255 movdqa 32(%ecx), %xmm2
256 pcmpeqb %xmm1, %xmm2
257 pmovmskb %xmm2, %eax
258 test %eax, %eax
259 jnz L(matches32)
260
261 movdqa 16(%ecx), %xmm3
262 pcmpeqb %xmm1, %xmm3
263 pmovmskb %xmm3, %eax
264 test %eax, %eax
265 jnz L(matches16_1)
266 cmp $48, %edx
267 jbe L(return_null)
268
269 pcmpeqb (%ecx), %xmm1
270 pmovmskb %xmm1, %eax
271 test %eax, %eax
272 jnz L(matches0_1)
273 xor %eax, %eax
274 ret
275
276 .p2align 4
277L(exit_loop_32):
278 movdqa 48(%ecx), %xmm0
279 pcmpeqb %xmm1, %xmm0
280 pmovmskb %xmm0, %eax
281 test %eax, %eax
282 jnz L(matches48_1)
283 cmp $16, %edx
284 jbe L(return_null)
285
286 pcmpeqb 32(%ecx), %xmm1
287 pmovmskb %xmm1, %eax
288 test %eax, %eax
289 jnz L(matches32_1)
290 xor %eax, %eax
291 ret
292
293 .p2align 4
294L(matches16):
295 lea 16(%ecx), %ecx
296 test %ah, %ah
297 jnz L(exit_dispatch_high)
298 mov %al, %dl
299 and $15 << 4, %dl
300 jnz L(exit_dispatch_8)
301 test $0x08, %al
302 jnz L(exit_4)
303 test $0x04, %al
304 jnz L(exit_3)
305 test $0x02, %al
306 jnz L(exit_2)
307 mov %ecx, %eax
308 ret
309
310 .p2align 4
311L(matches32):
312 lea 32(%ecx), %ecx
313 test %ah, %ah
314 jnz L(exit_dispatch_high)
315 mov %al, %dl
316 and $15 << 4, %dl
317 jnz L(exit_dispatch_8)
318 test $0x08, %al
319 jnz L(exit_4)
320 test $0x04, %al
321 jnz L(exit_3)
322 test $0x02, %al
323 jnz L(exit_2)
324 mov %ecx, %eax
325 ret
326
327 .p2align 4
328L(matches48):
329 lea 48(%ecx), %ecx
330
331 .p2align 4
332L(exit_dispatch):
333 test %ah, %ah
334 jnz L(exit_dispatch_high)
335 mov %al, %dl
336 and $15 << 4, %dl
337 jnz L(exit_dispatch_8)
338 test $0x08, %al
339 jnz L(exit_4)
340 test $0x04, %al
341 jnz L(exit_3)
342 test $0x02, %al
343 jnz L(exit_2)
344 mov %ecx, %eax
345 ret
346
347 .p2align 4
348L(exit_dispatch_8):
349 test $0x80, %al
350 jnz L(exit_8)
351 test $0x40, %al
352 jnz L(exit_7)
353 test $0x20, %al
354 jnz L(exit_6)
355 lea 4(%ecx), %eax
356 ret
357
358 .p2align 4
359L(exit_dispatch_high):
360 mov %ah, %dh
361 and $15 << 4, %dh
362 jnz L(exit_dispatch_high_8)
363 test $0x08, %ah
364 jnz L(exit_12)
365 test $0x04, %ah
366 jnz L(exit_11)
367 test $0x02, %ah
368 jnz L(exit_10)
369 lea 8(%ecx), %eax
370 ret
371
372 .p2align 4
373L(exit_dispatch_high_8):
374 test $0x80, %ah
375 jnz L(exit_16)
376 test $0x40, %ah
377 jnz L(exit_15)
378 test $0x20, %ah
379 jnz L(exit_14)
380 lea 12(%ecx), %eax
381 ret
382
383 .p2align 4
384L(exit_2):
385 lea 1(%ecx), %eax
386 ret
387
388 .p2align 4
389L(exit_3):
390 lea 2(%ecx), %eax
391 ret
392
393 .p2align 4
394L(exit_4):
395 lea 3(%ecx), %eax
396 ret
397
398 .p2align 4
399L(exit_6):
400 lea 5(%ecx), %eax
401 ret
402
403 .p2align 4
404L(exit_7):
405 lea 6(%ecx), %eax
406 ret
407
408 .p2align 4
409L(exit_8):
410 lea 7(%ecx), %eax
411 ret
412
413 .p2align 4
414L(exit_10):
415 lea 9(%ecx), %eax
416 ret
417
418 .p2align 4
419L(exit_11):
420 lea 10(%ecx), %eax
421 ret
422
423 .p2align 4
424L(exit_12):
425 lea 11(%ecx), %eax
426 ret
427
428 .p2align 4
429L(exit_14):
430 lea 13(%ecx), %eax
431 ret
432
433 .p2align 4
434L(exit_15):
435 lea 14(%ecx), %eax
436 ret
437
438 .p2align 4
439L(exit_16):
440 lea 15(%ecx), %eax
441 ret
442
443 .p2align 4
444L(matches0_1):
445 lea -64(%edx), %edx
446
447 test %ah, %ah
448 jnz L(exit_dispatch_1_high)
449 mov %al, %ah
450 and $15 << 4, %ah
451 jnz L(exit_dispatch_1_8)
452 test $0x08, %al
453 jnz L(exit_1_4)
454 test $0x04, %al
455 jnz L(exit_1_3)
456 test $0x02, %al
457 jnz L(exit_1_2)
458
459 add $0, %edx
460 jl L(return_null)
461 mov %ecx, %eax
462 ret
463
464 .p2align 4
465L(matches16_1):
466 lea -48(%edx), %edx
467 lea 16(%ecx), %ecx
468
469 test %ah, %ah
470 jnz L(exit_dispatch_1_high)
471 mov %al, %ah
472 and $15 << 4, %ah
473 jnz L(exit_dispatch_1_8)
474 test $0x08, %al
475 jnz L(exit_1_4)
476 test $0x04, %al
477 jnz L(exit_1_3)
478 test $0x02, %al
479 jnz L(exit_1_2)
480
481 add $0, %edx
482 jl L(return_null)
483 mov %ecx, %eax
484 ret
485
486 .p2align 4
487L(matches32_1):
488 lea -32(%edx), %edx
489 lea 32(%ecx), %ecx
490
491 test %ah, %ah
492 jnz L(exit_dispatch_1_high)
493 mov %al, %ah
494 and $15 << 4, %ah
495 jnz L(exit_dispatch_1_8)
496 test $0x08, %al
497 jnz L(exit_1_4)
498 test $0x04, %al
499 jnz L(exit_1_3)
500 test $0x02, %al
501 jnz L(exit_1_2)
502
503 add $0, %edx
504 jl L(return_null)
505 mov %ecx, %eax
506 ret
507
508 .p2align 4
509L(matches48_1):
510 lea -16(%edx), %edx
511 lea 48(%ecx), %ecx
512
513 .p2align 4
514L(exit_dispatch_1):
515 test %ah, %ah
516 jnz L(exit_dispatch_1_high)
517 mov %al, %ah
518 and $15 << 4, %ah
519 jnz L(exit_dispatch_1_8)
520 test $0x08, %al
521 jnz L(exit_1_4)
522 test $0x04, %al
523 jnz L(exit_1_3)
524 test $0x02, %al
525 jnz L(exit_1_2)
526
527 add $0, %edx
528 jl L(return_null)
529 mov %ecx, %eax
530 ret
531
532 .p2align 4
533L(exit_dispatch_1_8):
534 test $0x80, %al
535 jnz L(exit_1_8)
536 test $0x40, %al
537 jnz L(exit_1_7)
538 test $0x20, %al
539 jnz L(exit_1_6)
540
541 add $4, %edx
542 jl L(return_null)
543 lea 4(%ecx), %eax
544 ret
545
546 .p2align 4
547L(exit_dispatch_1_high):
548 mov %ah, %al
549 and $15 << 4, %al
550 jnz L(exit_dispatch_1_high_8)
551 test $0x08, %ah
552 jnz L(exit_1_12)
553 test $0x04, %ah
554 jnz L(exit_1_11)
555 test $0x02, %ah
556 jnz L(exit_1_10)
557
558 add $8, %edx
559 jl L(return_null)
560 lea 8(%ecx), %eax
561 ret
562
563 .p2align 4
564L(exit_dispatch_1_high_8):
565 test $0x80, %ah
566 jnz L(exit_1_16)
567 test $0x40, %ah
568 jnz L(exit_1_15)
569 test $0x20, %ah
570 jnz L(exit_1_14)
571
572 add $12, %edx
573 jl L(return_null)
574 lea 12(%ecx), %eax
575 ret
576
577 .p2align 4
578L(exit_1_2):
579 add $1, %edx
580 jl L(return_null)
581 lea 1(%ecx), %eax
582 ret
583
584 .p2align 4
585L(exit_1_3):
586 add $2, %edx
587 jl L(return_null)
588 lea 2(%ecx), %eax
589 ret
590
591 .p2align 4
592L(exit_1_4):
593 add $3, %edx
594 jl L(return_null)
595 lea 3(%ecx), %eax
596 ret
597
598 .p2align 4
599L(exit_1_6):
600 add $5, %edx
601 jl L(return_null)
602 lea 5(%ecx), %eax
603 ret
604
605 .p2align 4
606L(exit_1_7):
607 add $6, %edx
608 jl L(return_null)
609 lea 6(%ecx), %eax
610 ret
611
612 .p2align 4
613L(exit_1_8):
614 add $7, %edx
615 jl L(return_null)
616 lea 7(%ecx), %eax
617 ret
618
619 .p2align 4
620L(exit_1_10):
621 add $9, %edx
622 jl L(return_null)
623 lea 9(%ecx), %eax
624 ret
625
626 .p2align 4
627L(exit_1_11):
628 add $10, %edx
629 jl L(return_null)
630 lea 10(%ecx), %eax
631 ret
632
633 .p2align 4
634L(exit_1_12):
635 add $11, %edx
636 jl L(return_null)
637 lea 11(%ecx), %eax
638 ret
639
640 .p2align 4
641L(exit_1_14):
642 add $13, %edx
643 jl L(return_null)
644 lea 13(%ecx), %eax
645 ret
646
647 .p2align 4
648L(exit_1_15):
649 add $14, %edx
650 jl L(return_null)
651 lea 14(%ecx), %eax
652 ret
653
654 .p2align 4
655L(exit_1_16):
656 add $15, %edx
657 jl L(return_null)
658 lea 15(%ecx), %eax
659 ret
660
661 .p2align 4
662L(return_null):
663 xor %eax, %eax
664 ret
665
666 .p2align 4
667L(length_less16_offset0):
668 mov %dl, %cl
669 pcmpeqb (%eax), %xmm1
670
671 mov $1, %edx
672 sal %cl, %edx
673 sub $1, %edx
674
675 mov %eax, %ecx
676 pmovmskb %xmm1, %eax
677
678 and %edx, %eax
679 test %eax, %eax
680 jnz L(exit_dispatch)
681
682 xor %eax, %eax
683 ret
684
685 .p2align 4
686L(length_less16):
687 punpcklbw %xmm1, %xmm1
688 add $16, %edx
689 punpcklbw %xmm1, %xmm1
690
691 mov %ecx, %eax
692 pshufd $0, %xmm1, %xmm1
693
694 and $15, %ecx
695 jz L(length_less16_offset0)
696
697 PUSH (%edi)
698
699 mov %cl, %dh
700 add %dl, %dh
701 and $-16, %eax
702
703 sub $16, %dh
704 ja L(length_less16_part2)
705
706 pcmpeqb (%eax), %xmm1
707 pmovmskb %xmm1, %edi
708
709 sar %cl, %edi
710 add %ecx, %eax
711 mov %dl, %cl
712
713 mov $1, %edx
714 sal %cl, %edx
715 sub $1, %edx
716
717 and %edx, %edi
718 test %edi, %edi
719 jz L(ret_null)
720
721 bsr %edi, %edi
722 add %edi, %eax
723 POP (%edi)
724 ret
725
726 CFI_PUSH (%edi)
727
728 .p2align 4
729L(length_less16_part2):
730 movdqa 16(%eax), %xmm2
731 pcmpeqb %xmm1, %xmm2
732 pmovmskb %xmm2, %edi
733
734 mov %cl, %ch
735
736 mov %dh, %cl
737 mov $1, %edx
738 sal %cl, %edx
739 sub $1, %edx
740
741 and %edx, %edi
742
743 test %edi, %edi
744 jnz L(length_less16_part2_return)
745
746 pcmpeqb (%eax), %xmm1
747 pmovmskb %xmm1, %edi
748
749 mov %ch, %cl
750 sar %cl, %edi
751 test %edi, %edi
752 jz L(ret_null)
753
754 bsr %edi, %edi
755 add %edi, %eax
756 xor %ch, %ch
757 add %ecx, %eax
758 POP (%edi)
759 ret
760
761 CFI_PUSH (%edi)
762
763 .p2align 4
764L(length_less16_part2_return):
765 bsr %edi, %edi
766 lea 16(%eax, %edi), %eax
767 POP (%edi)
768 ret
769
770 CFI_PUSH (%edi)
771
772 .p2align 4
773L(ret_null):
774 xor %eax, %eax
775 POP (%edi)
776 ret
777
778END (memrchr)