blob: 0dd8c275bca309c5d32fb6ac469ab25c2ae07eed [file] [log] [blame]
Varvara Rainchika020a242014-04-29 17:44:56 +04001/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifdef USE_AS_STRNCMP
32/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
33 if the new counter > the old one or is 0. */
34#define UPDATE_STRNCMP_COUNTER \
35 /* calculate left number to compare */ \
36 lea -16(%rcx, %r11), %r9; \
37 cmp %r9, %r11; \
38 jb L(strcmp_exitz); \
39 test %r9, %r9; \
40 je L(strcmp_exitz); \
41 mov %r9, %r11
42
43#else
44#define UPDATE_STRNCMP_COUNTER
45#ifndef STRCMP
46#define STRCMP strcmp
47#endif
48#endif
49
50#ifndef L
51# define L(label) .L##label
52#endif
53
54#ifndef cfi_startproc
55# define cfi_startproc .cfi_startproc
56#endif
57
58#ifndef cfi_endproc
59# define cfi_endproc .cfi_endproc
60#endif
61
62#ifndef ENTRY
63# define ENTRY(name) \
64 .type name, @function; \
65 .globl name; \
66 .p2align 4; \
67name: \
68 cfi_startproc
69#endif
70
71#ifndef END
72# define END(name) \
73 cfi_endproc; \
74 .size name, .-name
75#endif
76#define RETURN ret
77 .section .text.ssse3,"ax",@progbits
78ENTRY (STRCMP)
79/*
80 * This implementation uses SSE to compare up to 16 bytes at a time.
81 */
82#ifdef USE_AS_STRNCMP
83 test %rdx, %rdx
84 je L(strcmp_exitz)
85 cmp $1, %rdx
86 je L(Byte0)
87 mov %rdx, %r11
88#endif
89 mov %esi, %ecx
90 mov %edi, %eax
91/* Use 64bit AND here to avoid long NOP padding. */
92 and $0x3f, %rcx /* rsi alignment in cache line */
93 and $0x3f, %rax /* rdi alignment in cache line */
94 cmp $0x30, %ecx
95 ja L(crosscache) /* rsi: 16-byte load will cross cache line */
96 cmp $0x30, %eax
97 ja L(crosscache) /* rdi: 16-byte load will cross cache line */
98 movlpd (%rdi), %xmm1
99 movlpd (%rsi), %xmm2
100 movhpd 8(%rdi), %xmm1
101 movhpd 8(%rsi), %xmm2
102 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
103 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
104 pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
105 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
106 pmovmskb %xmm1, %edx
107 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
108 jnz L(less16bytes) /* If not, find different value or null char */
109#ifdef USE_AS_STRNCMP
110 sub $16, %r11
111 jbe L(strcmp_exitz) /* finish comparision */
112#endif
113 add $16, %rsi /* prepare to search next 16 bytes */
114 add $16, %rdi /* prepare to search next 16 bytes */
115
116 /*
117 * Determine source and destination string offsets from 16-byte alignment.
118 * Use relative offset difference between the two to determine which case
119 * below to use.
120 */
121 .p2align 4
122L(crosscache):
123 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
124 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
125 mov $0xffff, %edx /* for equivalent offset */
126 xor %r8d, %r8d
127 and $0xf, %ecx /* offset of rsi */
128 and $0xf, %eax /* offset of rdi */
129 cmp %eax, %ecx
130 je L(ashr_0) /* rsi and rdi relative offset same */
131 ja L(bigger)
132 mov %edx, %r8d /* r8d is offset flag for exit tail */
133 xchg %ecx, %eax
134 xchg %rsi, %rdi
135L(bigger):
136 lea 15(%rax), %r9
137 sub %rcx, %r9
138 lea L(unaligned_table)(%rip), %r10
139 movslq (%r10, %r9,4), %r9
140 lea (%r10, %r9), %r10
141 jmp *%r10 /* jump to corresponding case */
142
143/*
144 * The following cases will be handled by ashr_0
145 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
146 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
147 */
148 .p2align 4
149L(ashr_0):
150
151 movdqa (%rsi), %xmm1
152 pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
153 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
154 pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
155 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
156 pmovmskb %xmm1, %r9d
157 shr %cl, %edx /* adjust 0xffff for offset */
158 shr %cl, %r9d /* adjust for 16-byte offset */
159 sub %r9d, %edx
160 /*
161 * edx must be the same with r9d if in left byte (16-rcx) is equal to
162 * the start from (16-rax) and no null char was seen.
163 */
164 jne L(less32bytes) /* mismatch or null char */
165 UPDATE_STRNCMP_COUNTER
166 mov $16, %rcx
167 mov $16, %r9
168 pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
169
170 /*
171 * Now both strings are aligned at 16-byte boundary. Loop over strings
172 * checking 32-bytes per iteration.
173 */
174 .p2align 4
175L(loop_ashr_0):
176 movdqa (%rsi, %rcx), %xmm1
177 movdqa (%rdi, %rcx), %xmm2
178
179 pcmpeqb %xmm1, %xmm0
180 pcmpeqb %xmm2, %xmm1
181 psubb %xmm0, %xmm1
182 pmovmskb %xmm1, %edx
183 sub $0xffff, %edx
184 jnz L(exit) /* mismatch or null char seen */
185
186#ifdef USE_AS_STRNCMP
187 sub $16, %r11
188 jbe L(strcmp_exitz)
189#endif
190 add $16, %rcx
191 movdqa (%rsi, %rcx), %xmm1
192 movdqa (%rdi, %rcx), %xmm2
193
194 pcmpeqb %xmm1, %xmm0
195 pcmpeqb %xmm2, %xmm1
196 psubb %xmm0, %xmm1
197 pmovmskb %xmm1, %edx
198 sub $0xffff, %edx
199 jnz L(exit)
200#ifdef USE_AS_STRNCMP
201 sub $16, %r11
202 jbe L(strcmp_exitz)
203#endif
204 add $16, %rcx
205 jmp L(loop_ashr_0)
206
207/*
208 * The following cases will be handled by ashr_1
209 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
210 * n(15) n -15 0(15 +(n-15) - n) ashr_1
211 */
212 .p2align 4
213L(ashr_1):
214 pxor %xmm0, %xmm0
215 movdqa (%rdi), %xmm2
216 movdqa (%rsi), %xmm1
217 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
218 pslldq $15, %xmm2 /* shift first string to align with second */
219 pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
220 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
221 pmovmskb %xmm2, %r9d
222 shr %cl, %edx /* adjust 0xffff for offset */
223 shr %cl, %r9d /* adjust for 16-byte offset */
224 sub %r9d, %edx
225 jnz L(less32bytes) /* mismatch or null char seen */
226 movdqa (%rdi), %xmm3
227 UPDATE_STRNCMP_COUNTER
228
229 pxor %xmm0, %xmm0
230 mov $16, %rcx /* index for loads*/
231 mov $1, %r9d /* byte position left over from less32bytes case */
232 /*
233 * Setup %r10 value allows us to detect crossing a page boundary.
234 * When %r10 goes positive we have crossed a page boundary and
235 * need to do a nibble.
236 */
237 lea 1(%rdi), %r10
238 and $0xfff, %r10 /* offset into 4K page */
239 sub $0x1000, %r10 /* subtract 4K pagesize */
240
241 .p2align 4
242L(loop_ashr_1):
243 add $16, %r10
244 jg L(nibble_ashr_1) /* cross page boundary */
245
246L(gobble_ashr_1):
247 movdqa (%rsi, %rcx), %xmm1
248 movdqa (%rdi, %rcx), %xmm2
249 movdqa %xmm2, %xmm4 /* store for next cycle */
250
251 palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
252
253 pcmpeqb %xmm1, %xmm0
254 pcmpeqb %xmm2, %xmm1
255 psubb %xmm0, %xmm1
256 pmovmskb %xmm1, %edx
257 sub $0xffff, %edx
258 jnz L(exit)
259
260#ifdef USE_AS_STRNCMP
261 sub $16, %r11
262 jbe L(strcmp_exitz)
263#endif
264 add $16, %rcx
265 movdqa %xmm4, %xmm3
266
267 add $16, %r10
268 jg L(nibble_ashr_1) /* cross page boundary */
269
270 movdqa (%rsi, %rcx), %xmm1
271 movdqa (%rdi, %rcx), %xmm2
272 movdqa %xmm2, %xmm4 /* store for next cycle */
273
274 palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
275
276 pcmpeqb %xmm1, %xmm0
277 pcmpeqb %xmm2, %xmm1
278 psubb %xmm0, %xmm1
279 pmovmskb %xmm1, %edx
280 sub $0xffff, %edx
281 jnz L(exit)
282
283#ifdef USE_AS_STRNCMP
284 sub $16, %r11
285 jbe L(strcmp_exitz)
286#endif
287 add $16, %rcx
288 movdqa %xmm4, %xmm3
289 jmp L(loop_ashr_1)
290
291 /*
292 * Nibble avoids loads across page boundary. This is to avoid a potential
293 * access into unmapped memory.
294 */
295 .p2align 4
296L(nibble_ashr_1):
297 pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/
298 pmovmskb %xmm0, %edx
299 test $0xfffe, %edx
300 jnz L(ashr_1_exittail) /* find null char*/
301
302#ifdef USE_AS_STRNCMP
303 cmp $14, %r11
304 jbe L(ashr_1_exittail)
305#endif
306
307 pxor %xmm0, %xmm0
308 sub $0x1000, %r10 /* substract 4K from %r10 */
309 jmp L(gobble_ashr_1)
310
311 /*
312 * Once find null char, determine if there is a string mismatch
313 * before the null char.
314 */
315 .p2align 4
316L(ashr_1_exittail):
317 movdqa (%rsi, %rcx), %xmm1
318 psrldq $1, %xmm0
319 psrldq $1, %xmm3
320 jmp L(aftertail)
321
322/*
323 * The following cases will be handled by ashr_2
324 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
325 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
326 */
327 .p2align 4
328L(ashr_2):
329 pxor %xmm0, %xmm0
330 movdqa (%rdi), %xmm2
331 movdqa (%rsi), %xmm1
332 pcmpeqb %xmm1, %xmm0
333 pslldq $14, %xmm2
334 pcmpeqb %xmm1, %xmm2
335 psubb %xmm0, %xmm2
336 pmovmskb %xmm2, %r9d
337 shr %cl, %edx
338 shr %cl, %r9d
339 sub %r9d, %edx
340 jnz L(less32bytes)
341 movdqa (%rdi), %xmm3
342 UPDATE_STRNCMP_COUNTER
343
344 pxor %xmm0, %xmm0
345 mov $16, %rcx /* index for loads */
346 mov $2, %r9d /* byte position left over from less32bytes case */
347 /*
348 * Setup %r10 value allows us to detect crossing a page boundary.
349 * When %r10 goes positive we have crossed a page boundary and
350 * need to do a nibble.
351 */
352 lea 2(%rdi), %r10
353 and $0xfff, %r10 /* offset into 4K page */
354 sub $0x1000, %r10 /* subtract 4K pagesize */
355
356 .p2align 4
357L(loop_ashr_2):
358 add $16, %r10
359 jg L(nibble_ashr_2)
360
361L(gobble_ashr_2):
362 movdqa (%rsi, %rcx), %xmm1
363 movdqa (%rdi, %rcx), %xmm2
364 movdqa %xmm2, %xmm4
365
366 palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
367
368 pcmpeqb %xmm1, %xmm0
369 pcmpeqb %xmm2, %xmm1
370 psubb %xmm0, %xmm1
371 pmovmskb %xmm1, %edx
372 sub $0xffff, %edx
373 jnz L(exit)
374
375#ifdef USE_AS_STRNCMP
376 sub $16, %r11
377 jbe L(strcmp_exitz)
378#endif
379
380 add $16, %rcx
381 movdqa %xmm4, %xmm3
382
383 add $16, %r10
384 jg L(nibble_ashr_2) /* cross page boundary */
385
386 movdqa (%rsi, %rcx), %xmm1
387 movdqa (%rdi, %rcx), %xmm2
388 movdqa %xmm2, %xmm4
389
390 palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
391
392 pcmpeqb %xmm1, %xmm0
393 pcmpeqb %xmm2, %xmm1
394 psubb %xmm0, %xmm1
395 pmovmskb %xmm1, %edx
396 sub $0xffff, %edx
397 jnz L(exit)
398
399#ifdef USE_AS_STRNCMP
400 sub $16, %r11
401 jbe L(strcmp_exitz)
402#endif
403
404 add $16, %rcx
405 movdqa %xmm4, %xmm3
406 jmp L(loop_ashr_2)
407
408 .p2align 4
409L(nibble_ashr_2):
410 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
411 pmovmskb %xmm0, %edx
412 test $0xfffc, %edx
413 jnz L(ashr_2_exittail)
414
415#ifdef USE_AS_STRNCMP
416 cmp $13, %r11
417 jbe L(ashr_2_exittail)
418#endif
419
420 pxor %xmm0, %xmm0
421 sub $0x1000, %r10
422 jmp L(gobble_ashr_2)
423
424 .p2align 4
425L(ashr_2_exittail):
426 movdqa (%rsi, %rcx), %xmm1
427 psrldq $2, %xmm0
428 psrldq $2, %xmm3
429 jmp L(aftertail)
430
431/*
432 * The following cases will be handled by ashr_3
433 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
434 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
435 */
436 .p2align 4
437L(ashr_3):
438 pxor %xmm0, %xmm0
439 movdqa (%rdi), %xmm2
440 movdqa (%rsi), %xmm1
441 pcmpeqb %xmm1, %xmm0
442 pslldq $13, %xmm2
443 pcmpeqb %xmm1, %xmm2
444 psubb %xmm0, %xmm2
445 pmovmskb %xmm2, %r9d
446 shr %cl, %edx
447 shr %cl, %r9d
448 sub %r9d, %edx
449 jnz L(less32bytes)
450 movdqa (%rdi), %xmm3
451
452 UPDATE_STRNCMP_COUNTER
453
454 pxor %xmm0, %xmm0
455 mov $16, %rcx /* index for loads */
456 mov $3, %r9d /* byte position left over from less32bytes case */
457 /*
458 * Setup %r10 value allows us to detect crossing a page boundary.
459 * When %r10 goes positive we have crossed a page boundary and
460 * need to do a nibble.
461 */
462 lea 3(%rdi), %r10
463 and $0xfff, %r10 /* offset into 4K page */
464 sub $0x1000, %r10 /* subtract 4K pagesize */
465
466 .p2align 4
467L(loop_ashr_3):
468 add $16, %r10
469 jg L(nibble_ashr_3)
470
471L(gobble_ashr_3):
472 movdqa (%rsi, %rcx), %xmm1
473 movdqa (%rdi, %rcx), %xmm2
474 movdqa %xmm2, %xmm4
475
476 palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
477
478 pcmpeqb %xmm1, %xmm0
479 pcmpeqb %xmm2, %xmm1
480 psubb %xmm0, %xmm1
481 pmovmskb %xmm1, %edx
482 sub $0xffff, %edx
483 jnz L(exit)
484
485#ifdef USE_AS_STRNCMP
486 sub $16, %r11
487 jbe L(strcmp_exitz)
488#endif
489
490 add $16, %rcx
491 movdqa %xmm4, %xmm3
492
493 add $16, %r10
494 jg L(nibble_ashr_3) /* cross page boundary */
495
496 movdqa (%rsi, %rcx), %xmm1
497 movdqa (%rdi, %rcx), %xmm2
498 movdqa %xmm2, %xmm4
499
500 palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
501
502 pcmpeqb %xmm1, %xmm0
503 pcmpeqb %xmm2, %xmm1
504 psubb %xmm0, %xmm1
505 pmovmskb %xmm1, %edx
506 sub $0xffff, %edx
507 jnz L(exit)
508
509#ifdef USE_AS_STRNCMP
510 sub $16, %r11
511 jbe L(strcmp_exitz)
512#endif
513
514 add $16, %rcx
515 movdqa %xmm4, %xmm3
516 jmp L(loop_ashr_3)
517
518 .p2align 4
519L(nibble_ashr_3):
520 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
521 pmovmskb %xmm0, %edx
522 test $0xfff8, %edx
523 jnz L(ashr_3_exittail)
524
525#ifdef USE_AS_STRNCMP
526 cmp $12, %r11
527 jbe L(ashr_3_exittail)
528#endif
529
530 pxor %xmm0, %xmm0
531 sub $0x1000, %r10
532 jmp L(gobble_ashr_3)
533
534 .p2align 4
535L(ashr_3_exittail):
536 movdqa (%rsi, %rcx), %xmm1
537 psrldq $3, %xmm0
538 psrldq $3, %xmm3
539 jmp L(aftertail)
540
541/*
542 * The following cases will be handled by ashr_4
543 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
544 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
545 */
546 .p2align 4
547L(ashr_4):
548 pxor %xmm0, %xmm0
549 movdqa (%rdi), %xmm2
550 movdqa (%rsi), %xmm1
551 pcmpeqb %xmm1, %xmm0
552 pslldq $12, %xmm2
553 pcmpeqb %xmm1, %xmm2
554 psubb %xmm0, %xmm2
555 pmovmskb %xmm2, %r9d
556 shr %cl, %edx
557 shr %cl, %r9d
558 sub %r9d, %edx
559 jnz L(less32bytes)
560 movdqa (%rdi), %xmm3
561
562 UPDATE_STRNCMP_COUNTER
563
564 pxor %xmm0, %xmm0
565 mov $16, %rcx /* index for loads */
566 mov $4, %r9d /* byte position left over from less32bytes case */
567 /*
568 * Setup %r10 value allows us to detect crossing a page boundary.
569 * When %r10 goes positive we have crossed a page boundary and
570 * need to do a nibble.
571 */
572 lea 4(%rdi), %r10
573 and $0xfff, %r10 /* offset into 4K page */
574 sub $0x1000, %r10 /* subtract 4K pagesize */
575
576 .p2align 4
577L(loop_ashr_4):
578 add $16, %r10
579 jg L(nibble_ashr_4)
580
581L(gobble_ashr_4):
582 movdqa (%rsi, %rcx), %xmm1
583 movdqa (%rdi, %rcx), %xmm2
584 movdqa %xmm2, %xmm4
585
586 palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
587
588 pcmpeqb %xmm1, %xmm0
589 pcmpeqb %xmm2, %xmm1
590 psubb %xmm0, %xmm1
591 pmovmskb %xmm1, %edx
592 sub $0xffff, %edx
593 jnz L(exit)
594
595#ifdef USE_AS_STRNCMP
596 sub $16, %r11
597 jbe L(strcmp_exitz)
598#endif
599
600 add $16, %rcx
601 movdqa %xmm4, %xmm3
602
603 add $16, %r10
604 jg L(nibble_ashr_4) /* cross page boundary */
605
606 movdqa (%rsi, %rcx), %xmm1
607 movdqa (%rdi, %rcx), %xmm2
608 movdqa %xmm2, %xmm4
609
610 palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
611
612 pcmpeqb %xmm1, %xmm0
613 pcmpeqb %xmm2, %xmm1
614 psubb %xmm0, %xmm1
615 pmovmskb %xmm1, %edx
616 sub $0xffff, %edx
617 jnz L(exit)
618
619#ifdef USE_AS_STRNCMP
620 sub $16, %r11
621 jbe L(strcmp_exitz)
622#endif
623
624 add $16, %rcx
625 movdqa %xmm4, %xmm3
626 jmp L(loop_ashr_4)
627
628 .p2align 4
629L(nibble_ashr_4):
630 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
631 pmovmskb %xmm0, %edx
632 test $0xfff0, %edx
633 jnz L(ashr_4_exittail)
634
635#ifdef USE_AS_STRNCMP
636 cmp $11, %r11
637 jbe L(ashr_4_exittail)
638#endif
639
640 pxor %xmm0, %xmm0
641 sub $0x1000, %r10
642 jmp L(gobble_ashr_4)
643
644 .p2align 4
645L(ashr_4_exittail):
646 movdqa (%rsi, %rcx), %xmm1
647 psrldq $4, %xmm0
648 psrldq $4, %xmm3
649 jmp L(aftertail)
650
651/*
652 * The following cases will be handled by ashr_5
653 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
654 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
655 */
656 .p2align 4
657L(ashr_5):
658 pxor %xmm0, %xmm0
659 movdqa (%rdi), %xmm2
660 movdqa (%rsi), %xmm1
661 pcmpeqb %xmm1, %xmm0
662 pslldq $11, %xmm2
663 pcmpeqb %xmm1, %xmm2
664 psubb %xmm0, %xmm2
665 pmovmskb %xmm2, %r9d
666 shr %cl, %edx
667 shr %cl, %r9d
668 sub %r9d, %edx
669 jnz L(less32bytes)
670 movdqa (%rdi), %xmm3
671
672 UPDATE_STRNCMP_COUNTER
673
674 pxor %xmm0, %xmm0
675 mov $16, %rcx /* index for loads */
676 mov $5, %r9d /* byte position left over from less32bytes case */
677 /*
678 * Setup %r10 value allows us to detect crossing a page boundary.
679 * When %r10 goes positive we have crossed a page boundary and
680 * need to do a nibble.
681 */
682 lea 5(%rdi), %r10
683 and $0xfff, %r10 /* offset into 4K page */
684 sub $0x1000, %r10 /* subtract 4K pagesize */
685
686 .p2align 4
687L(loop_ashr_5):
688 add $16, %r10
689 jg L(nibble_ashr_5)
690
691L(gobble_ashr_5):
692 movdqa (%rsi, %rcx), %xmm1
693 movdqa (%rdi, %rcx), %xmm2
694 movdqa %xmm2, %xmm4
695
696 palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
697
698 pcmpeqb %xmm1, %xmm0
699 pcmpeqb %xmm2, %xmm1
700 psubb %xmm0, %xmm1
701 pmovmskb %xmm1, %edx
702 sub $0xffff, %edx
703 jnz L(exit)
704
705#ifdef USE_AS_STRNCMP
706 sub $16, %r11
707 jbe L(strcmp_exitz)
708#endif
709
710 add $16, %rcx
711 movdqa %xmm4, %xmm3
712
713 add $16, %r10
714 jg L(nibble_ashr_5) /* cross page boundary */
715
716 movdqa (%rsi, %rcx), %xmm1
717 movdqa (%rdi, %rcx), %xmm2
718 movdqa %xmm2, %xmm4
719
720 palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
721
722 pcmpeqb %xmm1, %xmm0
723 pcmpeqb %xmm2, %xmm1
724 psubb %xmm0, %xmm1
725 pmovmskb %xmm1, %edx
726 sub $0xffff, %edx
727 jnz L(exit)
728
729#ifdef USE_AS_STRNCMP
730 sub $16, %r11
731 jbe L(strcmp_exitz)
732#endif
733
734 add $16, %rcx
735 movdqa %xmm4, %xmm3
736 jmp L(loop_ashr_5)
737
738 .p2align 4
739L(nibble_ashr_5):
740 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
741 pmovmskb %xmm0, %edx
742 test $0xffe0, %edx
743 jnz L(ashr_5_exittail)
744
745#ifdef USE_AS_STRNCMP
746 cmp $10, %r11
747 jbe L(ashr_5_exittail)
748#endif
749
750 pxor %xmm0, %xmm0
751 sub $0x1000, %r10
752 jmp L(gobble_ashr_5)
753
754 .p2align 4
755L(ashr_5_exittail):
756 movdqa (%rsi, %rcx), %xmm1
757 psrldq $5, %xmm0
758 psrldq $5, %xmm3
759 jmp L(aftertail)
760
761/*
762 * The following cases will be handled by ashr_6
763 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
764 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
765 */
766 .p2align 4
767L(ashr_6):
768 pxor %xmm0, %xmm0
769 movdqa (%rdi), %xmm2
770 movdqa (%rsi), %xmm1
771 pcmpeqb %xmm1, %xmm0
772 pslldq $10, %xmm2
773 pcmpeqb %xmm1, %xmm2
774 psubb %xmm0, %xmm2
775 pmovmskb %xmm2, %r9d
776 shr %cl, %edx
777 shr %cl, %r9d
778 sub %r9d, %edx
779 jnz L(less32bytes)
780 movdqa (%rdi), %xmm3
781
782 UPDATE_STRNCMP_COUNTER
783
784 pxor %xmm0, %xmm0
785 mov $16, %rcx /* index for loads */
786 mov $6, %r9d /* byte position left over from less32bytes case */
787 /*
788 * Setup %r10 value allows us to detect crossing a page boundary.
789 * When %r10 goes positive we have crossed a page boundary and
790 * need to do a nibble.
791 */
792 lea 6(%rdi), %r10
793 and $0xfff, %r10 /* offset into 4K page */
794 sub $0x1000, %r10 /* subtract 4K pagesize */
795
796 .p2align 4
797L(loop_ashr_6):
798 add $16, %r10
799 jg L(nibble_ashr_6)
800
801L(gobble_ashr_6):
802 movdqa (%rsi, %rcx), %xmm1
803 movdqa (%rdi, %rcx), %xmm2
804 movdqa %xmm2, %xmm4
805
806 palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
807
808 pcmpeqb %xmm1, %xmm0
809 pcmpeqb %xmm2, %xmm1
810 psubb %xmm0, %xmm1
811 pmovmskb %xmm1, %edx
812 sub $0xffff, %edx
813 jnz L(exit)
814
815#ifdef USE_AS_STRNCMP
816 sub $16, %r11
817 jbe L(strcmp_exitz)
818#endif
819
820 add $16, %rcx
821 movdqa %xmm4, %xmm3
822
823 add $16, %r10
824 jg L(nibble_ashr_6) /* cross page boundary */
825
826 movdqa (%rsi, %rcx), %xmm1
827 movdqa (%rdi, %rcx), %xmm2
828 movdqa %xmm2, %xmm4
829
830 palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
831
832 pcmpeqb %xmm1, %xmm0
833 pcmpeqb %xmm2, %xmm1
834 psubb %xmm0, %xmm1
835 pmovmskb %xmm1, %edx
836 sub $0xffff, %edx
837 jnz L(exit)
838
839#ifdef USE_AS_STRNCMP
840 sub $16, %r11
841 jbe L(strcmp_exitz)
842#endif
843
844 add $16, %rcx
845 movdqa %xmm4, %xmm3
846 jmp L(loop_ashr_6)
847
848 .p2align 4
849L(nibble_ashr_6):
850 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
851 pmovmskb %xmm0, %edx
852 test $0xffc0, %edx
853 jnz L(ashr_6_exittail)
854
855#ifdef USE_AS_STRNCMP
856 cmp $9, %r11
857 jbe L(ashr_6_exittail)
858#endif
859
860 pxor %xmm0, %xmm0
861 sub $0x1000, %r10
862 jmp L(gobble_ashr_6)
863
864 .p2align 4
865L(ashr_6_exittail):
866 movdqa (%rsi, %rcx), %xmm1
867 psrldq $6, %xmm0
868 psrldq $6, %xmm3
869 jmp L(aftertail)
870
871/*
872 * The following cases will be handled by ashr_7
873 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
874 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
875 */
876 .p2align 4
877L(ashr_7):
878 pxor %xmm0, %xmm0
879 movdqa (%rdi), %xmm2
880 movdqa (%rsi), %xmm1
881 pcmpeqb %xmm1, %xmm0
882 pslldq $9, %xmm2
883 pcmpeqb %xmm1, %xmm2
884 psubb %xmm0, %xmm2
885 pmovmskb %xmm2, %r9d
886 shr %cl, %edx
887 shr %cl, %r9d
888 sub %r9d, %edx
889 jnz L(less32bytes)
890 movdqa (%rdi), %xmm3
891
892 UPDATE_STRNCMP_COUNTER
893
894 pxor %xmm0, %xmm0
895 mov $16, %rcx /* index for loads */
896 mov $7, %r9d /* byte position left over from less32bytes case */
897 /*
898 * Setup %r10 value allows us to detect crossing a page boundary.
899 * When %r10 goes positive we have crossed a page boundary and
900 * need to do a nibble.
901 */
902 lea 7(%rdi), %r10
903 and $0xfff, %r10 /* offset into 4K page */
904 sub $0x1000, %r10 /* subtract 4K pagesize */
905
906 .p2align 4
907L(loop_ashr_7):
908 add $16, %r10
909 jg L(nibble_ashr_7)
910
911L(gobble_ashr_7):
912 movdqa (%rsi, %rcx), %xmm1
913 movdqa (%rdi, %rcx), %xmm2
914 movdqa %xmm2, %xmm4
915
916 palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
917
918 pcmpeqb %xmm1, %xmm0
919 pcmpeqb %xmm2, %xmm1
920 psubb %xmm0, %xmm1
921 pmovmskb %xmm1, %edx
922 sub $0xffff, %edx
923 jnz L(exit)
924
925#ifdef USE_AS_STRNCMP
926 sub $16, %r11
927 jbe L(strcmp_exitz)
928#endif
929
930 add $16, %rcx
931 movdqa %xmm4, %xmm3
932
933 add $16, %r10
934 jg L(nibble_ashr_7) /* cross page boundary */
935
936 movdqa (%rsi, %rcx), %xmm1
937 movdqa (%rdi, %rcx), %xmm2
938 movdqa %xmm2, %xmm4
939
940 palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
941
942 pcmpeqb %xmm1, %xmm0
943 pcmpeqb %xmm2, %xmm1
944 psubb %xmm0, %xmm1
945 pmovmskb %xmm1, %edx
946 sub $0xffff, %edx
947 jnz L(exit)
948
949#ifdef USE_AS_STRNCMP
950 sub $16, %r11
951 jbe L(strcmp_exitz)
952#endif
953
954 add $16, %rcx
955 movdqa %xmm4, %xmm3
956 jmp L(loop_ashr_7)
957
958 .p2align 4
959L(nibble_ashr_7):
960 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
961 pmovmskb %xmm0, %edx
962 test $0xff80, %edx
963 jnz L(ashr_7_exittail)
964
965#ifdef USE_AS_STRNCMP
966 cmp $8, %r11
967 jbe L(ashr_7_exittail)
968#endif
969
970 pxor %xmm0, %xmm0
971 sub $0x1000, %r10
972 jmp L(gobble_ashr_7)
973
974 .p2align 4
975L(ashr_7_exittail):
976 movdqa (%rsi, %rcx), %xmm1
977 psrldq $7, %xmm0
978 psrldq $7, %xmm3
979 jmp L(aftertail)
980
981/*
982 * The following cases will be handled by ashr_8
983 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
984 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
985 */
986 .p2align 4
987L(ashr_8):
988 pxor %xmm0, %xmm0
989 movdqa (%rdi), %xmm2
990 movdqa (%rsi), %xmm1
991 pcmpeqb %xmm1, %xmm0
992 pslldq $8, %xmm2
993 pcmpeqb %xmm1, %xmm2
994 psubb %xmm0, %xmm2
995 pmovmskb %xmm2, %r9d
996 shr %cl, %edx
997 shr %cl, %r9d
998 sub %r9d, %edx
999 jnz L(less32bytes)
1000 movdqa (%rdi), %xmm3
1001
1002 UPDATE_STRNCMP_COUNTER
1003
1004 pxor %xmm0, %xmm0
1005 mov $16, %rcx /* index for loads */
1006 mov $8, %r9d /* byte position left over from less32bytes case */
1007 /*
1008 * Setup %r10 value allows us to detect crossing a page boundary.
1009 * When %r10 goes positive we have crossed a page boundary and
1010 * need to do a nibble.
1011 */
1012 lea 8(%rdi), %r10
1013 and $0xfff, %r10 /* offset into 4K page */
1014 sub $0x1000, %r10 /* subtract 4K pagesize */
1015
1016 .p2align 4
1017L(loop_ashr_8):
1018 add $16, %r10
1019 jg L(nibble_ashr_8)
1020
1021L(gobble_ashr_8):
1022 movdqa (%rsi, %rcx), %xmm1
1023 movdqa (%rdi, %rcx), %xmm2
1024 movdqa %xmm2, %xmm4
1025
1026 palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
1027
1028 pcmpeqb %xmm1, %xmm0
1029 pcmpeqb %xmm2, %xmm1
1030 psubb %xmm0, %xmm1
1031 pmovmskb %xmm1, %edx
1032 sub $0xffff, %edx
1033 jnz L(exit)
1034
1035#ifdef USE_AS_STRNCMP
1036 sub $16, %r11
1037 jbe L(strcmp_exitz)
1038#endif
1039
1040 add $16, %rcx
1041 movdqa %xmm4, %xmm3
1042
1043 add $16, %r10
1044 jg L(nibble_ashr_8) /* cross page boundary */
1045
1046 movdqa (%rsi, %rcx), %xmm1
1047 movdqa (%rdi, %rcx), %xmm2
1048 movdqa %xmm2, %xmm4
1049
1050 palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
1051
1052 pcmpeqb %xmm1, %xmm0
1053 pcmpeqb %xmm2, %xmm1
1054 psubb %xmm0, %xmm1
1055 pmovmskb %xmm1, %edx
1056 sub $0xffff, %edx
1057 jnz L(exit)
1058
1059#ifdef USE_AS_STRNCMP
1060 sub $16, %r11
1061 jbe L(strcmp_exitz)
1062#endif
1063
1064 add $16, %rcx
1065 movdqa %xmm4, %xmm3
1066 jmp L(loop_ashr_8)
1067
1068 .p2align 4
1069L(nibble_ashr_8):
1070 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1071 pmovmskb %xmm0, %edx
1072 test $0xff00, %edx
1073 jnz L(ashr_8_exittail)
1074
1075#ifdef USE_AS_STRNCMP
1076 cmp $7, %r11
1077 jbe L(ashr_8_exittail)
1078#endif
1079
1080 pxor %xmm0, %xmm0
1081 sub $0x1000, %r10
1082 jmp L(gobble_ashr_8)
1083
1084 .p2align 4
1085L(ashr_8_exittail):
1086 movdqa (%rsi, %rcx), %xmm1
1087 psrldq $8, %xmm0
1088 psrldq $8, %xmm3
1089 jmp L(aftertail)
1090
1091/*
1092 * The following cases will be handled by ashr_9
1093 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1094 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
1095 */
1096 .p2align 4
1097L(ashr_9):
1098 pxor %xmm0, %xmm0
1099 movdqa (%rdi), %xmm2
1100 movdqa (%rsi), %xmm1
1101 pcmpeqb %xmm1, %xmm0
1102 pslldq $7, %xmm2
1103 pcmpeqb %xmm1, %xmm2
1104 psubb %xmm0, %xmm2
1105 pmovmskb %xmm2, %r9d
1106 shr %cl, %edx
1107 shr %cl, %r9d
1108 sub %r9d, %edx
1109 jnz L(less32bytes)
1110 movdqa (%rdi), %xmm3
1111
1112 UPDATE_STRNCMP_COUNTER
1113
1114 pxor %xmm0, %xmm0
1115 mov $16, %rcx /* index for loads */
1116 mov $9, %r9d /* byte position left over from less32bytes case */
1117 /*
1118 * Setup %r10 value allows us to detect crossing a page boundary.
1119 * When %r10 goes positive we have crossed a page boundary and
1120 * need to do a nibble.
1121 */
1122 lea 9(%rdi), %r10
1123 and $0xfff, %r10 /* offset into 4K page */
1124 sub $0x1000, %r10 /* subtract 4K pagesize */
1125
1126 .p2align 4
1127L(loop_ashr_9):
1128 add $16, %r10
1129 jg L(nibble_ashr_9)
1130
1131L(gobble_ashr_9):
1132 movdqa (%rsi, %rcx), %xmm1
1133 movdqa (%rdi, %rcx), %xmm2
1134 movdqa %xmm2, %xmm4
1135
1136 palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
1137
1138 pcmpeqb %xmm1, %xmm0
1139 pcmpeqb %xmm2, %xmm1
1140 psubb %xmm0, %xmm1
1141 pmovmskb %xmm1, %edx
1142 sub $0xffff, %edx
1143 jnz L(exit)
1144
1145#ifdef USE_AS_STRNCMP
1146 sub $16, %r11
1147 jbe L(strcmp_exitz)
1148#endif
1149
1150 add $16, %rcx
1151 movdqa %xmm4, %xmm3
1152
1153 add $16, %r10
1154 jg L(nibble_ashr_9) /* cross page boundary */
1155
1156 movdqa (%rsi, %rcx), %xmm1
1157 movdqa (%rdi, %rcx), %xmm2
1158 movdqa %xmm2, %xmm4
1159
1160 palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
1161
1162 pcmpeqb %xmm1, %xmm0
1163 pcmpeqb %xmm2, %xmm1
1164 psubb %xmm0, %xmm1
1165 pmovmskb %xmm1, %edx
1166 sub $0xffff, %edx
1167 jnz L(exit)
1168
1169#ifdef USE_AS_STRNCMP
1170 sub $16, %r11
1171 jbe L(strcmp_exitz)
1172#endif
1173
1174 add $16, %rcx
1175 movdqa %xmm4, %xmm3 /* store for next cycle */
1176 jmp L(loop_ashr_9)
1177
1178 .p2align 4
1179L(nibble_ashr_9):
1180 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1181 pmovmskb %xmm0, %edx
1182 test $0xfe00, %edx
1183 jnz L(ashr_9_exittail)
1184
1185#ifdef USE_AS_STRNCMP
1186 cmp $6, %r11
1187 jbe L(ashr_9_exittail)
1188#endif
1189
1190 pxor %xmm0, %xmm0
1191 sub $0x1000, %r10
1192 jmp L(gobble_ashr_9)
1193
1194 .p2align 4
1195L(ashr_9_exittail):
1196 movdqa (%rsi, %rcx), %xmm1
1197 psrldq $9, %xmm0
1198 psrldq $9, %xmm3
1199 jmp L(aftertail)
1200
1201/*
1202 * The following cases will be handled by ashr_10
1203 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1204 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
1205 */
1206 .p2align 4
1207L(ashr_10):
1208 pxor %xmm0, %xmm0
1209 movdqa (%rdi), %xmm2
1210 movdqa (%rsi), %xmm1
1211 pcmpeqb %xmm1, %xmm0
1212 pslldq $6, %xmm2
1213 pcmpeqb %xmm1, %xmm2
1214 psubb %xmm0, %xmm2
1215 pmovmskb %xmm2, %r9d
1216 shr %cl, %edx
1217 shr %cl, %r9d
1218 sub %r9d, %edx
1219 jnz L(less32bytes)
1220 movdqa (%rdi), %xmm3
1221
1222 UPDATE_STRNCMP_COUNTER
1223
1224 pxor %xmm0, %xmm0
1225 mov $16, %rcx /* index for loads */
1226 mov $10, %r9d /* byte position left over from less32bytes case */
1227 /*
1228 * Setup %r10 value allows us to detect crossing a page boundary.
1229 * When %r10 goes positive we have crossed a page boundary and
1230 * need to do a nibble.
1231 */
1232 lea 10(%rdi), %r10
1233 and $0xfff, %r10 /* offset into 4K page */
1234 sub $0x1000, %r10 /* subtract 4K pagesize */
1235
1236 .p2align 4
1237L(loop_ashr_10):
1238 add $16, %r10
1239 jg L(nibble_ashr_10)
1240
1241L(gobble_ashr_10):
1242 movdqa (%rsi, %rcx), %xmm1
1243 movdqa (%rdi, %rcx), %xmm2
1244 movdqa %xmm2, %xmm4
1245
1246 palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
1247
1248 pcmpeqb %xmm1, %xmm0
1249 pcmpeqb %xmm2, %xmm1
1250 psubb %xmm0, %xmm1
1251 pmovmskb %xmm1, %edx
1252 sub $0xffff, %edx
1253 jnz L(exit)
1254
1255#ifdef USE_AS_STRNCMP
1256 sub $16, %r11
1257 jbe L(strcmp_exitz)
1258#endif
1259
1260 add $16, %rcx
1261 movdqa %xmm4, %xmm3
1262
1263 add $16, %r10
1264 jg L(nibble_ashr_10) /* cross page boundary */
1265
1266 movdqa (%rsi, %rcx), %xmm1
1267 movdqa (%rdi, %rcx), %xmm2
1268 movdqa %xmm2, %xmm4
1269
1270 palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
1271
1272 pcmpeqb %xmm1, %xmm0
1273 pcmpeqb %xmm2, %xmm1
1274 psubb %xmm0, %xmm1
1275 pmovmskb %xmm1, %edx
1276 sub $0xffff, %edx
1277 jnz L(exit)
1278
1279#ifdef USE_AS_STRNCMP
1280 sub $16, %r11
1281 jbe L(strcmp_exitz)
1282#endif
1283
1284 add $16, %rcx
1285 movdqa %xmm4, %xmm3
1286 jmp L(loop_ashr_10)
1287
1288 .p2align 4
1289L(nibble_ashr_10):
1290 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1291 pmovmskb %xmm0, %edx
1292 test $0xfc00, %edx
1293 jnz L(ashr_10_exittail)
1294
1295#ifdef USE_AS_STRNCMP
1296 cmp $5, %r11
1297 jbe L(ashr_10_exittail)
1298#endif
1299
1300 pxor %xmm0, %xmm0
1301 sub $0x1000, %r10
1302 jmp L(gobble_ashr_10)
1303
1304 .p2align 4
1305L(ashr_10_exittail):
1306 movdqa (%rsi, %rcx), %xmm1
1307 psrldq $10, %xmm0
1308 psrldq $10, %xmm3
1309 jmp L(aftertail)
1310
1311/*
1312 * The following cases will be handled by ashr_11
1313 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1314 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1315 */
1316 .p2align 4
1317L(ashr_11):
1318 pxor %xmm0, %xmm0
1319 movdqa (%rdi), %xmm2
1320 movdqa (%rsi), %xmm1
1321 pcmpeqb %xmm1, %xmm0
1322 pslldq $5, %xmm2
1323 pcmpeqb %xmm1, %xmm2
1324 psubb %xmm0, %xmm2
1325 pmovmskb %xmm2, %r9d
1326 shr %cl, %edx
1327 shr %cl, %r9d
1328 sub %r9d, %edx
1329 jnz L(less32bytes)
1330 movdqa (%rdi), %xmm3
1331
1332 UPDATE_STRNCMP_COUNTER
1333
1334 pxor %xmm0, %xmm0
1335 mov $16, %rcx /* index for loads */
1336 mov $11, %r9d /* byte position left over from less32bytes case */
1337 /*
1338 * Setup %r10 value allows us to detect crossing a page boundary.
1339 * When %r10 goes positive we have crossed a page boundary and
1340 * need to do a nibble.
1341 */
1342 lea 11(%rdi), %r10
1343 and $0xfff, %r10 /* offset into 4K page */
1344 sub $0x1000, %r10 /* subtract 4K pagesize */
1345
1346 .p2align 4
1347L(loop_ashr_11):
1348 add $16, %r10
1349 jg L(nibble_ashr_11)
1350
1351L(gobble_ashr_11):
1352 movdqa (%rsi, %rcx), %xmm1
1353 movdqa (%rdi, %rcx), %xmm2
1354 movdqa %xmm2, %xmm4
1355
1356 palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
1357
1358 pcmpeqb %xmm1, %xmm0
1359 pcmpeqb %xmm2, %xmm1
1360 psubb %xmm0, %xmm1
1361 pmovmskb %xmm1, %edx
1362 sub $0xffff, %edx
1363 jnz L(exit)
1364
1365#ifdef USE_AS_STRNCMP
1366 sub $16, %r11
1367 jbe L(strcmp_exitz)
1368#endif
1369
1370 add $16, %rcx
1371 movdqa %xmm4, %xmm3
1372
1373 add $16, %r10
1374 jg L(nibble_ashr_11) /* cross page boundary */
1375
1376 movdqa (%rsi, %rcx), %xmm1
1377 movdqa (%rdi, %rcx), %xmm2
1378 movdqa %xmm2, %xmm4
1379
1380 palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
1381
1382 pcmpeqb %xmm1, %xmm0
1383 pcmpeqb %xmm2, %xmm1
1384 psubb %xmm0, %xmm1
1385 pmovmskb %xmm1, %edx
1386 sub $0xffff, %edx
1387 jnz L(exit)
1388
1389#ifdef USE_AS_STRNCMP
1390 sub $16, %r11
1391 jbe L(strcmp_exitz)
1392#endif
1393
1394 add $16, %rcx
1395 movdqa %xmm4, %xmm3
1396 jmp L(loop_ashr_11)
1397
1398 .p2align 4
1399L(nibble_ashr_11):
1400 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1401 pmovmskb %xmm0, %edx
1402 test $0xf800, %edx
1403 jnz L(ashr_11_exittail)
1404
1405#ifdef USE_AS_STRNCMP
1406 cmp $4, %r11
1407 jbe L(ashr_11_exittail)
1408#endif
1409
1410 pxor %xmm0, %xmm0
1411 sub $0x1000, %r10
1412 jmp L(gobble_ashr_11)
1413
1414 .p2align 4
1415L(ashr_11_exittail):
1416 movdqa (%rsi, %rcx), %xmm1
1417 psrldq $11, %xmm0
1418 psrldq $11, %xmm3
1419 jmp L(aftertail)
1420
1421/*
1422 * The following cases will be handled by ashr_12
1423 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1424 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1425 */
1426 .p2align 4
1427L(ashr_12):
1428 pxor %xmm0, %xmm0
1429 movdqa (%rdi), %xmm2
1430 movdqa (%rsi), %xmm1
1431 pcmpeqb %xmm1, %xmm0
1432 pslldq $4, %xmm2
1433 pcmpeqb %xmm1, %xmm2
1434 psubb %xmm0, %xmm2
1435 pmovmskb %xmm2, %r9d
1436 shr %cl, %edx
1437 shr %cl, %r9d
1438 sub %r9d, %edx
1439 jnz L(less32bytes)
1440 movdqa (%rdi), %xmm3
1441
1442 UPDATE_STRNCMP_COUNTER
1443
1444 pxor %xmm0, %xmm0
1445 mov $16, %rcx /* index for loads */
1446 mov $12, %r9d /* byte position left over from less32bytes case */
1447 /*
1448 * Setup %r10 value allows us to detect crossing a page boundary.
1449 * When %r10 goes positive we have crossed a page boundary and
1450 * need to do a nibble.
1451 */
1452 lea 12(%rdi), %r10
1453 and $0xfff, %r10 /* offset into 4K page */
1454 sub $0x1000, %r10 /* subtract 4K pagesize */
1455
1456 .p2align 4
1457L(loop_ashr_12):
1458 add $16, %r10
1459 jg L(nibble_ashr_12)
1460
1461L(gobble_ashr_12):
1462 movdqa (%rsi, %rcx), %xmm1
1463 movdqa (%rdi, %rcx), %xmm2
1464 movdqa %xmm2, %xmm4
1465
1466 palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
1467
1468 pcmpeqb %xmm1, %xmm0
1469 pcmpeqb %xmm2, %xmm1
1470 psubb %xmm0, %xmm1
1471 pmovmskb %xmm1, %edx
1472 sub $0xffff, %edx
1473 jnz L(exit)
1474
1475#ifdef USE_AS_STRNCMP
1476 sub $16, %r11
1477 jbe L(strcmp_exitz)
1478#endif
1479
1480 add $16, %rcx
1481 movdqa %xmm4, %xmm3
1482
1483 add $16, %r10
1484 jg L(nibble_ashr_12) /* cross page boundary */
1485
1486 movdqa (%rsi, %rcx), %xmm1
1487 movdqa (%rdi, %rcx), %xmm2
1488 movdqa %xmm2, %xmm4
1489
1490 palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
1491
1492 pcmpeqb %xmm1, %xmm0
1493 pcmpeqb %xmm2, %xmm1
1494 psubb %xmm0, %xmm1
1495 pmovmskb %xmm1, %edx
1496 sub $0xffff, %edx
1497 jnz L(exit)
1498
1499#ifdef USE_AS_STRNCMP
1500 sub $16, %r11
1501 jbe L(strcmp_exitz)
1502#endif
1503
1504 add $16, %rcx
1505 movdqa %xmm4, %xmm3
1506 jmp L(loop_ashr_12)
1507
1508 .p2align 4
1509L(nibble_ashr_12):
1510 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1511 pmovmskb %xmm0, %edx
1512 test $0xf000, %edx
1513 jnz L(ashr_12_exittail)
1514
1515#ifdef USE_AS_STRNCMP
1516 cmp $3, %r11
1517 jbe L(ashr_12_exittail)
1518#endif
1519
1520 pxor %xmm0, %xmm0
1521 sub $0x1000, %r10
1522 jmp L(gobble_ashr_12)
1523
1524 .p2align 4
1525L(ashr_12_exittail):
1526 movdqa (%rsi, %rcx), %xmm1
1527 psrldq $12, %xmm0
1528 psrldq $12, %xmm3
1529 jmp L(aftertail)
1530
1531/*
1532 * The following cases will be handled by ashr_13
1533 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1534 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1535 */
1536 .p2align 4
1537L(ashr_13):
1538 pxor %xmm0, %xmm0
1539 movdqa (%rdi), %xmm2
1540 movdqa (%rsi), %xmm1
1541 pcmpeqb %xmm1, %xmm0
1542 pslldq $3, %xmm2
1543 pcmpeqb %xmm1, %xmm2
1544 psubb %xmm0, %xmm2
1545 pmovmskb %xmm2, %r9d
1546 shr %cl, %edx
1547 shr %cl, %r9d
1548 sub %r9d, %edx
1549 jnz L(less32bytes)
1550 movdqa (%rdi), %xmm3
1551
1552 UPDATE_STRNCMP_COUNTER
1553
1554 pxor %xmm0, %xmm0
1555 mov $16, %rcx /* index for loads */
1556 mov $13, %r9d /* byte position left over from less32bytes case */
1557 /*
1558 * Setup %r10 value allows us to detect crossing a page boundary.
1559 * When %r10 goes positive we have crossed a page boundary and
1560 * need to do a nibble.
1561 */
1562 lea 13(%rdi), %r10
1563 and $0xfff, %r10 /* offset into 4K page */
1564 sub $0x1000, %r10 /* subtract 4K pagesize */
1565
1566 .p2align 4
1567L(loop_ashr_13):
1568 add $16, %r10
1569 jg L(nibble_ashr_13)
1570
1571L(gobble_ashr_13):
1572 movdqa (%rsi, %rcx), %xmm1
1573 movdqa (%rdi, %rcx), %xmm2
1574 movdqa %xmm2, %xmm4
1575
1576 palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
1577
1578 pcmpeqb %xmm1, %xmm0
1579 pcmpeqb %xmm2, %xmm1
1580 psubb %xmm0, %xmm1
1581 pmovmskb %xmm1, %edx
1582 sub $0xffff, %edx
1583 jnz L(exit)
1584
1585#ifdef USE_AS_STRNCMP
1586 sub $16, %r11
1587 jbe L(strcmp_exitz)
1588#endif
1589
1590 add $16, %rcx
1591 movdqa %xmm4, %xmm3
1592
1593 add $16, %r10
1594 jg L(nibble_ashr_13) /* cross page boundary */
1595
1596 movdqa (%rsi, %rcx), %xmm1
1597 movdqa (%rdi, %rcx), %xmm2
1598 movdqa %xmm2, %xmm4
1599
1600 palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
1601
1602 pcmpeqb %xmm1, %xmm0
1603 pcmpeqb %xmm2, %xmm1
1604 psubb %xmm0, %xmm1
1605 pmovmskb %xmm1, %edx
1606 sub $0xffff, %edx
1607 jnz L(exit)
1608
1609#ifdef USE_AS_STRNCMP
1610 sub $16, %r11
1611 jbe L(strcmp_exitz)
1612#endif
1613
1614 add $16, %rcx
1615 movdqa %xmm4, %xmm3
1616 jmp L(loop_ashr_13)
1617
1618 .p2align 4
1619L(nibble_ashr_13):
1620 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1621 pmovmskb %xmm0, %edx
1622 test $0xe000, %edx
1623 jnz L(ashr_13_exittail)
1624
1625#ifdef USE_AS_STRNCMP
1626 cmp $2, %r11
1627 jbe L(ashr_13_exittail)
1628#endif
1629
1630 pxor %xmm0, %xmm0
1631 sub $0x1000, %r10
1632 jmp L(gobble_ashr_13)
1633
1634 .p2align 4
1635L(ashr_13_exittail):
1636 movdqa (%rsi, %rcx), %xmm1
1637 psrldq $13, %xmm0
1638 psrldq $13, %xmm3
1639 jmp L(aftertail)
1640
1641/*
1642 * The following cases will be handled by ashr_14
1643 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1644 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1645 */
1646 .p2align 4
1647L(ashr_14):
1648 pxor %xmm0, %xmm0
1649 movdqa (%rdi), %xmm2
1650 movdqa (%rsi), %xmm1
1651 pcmpeqb %xmm1, %xmm0
1652 pslldq $2, %xmm2
1653 pcmpeqb %xmm1, %xmm2
1654 psubb %xmm0, %xmm2
1655 pmovmskb %xmm2, %r9d
1656 shr %cl, %edx
1657 shr %cl, %r9d
1658 sub %r9d, %edx
1659 jnz L(less32bytes)
1660 movdqa (%rdi), %xmm3
1661
1662 UPDATE_STRNCMP_COUNTER
1663
1664 pxor %xmm0, %xmm0
1665 mov $16, %rcx /* index for loads */
1666 mov $14, %r9d /* byte position left over from less32bytes case */
1667 /*
1668 * Setup %r10 value allows us to detect crossing a page boundary.
1669 * When %r10 goes positive we have crossed a page boundary and
1670 * need to do a nibble.
1671 */
1672 lea 14(%rdi), %r10
1673 and $0xfff, %r10 /* offset into 4K page */
1674 sub $0x1000, %r10 /* subtract 4K pagesize */
1675
1676 .p2align 4
1677L(loop_ashr_14):
1678 add $16, %r10
1679 jg L(nibble_ashr_14)
1680
1681L(gobble_ashr_14):
1682 movdqa (%rsi, %rcx), %xmm1
1683 movdqa (%rdi, %rcx), %xmm2
1684 movdqa %xmm2, %xmm4
1685
1686 palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
1687
1688 pcmpeqb %xmm1, %xmm0
1689 pcmpeqb %xmm2, %xmm1
1690 psubb %xmm0, %xmm1
1691 pmovmskb %xmm1, %edx
1692 sub $0xffff, %edx
1693 jnz L(exit)
1694
1695#ifdef USE_AS_STRNCMP
1696 sub $16, %r11
1697 jbe L(strcmp_exitz)
1698#endif
1699
1700 add $16, %rcx
1701 movdqa %xmm4, %xmm3
1702
1703 add $16, %r10
1704 jg L(nibble_ashr_14) /* cross page boundary */
1705
1706 movdqa (%rsi, %rcx), %xmm1
1707 movdqa (%rdi, %rcx), %xmm2
1708 movdqa %xmm2, %xmm4
1709
1710 palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
1711
1712 pcmpeqb %xmm1, %xmm0
1713 pcmpeqb %xmm2, %xmm1
1714 psubb %xmm0, %xmm1
1715 pmovmskb %xmm1, %edx
1716 sub $0xffff, %edx
1717 jnz L(exit)
1718
1719#ifdef USE_AS_STRNCMP
1720 sub $16, %r11
1721 jbe L(strcmp_exitz)
1722#endif
1723
1724 add $16, %rcx
1725 movdqa %xmm4, %xmm3
1726 jmp L(loop_ashr_14)
1727
1728 .p2align 4
1729L(nibble_ashr_14):
1730 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1731 pmovmskb %xmm0, %edx
1732 test $0xc000, %edx
1733 jnz L(ashr_14_exittail)
1734
1735#ifdef USE_AS_STRNCMP
1736 cmp $1, %r11
1737 jbe L(ashr_14_exittail)
1738#endif
1739
1740 pxor %xmm0, %xmm0
1741 sub $0x1000, %r10
1742 jmp L(gobble_ashr_14)
1743
1744 .p2align 4
1745L(ashr_14_exittail):
1746 movdqa (%rsi, %rcx), %xmm1
1747 psrldq $14, %xmm0
1748 psrldq $14, %xmm3
1749 jmp L(aftertail)
1750
1751/*
1752 * The following cases will be handled by ashr_15
1753 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1754 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
1755 */
1756 .p2align 4
1757L(ashr_15):
1758 pxor %xmm0, %xmm0
1759 movdqa (%rdi), %xmm2
1760 movdqa (%rsi), %xmm1
1761 pcmpeqb %xmm1, %xmm0
1762 pslldq $1, %xmm2
1763 pcmpeqb %xmm1, %xmm2
1764 psubb %xmm0, %xmm2
1765 pmovmskb %xmm2, %r9d
1766 shr %cl, %edx
1767 shr %cl, %r9d
1768 sub %r9d, %edx
1769 jnz L(less32bytes)
1770
1771 movdqa (%rdi), %xmm3
1772
1773 UPDATE_STRNCMP_COUNTER
1774
1775 pxor %xmm0, %xmm0
1776 mov $16, %rcx /* index for loads */
1777 mov $15, %r9d /* byte position left over from less32bytes case */
1778 /*
1779 * Setup %r10 value allows us to detect crossing a page boundary.
1780 * When %r10 goes positive we have crossed a page boundary and
1781 * need to do a nibble.
1782 */
1783 lea 15(%rdi), %r10
1784 and $0xfff, %r10 /* offset into 4K page */
1785
1786 sub $0x1000, %r10 /* subtract 4K pagesize */
1787
1788 .p2align 4
1789L(loop_ashr_15):
1790 add $16, %r10
1791 jg L(nibble_ashr_15)
1792
1793L(gobble_ashr_15):
1794 movdqa (%rsi, %rcx), %xmm1
1795 movdqa (%rdi, %rcx), %xmm2
1796 movdqa %xmm2, %xmm4
1797
1798 palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
1799
1800 pcmpeqb %xmm1, %xmm0
1801 pcmpeqb %xmm2, %xmm1
1802 psubb %xmm0, %xmm1
1803 pmovmskb %xmm1, %edx
1804 sub $0xffff, %edx
1805 jnz L(exit)
1806
1807#ifdef USE_AS_STRNCMP
1808 sub $16, %r11
1809 jbe L(strcmp_exitz)
1810#endif
1811
1812 add $16, %rcx
1813 movdqa %xmm4, %xmm3
1814
1815 add $16, %r10
1816 jg L(nibble_ashr_15) /* cross page boundary */
1817
1818 movdqa (%rsi, %rcx), %xmm1
1819 movdqa (%rdi, %rcx), %xmm2
1820 movdqa %xmm2, %xmm4
1821
1822 palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
1823
1824 pcmpeqb %xmm1, %xmm0
1825 pcmpeqb %xmm2, %xmm1
1826 psubb %xmm0, %xmm1
1827 pmovmskb %xmm1, %edx
1828 sub $0xffff, %edx
1829 jnz L(exit)
1830
1831#ifdef USE_AS_STRNCMP
1832 sub $16, %r11
1833 jbe L(strcmp_exitz)
1834#endif
1835
1836 add $16, %rcx
1837 movdqa %xmm4, %xmm3
1838 jmp L(loop_ashr_15)
1839
1840 .p2align 4
1841L(nibble_ashr_15):
1842 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1843 pmovmskb %xmm0, %edx
1844 test $0x8000, %edx
1845 jnz L(ashr_15_exittail)
1846
1847#ifdef USE_AS_STRNCMP
1848 test %r11, %r11
1849 je L(ashr_15_exittail)
1850#endif
1851
1852 pxor %xmm0, %xmm0
1853 sub $0x1000, %r10
1854 jmp L(gobble_ashr_15)
1855
1856 .p2align 4
1857L(ashr_15_exittail):
1858 movdqa (%rsi, %rcx), %xmm1
1859 psrldq $15, %xmm3
1860 psrldq $15, %xmm0
1861
1862 .p2align 4
1863L(aftertail):
1864 pcmpeqb %xmm3, %xmm1
1865 psubb %xmm0, %xmm1
1866 pmovmskb %xmm1, %edx
1867 not %edx
1868
1869 .p2align 4
1870L(exit):
1871 lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */
1872L(less32bytes):
1873 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
1874 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
1875 test %r8d, %r8d
1876 jz L(ret)
1877 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
1878
1879 .p2align 4
1880L(ret):
1881L(less16bytes):
1882 bsf %rdx, %rdx /* find and store bit index in %rdx */
1883
1884#ifdef USE_AS_STRNCMP
1885 sub %rdx, %r11
1886 jbe L(strcmp_exitz)
1887#endif
1888 movzbl (%rsi, %rdx), %ecx
1889 movzbl (%rdi, %rdx), %eax
1890
1891 sub %ecx, %eax
1892 ret
1893
1894L(strcmp_exitz):
1895 xor %eax, %eax
1896 ret
1897
1898 .p2align 4
1899L(Byte0):
1900 movzx (%rsi), %ecx
1901 movzx (%rdi), %eax
1902
1903 sub %ecx, %eax
1904 ret
1905END (STRCMP)
1906
1907 .section .rodata,"a",@progbits
1908 .p2align 3
1909L(unaligned_table):
1910 .int L(ashr_1) - L(unaligned_table)
1911 .int L(ashr_2) - L(unaligned_table)
1912 .int L(ashr_3) - L(unaligned_table)
1913 .int L(ashr_4) - L(unaligned_table)
1914 .int L(ashr_5) - L(unaligned_table)
1915 .int L(ashr_6) - L(unaligned_table)
1916 .int L(ashr_7) - L(unaligned_table)
1917 .int L(ashr_8) - L(unaligned_table)
1918 .int L(ashr_9) - L(unaligned_table)
1919 .int L(ashr_10) - L(unaligned_table)
1920 .int L(ashr_11) - L(unaligned_table)
1921 .int L(ashr_12) - L(unaligned_table)
1922 .int L(ashr_13) - L(unaligned_table)
1923 .int L(ashr_14) - L(unaligned_table)
1924 .int L(ashr_15) - L(unaligned_table)
1925 .int L(ashr_0) - L(unaligned_table)