blob: cdb17cc5304b2cb25835442c3c533d0b7e0d84c3 [file] [log] [blame]
Liubov Dmitrieva0a490662012-01-17 12:55:46 +04001/*
2Copyright (c) 2011, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#define USE_AS_STRNCPY
32#define STRCPY strlcpy
33#define STRLEN strlcpy
34#define USE_AS_STRLCPY
35#include "ssse3-strcpy-atom.S"
36
37 .p2align 4
38L(CopyFrom1To16Bytes):
39 add %esi, %edx
40 add %esi, %ecx
41
42 POP (%esi)
43 test %al, %al
44 jz L(ExitHigh8)
45
46L(CopyFrom1To16BytesLess8):
47 mov %al, %ah
48 and $15, %ah
49 jz L(ExitHigh4)
50
51 test $0x01, %al
52 jnz L(Exit1)
53 test $0x02, %al
54 jnz L(Exit2)
55 test $0x04, %al
56 jnz L(Exit3)
57L(Exit4):
58 movl (%ecx), %eax
59 movl %eax, (%edx)
60
61 lea 3(%ecx), %eax
62 sub %edi, %eax
63 RETURN1
64
65 .p2align 4
66L(ExitHigh4):
67 test $0x10, %al
68 jnz L(Exit5)
69 test $0x20, %al
70 jnz L(Exit6)
71 test $0x40, %al
72 jnz L(Exit7)
73L(Exit8):
74 movlpd (%ecx), %xmm0
75 movlpd %xmm0, (%edx)
76
77 lea 7(%ecx), %eax
78 sub %edi, %eax
79 RETURN1
80
81 .p2align 4
82L(ExitHigh8):
83 mov %ah, %al
84 and $15, %al
85 jz L(ExitHigh12)
86
87 test $0x01, %ah
88 jnz L(Exit9)
89 test $0x02, %ah
90 jnz L(Exit10)
91 test $0x04, %ah
92 jnz L(Exit11)
93L(Exit12):
94 movlpd (%ecx), %xmm0
95 movlpd %xmm0, (%edx)
96 movl 8(%ecx), %eax
97 movl %eax, 8(%edx)
98
99 lea 11(%ecx), %eax
100 sub %edi, %eax
101 RETURN1
102
103 .p2align 4
104L(ExitHigh12):
105 test $0x10, %ah
106 jnz L(Exit13)
107 test $0x20, %ah
108 jnz L(Exit14)
109 test $0x40, %ah
110 jnz L(Exit15)
111L(Exit16):
112 movlpd (%ecx), %xmm0
113 movlpd 8(%ecx), %xmm1
114 movlpd %xmm0, (%edx)
115 movlpd %xmm1, 8(%edx)
116
117 lea 15(%ecx), %eax
118 sub %edi, %eax
119 RETURN1
120
121 CFI_PUSH(%esi)
122
123 .p2align 4
124L(CopyFrom1To16BytesCase2):
125 add $16, %ebx
126 add %esi, %ecx
127 add %esi, %edx
128
129 POP (%esi)
130
131 test %al, %al
132 jz L(ExitHighCase2)
133
134 cmp $8, %ebx
135 ja L(CopyFrom1To16BytesLess8)
136
137 test $0x01, %al
138 jnz L(Exit1)
139 cmp $1, %ebx
140 je L(StrlcpyExit1)
141 test $0x02, %al
142 jnz L(Exit2)
143 cmp $2, %ebx
144 je L(StrlcpyExit2)
145 test $0x04, %al
146 jnz L(Exit3)
147 cmp $3, %ebx
148 je L(StrlcpyExit3)
149 test $0x08, %al
150 jnz L(Exit4)
151 cmp $4, %ebx
152 je L(StrlcpyExit4)
153 test $0x10, %al
154 jnz L(Exit5)
155 cmp $5, %ebx
156 je L(StrlcpyExit5)
157 test $0x20, %al
158 jnz L(Exit6)
159 cmp $6, %ebx
160 je L(StrlcpyExit6)
161 test $0x40, %al
162 jnz L(Exit7)
163 cmp $7, %ebx
164 je L(StrlcpyExit7)
165 test $0x80, %al
166 jnz L(Exit8)
167 jmp L(StrlcpyExit8)
168
169 .p2align 4
170L(ExitHighCase2):
171 cmp $8, %ebx
172 jbe L(CopyFrom1To16BytesLess8Case3)
173
174 test $0x01, %ah
175 jnz L(Exit9)
176 cmp $9, %ebx
177 je L(StrlcpyExit9)
178 test $0x02, %ah
179 jnz L(Exit10)
180 cmp $10, %ebx
181 je L(StrlcpyExit10)
182 test $0x04, %ah
183 jnz L(Exit11)
184 cmp $11, %ebx
185 je L(StrlcpyExit11)
186 test $0x8, %ah
187 jnz L(Exit12)
188 cmp $12, %ebx
189 je L(StrlcpyExit12)
190 test $0x10, %ah
191 jnz L(Exit13)
192 cmp $13, %ebx
193 je L(StrlcpyExit13)
194 test $0x20, %ah
195 jnz L(Exit14)
196 cmp $14, %ebx
197 je L(StrlcpyExit14)
198 test $0x40, %ah
199 jnz L(Exit15)
200 cmp $15, %ebx
201 je L(StrlcpyExit15)
202 test $0x80, %ah
203 jnz L(Exit16)
204 jmp L(StrlcpyExit16)
205
206 CFI_PUSH(%esi)
207
208 .p2align 4
209L(CopyFrom1To16BytesCase2OrCase3):
210 test %eax, %eax
211 jnz L(CopyFrom1To16BytesCase2)
212
213 .p2align 4
214L(CopyFrom1To16BytesCase3):
215 add $16, %ebx
216 add %esi, %edx
217 add %esi, %ecx
218
219 POP (%esi)
220
221 cmp $8, %ebx
222 ja L(ExitHigh8Case3)
223
224L(CopyFrom1To16BytesLess8Case3):
225 cmp $4, %ebx
226 ja L(ExitHigh4Case3)
227
228 cmp $1, %ebx
229 je L(StrlcpyExit1)
230 cmp $2, %ebx
231 je L(StrlcpyExit2)
232 cmp $3, %ebx
233 je L(StrlcpyExit3)
234L(StrlcpyExit4):
235 movb %bh, 3(%edx)
236 movw (%ecx), %ax
237 movw %ax, (%edx)
238 movb 2(%ecx), %al
239 movb %al, 2(%edx)
240
241 lea 4(%ecx), %edx
242 mov %edi, %ecx
243 POP (%edi)
244 jmp L(CalculateLengthOfSrc)
245 CFI_PUSH (%edi)
246
247 .p2align 4
248L(ExitHigh4Case3):
249 cmp $5, %ebx
250 je L(StrlcpyExit5)
251 cmp $6, %ebx
252 je L(StrlcpyExit6)
253 cmp $7, %ebx
254 je L(StrlcpyExit7)
255L(StrlcpyExit8):
256 movb %bh, 7(%edx)
257 movl (%ecx), %eax
258 movl %eax, (%edx)
259 movl 3(%ecx), %eax
260 movl %eax, 3(%edx)
261
262 lea 8(%ecx), %edx
263 mov %edi, %ecx
264 POP (%edi)
265 jmp L(CalculateLengthOfSrc)
266 CFI_PUSH (%edi)
267
268 .p2align 4
269L(ExitHigh8Case3):
270 cmp $12, %ebx
271 ja L(ExitHigh12Case3)
272
273 cmp $9, %ebx
274 je L(StrlcpyExit9)
275 cmp $10, %ebx
276 je L(StrlcpyExit10)
277 cmp $11, %ebx
278 je L(StrlcpyExit11)
279L(StrlcpyExit12):
280 movb %bh, 11(%edx)
281 movlpd (%ecx), %xmm0
282 movlpd %xmm0, (%edx)
283 movl 7(%ecx), %eax
284 movl %eax, 7(%edx)
285
286 lea 12(%ecx), %edx
287 mov %edi, %ecx
288 POP (%edi)
289 jmp L(CalculateLengthOfSrc)
290 CFI_PUSH (%edi)
291
292 .p2align 4
293L(ExitHigh12Case3):
294 cmp $13, %ebx
295 je L(StrlcpyExit13)
296 cmp $14, %ebx
297 je L(StrlcpyExit14)
298 cmp $15, %ebx
299 je L(StrlcpyExit15)
300L(StrlcpyExit16):
301 movb %bh, 15(%edx)
302 movlpd (%ecx), %xmm0
303 movlpd %xmm0, (%edx)
304 movlpd 7(%ecx), %xmm0
305 movlpd %xmm0, 7(%edx)
306
307 lea 16(%ecx), %edx
308 mov %edi, %ecx
309 POP (%edi)
310 jmp L(CalculateLengthOfSrc)
311 CFI_PUSH (%edi)
312
313 .p2align 4
314L(StrlcpyExit1):
315 movb %bh, (%edx)
316
317 lea 1(%ecx), %edx
318 mov %edi, %ecx
319 POP (%edi)
320 jmp L(CalculateLengthOfSrc)
321 CFI_PUSH (%edi)
322
323 .p2align 4
324L(Exit1):
325 movb (%ecx), %al
326 movb %al, (%edx)
327
328 mov %ecx, %eax
329 sub %edi, %eax
330 RETURN1
331
332 .p2align 4
333L(StrlcpyExit2):
334 movb %bh, 1(%edx)
335 movb (%ecx), %al
336 movb %al, (%edx)
337
338 lea 2(%ecx), %edx
339 mov %edi, %ecx
340 POP (%edi)
341 jmp L(CalculateLengthOfSrc)
342 CFI_PUSH (%edi)
343
344 .p2align 4
345L(Exit2):
346 movw (%ecx), %ax
347 movw %ax, (%edx)
348 movl %edi, %eax
349
350 lea 1(%ecx), %eax
351 sub %edi, %eax
352 RETURN1
353
354 .p2align 4
355L(StrlcpyExit3):
356 movb %bh, 2(%edx)
357 movw (%ecx), %ax
358 movw %ax, (%edx)
359
360 lea 3(%ecx), %edx
361 mov %edi, %ecx
362 POP (%edi)
363 jmp L(CalculateLengthOfSrc)
364 CFI_PUSH (%edi)
365
366 .p2align 4
367L(Exit3):
368 movw (%ecx), %ax
369 movw %ax, (%edx)
370 movb 2(%ecx), %al
371 movb %al, 2(%edx)
372
373 lea 2(%ecx), %eax
374 sub %edi, %eax
375 RETURN1
376
377 .p2align 4
378L(StrlcpyExit5):
379 movb %bh, 4(%edx)
380 movl (%ecx), %eax
381 movl %eax, (%edx)
382 movl %edi, %eax
383
384 lea 5(%ecx), %edx
385 mov %edi, %ecx
386 POP (%edi)
387 jmp L(CalculateLengthOfSrc)
388 CFI_PUSH (%edi)
389
390 .p2align 4
391L(Exit5):
392 movl (%ecx), %eax
393 movl %eax, (%edx)
394 movb 4(%ecx), %al
395 movb %al, 4(%edx)
396
397 lea 4(%ecx), %eax
398 sub %edi, %eax
399 RETURN1
400
401 .p2align 4
402L(StrlcpyExit6):
403 movb %bh, 5(%edx)
404 movl (%ecx), %eax
405 movl %eax, (%edx)
406 movb 4(%ecx), %al
407 movb %al, 4(%edx)
408
409 lea 6(%ecx), %edx
410 mov %edi, %ecx
411 POP (%edi)
412 jmp L(CalculateLengthOfSrc)
413 CFI_PUSH (%edi)
414
415 .p2align 4
416L(Exit6):
417 movl (%ecx), %eax
418 movl %eax, (%edx)
419 movw 4(%ecx), %ax
420 movw %ax, 4(%edx)
421
422 lea 5(%ecx), %eax
423 sub %edi, %eax
424 RETURN1
425
426 .p2align 4
427L(StrlcpyExit7):
428 movb %bh, 6(%edx)
429 movl (%ecx), %eax
430 movl %eax, (%edx)
431 movw 4(%ecx), %ax
432 movw %ax, 4(%edx)
433
434 lea 7(%ecx), %edx
435 mov %edi, %ecx
436 POP (%edi)
437 jmp L(CalculateLengthOfSrc)
438 CFI_PUSH (%edi)
439
440 .p2align 4
441L(Exit7):
442 movl (%ecx), %eax
443 movl %eax, (%edx)
444 movl 3(%ecx), %eax
445 movl %eax, 3(%edx)
446
447 lea 6(%ecx), %eax
448 sub %edi, %eax
449 RETURN1
450
451 .p2align 4
452L(StrlcpyExit9):
453 movb %bh, 8(%edx)
454 movlpd (%ecx), %xmm0
455 movlpd %xmm0, (%edx)
456
457 lea 9(%ecx), %edx
458 mov %edi, %ecx
459 POP (%edi)
460 jmp L(CalculateLengthOfSrc)
461 CFI_PUSH (%edi)
462
463 .p2align 4
464L(Exit9):
465 movlpd (%ecx), %xmm0
466 movlpd %xmm0, (%edx)
467 movb 8(%ecx), %al
468 movb %al, 8(%edx)
469
470 lea 8(%ecx), %eax
471 sub %edi, %eax
472 RETURN1
473
474 .p2align 4
475L(StrlcpyExit10):
476 movb %bh, 9(%edx)
477 movlpd (%ecx), %xmm0
478 movlpd %xmm0, (%edx)
479 movb 8(%ecx), %al
480 movb %al, 8(%edx)
481
482 lea 10(%ecx), %edx
483 mov %edi, %ecx
484 POP (%edi)
485 jmp L(CalculateLengthOfSrc)
486 CFI_PUSH (%edi)
487
488 .p2align 4
489L(Exit10):
490 movlpd (%ecx), %xmm0
491 movlpd %xmm0, (%edx)
492 movw 8(%ecx), %ax
493 movw %ax, 8(%edx)
494
495 lea 9(%ecx), %eax
496 sub %edi, %eax
497 RETURN1
498
499 .p2align 4
500L(StrlcpyExit11):
501 movb %bh, 10(%edx)
502 movlpd (%ecx), %xmm0
503 movlpd %xmm0, (%edx)
504 movw 8(%ecx), %ax
505 movw %ax, 8(%edx)
506
507 lea 11(%ecx), %edx
508 mov %edi, %ecx
509 POP (%edi)
510 jmp L(CalculateLengthOfSrc)
511 CFI_PUSH (%edi)
512
513 .p2align 4
514L(Exit11):
515 movlpd (%ecx), %xmm0
516 movlpd %xmm0, (%edx)
517 movl 7(%ecx), %eax
518 movl %eax, 7(%edx)
519
520 lea 10(%ecx), %eax
521 sub %edi, %eax
522 RETURN1
523
524 .p2align 4
525L(StrlcpyExit13):
526 movb %bh, 12(%edx)
527 movlpd (%ecx), %xmm0
528 movlpd %xmm0, (%edx)
529 movl 8(%ecx), %eax
530 movl %eax, 8(%edx)
531
532 lea 13(%ecx), %edx
533 mov %edi, %ecx
534 POP (%edi)
535 jmp L(CalculateLengthOfSrc)
536 CFI_PUSH (%edi)
537
538 .p2align 4
539L(Exit13):
540 movlpd (%ecx), %xmm0
541 movlpd %xmm0, (%edx)
542 movlpd 5(%ecx), %xmm0
543 movlpd %xmm0, 5(%edx)
544
545 lea 12(%ecx), %eax
546 sub %edi, %eax
547 RETURN1
548
549 .p2align 4
550L(StrlcpyExit14):
551 movb %bh, 13(%edx)
552 movlpd (%ecx), %xmm0
553 movlpd %xmm0, (%edx)
554 movlpd 5(%ecx), %xmm0
555 movlpd %xmm0, 5(%edx)
556
557 lea 14(%ecx), %edx
558 mov %edi, %ecx
559 POP (%edi)
560 jmp L(CalculateLengthOfSrc)
561 CFI_PUSH (%edi)
562
563 .p2align 4
564L(Exit14):
565 movlpd (%ecx), %xmm0
566 movlpd %xmm0, (%edx)
567 movlpd 6(%ecx), %xmm0
568 movlpd %xmm0, 6(%edx)
569
570 lea 13(%ecx), %eax
571 sub %edi, %eax
572 RETURN1
573
574 .p2align 4
575L(StrlcpyExit15):
576 movb %bh, 14(%edx)
577 movlpd (%ecx), %xmm0
578 movlpd %xmm0, (%edx)
579 movlpd 6(%ecx), %xmm0
580 movlpd %xmm0, 6(%edx)
581
582 lea 15(%ecx), %edx
583 mov %edi, %ecx
584 POP (%edi)
585 jmp L(CalculateLengthOfSrc)
586 CFI_PUSH (%edi)
587
588 .p2align 4
589L(Exit15):
590 movlpd (%ecx), %xmm0
591 movlpd %xmm0, (%edx)
592 movlpd 7(%ecx), %xmm0
593 movlpd %xmm0, 7(%edx)
594
595 lea 14(%ecx), %eax
596 sub %edi, %eax
597 RETURN1
598
599 CFI_POP (%edi)
600
601 .p2align 4
602L(StrlcpyExit0):
603 movl $0, %eax
604 RETURN
605
606 .p2align 4
607L(StrncpyExit15Bytes):
608 cmp $12, %ebx
609 ja L(StrncpyExit15Bytes1)
610
611 cmpb $0, 8(%ecx)
612 jz L(ExitTail9)
613 cmp $9, %ebx
614 je L(StrlcpyExitTail9)
615
616 cmpb $0, 9(%ecx)
617 jz L(ExitTail10)
618 cmp $10, %ebx
619 je L(StrlcpyExitTail10)
620
621 cmpb $0, 10(%ecx)
622 jz L(ExitTail11)
623 cmp $11, %ebx
624 je L(StrlcpyExitTail11)
625
626 cmpb $0, 11(%ecx)
627 jz L(ExitTail12)
628
629 movb %bh, 11(%edx)
630 movlpd (%ecx), %xmm0
631 movlpd %xmm0, (%edx)
632 movl 7(%ecx), %eax
633 movl %eax, 7(%edx)
634
635 lea 12(%ecx), %edx
636 jmp L(CalculateLengthOfSrc)
637
638 .p2align 4
639L(StrncpyExit15Bytes1):
640 cmpb $0, 8(%ecx)
641 jz L(ExitTail9)
642 cmpb $0, 9(%ecx)
643 jz L(ExitTail10)
644 cmpb $0, 10(%ecx)
645 jz L(ExitTail11)
646 cmpb $0, 11(%ecx)
647 jz L(ExitTail12)
648
649 cmpb $0, 12(%ecx)
650 jz L(ExitTail13)
651 cmp $13, %ebx
652 je L(StrlcpyExitTail13)
653
654 cmpb $0, 13(%ecx)
655 jz L(ExitTail14)
656 cmp $14, %ebx
657 je L(StrlcpyExitTail14)
658
659 cmpb $0, 14(%ecx)
660 jz L(ExitTail15)
661
662 movb %bh, 14(%edx)
663 movlpd (%ecx), %xmm0
664 movlpd %xmm0, (%edx)
665 movlpd 6(%ecx), %xmm0
666 movlpd %xmm0, 6(%edx)
667
668 lea 15(%ecx), %edx
669 jmp L(CalculateLengthOfSrc)
670
671 .p2align 4
672L(StrncpyExit8Bytes):
673 cmp $4, %ebx
674 ja L(StrncpyExit8Bytes1)
675
676 test %ebx, %ebx
677 jz L(StrlcpyExitTail0)
678
679 cmpb $0, (%ecx)
680 jz L(ExitTail1)
681 cmp $1, %ebx
682 je L(StrlcpyExitTail1)
683
684 cmpb $0, 1(%ecx)
685 jz L(ExitTail2)
686 cmp $2, %ebx
687 je L(StrlcpyExitTail2)
688
689 cmpb $0, 2(%ecx)
690 jz L(ExitTail3)
691 cmp $3, %ebx
692 je L(StrlcpyExitTail3)
693
694 cmpb $0, 3(%ecx)
695 jz L(ExitTail4)
696
697 movb %bh, 3(%edx)
698 movw (%ecx), %ax
699 movw %ax, (%edx)
700 movb 2(%ecx), %al
701 movb %al, 2(%edx)
702
703 lea 4(%ecx), %edx
704 jmp L(CalculateLengthOfSrc)
705
706 .p2align 4
707L(StrncpyExit8Bytes1):
708 cmpb $0, (%ecx)
709 jz L(ExitTail1)
710 cmpb $0, 1(%ecx)
711 jz L(ExitTail2)
712 cmpb $0, 2(%ecx)
713 jz L(ExitTail3)
714 cmpb $0, 3(%ecx)
715 jz L(ExitTail4)
716
717 cmpb $0, 4(%ecx)
718 jz L(ExitTail5)
719 cmp $5, %ebx
720 je L(StrlcpyExitTail5)
721
722 cmpb $0, 5(%ecx)
723 jz L(ExitTail6)
724 cmp $6, %ebx
725 je L(StrlcpyExitTail6)
726
727 cmpb $0, 6(%ecx)
728 jz L(ExitTail7)
729 cmp $7, %ebx
730 je L(StrlcpyExitTail7)
731
732 cmpb $0, 7(%ecx)
733 jz L(ExitTail8)
734
735 movb %bh, 7(%edx)
736 movl (%ecx), %eax
737 movl %eax, (%edx)
738 movl 3(%ecx), %eax
739 movl %eax, 3(%edx)
740
741 lea 8(%ecx), %edx
742 jmp L(CalculateLengthOfSrc)
743
744 .p2align 4
745L(StrlcpyExitTail0):
746 mov %ecx, %edx
747 jmp L(CalculateLengthOfSrc)
748
749 .p2align 4
750L(StrlcpyExitTail1):
751 movb %bh, (%edx)
752
753 lea 1(%ecx), %edx
754 jmp L(CalculateLengthOfSrc)
755
756 .p2align 4
757L(ExitTail1):
758 movb (%ecx), %al
759 movb %al, (%edx)
760
761 mov $0, %eax
762 RETURN
763
764 .p2align 4
765L(StrlcpyExitTail2):
766 movb %bh, 1(%edx)
767 movb (%ecx), %al
768 movb %al, (%edx)
769
770 lea 2(%ecx), %edx
771 jmp L(CalculateLengthOfSrc)
772
773 .p2align 4
774L(ExitTail2):
775 movw (%ecx), %ax
776 movw %ax, (%edx)
777 movl %edx, %eax
778
779 mov $1, %eax
780 RETURN
781
782 .p2align 4
783L(StrlcpyExitTail3):
784 movb %bh, 2(%edx)
785 movw (%ecx), %ax
786 movw %ax, (%edx)
787
788 lea 3(%ecx), %edx
789 jmp L(CalculateLengthOfSrc)
790
791 .p2align 4
792L(ExitTail3):
793 movw (%ecx), %ax
794 movw %ax, (%edx)
795 movb 2(%ecx), %al
796 movb %al, 2(%edx)
797
798 mov $2, %eax
799 RETURN
800
801 .p2align 4
802L(ExitTail4):
803 movl (%ecx), %eax
804 movl %eax, (%edx)
805
806 mov $3, %eax
807 RETURN
808
809 .p2align 4
810L(StrlcpyExitTail5):
811 movb %bh, 4(%edx)
812 movl (%ecx), %eax
813 movl %eax, (%edx)
814 movl %edx, %eax
815
816 lea 5(%ecx), %edx
817 jmp L(CalculateLengthOfSrc)
818
819 .p2align 4
820L(ExitTail5):
821 movl (%ecx), %eax
822 movl %eax, (%edx)
823 movb 4(%ecx), %al
824 movb %al, 4(%edx)
825
826 mov $4, %eax
827 RETURN
828
829 .p2align 4
830L(StrlcpyExitTail6):
831 movb %bh, 5(%edx)
832 movl (%ecx), %eax
833 movl %eax, (%edx)
834 movb 4(%ecx), %al
835 movb %al, 4(%edx)
836
837 lea 6(%ecx), %edx
838 jmp L(CalculateLengthOfSrc)
839
840 .p2align 4
841L(ExitTail6):
842 movl (%ecx), %eax
843 movl %eax, (%edx)
844 movw 4(%ecx), %ax
845 movw %ax, 4(%edx)
846
847 mov $5, %eax
848 RETURN
849
850 .p2align 4
851L(StrlcpyExitTail7):
852 movb %bh, 6(%edx)
853 movl (%ecx), %eax
854 movl %eax, (%edx)
855 movw 4(%ecx), %ax
856 movw %ax, 4(%edx)
857
858 lea 7(%ecx), %edx
859 jmp L(CalculateLengthOfSrc)
860
861 .p2align 4
862L(ExitTail7):
863 movl (%ecx), %eax
864 movl %eax, (%edx)
865 movl 3(%ecx), %eax
866 movl %eax, 3(%edx)
867
868 mov $6, %eax
869 RETURN
870
871 .p2align 4
872L(ExitTail8):
873 movlpd (%ecx), %xmm0
874 movlpd %xmm0, (%edx)
875
876 mov $7, %eax
877 RETURN
878
879 .p2align 4
880L(StrlcpyExitTail9):
881 movb %bh, 8(%edx)
882 movlpd (%ecx), %xmm0
883 movlpd %xmm0, (%edx)
884
885 lea 9(%ecx), %edx
886 jmp L(CalculateLengthOfSrc)
887
888 .p2align 4
889L(ExitTail9):
890 movlpd (%ecx), %xmm0
891 movlpd %xmm0, (%edx)
892 movb 8(%ecx), %al
893 movb %al, 8(%edx)
894
895 mov $8, %eax
896 RETURN
897
898 .p2align 4
899L(StrlcpyExitTail10):
900 movb %bh, 9(%edx)
901 movlpd (%ecx), %xmm0
902 movlpd %xmm0, (%edx)
903 movb 8(%ecx), %al
904 movb %al, 8(%edx)
905
906 lea 10(%ecx), %edx
907 jmp L(CalculateLengthOfSrc)
908
909 .p2align 4
910L(ExitTail10):
911 movlpd (%ecx), %xmm0
912 movlpd %xmm0, (%edx)
913 movw 8(%ecx), %ax
914 movw %ax, 8(%edx)
915
916 mov $9, %eax
917 RETURN
918
919 .p2align 4
920L(StrlcpyExitTail11):
921 movb %bh, 10(%edx)
922 movlpd (%ecx), %xmm0
923 movlpd %xmm0, (%edx)
924 movw 8(%ecx), %ax
925 movw %ax, 8(%edx)
926
927 lea 11(%ecx), %edx
928 jmp L(CalculateLengthOfSrc)
929
930 .p2align 4
931L(ExitTail11):
932 movlpd (%ecx), %xmm0
933 movlpd %xmm0, (%edx)
934 movl 7(%ecx), %eax
935 movl %eax, 7(%edx)
936
937 mov $10, %eax
938 RETURN
939
940 .p2align 4
941L(ExitTail12):
942 movlpd (%ecx), %xmm0
943 movlpd %xmm0, (%edx)
944 movl 8(%ecx), %eax
945 movl %eax, 8(%edx)
946
947 mov $11, %eax
948 RETURN
949
950 .p2align 4
951L(StrlcpyExitTail13):
952 movb %bh, 12(%edx)
953 movlpd (%ecx), %xmm0
954 movlpd %xmm0, (%edx)
955 movl 8(%ecx), %eax
956 movl %eax, 8(%edx)
957
958 lea 13(%ecx), %edx
959 jmp L(CalculateLengthOfSrc)
960
961 .p2align 4
962L(ExitTail13):
963 movlpd (%ecx), %xmm0
964 movlpd %xmm0, (%edx)
965 movlpd 5(%ecx), %xmm0
966 movlpd %xmm0, 5(%edx)
967
968 mov $12, %eax
969 RETURN
970
971 .p2align 4
972L(StrlcpyExitTail14):
973 movb %bh, 13(%edx)
974 movlpd (%ecx), %xmm0
975 movlpd %xmm0, (%edx)
976 movlpd 5(%ecx), %xmm0
977 movlpd %xmm0, 5(%edx)
978
979 lea 14(%ecx), %edx
980 jmp L(CalculateLengthOfSrc)
981
982 .p2align 4
983L(ExitTail14):
984 movlpd (%ecx), %xmm0
985 movlpd %xmm0, (%edx)
986 movlpd 6(%ecx), %xmm0
987 movlpd %xmm0, 6(%edx)
988
989 mov $13, %eax
990 RETURN
991
992 .p2align 4
993L(ExitTail15):
994 movlpd (%ecx), %xmm0
995 movlpd %xmm0, (%edx)
996 movlpd 7(%ecx), %xmm0
997 movlpd %xmm0, 7(%edx)
998
999 mov $14, %eax
1000 RETURN
1001
1002 .p2align 4
1003L(StrlcpyExitTail16):
1004 movb %bh, 15(%edx)
1005 movlpd (%ecx), %xmm0
1006 movlpd %xmm0, (%edx)
1007 movlpd 7(%ecx), %xmm0
1008 movlpd %xmm0, 7(%edx)
1009
1010 lea 16(%ecx), %edx
1011 jmp L(CalculateLengthOfSrc)
1012
1013 .p2align 4
1014L(ExitTail16):
1015 movlpd (%ecx), %xmm0
1016 movlpd 8(%ecx), %xmm1
1017 movlpd %xmm0, (%edx)
1018 movlpd %xmm1, 8(%edx)
1019
1020 mov $15, %eax
1021 RETURN
1022
1023 .p2align 4
1024L(CalculateLengthOfSrc):
1025 xor %eax, %eax
1026 cmpb $0, (%edx)
1027 jz L(exit_tail0)
1028 cmpb $0, 1(%edx)
1029 jz L(exit_tail1)
1030 cmpb $0, 2(%edx)
1031 jz L(exit_tail2)
1032 cmpb $0, 3(%edx)
1033 jz L(exit_tail3)
1034
1035 cmpb $0, 4(%edx)
1036 jz L(exit_tail4)
1037 cmpb $0, 5(%edx)
1038 jz L(exit_tail5)
1039 cmpb $0, 6(%edx)
1040 jz L(exit_tail6)
1041 cmpb $0, 7(%edx)
1042 jz L(exit_tail7)
1043
1044 cmpb $0, 8(%edx)
1045 jz L(exit_tail8)
1046 cmpb $0, 9(%edx)
1047 jz L(exit_tail9)
1048 cmpb $0, 10(%edx)
1049 jz L(exit_tail10)
1050 cmpb $0, 11(%edx)
1051 jz L(exit_tail11)
1052
1053 cmpb $0, 12(%edx)
1054 jz L(exit_tail12)
1055 cmpb $0, 13(%edx)
1056 jz L(exit_tail13)
1057 cmpb $0, 14(%edx)
1058 jz L(exit_tail14)
1059 cmpb $0, 15(%edx)
1060 jz L(exit_tail15)
1061
1062 pxor %xmm0, %xmm0
1063 lea 16(%edx), %eax
1064 add $16, %ecx
1065 and $-16, %eax
1066
1067 pcmpeqb (%eax), %xmm0
1068 pmovmskb %xmm0, %edx
1069 pxor %xmm1, %xmm1
1070 lea 16(%eax), %eax
1071 test %edx, %edx
1072 jnz L(exit)
1073
1074 pcmpeqb (%eax), %xmm1
1075 pmovmskb %xmm1, %edx
1076 pxor %xmm2, %xmm2
1077 lea 16(%eax), %eax
1078 test %edx, %edx
1079 jnz L(exit)
1080
1081 pcmpeqb (%eax), %xmm2
1082 pmovmskb %xmm2, %edx
1083 pxor %xmm3, %xmm3
1084 lea 16(%eax), %eax
1085 test %edx, %edx
1086 jnz L(exit)
1087
1088 pcmpeqb (%eax), %xmm3
1089 pmovmskb %xmm3, %edx
1090 lea 16(%eax), %eax
1091 test %edx, %edx
1092 jnz L(exit)
1093
1094 pcmpeqb (%eax), %xmm0
1095 pmovmskb %xmm0, %edx
1096 lea 16(%eax), %eax
1097 test %edx, %edx
1098 jnz L(exit)
1099
1100 pcmpeqb (%eax), %xmm1
1101 pmovmskb %xmm1, %edx
1102 lea 16(%eax), %eax
1103 test %edx, %edx
1104 jnz L(exit)
1105
1106 pcmpeqb (%eax), %xmm2
1107 pmovmskb %xmm2, %edx
1108 lea 16(%eax), %eax
1109 test %edx, %edx
1110 jnz L(exit)
1111
1112 pcmpeqb (%eax), %xmm3
1113 pmovmskb %xmm3, %edx
1114 lea 16(%eax), %eax
1115 test %edx, %edx
1116 jnz L(exit)
1117
1118 pcmpeqb (%eax), %xmm0
1119 pmovmskb %xmm0, %edx
1120 lea 16(%eax), %eax
1121 test %edx, %edx
1122 jnz L(exit)
1123
1124 pcmpeqb (%eax), %xmm1
1125 pmovmskb %xmm1, %edx
1126 lea 16(%eax), %eax
1127 test %edx, %edx
1128 jnz L(exit)
1129
1130 pcmpeqb (%eax), %xmm2
1131 pmovmskb %xmm2, %edx
1132 lea 16(%eax), %eax
1133 test %edx, %edx
1134 jnz L(exit)
1135
1136 pcmpeqb (%eax), %xmm3
1137 pmovmskb %xmm3, %edx
1138 lea 16(%eax), %eax
1139 test %edx, %edx
1140 jnz L(exit)
1141
1142 pcmpeqb (%eax), %xmm0
1143 pmovmskb %xmm0, %edx
1144 lea 16(%eax), %eax
1145 test %edx, %edx
1146 jnz L(exit)
1147
1148 pcmpeqb (%eax), %xmm1
1149 pmovmskb %xmm1, %edx
1150 lea 16(%eax), %eax
1151 test %edx, %edx
1152 jnz L(exit)
1153
1154 pcmpeqb (%eax), %xmm2
1155 pmovmskb %xmm2, %edx
1156 lea 16(%eax), %eax
1157 test %edx, %edx
1158 jnz L(exit)
1159
1160 pcmpeqb (%eax), %xmm3
1161 pmovmskb %xmm3, %edx
1162 lea 16(%eax), %eax
1163 test %edx, %edx
1164 jnz L(exit)
1165
1166 and $-0x40, %eax
1167
1168 .p2align 4
1169L(aligned_64_loop):
1170 movaps (%eax), %xmm0
1171 movaps 16(%eax), %xmm1
1172 movaps 32(%eax), %xmm2
1173 movaps 48(%eax), %xmm6
1174 pminub %xmm1, %xmm0
1175 pminub %xmm6, %xmm2
1176 pminub %xmm0, %xmm2
1177 pcmpeqb %xmm3, %xmm2
1178 pmovmskb %xmm2, %edx
1179 lea 64(%eax), %eax
1180 test %edx, %edx
1181 jz L(aligned_64_loop)
1182
1183 pcmpeqb -64(%eax), %xmm3
1184 pmovmskb %xmm3, %edx
1185 lea 48(%ecx), %ecx
1186 test %edx, %edx
1187 jnz L(exit)
1188
1189 pcmpeqb %xmm1, %xmm3
1190 pmovmskb %xmm3, %edx
1191 lea -16(%ecx), %ecx
1192 test %edx, %edx
1193 jnz L(exit)
1194
1195 pcmpeqb -32(%eax), %xmm3
1196 pmovmskb %xmm3, %edx
1197 lea -16(%ecx), %ecx
1198 test %edx, %edx
1199 jnz L(exit)
1200
1201 pcmpeqb %xmm6, %xmm3
1202 pmovmskb %xmm3, %edx
1203 lea -16(%ecx), %ecx
1204
1205 .p2align 4
1206L(exit):
1207 sub %ecx, %eax
1208 test %dl, %dl
1209 jz L(exit_more_8)
1210
1211 mov %dl, %cl
1212 and $15, %cl
1213 jz L(exit_more_4)
1214 test $0x01, %dl
1215 jnz L(exit_0)
1216 test $0x02, %dl
1217 jnz L(exit_1)
1218 test $0x04, %dl
1219 jnz L(exit_2)
1220 add $3, %eax
1221 RETURN
1222
1223 .p2align 4
1224L(exit_more_4):
1225 test $0x10, %dl
1226 jnz L(exit_4)
1227 test $0x20, %dl
1228 jnz L(exit_5)
1229 test $0x40, %dl
1230 jnz L(exit_6)
1231 add $7, %eax
1232 RETURN
1233
1234 .p2align 4
1235L(exit_more_8):
1236 mov %dh, %ch
1237 and $15, %ch
1238 jz L(exit_more_12)
1239 test $0x01, %dh
1240 jnz L(exit_8)
1241 test $0x02, %dh
1242 jnz L(exit_9)
1243 test $0x04, %dh
1244 jnz L(exit_10)
1245 add $11, %eax
1246 RETURN
1247
1248 .p2align 4
1249L(exit_more_12):
1250 test $0x10, %dh
1251 jnz L(exit_12)
1252 test $0x20, %dh
1253 jnz L(exit_13)
1254 test $0x40, %dh
1255 jnz L(exit_14)
1256 add $15, %eax
1257L(exit_0):
1258 RETURN
1259
1260 .p2align 4
1261L(exit_1):
1262 add $1, %eax
1263 RETURN
1264
1265L(exit_2):
1266 add $2, %eax
1267 RETURN
1268
1269L(exit_3):
1270 add $3, %eax
1271 RETURN
1272
1273L(exit_4):
1274 add $4, %eax
1275 RETURN
1276
1277L(exit_5):
1278 add $5, %eax
1279 RETURN
1280
1281L(exit_6):
1282 add $6, %eax
1283 RETURN
1284
1285L(exit_7):
1286 add $7, %eax
1287 RETURN
1288
1289L(exit_8):
1290 add $8, %eax
1291 RETURN
1292
1293L(exit_9):
1294 add $9, %eax
1295 RETURN
1296
1297L(exit_10):
1298 add $10, %eax
1299 RETURN
1300
1301L(exit_11):
1302 add $11, %eax
1303 RETURN
1304
1305L(exit_12):
1306 add $12, %eax
1307 RETURN
1308
1309L(exit_13):
1310 add $13, %eax
1311 RETURN
1312
1313L(exit_14):
1314 add $14, %eax
1315 RETURN
1316
1317L(exit_15):
1318 add $15, %eax
1319 RETURN
1320
1321L(exit_tail0):
1322 mov %edx, %eax
1323 sub %ecx, %eax
1324 RETURN
1325
1326 .p2align 4
1327L(exit_tail1):
1328 lea 1(%edx), %eax
1329 sub %ecx, %eax
1330 RETURN
1331
1332L(exit_tail2):
1333 lea 2(%edx), %eax
1334 sub %ecx, %eax
1335 RETURN
1336
1337L(exit_tail3):
1338 lea 3(%edx), %eax
1339 sub %ecx, %eax
1340 RETURN
1341
1342L(exit_tail4):
1343 lea 4(%edx), %eax
1344 sub %ecx, %eax
1345 RETURN
1346
1347L(exit_tail5):
1348 lea 5(%edx), %eax
1349 sub %ecx, %eax
1350 RETURN
1351
1352L(exit_tail6):
1353 lea 6(%edx), %eax
1354 sub %ecx, %eax
1355 RETURN
1356
1357L(exit_tail7):
1358 lea 7(%edx), %eax
1359 sub %ecx, %eax
1360 RETURN
1361
1362L(exit_tail8):
1363 lea 8(%edx), %eax
1364 sub %ecx, %eax
1365 RETURN
1366
1367L(exit_tail9):
1368 lea 9(%edx), %eax
1369 sub %ecx, %eax
1370 RETURN
1371
1372L(exit_tail10):
1373 lea 10(%edx), %eax
1374 sub %ecx, %eax
1375 RETURN
1376
1377L(exit_tail11):
1378 lea 11(%edx), %eax
1379 sub %ecx, %eax
1380 RETURN
1381
1382L(exit_tail12):
1383 lea 12(%edx), %eax
1384 sub %ecx, %eax
1385 RETURN
1386
1387L(exit_tail13):
1388 lea 13(%edx), %eax
1389 sub %ecx, %eax
1390 RETURN
1391
1392L(exit_tail14):
1393 lea 14(%edx), %eax
1394 sub %ecx, %eax
1395 RETURN
1396
1397L(exit_tail15):
1398 lea 15(%edx), %eax
1399 sub %ecx, %eax
1400 RETURN
1401
1402END (STRCPY)
1403