blob: 8ba84bca434b65466baac00e39848f1cacb4dedf [file] [log] [blame]
Liubov Dmitrieva0a490662012-01-17 12:55:46 +04001/*
2Copyright (c) 2011, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef USE_AS_WCSCAT
32
33# ifndef L
34# define L(label) .L##label
35# endif
36
37# ifndef cfi_startproc
38# define cfi_startproc .cfi_startproc
39# endif
40
41# ifndef cfi_endproc
42# define cfi_endproc .cfi_endproc
43# endif
44
45# ifndef cfi_rel_offset
46# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
47# endif
48
49# ifndef cfi_restore
50# define cfi_restore(reg) .cfi_restore reg
51# endif
52
53# ifndef cfi_adjust_cfa_offset
54# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
55# endif
56
57# ifndef ENTRY
58# define ENTRY(name) \
59 .type name, @function; \
60 .globl name; \
61 .p2align 4; \
62name: \
63 cfi_startproc
64# endif
65
66# ifndef END
67# define END(name) \
68 cfi_endproc; \
69 .size name, .-name
70# endif
71
72# define CFI_PUSH(REG) \
73 cfi_adjust_cfa_offset (4); \
74 cfi_rel_offset (REG, 0)
75
76# define CFI_POP(REG) \
77 cfi_adjust_cfa_offset (-4); \
78 cfi_restore (REG)
79
80# define PUSH(REG) pushl REG; CFI_PUSH (REG)
81# define POP(REG) popl REG; CFI_POP (REG)
82
83# define PARMS 4
84# define RETURN POP (%edi); ret; CFI_PUSH (%edi)
85
86# define STR1 PARMS
87# define STR2 STR1+4
88# define LEN STR2+4
89
90.text
91ENTRY (wcscpy)
92 mov STR1(%esp), %edx
93 mov STR2(%esp), %ecx
94
95 cmp $0, (%ecx)
96 jz L(ExitTail4)
97 cmp $0, 4(%ecx)
98 jz L(ExitTail8)
99 cmp $0, 8(%ecx)
100 jz L(ExitTail12)
101 cmp $0, 12(%ecx)
102 jz L(ExitTail16)
103
104 PUSH (%edi)
105 mov %edx, %edi
106#endif
107 PUSH (%esi)
108 lea 16(%ecx), %esi
109
110 and $-16, %esi
111
112 pxor %xmm0, %xmm0
113 pcmpeqd (%esi), %xmm0
114 movdqu (%ecx), %xmm1
115 movdqu %xmm1, (%edx)
116
117 pmovmskb %xmm0, %eax
118 sub %ecx, %esi
119
120 test %eax, %eax
121 jnz L(CopyFrom1To16Bytes)
122
123 mov %edx, %eax
124 lea 16(%edx), %edx
125 and $-16, %edx
126 sub %edx, %eax
127
128 sub %eax, %ecx
129 mov %ecx, %eax
130 and $0xf, %eax
131 mov $0, %esi
132
133 jz L(Align16Both)
134 cmp $4, %eax
135 je L(Shl4)
136 cmp $8, %eax
137 je L(Shl8)
138 jmp L(Shl12)
139
140L(Align16Both):
141 movaps (%ecx), %xmm1
142 movaps 16(%ecx), %xmm2
143 movaps %xmm1, (%edx)
144 pcmpeqd %xmm2, %xmm0
145 pmovmskb %xmm0, %eax
146 lea 16(%esi), %esi
147
148 test %eax, %eax
149 jnz L(CopyFrom1To16Bytes)
150
151 movaps 16(%ecx, %esi), %xmm3
152 movaps %xmm2, (%edx, %esi)
153 pcmpeqd %xmm3, %xmm0
154 pmovmskb %xmm0, %eax
155 lea 16(%esi), %esi
156
157 test %eax, %eax
158 jnz L(CopyFrom1To16Bytes)
159
160 movaps 16(%ecx, %esi), %xmm4
161 movaps %xmm3, (%edx, %esi)
162 pcmpeqd %xmm4, %xmm0
163 pmovmskb %xmm0, %eax
164 lea 16(%esi), %esi
165
166 test %eax, %eax
167 jnz L(CopyFrom1To16Bytes)
168
169 movaps 16(%ecx, %esi), %xmm1
170 movaps %xmm4, (%edx, %esi)
171 pcmpeqd %xmm1, %xmm0
172 pmovmskb %xmm0, %eax
173 lea 16(%esi), %esi
174
175 test %eax, %eax
176 jnz L(CopyFrom1To16Bytes)
177
178 movaps 16(%ecx, %esi), %xmm2
179 movaps %xmm1, (%edx, %esi)
180 pcmpeqd %xmm2, %xmm0
181 pmovmskb %xmm0, %eax
182 lea 16(%esi), %esi
183
184 test %eax, %eax
185 jnz L(CopyFrom1To16Bytes)
186
187 movaps 16(%ecx, %esi), %xmm3
188 movaps %xmm2, (%edx, %esi)
189 pcmpeqd %xmm3, %xmm0
190 pmovmskb %xmm0, %eax
191 lea 16(%esi), %esi
192
193 test %eax, %eax
194 jnz L(CopyFrom1To16Bytes)
195
196 movaps %xmm3, (%edx, %esi)
197 mov %ecx, %eax
198 lea 16(%ecx, %esi), %ecx
199 and $-0x40, %ecx
200 sub %ecx, %eax
201 sub %eax, %edx
202
203 mov $-0x40, %esi
204
205L(Aligned64Loop):
206 movaps (%ecx), %xmm2
207 movaps 32(%ecx), %xmm3
208 movaps %xmm2, %xmm4
209 movaps 16(%ecx), %xmm5
210 movaps %xmm3, %xmm6
211 movaps 48(%ecx), %xmm7
212 pminub %xmm5, %xmm2
213 pminub %xmm7, %xmm3
214 pminub %xmm2, %xmm3
215 lea 64(%edx), %edx
216 pcmpeqd %xmm0, %xmm3
217 lea 64(%ecx), %ecx
218 pmovmskb %xmm3, %eax
219
220 test %eax, %eax
221 jnz L(Aligned64Leave)
222 movaps %xmm4, -64(%edx)
223 movaps %xmm5, -48(%edx)
224 movaps %xmm6, -32(%edx)
225 movaps %xmm7, -16(%edx)
226 jmp L(Aligned64Loop)
227
228L(Aligned64Leave):
229 pcmpeqd %xmm4, %xmm0
230 pmovmskb %xmm0, %eax
231 test %eax, %eax
232 jnz L(CopyFrom1To16Bytes)
233
234 pcmpeqd %xmm5, %xmm0
235 pmovmskb %xmm0, %eax
236 movaps %xmm4, -64(%edx)
237 lea 16(%esi), %esi
238 test %eax, %eax
239 jnz L(CopyFrom1To16Bytes)
240
241 pcmpeqd %xmm6, %xmm0
242 pmovmskb %xmm0, %eax
243 movaps %xmm5, -48(%edx)
244 lea 16(%esi), %esi
245 test %eax, %eax
246 jnz L(CopyFrom1To16Bytes)
247
248 movaps %xmm6, -32(%edx)
249 pcmpeqd %xmm7, %xmm0
250 pmovmskb %xmm0, %eax
251 lea 16(%esi), %esi
252 test %eax, %eax
253 jnz L(CopyFrom1To16Bytes)
254
255 mov $-0x40, %esi
256 movaps %xmm7, -16(%edx)
257 jmp L(Aligned64Loop)
258
259 .p2align 4
260L(Shl4):
261 movaps -4(%ecx), %xmm1
262 movaps 12(%ecx), %xmm2
263L(Shl4Start):
264 pcmpeqd %xmm2, %xmm0
265 pmovmskb %xmm0, %eax
266 movaps %xmm2, %xmm3
267
268 test %eax, %eax
269 jnz L(Shl4LoopExit)
270
271 palignr $4, %xmm1, %xmm2
272 movaps %xmm2, (%edx)
273 movaps 28(%ecx), %xmm2
274
275 pcmpeqd %xmm2, %xmm0
276 lea 16(%edx), %edx
277 pmovmskb %xmm0, %eax
278 lea 16(%ecx), %ecx
279 movaps %xmm2, %xmm1
280
281 test %eax, %eax
282 jnz L(Shl4LoopExit)
283
284 palignr $4, %xmm3, %xmm2
285 movaps %xmm2, (%edx)
286 movaps 28(%ecx), %xmm2
287
288 pcmpeqd %xmm2, %xmm0
289 lea 16(%edx), %edx
290 pmovmskb %xmm0, %eax
291 lea 16(%ecx), %ecx
292 movaps %xmm2, %xmm3
293
294 test %eax, %eax
295 jnz L(Shl4LoopExit)
296
297 palignr $4, %xmm1, %xmm2
298 movaps %xmm2, (%edx)
299 movaps 28(%ecx), %xmm2
300
301 pcmpeqd %xmm2, %xmm0
302 lea 16(%edx), %edx
303 pmovmskb %xmm0, %eax
304 lea 16(%ecx), %ecx
305
306 test %eax, %eax
307 jnz L(Shl4LoopExit)
308
309 palignr $4, %xmm3, %xmm2
310 movaps %xmm2, (%edx)
311 lea 28(%ecx), %ecx
312 lea 16(%edx), %edx
313
314 mov %ecx, %eax
315 and $-0x40, %ecx
316 sub %ecx, %eax
317 lea -12(%ecx), %ecx
318 sub %eax, %edx
319
320 movaps -4(%ecx), %xmm1
321
322L(Shl4LoopStart):
323 movaps 12(%ecx), %xmm2
324 movaps 28(%ecx), %xmm3
325 movaps %xmm3, %xmm6
326 movaps 44(%ecx), %xmm4
327 movaps %xmm4, %xmm7
328 movaps 60(%ecx), %xmm5
329 pminub %xmm2, %xmm6
330 pminub %xmm5, %xmm7
331 pminub %xmm6, %xmm7
332 pcmpeqd %xmm0, %xmm7
333 pmovmskb %xmm7, %eax
334 movaps %xmm5, %xmm7
335 palignr $4, %xmm4, %xmm5
336 palignr $4, %xmm3, %xmm4
337 test %eax, %eax
338 jnz L(Shl4Start)
339
340 palignr $4, %xmm2, %xmm3
341 lea 64(%ecx), %ecx
342 palignr $4, %xmm1, %xmm2
343 movaps %xmm7, %xmm1
344 movaps %xmm5, 48(%edx)
345 movaps %xmm4, 32(%edx)
346 movaps %xmm3, 16(%edx)
347 movaps %xmm2, (%edx)
348 lea 64(%edx), %edx
349 jmp L(Shl4LoopStart)
350
351L(Shl4LoopExit):
352 movlpd (%ecx), %xmm0
353 movl 8(%ecx), %esi
354 movlpd %xmm0, (%edx)
355 movl %esi, 8(%edx)
356 POP (%esi)
357 add $12, %edx
358 add $12, %ecx
359 test %al, %al
360 jz L(ExitHigh)
361 test $0x01, %al
362 jnz L(Exit4)
363 movlpd (%ecx), %xmm0
364 movlpd %xmm0, (%edx)
365 movl %edi, %eax
366 RETURN
367
368 CFI_PUSH (%esi)
369
370 .p2align 4
371L(Shl8):
372 movaps -8(%ecx), %xmm1
373 movaps 8(%ecx), %xmm2
374L(Shl8Start):
375 pcmpeqd %xmm2, %xmm0
376 pmovmskb %xmm0, %eax
377 movaps %xmm2, %xmm3
378
379 test %eax, %eax
380 jnz L(Shl8LoopExit)
381
382 palignr $8, %xmm1, %xmm2
383 movaps %xmm2, (%edx)
384 movaps 24(%ecx), %xmm2
385
386 pcmpeqd %xmm2, %xmm0
387 lea 16(%edx), %edx
388 pmovmskb %xmm0, %eax
389 lea 16(%ecx), %ecx
390 movaps %xmm2, %xmm1
391
392 test %eax, %eax
393 jnz L(Shl8LoopExit)
394
395 palignr $8, %xmm3, %xmm2
396 movaps %xmm2, (%edx)
397 movaps 24(%ecx), %xmm2
398
399 pcmpeqd %xmm2, %xmm0
400 lea 16(%edx), %edx
401 pmovmskb %xmm0, %eax
402 lea 16(%ecx), %ecx
403 movaps %xmm2, %xmm3
404
405 test %eax, %eax
406 jnz L(Shl8LoopExit)
407
408 palignr $8, %xmm1, %xmm2
409 movaps %xmm2, (%edx)
410 movaps 24(%ecx), %xmm2
411
412 pcmpeqd %xmm2, %xmm0
413 lea 16(%edx), %edx
414 pmovmskb %xmm0, %eax
415 lea 16(%ecx), %ecx
416
417 test %eax, %eax
418 jnz L(Shl8LoopExit)
419
420 palignr $8, %xmm3, %xmm2
421 movaps %xmm2, (%edx)
422 lea 24(%ecx), %ecx
423 lea 16(%edx), %edx
424
425 mov %ecx, %eax
426 and $-0x40, %ecx
427 sub %ecx, %eax
428 lea -8(%ecx), %ecx
429 sub %eax, %edx
430
431 movaps -8(%ecx), %xmm1
432
433L(Shl8LoopStart):
434 movaps 8(%ecx), %xmm2
435 movaps 24(%ecx), %xmm3
436 movaps %xmm3, %xmm6
437 movaps 40(%ecx), %xmm4
438 movaps %xmm4, %xmm7
439 movaps 56(%ecx), %xmm5
440 pminub %xmm2, %xmm6
441 pminub %xmm5, %xmm7
442 pminub %xmm6, %xmm7
443 pcmpeqd %xmm0, %xmm7
444 pmovmskb %xmm7, %eax
445 movaps %xmm5, %xmm7
446 palignr $8, %xmm4, %xmm5
447 palignr $8, %xmm3, %xmm4
448 test %eax, %eax
449 jnz L(Shl8Start)
450
451 palignr $8, %xmm2, %xmm3
452 lea 64(%ecx), %ecx
453 palignr $8, %xmm1, %xmm2
454 movaps %xmm7, %xmm1
455 movaps %xmm5, 48(%edx)
456 movaps %xmm4, 32(%edx)
457 movaps %xmm3, 16(%edx)
458 movaps %xmm2, (%edx)
459 lea 64(%edx), %edx
460 jmp L(Shl8LoopStart)
461
462L(Shl8LoopExit):
463 movlpd (%ecx), %xmm0
464 movlpd %xmm0, (%edx)
465 POP (%esi)
466 add $8, %edx
467 add $8, %ecx
468 test %al, %al
469 jz L(ExitHigh)
470 test $0x01, %al
471 jnz L(Exit4)
472 movlpd (%ecx), %xmm0
473 movlpd %xmm0, (%edx)
474 movl %edi, %eax
475 RETURN
476
477 CFI_PUSH (%esi)
478
479 .p2align 4
480L(Shl12):
481 movaps -12(%ecx), %xmm1
482 movaps 4(%ecx), %xmm2
483L(Shl12Start):
484 pcmpeqd %xmm2, %xmm0
485 pmovmskb %xmm0, %eax
486 movaps %xmm2, %xmm3
487
488 test %eax, %eax
489 jnz L(Shl12LoopExit)
490
491 palignr $12, %xmm1, %xmm2
492 movaps %xmm2, (%edx)
493 movaps 20(%ecx), %xmm2
494
495 pcmpeqd %xmm2, %xmm0
496 lea 16(%edx), %edx
497 pmovmskb %xmm0, %eax
498 lea 16(%ecx), %ecx
499 movaps %xmm2, %xmm1
500
501 test %eax, %eax
502 jnz L(Shl12LoopExit)
503
504 palignr $12, %xmm3, %xmm2
505 movaps %xmm2, (%edx)
506 movaps 20(%ecx), %xmm2
507
508 pcmpeqd %xmm2, %xmm0
509 lea 16(%edx), %edx
510 pmovmskb %xmm0, %eax
511 lea 16(%ecx), %ecx
512 movaps %xmm2, %xmm3
513
514 test %eax, %eax
515 jnz L(Shl12LoopExit)
516
517 palignr $12, %xmm1, %xmm2
518 movaps %xmm2, (%edx)
519 movaps 20(%ecx), %xmm2
520
521 pcmpeqd %xmm2, %xmm0
522 lea 16(%edx), %edx
523 pmovmskb %xmm0, %eax
524 lea 16(%ecx), %ecx
525
526 test %eax, %eax
527 jnz L(Shl12LoopExit)
528
529 palignr $12, %xmm3, %xmm2
530 movaps %xmm2, (%edx)
531 lea 20(%ecx), %ecx
532 lea 16(%edx), %edx
533
534 mov %ecx, %eax
535 and $-0x40, %ecx
536 sub %ecx, %eax
537 lea -4(%ecx), %ecx
538 sub %eax, %edx
539
540 movaps -12(%ecx), %xmm1
541
542L(Shl12LoopStart):
543 movaps 4(%ecx), %xmm2
544 movaps 20(%ecx), %xmm3
545 movaps %xmm3, %xmm6
546 movaps 36(%ecx), %xmm4
547 movaps %xmm4, %xmm7
548 movaps 52(%ecx), %xmm5
549 pminub %xmm2, %xmm6
550 pminub %xmm5, %xmm7
551 pminub %xmm6, %xmm7
552 pcmpeqd %xmm0, %xmm7
553 pmovmskb %xmm7, %eax
554 movaps %xmm5, %xmm7
555 palignr $12, %xmm4, %xmm5
556 palignr $12, %xmm3, %xmm4
557 test %eax, %eax
558 jnz L(Shl12Start)
559
560 palignr $12, %xmm2, %xmm3
561 lea 64(%ecx), %ecx
562 palignr $12, %xmm1, %xmm2
563 movaps %xmm7, %xmm1
564 movaps %xmm5, 48(%edx)
565 movaps %xmm4, 32(%edx)
566 movaps %xmm3, 16(%edx)
567 movaps %xmm2, (%edx)
568 lea 64(%edx), %edx
569 jmp L(Shl12LoopStart)
570
571L(Shl12LoopExit):
572 movl (%ecx), %esi
573 movl %esi, (%edx)
574 mov $4, %esi
575
576 .p2align 4
577L(CopyFrom1To16Bytes):
578 add %esi, %edx
579 add %esi, %ecx
580
581 POP (%esi)
582 test %al, %al
583 jz L(ExitHigh)
584 test $0x01, %al
585 jnz L(Exit4)
586L(Exit8):
587 movlpd (%ecx), %xmm0
588 movlpd %xmm0, (%edx)
589 movl %edi, %eax
590 RETURN
591
592 .p2align 4
593L(ExitHigh):
594 test $0x01, %ah
595 jnz L(Exit12)
596L(Exit16):
597 movdqu (%ecx), %xmm0
598 movdqu %xmm0, (%edx)
599 movl %edi, %eax
600 RETURN
601
602 .p2align 4
603L(Exit4):
604 movl (%ecx), %eax
605 movl %eax, (%edx)
606 movl %edi, %eax
607 RETURN
608
609 .p2align 4
610L(Exit12):
611 movlpd (%ecx), %xmm0
612 movlpd %xmm0, (%edx)
613 movl 8(%ecx), %eax
614 movl %eax, 8(%edx)
615 movl %edi, %eax
616 RETURN
617
618CFI_POP (%edi)
619
620 .p2align 4
621L(ExitTail4):
622 movl (%ecx), %eax
623 movl %eax, (%edx)
624 movl %edx, %eax
625 ret
626
627 .p2align 4
628L(ExitTail8):
629 movlpd (%ecx), %xmm0
630 movlpd %xmm0, (%edx)
631 movl %edx, %eax
632 ret
633
634 .p2align 4
635L(ExitTail12):
636 movlpd (%ecx), %xmm0
637 movlpd %xmm0, (%edx)
638 movl 8(%ecx), %eax
639 movl %eax, 8(%edx)
640 movl %edx, %eax
641 ret
642
643 .p2align 4
644L(ExitTail16):
645 movdqu (%ecx), %xmm0
646 movdqu %xmm0, (%edx)
647 movl %edx, %eax
648 ret
649
650#ifndef USE_AS_WCSCAT
651END (wcscpy)
652#endif