blob: 013af9b663de3811cceeaa722f3413265e7d0dbb [file] [log] [blame]
Liubov Dmitrieva0a490662012-01-17 12:55:46 +04001/*
2Copyright (c) 2011, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#ifndef L
32# define L(label) .L##label
33#endif
34
35#ifndef cfi_startproc
36# define cfi_startproc .cfi_startproc
37#endif
38
39#ifndef cfi_endproc
40# define cfi_endproc .cfi_endproc
41#endif
42
43#ifndef cfi_rel_offset
44# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
45#endif
46
47#ifndef cfi_restore
48# define cfi_restore(reg) .cfi_restore reg
49#endif
50
51#ifndef cfi_adjust_cfa_offset
52# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
53#endif
54
55#ifndef ENTRY
56# define ENTRY(name) \
57 .type name, @function; \
58 .globl name; \
59 .p2align 4; \
60name: \
61 cfi_startproc
62#endif
63
64#ifndef END
65# define END(name) \
66 cfi_endproc; \
67 .size name, .-name
68#endif
69
70#define CFI_PUSH(REG) \
71 cfi_adjust_cfa_offset (4); \
72 cfi_rel_offset (REG, 0)
73
74#define CFI_POP(REG) \
75 cfi_adjust_cfa_offset (-4); \
76 cfi_restore (REG)
77
78#define PUSH(REG) pushl REG; CFI_PUSH (REG)
79#define POP(REG) popl REG; CFI_POP (REG)
80
81#define ENTRANCE PUSH (%edi);
82#define PARMS 8
83#define RETURN POP (%edi); ret; CFI_PUSH (%edi);
84
85#define STR1 PARMS
86#define STR2 STR1+4
87#define LEN STR2+4
88
89 .text
90ENTRY (memchr)
91 ENTRANCE
92 mov STR1(%esp), %ecx
93 movd STR2(%esp), %xmm1
94 mov LEN(%esp), %edx
95 test %edx, %edx
96 jz L(return_null)
97
98 punpcklbw %xmm1, %xmm1
99 mov %ecx, %edi
100 punpcklbw %xmm1, %xmm1
101
102 and $63, %ecx
103 pshufd $0, %xmm1, %xmm1
104 cmp $48, %ecx
105 ja L(crosscache)
106
107 movdqu (%edi), %xmm0
108 pcmpeqb %xmm1, %xmm0
109 pmovmskb %xmm0, %eax
110 test %eax, %eax
111 jnz L(match_case2_prolog)
112
113 sub $16, %edx
114 jbe L(return_null)
115 lea 16(%edi), %edi
116 and $15, %ecx
117 and $-16, %edi
118 add %ecx, %edx
119 sub $64, %edx
120 jbe L(exit_loop)
121 jmp L(loop_prolog)
122
123 .p2align 4
124L(crosscache):
125 and $15, %ecx
126 and $-16, %edi
127 movdqa (%edi), %xmm0
128 pcmpeqb %xmm1, %xmm0
129 pmovmskb %xmm0, %eax
130 sar %cl, %eax
131 test %eax, %eax
132
133 jnz L(match_case2_prolog1)
134 lea -16(%edx), %edx
135 add %ecx, %edx
136 jle L(return_null)
137 lea 16(%edi), %edi
138 sub $64, %edx
139 jbe L(exit_loop)
140
141 .p2align 4
142L(loop_prolog):
143 movdqa (%edi), %xmm0
144 pcmpeqb %xmm1, %xmm0
145 xor %ecx, %ecx
146 pmovmskb %xmm0, %eax
147 test %eax, %eax
148 jnz L(match_case1)
149
150 movdqa 16(%edi), %xmm2
151 pcmpeqb %xmm1, %xmm2
152 lea 16(%ecx), %ecx
153 pmovmskb %xmm2, %eax
154 test %eax, %eax
155 jnz L(match_case1)
156
157 movdqa 32(%edi), %xmm3
158 pcmpeqb %xmm1, %xmm3
159 lea 16(%ecx), %ecx
160 pmovmskb %xmm3, %eax
161 test %eax, %eax
162 jnz L(match_case1)
163
164 movdqa 48(%edi), %xmm4
165 pcmpeqb %xmm1, %xmm4
166 lea 16(%ecx), %ecx
167 pmovmskb %xmm4, %eax
168 test %eax, %eax
169 jnz L(match_case1)
170
171 lea 64(%edi), %edi
172 sub $64, %edx
173 jbe L(exit_loop)
174
175 movdqa (%edi), %xmm0
176 pcmpeqb %xmm1, %xmm0
177 xor %ecx, %ecx
178 pmovmskb %xmm0, %eax
179 test %eax, %eax
180 jnz L(match_case1)
181
182 movdqa 16(%edi), %xmm2
183 pcmpeqb %xmm1, %xmm2
184 lea 16(%ecx), %ecx
185 pmovmskb %xmm2, %eax
186 test %eax, %eax
187 jnz L(match_case1)
188
189 movdqa 32(%edi), %xmm3
190 pcmpeqb %xmm1, %xmm3
191 lea 16(%ecx), %ecx
192 pmovmskb %xmm3, %eax
193 test %eax, %eax
194 jnz L(match_case1)
195
196 movdqa 48(%edi), %xmm4
197 pcmpeqb %xmm1, %xmm4
198 lea 16(%ecx), %ecx
199 pmovmskb %xmm4, %eax
200 test %eax, %eax
201 jnz L(match_case1)
202
203 lea 64(%edi), %edi
204 mov %edi, %ecx
205 and $-64, %edi
206 and $63, %ecx
207 add %ecx, %edx
208
209 .p2align 4
210L(align64_loop):
211 sub $64, %edx
212 jbe L(exit_loop)
213 movdqa (%edi), %xmm0
214 movdqa 16(%edi), %xmm2
215 movdqa 32(%edi), %xmm3
216 movdqa 48(%edi), %xmm4
217 pcmpeqb %xmm1, %xmm0
218 pcmpeqb %xmm1, %xmm2
219 pcmpeqb %xmm1, %xmm3
220 pcmpeqb %xmm1, %xmm4
221
222 pmaxub %xmm0, %xmm3
223 pmaxub %xmm2, %xmm4
224 pmaxub %xmm3, %xmm4
225 add $64, %edi
226 pmovmskb %xmm4, %eax
227
228 test %eax, %eax
229 jz L(align64_loop)
230
231 sub $64, %edi
232
233 pmovmskb %xmm0, %eax
234 xor %ecx, %ecx
235 test %eax, %eax
236 jnz L(match_case1)
237
238 pmovmskb %xmm2, %eax
239 lea 16(%ecx), %ecx
240 test %eax, %eax
241 jnz L(match_case1)
242
243 movdqa 32(%edi), %xmm3
244 pcmpeqb %xmm1, %xmm3
245 pmovmskb %xmm3, %eax
246 lea 16(%ecx), %ecx
247 test %eax, %eax
248 jnz L(match_case1)
249
250 pcmpeqb 48(%edi), %xmm1
251 pmovmskb %xmm1, %eax
252 lea 16(%ecx), %ecx
253
254 .p2align 4
255L(match_case1):
256 add %ecx, %edi
257 test %al, %al
258 jz L(match_case1_high)
259 mov %al, %cl
260 and $15, %cl
261 jz L(match_case1_8)
262 test $0x01, %al
263 jnz L(exit_case1_1)
264 test $0x02, %al
265 jnz L(exit_case1_2)
266 test $0x04, %al
267 jnz L(exit_case1_3)
268 lea 3(%edi), %eax
269 RETURN
270
271 .p2align 4
272L(match_case1_8):
273 test $0x10, %al
274 jnz L(exit_case1_5)
275 test $0x20, %al
276 jnz L(exit_case1_6)
277 test $0x40, %al
278 jnz L(exit_case1_7)
279 lea 7(%edi), %eax
280 RETURN
281
282 .p2align 4
283L(match_case1_high):
284 mov %ah, %ch
285 and $15, %ch
286 jz L(match_case1_high_8)
287 test $0x01, %ah
288 jnz L(exit_case1_9)
289 test $0x02, %ah
290 jnz L(exit_case1_10)
291 test $0x04, %ah
292 jnz L(exit_case1_11)
293 lea 11(%edi), %eax
294 RETURN
295
296 .p2align 4
297L(match_case1_high_8):
298 test $0x10, %ah
299 jnz L(exit_case1_13)
300 test $0x20, %ah
301 jnz L(exit_case1_14)
302 test $0x40, %ah
303 jnz L(exit_case1_15)
304 lea 15(%edi), %eax
305 RETURN
306
307 .p2align 4
308L(exit_loop):
309 add $64, %edx
310
311 movdqa (%edi), %xmm0
312 pcmpeqb %xmm1, %xmm0
313 xor %ecx, %ecx
314 pmovmskb %xmm0, %eax
315 test %eax, %eax
316 jnz L(match_case2)
317 cmp $16, %edx
318 jbe L(return_null)
319
320 movdqa 16(%edi), %xmm2
321 pcmpeqb %xmm1, %xmm2
322 lea 16(%ecx), %ecx
323 pmovmskb %xmm2, %eax
324 test %eax, %eax
325 jnz L(match_case2)
326 cmp $32, %edx
327 jbe L(return_null)
328
329 movdqa 32(%edi), %xmm3
330 pcmpeqb %xmm1, %xmm3
331 lea 16(%ecx), %ecx
332 pmovmskb %xmm3, %eax
333 test %eax, %eax
334 jnz L(match_case2)
335 cmp $48, %edx
336 jbe L(return_null)
337
338 pcmpeqb 48(%edi), %xmm1
339 lea 16(%ecx), %ecx
340 pmovmskb %xmm1, %eax
341 test %eax, %eax
342 jnz L(match_case2)
343
344 xor %eax, %eax
345 RETURN
346
347 .p2align 4
348L(exit_case1_1):
349 mov %edi, %eax
350 RETURN
351
352 .p2align 4
353L(exit_case1_2):
354 lea 1(%edi), %eax
355 RETURN
356
357 .p2align 4
358L(exit_case1_3):
359 lea 2(%edi), %eax
360 RETURN
361
362 .p2align 4
363L(exit_case1_5):
364 lea 4(%edi), %eax
365 RETURN
366
367 .p2align 4
368L(exit_case1_6):
369 lea 5(%edi), %eax
370 RETURN
371
372 .p2align 4
373L(exit_case1_7):
374 lea 6(%edi), %eax
375 RETURN
376
377 .p2align 4
378L(exit_case1_9):
379 lea 8(%edi), %eax
380 RETURN
381
382 .p2align 4
383L(exit_case1_10):
384 lea 9(%edi), %eax
385 RETURN
386
387 .p2align 4
388L(exit_case1_11):
389 lea 10(%edi), %eax
390 RETURN
391
392 .p2align 4
393L(exit_case1_13):
394 lea 12(%edi), %eax
395 RETURN
396
397 .p2align 4
398L(exit_case1_14):
399 lea 13(%edi), %eax
400 RETURN
401
402 .p2align 4
403L(exit_case1_15):
404 lea 14(%edi), %eax
405 RETURN
406
407 .p2align 4
408L(match_case2):
409 sub %ecx, %edx
410L(match_case2_prolog1):
411 add %ecx, %edi
412L(match_case2_prolog):
413 test %al, %al
414 jz L(match_case2_high)
415 mov %al, %cl
416 and $15, %cl
417 jz L(match_case2_8)
418 test $0x01, %al
419 jnz L(exit_case2_1)
420 test $0x02, %al
421 jnz L(exit_case2_2)
422 test $0x04, %al
423 jnz L(exit_case2_3)
424 sub $4, %edx
425 jb L(return_null)
426 lea 3(%edi), %eax
427 RETURN
428
429 .p2align 4
430L(match_case2_8):
431 test $0x10, %al
432 jnz L(exit_case2_5)
433 test $0x20, %al
434 jnz L(exit_case2_6)
435 test $0x40, %al
436 jnz L(exit_case2_7)
437 sub $8, %edx
438 jb L(return_null)
439 lea 7(%edi), %eax
440 RETURN
441
442 .p2align 4
443L(match_case2_high):
444 mov %ah, %ch
445 and $15, %ch
446 jz L(match_case2_high_8)
447 test $0x01, %ah
448 jnz L(exit_case2_9)
449 test $0x02, %ah
450 jnz L(exit_case2_10)
451 test $0x04, %ah
452 jnz L(exit_case2_11)
453 sub $12, %edx
454 jb L(return_null)
455 lea 11(%edi), %eax
456 RETURN
457
458 .p2align 4
459L(match_case2_high_8):
460 test $0x10, %ah
461 jnz L(exit_case2_13)
462 test $0x20, %ah
463 jnz L(exit_case2_14)
464 test $0x40, %ah
465 jnz L(exit_case2_15)
466 sub $16, %edx
467 jb L(return_null)
468 lea 15(%edi), %eax
469 RETURN
470
471 .p2align 4
472L(exit_case2_1):
473 mov %edi, %eax
474 RETURN
475
476 .p2align 4
477L(exit_case2_2):
478 sub $2, %edx
479 jb L(return_null)
480 lea 1(%edi), %eax
481 RETURN
482
483 .p2align 4
484L(exit_case2_3):
485 sub $3, %edx
486 jb L(return_null)
487 lea 2(%edi), %eax
488 RETURN
489
490 .p2align 4
491L(exit_case2_5):
492 sub $5, %edx
493 jb L(return_null)
494 lea 4(%edi), %eax
495 RETURN
496
497 .p2align 4
498L(exit_case2_6):
499 sub $6, %edx
500 jb L(return_null)
501 lea 5(%edi), %eax
502 RETURN
503
504 .p2align 4
505L(exit_case2_7):
506 sub $7, %edx
507 jb L(return_null)
508 lea 6(%edi), %eax
509 RETURN
510
511 .p2align 4
512L(exit_case2_9):
513 sub $9, %edx
514 jb L(return_null)
515 lea 8(%edi), %eax
516 RETURN
517
518 .p2align 4
519L(exit_case2_10):
520 sub $10, %edx
521 jb L(return_null)
522 lea 9(%edi), %eax
523 RETURN
524
525 .p2align 4
526L(exit_case2_11):
527 sub $11, %edx
528 jb L(return_null)
529 lea 10(%edi), %eax
530 RETURN
531
532 .p2align 4
533L(exit_case2_13):
534 sub $13, %edx
535 jb L(return_null)
536 lea 12(%edi), %eax
537 RETURN
538
539 .p2align 4
540L(exit_case2_14):
541 sub $14, %edx
542 jb L(return_null)
543 lea 13(%edi), %eax
544 RETURN
545
546 .p2align 4
547L(exit_case2_15):
548 sub $15, %edx
549 jb L(return_null)
550 lea 14(%edi), %eax
551 RETURN
552 .p2align 4
553L(return_null):
554 xor %eax, %eax
555 RETURN
556END (memchr)