blob: 79a0a367e6f163c61de3ea78988a0360cdcdbe95 [file] [log] [blame]
Varvara Rainchik5a922842014-04-24 15:41:20 +04001/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include "cache.h"
32
33#ifndef MEMMOVE
34# define MEMMOVE memmove
35#endif
36
37#ifndef L
38# define L(label) .L##label
39#endif
40
41#ifndef cfi_startproc
42# define cfi_startproc .cfi_startproc
43#endif
44
45#ifndef cfi_endproc
46# define cfi_endproc .cfi_endproc
47#endif
48
49#ifndef cfi_rel_offset
50# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
51#endif
52
53#ifndef cfi_restore
54# define cfi_restore(reg) .cfi_restore reg
55#endif
56
57#ifndef cfi_adjust_cfa_offset
58# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
59#endif
60
61#ifndef ENTRY
62# define ENTRY(name) \
63 .type name, @function; \
64 .globl name; \
65 .p2align 4; \
66name: \
67 cfi_startproc
68#endif
69
70#ifndef END
71# define END(name) \
72 cfi_endproc; \
73 .size name, .-name
74#endif
75
76#ifdef USE_AS_BCOPY
77# define SRC PARMS
78# define DEST SRC+4
79# define LEN DEST+4
80#else
81# define DEST PARMS
82# define SRC DEST+4
83# define LEN SRC+4
84#endif
85
86#define CFI_PUSH(REG) \
87 cfi_adjust_cfa_offset (4); \
88 cfi_rel_offset (REG, 0)
89
90#define CFI_POP(REG) \
91 cfi_adjust_cfa_offset (-4); \
92 cfi_restore (REG)
93
94#define PUSH(REG) pushl REG; CFI_PUSH (REG)
95#define POP(REG) popl REG; CFI_POP (REG)
96
97#define PARMS 8 /* Preserve EBX. */
98#define ENTRANCE PUSH (%ebx);
99#define RETURN_END POP (%ebx); ret
100#define RETURN RETURN_END; CFI_PUSH (%ebx)
101
102 .section .text.sse2,"ax",@progbits
103ENTRY (MEMMOVE)
104 ENTRANCE
105 movl LEN(%esp), %ecx
106 movl SRC(%esp), %eax
107 movl DEST(%esp), %edx
108
109/* Check whether we should copy backward or forward. */
110 cmp %eax, %edx
111 je L(mm_return)
112 ja L(mm_len_0_or_more_backward)
113
114/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
115 separately. */
116 cmp $16, %ecx
117 jbe L(mm_len_0_16_bytes_forward)
118
119 cmpl $32, %ecx
120 jg L(mm_len_32_or_more_forward)
121
122/* Copy [0..32] and return. */
123 movdqu (%eax), %xmm0
124 movdqu -16(%eax, %ecx), %xmm1
125 movdqu %xmm0, (%edx)
126 movdqu %xmm1, -16(%edx, %ecx)
127 jmp L(mm_return)
128
129L(mm_len_32_or_more_forward):
130 cmpl $64, %ecx
131 jg L(mm_len_64_or_more_forward)
132
133/* Copy [0..64] and return. */
134 movdqu (%eax), %xmm0
135 movdqu 16(%eax), %xmm1
136 movdqu -16(%eax, %ecx), %xmm2
137 movdqu -32(%eax, %ecx), %xmm3
138 movdqu %xmm0, (%edx)
139 movdqu %xmm1, 16(%edx)
140 movdqu %xmm2, -16(%edx, %ecx)
141 movdqu %xmm3, -32(%edx, %ecx)
142 jmp L(mm_return)
143
144L(mm_len_64_or_more_forward):
145 cmpl $128, %ecx
146 jg L(mm_len_128_or_more_forward)
147
148/* Copy [0..128] and return. */
149 movdqu (%eax), %xmm0
150 movdqu 16(%eax), %xmm1
151 movdqu 32(%eax), %xmm2
152 movdqu 48(%eax), %xmm3
153 movdqu -64(%eax, %ecx), %xmm4
154 movdqu -48(%eax, %ecx), %xmm5
155 movdqu -32(%eax, %ecx), %xmm6
156 movdqu -16(%eax, %ecx), %xmm7
157 movdqu %xmm0, (%edx)
158 movdqu %xmm1, 16(%edx)
159 movdqu %xmm2, 32(%edx)
160 movdqu %xmm3, 48(%edx)
161 movdqu %xmm4, -64(%edx, %ecx)
162 movdqu %xmm5, -48(%edx, %ecx)
163 movdqu %xmm6, -32(%edx, %ecx)
164 movdqu %xmm7, -16(%edx, %ecx)
165 jmp L(mm_return)
166
167L(mm_len_128_or_more_forward):
168
169 cmp $SHARED_CACHE_SIZE_HALF, %ecx
170 jae L(mm_large_page_forward)
171
172 PUSH (%esi)
173 PUSH (%edi)
174 movl %eax, %esi
175 movl %edx, %edi
176
177/* Aligning the address of destination. */
178 movdqu (%esi), %xmm0
179 movdqu 16(%esi), %xmm1
180 movdqu 32(%esi), %xmm2
181 movdqu 48(%esi), %xmm3
182
183 leal 64(%edi), %edx
184 andl $-64, %edx
185
186 movl %esi, %eax
187 subl %edi, %eax
188
189 movdqu (%edx, %eax), %xmm4
190 movdqu 16(%edx, %eax), %xmm5
191 movdqu 32(%edx, %eax), %xmm6
192 movdqu 48(%edx, %eax), %xmm7
193
194 movdqu %xmm0, (%edi)
195 movdqu %xmm1, 16(%edi)
196 movdqu %xmm2, 32(%edi)
197 movdqu %xmm3, 48(%edi)
198 movdqa %xmm4, (%edx)
199 movdqa %xmm5, 16(%edx)
200 movdqa %xmm6, 32(%edx)
201 movdqa %xmm7, 48(%edx)
202 addl $64, %edx
203
204 leal (%edi, %ecx), %ebx
205 andl $-64, %ebx
206
207 cmp %edx, %ebx
208 jbe L(mm_copy_remaining_forward)
209
210 .p2align 4
211L(mm_main_loop_forward):
212
213 prefetcht0 128(%edx, %eax)
214
215 movdqu (%edx, %eax), %xmm0
216 movdqu 16(%edx, %eax), %xmm1
217 movdqu 32(%edx, %eax), %xmm2
218 movdqu 48(%edx, %eax), %xmm3
219 movdqa %xmm0, (%edx)
220 movdqa %xmm1, 16(%edx)
221 movdqa %xmm2, 32(%edx)
222 movdqa %xmm3, 48(%edx)
223 leal 64(%edx), %edx
224 cmp %edx, %ebx
225 ja L(mm_main_loop_forward)
226
227L(mm_copy_remaining_forward):
228 addl %edi, %ecx
229 subl %edx, %ecx
230/* We copied all up till %edx position in the dst.
231 In %ecx now is how many bytes are left to copy.
232 Now we need to advance %esi. */
233 leal (%edx, %eax), %esi
234
235L(mm_remaining_0_64_bytes_forward):
236 cmp $32, %ecx
237 ja L(mm_remaining_33_64_bytes_forward)
238 cmp $16, %ecx
239 ja L(mm_remaining_17_32_bytes_forward)
240 testl %ecx, %ecx
241 .p2align 4,,2
242 je L(mm_return_pop_all)
243
244 cmpb $8, %cl
245 ja L(mm_remaining_9_16_bytes_forward)
246 cmpb $4, %cl
247 .p2align 4,,5
248 ja L(mm_remaining_5_8_bytes_forward)
249 cmpb $2, %cl
250 .p2align 4,,1
251 ja L(mm_remaining_3_4_bytes_forward)
252 movzbl -1(%esi,%ecx), %eax
253 movzbl (%esi), %ebx
254 movb %al, -1(%edx,%ecx)
255 movb %bl, (%edx)
256 jmp L(mm_return_pop_all)
257
258L(mm_remaining_33_64_bytes_forward):
259 movdqu (%esi), %xmm0
260 movdqu 16(%esi), %xmm1
261 movdqu -32(%esi, %ecx), %xmm2
262 movdqu -16(%esi, %ecx), %xmm3
263 movdqu %xmm0, (%edx)
264 movdqu %xmm1, 16(%edx)
265 movdqu %xmm2, -32(%edx, %ecx)
266 movdqu %xmm3, -16(%edx, %ecx)
267 jmp L(mm_return_pop_all)
268
269L(mm_remaining_17_32_bytes_forward):
270 movdqu (%esi), %xmm0
271 movdqu -16(%esi, %ecx), %xmm1
272 movdqu %xmm0, (%edx)
273 movdqu %xmm1, -16(%edx, %ecx)
274 jmp L(mm_return_pop_all)
275
276L(mm_remaining_3_4_bytes_forward):
277 movzwl -2(%esi,%ecx), %eax
278 movzwl (%esi), %ebx
279 movw %ax, -2(%edx,%ecx)
280 movw %bx, (%edx)
281 jmp L(mm_return_pop_all)
282
283L(mm_remaining_5_8_bytes_forward):
284 movl (%esi), %eax
285 movl -4(%esi,%ecx), %ebx
286 movl %eax, (%edx)
287 movl %ebx, -4(%edx,%ecx)
288 jmp L(mm_return_pop_all)
289
290L(mm_remaining_9_16_bytes_forward):
291 movq (%esi), %xmm0
292 movq -8(%esi, %ecx), %xmm1
293 movq %xmm0, (%edx)
294 movq %xmm1, -8(%edx, %ecx)
295 jmp L(mm_return_pop_all)
296
297
298L(mm_len_0_16_bytes_forward):
299 testb $24, %cl
300 jne L(mm_len_9_16_bytes_forward)
301 testb $4, %cl
302 .p2align 4,,5
303 jne L(mm_len_5_8_bytes_forward)
304 testl %ecx, %ecx
305 .p2align 4,,2
306 je L(mm_return)
307 testb $2, %cl
308 .p2align 4,,1
309 jne L(mm_len_2_4_bytes_forward)
310 movzbl -1(%eax,%ecx), %ebx
311 movzbl (%eax), %eax
312 movb %bl, -1(%edx,%ecx)
313 movb %al, (%edx)
314 jmp L(mm_return)
315
316L(mm_len_2_4_bytes_forward):
317 movzwl -2(%eax,%ecx), %ebx
318 movzwl (%eax), %eax
319 movw %bx, -2(%edx,%ecx)
320 movw %ax, (%edx)
321 jmp L(mm_return)
322
323L(mm_len_5_8_bytes_forward):
324 movl (%eax), %ebx
325 movl -4(%eax,%ecx), %eax
326 movl %ebx, (%edx)
327 movl %eax, -4(%edx,%ecx)
328 jmp L(mm_return)
329
330L(mm_len_9_16_bytes_forward):
331 movq (%eax), %xmm0
332 movq -8(%eax, %ecx), %xmm1
333 movq %xmm0, (%edx)
334 movq %xmm1, -8(%edx, %ecx)
335 jmp L(mm_return)
336
337/* The code for copying backwards. */
338L(mm_len_0_or_more_backward):
339
340/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
341 separately. */
342 cmp $16, %ecx
343 jbe L(mm_len_0_16_bytes_backward)
344
345 cmpl $32, %ecx
346 jg L(mm_len_32_or_more_backward)
347
348/* Copy [0..32] and return. */
349 movdqu (%eax), %xmm0
350 movdqu -16(%eax, %ecx), %xmm1
351 movdqu %xmm0, (%edx)
352 movdqu %xmm1, -16(%edx, %ecx)
353 jmp L(mm_return)
354
355L(mm_len_32_or_more_backward):
356 cmpl $64, %ecx
357 jg L(mm_len_64_or_more_backward)
358
359/* Copy [0..64] and return. */
360 movdqu (%eax), %xmm0
361 movdqu 16(%eax), %xmm1
362 movdqu -16(%eax, %ecx), %xmm2
363 movdqu -32(%eax, %ecx), %xmm3
364 movdqu %xmm0, (%edx)
365 movdqu %xmm1, 16(%edx)
366 movdqu %xmm2, -16(%edx, %ecx)
367 movdqu %xmm3, -32(%edx, %ecx)
368 jmp L(mm_return)
369
370L(mm_len_64_or_more_backward):
371 cmpl $128, %ecx
372 jg L(mm_len_128_or_more_backward)
373
374/* Copy [0..128] and return. */
375 movdqu (%eax), %xmm0
376 movdqu 16(%eax), %xmm1
377 movdqu 32(%eax), %xmm2
378 movdqu 48(%eax), %xmm3
379 movdqu -64(%eax, %ecx), %xmm4
380 movdqu -48(%eax, %ecx), %xmm5
381 movdqu -32(%eax, %ecx), %xmm6
382 movdqu -16(%eax, %ecx), %xmm7
383 movdqu %xmm0, (%edx)
384 movdqu %xmm1, 16(%edx)
385 movdqu %xmm2, 32(%edx)
386 movdqu %xmm3, 48(%edx)
387 movdqu %xmm4, -64(%edx, %ecx)
388 movdqu %xmm5, -48(%edx, %ecx)
389 movdqu %xmm6, -32(%edx, %ecx)
390 movdqu %xmm7, -16(%edx, %ecx)
391 jmp L(mm_return)
392
393L(mm_len_128_or_more_backward):
394
395 cmp $SHARED_CACHE_SIZE_HALF, %ecx
396 jae L(mm_large_page_backward)
397
398 PUSH (%esi)
399 PUSH (%edi)
400
401/* Aligning the address of destination. We need to save
402 16 bits from the source in order not to overwrite them. */
403 movdqu -16(%eax, %ecx), %xmm0
404 movdqu -32(%eax, %ecx), %xmm1
405 movdqu -48(%eax, %ecx), %xmm2
406 movdqu -64(%eax, %ecx), %xmm3
407
408 leal (%edx, %ecx), %edi
409 andl $-64, %edi
410
411 movl %eax, %esi
412 subl %edx, %esi
413
414 movdqu -16(%edi, %esi), %xmm4
415 movdqu -32(%edi, %esi), %xmm5
416 movdqu -48(%edi, %esi), %xmm6
417 movdqu -64(%edi, %esi), %xmm7
418
419 movdqu %xmm0, -16(%edx, %ecx)
420 movdqu %xmm1, -32(%edx, %ecx)
421 movdqu %xmm2, -48(%edx, %ecx)
422 movdqu %xmm3, -64(%edx, %ecx)
423 movdqa %xmm4, -16(%edi)
424 movdqa %xmm5, -32(%edi)
425 movdqa %xmm6, -48(%edi)
426 movdqa %xmm7, -64(%edi)
427 leal -64(%edi), %edi
428
429 leal 64(%edx), %ebx
430 andl $-64, %ebx
431
432/* Compute in %ecx how many bytes are left to copy after
433 the main loop stops. */
434 movl %ebx, %ecx
435 subl %edx, %ecx
436
437 cmp %edi, %ebx
438 jb L(mm_main_loop_backward)
439
440 POP (%edi)
441 POP (%esi)
442 jmp L(mm_len_0_or_more_backward)
443
444 .p2align 4
445L(mm_main_loop_backward):
446
447 prefetcht0 -128(%edi, %esi)
448
449 movdqu -64(%edi, %esi), %xmm0
450 movdqu -48(%edi, %esi), %xmm1
451 movdqu -32(%edi, %esi), %xmm2
452 movdqu -16(%edi, %esi), %xmm3
453 movdqa %xmm0, -64(%edi)
454 movdqa %xmm1, -48(%edi)
455 movdqa %xmm2, -32(%edi)
456 movdqa %xmm3, -16(%edi)
457 leal -64(%edi), %edi
458 cmp %edi, %ebx
459 jb L(mm_main_loop_backward)
460 POP (%edi)
461 POP (%esi)
462 jmp L(mm_len_0_or_more_backward)
463
464/* Copy [0..16] and return. */
465L(mm_len_0_16_bytes_backward):
466 testb $24, %cl
467 jnz L(mm_len_9_16_bytes_backward)
468 testb $4, %cl
469 .p2align 4,,5
470 jnz L(mm_len_5_8_bytes_backward)
471 testl %ecx, %ecx
472 .p2align 4,,2
473 je L(mm_return)
474 testb $2, %cl
475 .p2align 4,,1
476 jne L(mm_len_3_4_bytes_backward)
477 movzbl -1(%eax,%ecx), %ebx
478 movzbl (%eax), %eax
479 movb %bl, -1(%edx,%ecx)
480 movb %al, (%edx)
481 jmp L(mm_return)
482
483L(mm_len_3_4_bytes_backward):
484 movzwl -2(%eax,%ecx), %ebx
485 movzwl (%eax), %eax
486 movw %bx, -2(%edx,%ecx)
487 movw %ax, (%edx)
488 jmp L(mm_return)
489
490L(mm_len_9_16_bytes_backward):
491 PUSH (%esi)
492 movl -4(%eax,%ecx), %ebx
493 movl -8(%eax,%ecx), %esi
494 movl %ebx, -4(%edx,%ecx)
495 movl %esi, -8(%edx,%ecx)
496 subl $8, %ecx
497 POP (%esi)
498 jmp L(mm_len_0_16_bytes_backward)
499
500L(mm_len_5_8_bytes_backward):
501 movl (%eax), %ebx
502 movl -4(%eax,%ecx), %eax
503 movl %ebx, (%edx)
504 movl %eax, -4(%edx,%ecx)
505
506L(mm_return):
507 movl %edx, %eax
508 RETURN
509
510L(mm_return_pop_all):
511 movl %edi, %eax
512 POP (%edi)
513 POP (%esi)
514 RETURN
515
516/* Big length copy forward part. */
517
518L(mm_large_page_forward):
519/* Aligning the address of destination. We need to save
520 16 bits from the source in order not to overwrite them. */
521
522 PUSH (%esi)
523 PUSH (%edi)
524 movl %eax, %esi
525 movl %edx, %edi
526
527 movdqu (%esi), %xmm0
528 movdqu 16(%esi), %xmm1
529 movdqu 32(%esi), %xmm2
530 movdqu 48(%esi), %xmm3
531
532 leal 64(%edi), %edx
533 andl $-64, %edx
534
535 movl %esi, %eax
536 subl %edi, %eax
537
538 movdqu (%edx, %eax), %xmm4
539 movdqu 16(%edx, %eax), %xmm5
540 movdqu 32(%edx, %eax), %xmm6
541 movdqu 48(%edx, %eax), %xmm7
542
543 movdqu %xmm0, (%edi)
544 movdqu %xmm1, 16(%edi)
545 movdqu %xmm2, 32(%edi)
546 movdqu %xmm3, 48(%edi)
547 movntdq %xmm4, (%edx)
548 movntdq %xmm5, 16(%edx)
549 movntdq %xmm6, 32(%edx)
550 movntdq %xmm7, 48(%edx)
551 addl $64, %edx
552
553 leal (%edi, %ecx), %ebx
554 andl $-128, %ebx
555
556 cmp %edx, %ebx
557 jbe L(mm_copy_remaining_forward)
558
559 .p2align 4
560L(mm_large_page_loop_forward):
561 movdqu (%edx, %eax), %xmm0
562 movdqu 16(%edx, %eax), %xmm1
563 movdqu 32(%edx, %eax), %xmm2
564 movdqu 48(%edx, %eax), %xmm3
565 movdqu 64(%edx, %eax), %xmm4
566 movdqu 80(%edx, %eax), %xmm5
567 movdqu 96(%edx, %eax), %xmm6
568 movdqu 112(%edx, %eax), %xmm7
569 movntdq %xmm0, (%edx)
570 movntdq %xmm1, 16(%edx)
571 movntdq %xmm2, 32(%edx)
572 movntdq %xmm3, 48(%edx)
573 movntdq %xmm4, 64(%edx)
574 movntdq %xmm5, 80(%edx)
575 movntdq %xmm6, 96(%edx)
576 movntdq %xmm7, 112(%edx)
577 leal 128(%edx), %edx
578 cmp %edx, %ebx
579 ja L(mm_large_page_loop_forward)
580 sfence
581
582 addl %edi, %ecx
583 subl %edx, %ecx
584/* We copied all up till %edx position in the dst.
585 In %ecx now is how many bytes are left to copy.
586 Now we need to advance %esi. */
587 leal (%edx, %eax), %esi
588
589 cmp $64, %ecx
590 jb L(mm_remaining_0_64_bytes_forward)
591
592 movdqu (%esi), %xmm0
593 movdqu 16(%esi), %xmm1
594 movdqu 32(%esi), %xmm2
595 movdqu 48(%esi), %xmm3
596 movdqu -64(%esi, %ecx), %xmm4
597 movdqu -48(%esi, %ecx), %xmm5
598 movdqu -32(%esi, %ecx), %xmm6
599 movdqu -16(%esi, %ecx), %xmm7
600 movdqu %xmm0, (%edx)
601 movdqu %xmm1, 16(%edx)
602 movdqu %xmm2, 32(%edx)
603 movdqu %xmm3, 48(%edx)
604 movdqu %xmm4, -64(%edx, %ecx)
605 movdqu %xmm5, -48(%edx, %ecx)
606 movdqu %xmm6, -32(%edx, %ecx)
607 movdqu %xmm7, -16(%edx, %ecx)
608 jmp L(mm_return_pop_all)
609
610
611/* Big length copy backward part. */
612L(mm_large_page_backward):
613/* Aligning the address of destination. We need to save
614 16 bits from the source in order not to overwrite them. */
615
616 PUSH (%esi)
617 PUSH (%edi)
618
619 movdqu -16(%eax, %ecx), %xmm0
620 movdqu -32(%eax, %ecx), %xmm1
621 movdqu -48(%eax, %ecx), %xmm2
622 movdqu -64(%eax, %ecx), %xmm3
623
624 leal (%edx, %ecx), %edi
625 andl $-64, %edi
626
627 movl %eax, %esi
628 subl %edx, %esi
629
630 movdqu -16(%edi, %esi), %xmm4
631 movdqu -32(%edi, %esi), %xmm5
632 movdqu -48(%edi, %esi), %xmm6
633 movdqu -64(%edi, %esi), %xmm7
634
635 movdqu %xmm0, -16(%edx, %ecx)
636 movdqu %xmm1, -32(%edx, %ecx)
637 movdqu %xmm2, -48(%edx, %ecx)
638 movdqu %xmm3, -64(%edx, %ecx)
639 movntdq %xmm4, -16(%edi)
640 movntdq %xmm5, -32(%edi)
641 movntdq %xmm6, -48(%edi)
642 movntdq %xmm7, -64(%edi)
643 leal -64(%edi), %edi
644
645 leal 128(%edx), %ebx
646 andl $-64, %ebx
647
648/* Compute in %ecx how many bytes are left to copy after
649 the main loop stops. */
650 movl %ebx, %ecx
651 subl %edx, %ecx
652
653 cmp %edi, %ebx
654 jae L(mm_len_0_or_more_backward)
655
656 .p2align 4
657L(mm_large_page_loop_backward):
658 movdqu -64(%edi, %esi), %xmm0
659 movdqu -48(%edi, %esi), %xmm1
660 movdqu -32(%edi, %esi), %xmm2
661 movdqu -16(%edi, %esi), %xmm3
662 movntdq %xmm0, -64(%edi)
663 movntdq %xmm1, -48(%edi)
664 movntdq %xmm2, -32(%edi)
665 movntdq %xmm3, -16(%edi)
666 leal -64(%edi), %edi
667 cmp %edi, %ebx
668 jb L(mm_large_page_loop_backward)
669 POP (%edi)
670 POP (%esi)
671 jmp L(mm_len_0_or_more_backward)
672
673END (MEMMOVE)