blob: ee8440e9aada3081ddaad31df665b6c9b542f063 [file] [log] [blame]
Varvara Rainchika020a242014-04-29 17:44:56 +04001/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include "cache.h"
32
33#ifndef MEMMOVE
34# define MEMMOVE memmove
35#endif
36
37#ifndef L
38# define L(label) .L##label
39#endif
40
41#ifndef cfi_startproc
42# define cfi_startproc .cfi_startproc
43#endif
44
45#ifndef cfi_endproc
46# define cfi_endproc .cfi_endproc
47#endif
48
49#ifndef cfi_rel_offset
50# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
51#endif
52
53#ifndef cfi_restore
54# define cfi_restore(reg) .cfi_restore reg
55#endif
56
57#ifndef cfi_adjust_cfa_offset
58# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
59#endif
60
61#ifndef ENTRY
62# define ENTRY(name) \
63 .type name, @function; \
64 .globl name; \
65 .p2align 4; \
66name: \
67 cfi_startproc
68#endif
69
70#ifndef END
71# define END(name) \
72 cfi_endproc; \
73 .size name, .-name
74#endif
75
76#define CFI_PUSH(REG) \
77 cfi_adjust_cfa_offset (4); \
78 cfi_rel_offset (REG, 0)
79
80#define CFI_POP(REG) \
81 cfi_adjust_cfa_offset (-4); \
82 cfi_restore (REG)
83
84#define PUSH(REG) push REG;
85#define POP(REG) pop REG;
86
87#define ENTRANCE PUSH (%rbx);
88#define RETURN_END POP (%rbx); ret
89#define RETURN RETURN_END;
90
91 .section .text.sse2,"ax",@progbits
92ENTRY (MEMMOVE)
93 ENTRANCE
94#ifdef USE_AS_BCOPY
95 xchg %rsi, %rdi
96#endif
97 mov %rdi, %rax
98
99/* Check whether we should copy backward or forward. */
100 cmp %rsi, %rdi
101 je L(mm_return)
102 ja L(mm_len_0_or_more_backward)
103
104/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
105 separately. */
106 cmp $16, %rdx
107 jbe L(mm_len_0_16_bytes_forward)
108
109 cmp $32, %rdx
110 jg L(mm_len_32_or_more_forward)
111
112/* Copy [0..32] and return. */
113 movdqu (%rsi), %xmm0
114 movdqu -16(%rsi, %rdx), %xmm1
115 movdqu %xmm0, (%rdi)
116 movdqu %xmm1, -16(%rdi, %rdx)
117 jmp L(mm_return)
118
119L(mm_len_32_or_more_forward):
120 cmp $64, %rdx
121 jg L(mm_len_64_or_more_forward)
122
123/* Copy [0..64] and return. */
124 movdqu (%rsi), %xmm0
125 movdqu 16(%rsi), %xmm1
126 movdqu -16(%rsi, %rdx), %xmm2
127 movdqu -32(%rsi, %rdx), %xmm3
128 movdqu %xmm0, (%rdi)
129 movdqu %xmm1, 16(%rdi)
130 movdqu %xmm2, -16(%rdi, %rdx)
131 movdqu %xmm3, -32(%rdi, %rdx)
132 jmp L(mm_return)
133
134L(mm_len_64_or_more_forward):
135 cmp $128, %rdx
136 jg L(mm_len_128_or_more_forward)
137
138/* Copy [0..128] and return. */
139 movdqu (%rsi), %xmm0
140 movdqu 16(%rsi), %xmm1
141 movdqu 32(%rsi), %xmm2
142 movdqu 48(%rsi), %xmm3
143 movdqu -64(%rsi, %rdx), %xmm4
144 movdqu -48(%rsi, %rdx), %xmm5
145 movdqu -32(%rsi, %rdx), %xmm6
146 movdqu -16(%rsi, %rdx), %xmm7
147 movdqu %xmm0, (%rdi)
148 movdqu %xmm1, 16(%rdi)
149 movdqu %xmm2, 32(%rdi)
150 movdqu %xmm3, 48(%rdi)
151 movdqu %xmm4, -64(%rdi, %rdx)
152 movdqu %xmm5, -48(%rdi, %rdx)
153 movdqu %xmm6, -32(%rdi, %rdx)
154 movdqu %xmm7, -16(%rdi, %rdx)
155 jmp L(mm_return)
156
157L(mm_len_128_or_more_forward):
158
159 cmp $SHARED_CACHE_SIZE_HALF, %rdx
160 jae L(mm_large_page_forward)
161
162 mov %rsi, %r8 // copy src to r8
163 mov %rdi, %r9 // copy dst to r9
164
165/* Aligning the address of destination. */
166/* save first unaligned 64 bytes */
167 movdqu (%rsi), %xmm0
168 movdqu 16(%rsi), %xmm1
169 movdqu 32(%rsi), %xmm2
170 movdqu 48(%rsi), %xmm3
171
172 lea 64(%r9), %rdi
173 and $-64, %rdi /* rdi now aligned to next 64 byte boundary */
174
175 sub %r9, %rsi /* rsi = src - dst = diff */
176
177 movdqu (%rdi, %rsi), %xmm4
178 movdqu 16(%rdi, %rsi), %xmm5
179 movdqu 32(%rdi, %rsi), %xmm6
180 movdqu 48(%rdi, %rsi), %xmm7
181
182 movdqu %xmm0, (%r9)
183 movdqu %xmm1, 16(%r9)
184 movdqu %xmm2, 32(%r9)
185 movdqu %xmm3, 48(%r9)
186 movdqa %xmm4, (%rdi)
187 movdqa %xmm5, 16(%rdi)
188 movdqa %xmm6, 32(%rdi)
189 movdqa %xmm7, 48(%rdi)
190 add $64, %rdi
191
192 lea (%r9, %rdx), %rbx
193 and $-64, %rbx
194
195 cmp %rdi, %rbx
196 jbe L(mm_copy_remaining_forward)
197
198 .p2align 4
199L(mm_main_loop_forward):
200
201 prefetcht0 128(%rdi, %rsi)
202
203 movdqu (%rdi, %rsi), %xmm0
204 movdqu 16(%rdi, %rsi), %xmm1
205 movdqu 32(%rdi, %rsi), %xmm2
206 movdqu 48(%rdi, %rsi), %xmm3
207 movdqa %xmm0, (%rdi)
208 movdqa %xmm1, 16(%rdi)
209 movdqa %xmm2, 32(%rdi)
210 movdqa %xmm3, 48(%rdi)
211 lea 64(%rdi), %rdi
212 cmp %rdi, %rbx
213 ja L(mm_main_loop_forward)
214
215L(mm_copy_remaining_forward):
216 add %r9, %rdx
217 sub %rdi, %rdx
218/* We copied all up till %rdi position in the dst.
219 In %rdx now is how many bytes are left to copy.
220 Now we need to advance %r8. */
221 lea (%rdi, %rsi), %r8
222
223L(mm_remaining_0_64_bytes_forward):
224 cmp $32, %rdx
225 ja L(mm_remaining_33_64_bytes_forward)
226 cmp $16, %rdx
227 ja L(mm_remaining_17_32_bytes_forward)
228 test %rdx, %rdx
229 .p2align 4,,2
230 je L(mm_return)
231
232 cmpb $8, %dl
233 ja L(mm_remaining_9_16_bytes_forward)
234 cmpb $4, %dl
235 .p2align 4,,5
236 ja L(mm_remaining_5_8_bytes_forward)
237 cmpb $2, %dl
238 .p2align 4,,1
239 ja L(mm_remaining_3_4_bytes_forward)
240 movzbl -1(%r8,%rdx), %esi
241 movzbl (%r8), %ebx
242 movb %sil, -1(%rdi,%rdx)
243 movb %bl, (%rdi)
244 jmp L(mm_return)
245
246L(mm_remaining_33_64_bytes_forward):
247 movdqu (%r8), %xmm0
248 movdqu 16(%r8), %xmm1
249 movdqu -32(%r8, %rdx), %xmm2
250 movdqu -16(%r8, %rdx), %xmm3
251 movdqu %xmm0, (%rdi)
252 movdqu %xmm1, 16(%rdi)
253 movdqu %xmm2, -32(%rdi, %rdx)
254 movdqu %xmm3, -16(%rdi, %rdx)
255 jmp L(mm_return)
256
257L(mm_remaining_17_32_bytes_forward):
258 movdqu (%r8), %xmm0
259 movdqu -16(%r8, %rdx), %xmm1
260 movdqu %xmm0, (%rdi)
261 movdqu %xmm1, -16(%rdi, %rdx)
262 jmp L(mm_return)
263
264L(mm_remaining_3_4_bytes_forward):
265 movzwl -2(%r8,%rdx), %esi
266 movzwl (%r8), %ebx
267 movw %si, -2(%rdi,%rdx)
268 movw %bx, (%rdi)
269 jmp L(mm_return)
270
271L(mm_remaining_5_8_bytes_forward):
272 movl (%r8), %esi
273 movl -4(%r8,%rdx), %ebx
274 movl %esi, (%rdi)
275 movl %ebx, -4(%rdi,%rdx)
276 jmp L(mm_return)
277
278L(mm_remaining_9_16_bytes_forward):
279 mov (%r8), %rsi
280 mov -8(%r8, %rdx), %rbx
281 mov %rsi, (%rdi)
282 mov %rbx, -8(%rdi, %rdx)
283 jmp L(mm_return)
284
285L(mm_len_0_16_bytes_forward):
286 testb $24, %dl
287 jne L(mm_len_9_16_bytes_forward)
288 testb $4, %dl
289 .p2align 4,,5
290 jne L(mm_len_5_8_bytes_forward)
291 test %rdx, %rdx
292 .p2align 4,,2
293 je L(mm_return)
294 testb $2, %dl
295 .p2align 4,,1
296 jne L(mm_len_2_4_bytes_forward)
297 movzbl -1(%rsi,%rdx), %ebx
298 movzbl (%rsi), %esi
299 movb %bl, -1(%rdi,%rdx)
300 movb %sil, (%rdi)
301 jmp L(mm_return)
302
303L(mm_len_2_4_bytes_forward):
304 movzwl -2(%rsi,%rdx), %ebx
305 movzwl (%rsi), %esi
306 movw %bx, -2(%rdi,%rdx)
307 movw %si, (%rdi)
308 jmp L(mm_return)
309
310L(mm_len_5_8_bytes_forward):
311 movl (%rsi), %ebx
312 movl -4(%rsi,%rdx), %esi
313 movl %ebx, (%rdi)
314 movl %esi, -4(%rdi,%rdx)
315 jmp L(mm_return)
316
317L(mm_len_9_16_bytes_forward):
318 mov (%rsi), %rbx
319 mov -8(%rsi, %rdx), %rsi
320 mov %rbx, (%rdi)
321 mov %rsi, -8(%rdi, %rdx)
322 jmp L(mm_return)
323
324/* The code for copying backwards. */
325L(mm_len_0_or_more_backward):
326
327/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
328 separately. */
329 cmp $16, %rdx
330 jbe L(mm_len_0_16_bytes_backward)
331
332 cmp $32, %rdx
333 jg L(mm_len_32_or_more_backward)
334
335/* Copy [0..32] and return. */
336 movdqu (%rsi), %xmm0
337 movdqu -16(%rsi, %rdx), %xmm1
338 movdqu %xmm0, (%rdi)
339 movdqu %xmm1, -16(%rdi, %rdx)
340 jmp L(mm_return)
341
342L(mm_len_32_or_more_backward):
343 cmp $64, %rdx
344 jg L(mm_len_64_or_more_backward)
345
346/* Copy [0..64] and return. */
347 movdqu (%rsi), %xmm0
348 movdqu 16(%rsi), %xmm1
349 movdqu -16(%rsi, %rdx), %xmm2
350 movdqu -32(%rsi, %rdx), %xmm3
351 movdqu %xmm0, (%rdi)
352 movdqu %xmm1, 16(%rdi)
353 movdqu %xmm2, -16(%rdi, %rdx)
354 movdqu %xmm3, -32(%rdi, %rdx)
355 jmp L(mm_return)
356
357L(mm_len_64_or_more_backward):
358 cmp $128, %rdx
359 jg L(mm_len_128_or_more_backward)
360
361/* Copy [0..128] and return. */
362 movdqu (%rsi), %xmm0
363 movdqu 16(%rsi), %xmm1
364 movdqu 32(%rsi), %xmm2
365 movdqu 48(%rsi), %xmm3
366 movdqu -64(%rsi, %rdx), %xmm4
367 movdqu -48(%rsi, %rdx), %xmm5
368 movdqu -32(%rsi, %rdx), %xmm6
369 movdqu -16(%rsi, %rdx), %xmm7
370 movdqu %xmm0, (%rdi)
371 movdqu %xmm1, 16(%rdi)
372 movdqu %xmm2, 32(%rdi)
373 movdqu %xmm3, 48(%rdi)
374 movdqu %xmm4, -64(%rdi, %rdx)
375 movdqu %xmm5, -48(%rdi, %rdx)
376 movdqu %xmm6, -32(%rdi, %rdx)
377 movdqu %xmm7, -16(%rdi, %rdx)
378 jmp L(mm_return)
379
380L(mm_len_128_or_more_backward):
381
382 cmp $SHARED_CACHE_SIZE_HALF, %rdx
383 jae L(mm_large_page_backward)
384
385/* Aligning the address of destination. We need to save
386 16 bits from the source in order not to overwrite them. */
387 movdqu -16(%rsi, %rdx), %xmm0
388 movdqu -32(%rsi, %rdx), %xmm1
389 movdqu -48(%rsi, %rdx), %xmm2
390 movdqu -64(%rsi, %rdx), %xmm3
391
392 lea (%rdi, %rdx), %r9
393 and $-64, %r9 /* r9 = aligned dst */
394
395 mov %rsi, %r8
396 sub %rdi, %r8 /* r8 = src - dst, diff */
397
398 movdqu -16(%r9, %r8), %xmm4
399 movdqu -32(%r9, %r8), %xmm5
400 movdqu -48(%r9, %r8), %xmm6
401 movdqu -64(%r9, %r8), %xmm7
402
403 movdqu %xmm0, -16(%rdi, %rdx)
404 movdqu %xmm1, -32(%rdi, %rdx)
405 movdqu %xmm2, -48(%rdi, %rdx)
406 movdqu %xmm3, -64(%rdi, %rdx)
407 movdqa %xmm4, -16(%r9)
408 movdqa %xmm5, -32(%r9)
409 movdqa %xmm6, -48(%r9)
410 movdqa %xmm7, -64(%r9)
411 lea -64(%r9), %r9
412
413 lea 64(%rdi), %rbx
414 and $-64, %rbx
415
416/* Compute in %rdx how many bytes are left to copy after
417 the main loop stops. */
418 mov %rbx, %rdx
419 sub %rdi, %rdx
420
421 cmp %r9, %rbx
422 jb L(mm_main_loop_backward)
423 jmp L(mm_len_0_or_more_backward)
424
425 .p2align 4
426L(mm_main_loop_backward):
427
428 prefetcht0 -128(%r9, %r8)
429
430 movdqu -64(%r9, %r8), %xmm0
431 movdqu -48(%r9, %r8), %xmm1
432 movdqu -32(%r9, %r8), %xmm2
433 movdqu -16(%r9, %r8), %xmm3
434 movdqa %xmm0, -64(%r9)
435 movdqa %xmm1, -48(%r9)
436 movdqa %xmm2, -32(%r9)
437 movdqa %xmm3, -16(%r9)
438 lea -64(%r9), %r9
439 cmp %r9, %rbx
440 jb L(mm_main_loop_backward)
441 jmp L(mm_len_0_or_more_backward)
442
443/* Copy [0..16] and return. */
444L(mm_len_0_16_bytes_backward):
445 testb $24, %dl
446 jnz L(mm_len_9_16_bytes_backward)
447 testb $4, %dl
448 .p2align 4,,5
449 jnz L(mm_len_5_8_bytes_backward)
450 test %rdx, %rdx
451 .p2align 4,,2
452 je L(mm_return)
453 testb $2, %dl
454 .p2align 4,,1
455 jne L(mm_len_3_4_bytes_backward)
456 movzbl -1(%rsi,%rdx), %ebx
457 movzbl (%rsi), %ecx
458 movb %bl, -1(%rdi,%rdx)
459 movb %cl, (%rdi)
460 jmp L(mm_return)
461
462L(mm_len_3_4_bytes_backward):
463 movzwl -2(%rsi,%rdx), %ebx
464 movzwl (%rsi), %ecx
465 movw %bx, -2(%rdi,%rdx)
466 movw %cx, (%rdi)
467 jmp L(mm_return)
468
469L(mm_len_9_16_bytes_backward):
470 movl -4(%rsi,%rdx), %ebx
471 movl -8(%rsi,%rdx), %ecx
472 movl %ebx, -4(%rdi,%rdx)
473 movl %ecx, -8(%rdi,%rdx)
474 sub $8, %rdx
475 jmp L(mm_len_0_16_bytes_backward)
476
477L(mm_len_5_8_bytes_backward):
478 movl (%rsi), %ebx
479 movl -4(%rsi,%rdx), %ecx
480 movl %ebx, (%rdi)
481 movl %ecx, -4(%rdi,%rdx)
482
483L(mm_return):
484 RETURN
485
486/* Big length copy forward part. */
487
488L(mm_large_page_forward):
489/* Aligning the address of destination. We need to save
490 16 bits from the source in order not to overwrite them. */
491
492 mov %rsi, %r8
493 mov %rdi, %r9
494
495 movdqu (%rsi), %xmm0
496 movdqu 16(%rsi), %xmm1
497 movdqu 32(%rsi), %xmm2
498 movdqu 48(%rsi), %xmm3
499
500 lea 64(%r9), %rdi
501 and $-64, %rdi /* rdi = aligned dst */
502
503 sub %r9, %rsi /* rsi = diff */
504
505 movdqu (%rdi, %rsi), %xmm4
506 movdqu 16(%rdi, %rsi), %xmm5
507 movdqu 32(%rdi, %rsi), %xmm6
508 movdqu 48(%rdi, %rsi), %xmm7
509
510 movdqu %xmm0, (%r9)
511 movdqu %xmm1, 16(%r9)
512 movdqu %xmm2, 32(%r9)
513 movdqu %xmm3, 48(%r9)
514 movntdq %xmm4, (%rdi)
515 movntdq %xmm5, 16(%rdi)
516 movntdq %xmm6, 32(%rdi)
517 movntdq %xmm7, 48(%rdi)
518 add $64, %rdi
519
520 lea (%r9, %rdx), %rbx
521 and $-128, %rbx
522
523 cmp %rdi, %rbx
524 jbe L(mm_copy_remaining_forward)
525
526 .p2align 4
527L(mm_large_page_loop_forward):
528 movdqu (%rdi, %rsi), %xmm0
529 movdqu 16(%rdi, %rsi), %xmm1
530 movdqu 32(%rdi, %rsi), %xmm2
531 movdqu 48(%rdi, %rsi), %xmm3
532 movdqu 64(%rdi, %rsi), %xmm4
533 movdqu 80(%rdi, %rsi), %xmm5
534 movdqu 96(%rdi, %rsi), %xmm6
535 movdqu 112(%rdi, %rsi), %xmm7
536 movntdq %xmm0, (%rdi)
537 movntdq %xmm1, 16(%rdi)
538 movntdq %xmm2, 32(%rdi)
539 movntdq %xmm3, 48(%rdi)
540 movntdq %xmm4, 64(%rdi)
541 movntdq %xmm5, 80(%rdi)
542 movntdq %xmm6, 96(%rdi)
543 movntdq %xmm7, 112(%rdi)
544 lea 128(%rdi), %rdi
545 cmp %rdi, %rbx
546 ja L(mm_large_page_loop_forward)
547 sfence
548
549 add %r9, %rdx
550 sub %rdi, %rdx
551/* We copied all up till %rdi position in the dst.
552 In %rdx now is how many bytes are left to copy.
553 Now we need to advance %r8. */
554 lea (%rdi, %rsi), %r8
555
556 cmp $64, %rdx
557 jb L(mm_remaining_0_64_bytes_forward)
558
559 movdqu (%r8), %xmm0
560 movdqu 16(%r8), %xmm1
561 movdqu 32(%r8), %xmm2
562 movdqu 48(%r8), %xmm3
563 movdqu -64(%r8, %rdx), %xmm4
564 movdqu -48(%r8, %rdx), %xmm5
565 movdqu -32(%r8, %rdx), %xmm6
566 movdqu -16(%r8, %rdx), %xmm7
567 movdqu %xmm0, (%rdi)
568 movdqu %xmm1, 16(%rdi)
569 movdqu %xmm2, 32(%rdi)
570 movdqu %xmm3, 48(%rdi)
571 movdqu %xmm4, -64(%rdi, %rdx)
572 movdqu %xmm5, -48(%rdi, %rdx)
573 movdqu %xmm6, -32(%rdi, %rdx)
574 movdqu %xmm7, -16(%rdi, %rdx)
575 jmp L(mm_return)
576
577
578/* Big length copy backward part. */
579L(mm_large_page_backward):
580/* Aligning the address of destination. We need to save
581 16 bits from the source in order not to overwrite them. */
582
583 movdqu -16(%rsi, %rdx), %xmm0
584 movdqu -32(%rsi, %rdx), %xmm1
585 movdqu -48(%rsi, %rdx), %xmm2
586 movdqu -64(%rsi, %rdx), %xmm3
587
588 lea (%rdi, %rdx), %r9
589 and $-64, %r9
590
591 mov %rsi, %r8
592 sub %rdi, %r8
593
594 movdqu -16(%r9, %r8), %xmm4
595 movdqu -32(%r9, %r8), %xmm5
596 movdqu -48(%r9, %r8), %xmm6
597 movdqu -64(%r9, %r8), %xmm7
598
599 movdqu %xmm0, -16(%rdi, %rdx)
600 movdqu %xmm1, -32(%rdi, %rdx)
601 movdqu %xmm2, -48(%rdi, %rdx)
602 movdqu %xmm3, -64(%rdi, %rdx)
603 movntdq %xmm4, -16(%r9)
604 movntdq %xmm5, -32(%r9)
605 movntdq %xmm6, -48(%r9)
606 movntdq %xmm7, -64(%r9)
607 lea -64(%r9), %r9
608
609 lea 128(%rdi), %rbx
610 and $-64, %rbx
611
612/* Compute in %rdx how many bytes are left to copy after
613 the main loop stops. */
614 mov %rbx, %rdx
615 sub %rdi, %rdx
616
617 cmp %r9, %rbx
618 jae L(mm_len_0_or_more_backward)
619
620 .p2align 4
621L(mm_large_page_loop_backward):
622 movdqu -64(%r9, %r8), %xmm0
623 movdqu -48(%r9, %r8), %xmm1
624 movdqu -32(%r9, %r8), %xmm2
625 movdqu -16(%r9, %r8), %xmm3
626 movntdq %xmm0, -64(%r9)
627 movntdq %xmm1, -48(%r9)
628 movntdq %xmm2, -32(%r9)
629 movntdq %xmm3, -16(%r9)
630 lea -64(%r9), %r9
631 cmp %r9, %rbx
632 jb L(mm_large_page_loop_backward)
633 jmp L(mm_len_0_or_more_backward)
634
635END (MEMMOVE)