blob: b971f0b39a19c9e3c50ad3abaadee03c4555584f [file] [log] [blame]
Varvara Rainchik5a922842014-04-24 15:41:20 +04001/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include "cache.h"
32
33#ifndef MEMMOVE
34# define MEMMOVE memmove
35#endif
36
37#ifndef L
38# define L(label) .L##label
39#endif
40
41#ifndef cfi_startproc
42# define cfi_startproc .cfi_startproc
43#endif
44
45#ifndef cfi_endproc
46# define cfi_endproc .cfi_endproc
47#endif
48
49#ifndef cfi_rel_offset
50# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
51#endif
52
53#ifndef cfi_restore
54# define cfi_restore(reg) .cfi_restore reg
55#endif
56
57#ifndef cfi_adjust_cfa_offset
58# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
59#endif
60
61#ifndef ENTRY
62# define ENTRY(name) \
63 .type name, @function; \
64 .globl name; \
65 .p2align 4; \
66name: \
67 cfi_startproc
68#endif
69
70#ifndef END
71# define END(name) \
72 cfi_endproc; \
73 .size name, .-name
74#endif
75
76#ifdef USE_AS_BCOPY
Varvara Rainchikfce86142014-05-27 12:41:55 +040077# define SRC PARMS
78# define DEST SRC+4
79# define LEN DEST+4
Varvara Rainchik5a922842014-04-24 15:41:20 +040080#else
Varvara Rainchikfce86142014-05-27 12:41:55 +040081# define DEST PARMS
82# define SRC DEST+4
83# define LEN SRC+4
Varvara Rainchik5a922842014-04-24 15:41:20 +040084#endif
85
86#define CFI_PUSH(REG) \
87 cfi_adjust_cfa_offset (4); \
88 cfi_rel_offset (REG, 0)
89
90#define CFI_POP(REG) \
91 cfi_adjust_cfa_offset (-4); \
92 cfi_restore (REG)
93
94#define PUSH(REG) pushl REG; CFI_PUSH (REG)
95#define POP(REG) popl REG; CFI_POP (REG)
96
97#define PARMS 8 /* Preserve EBX. */
98#define ENTRANCE PUSH (%ebx);
99#define RETURN_END POP (%ebx); ret
100#define RETURN RETURN_END; CFI_PUSH (%ebx)
101
102 .section .text.sse2,"ax",@progbits
103ENTRY (MEMMOVE)
104 ENTRANCE
105 movl LEN(%esp), %ecx
106 movl SRC(%esp), %eax
107 movl DEST(%esp), %edx
108
109/* Check whether we should copy backward or forward. */
110 cmp %eax, %edx
111 je L(mm_return)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400112 jg L(mm_len_0_or_more_backward)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400113
114/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
115 separately. */
116 cmp $16, %ecx
117 jbe L(mm_len_0_16_bytes_forward)
118
Varvara Rainchikfce86142014-05-27 12:41:55 +0400119 cmpl $32, %ecx
120 ja L(mm_len_32_or_more_forward)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400121
122/* Copy [0..32] and return. */
123 movdqu (%eax), %xmm0
124 movdqu -16(%eax, %ecx), %xmm1
125 movdqu %xmm0, (%edx)
126 movdqu %xmm1, -16(%edx, %ecx)
127 jmp L(mm_return)
128
129L(mm_len_32_or_more_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400130 cmpl $64, %ecx
131 ja L(mm_len_64_or_more_forward)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400132
133/* Copy [0..64] and return. */
134 movdqu (%eax), %xmm0
135 movdqu 16(%eax), %xmm1
136 movdqu -16(%eax, %ecx), %xmm2
137 movdqu -32(%eax, %ecx), %xmm3
138 movdqu %xmm0, (%edx)
139 movdqu %xmm1, 16(%edx)
140 movdqu %xmm2, -16(%edx, %ecx)
141 movdqu %xmm3, -32(%edx, %ecx)
142 jmp L(mm_return)
143
144L(mm_len_64_or_more_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400145 cmpl $128, %ecx
146 ja L(mm_len_128_or_more_forward)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400147
148/* Copy [0..128] and return. */
149 movdqu (%eax), %xmm0
150 movdqu 16(%eax), %xmm1
151 movdqu 32(%eax), %xmm2
152 movdqu 48(%eax), %xmm3
153 movdqu -64(%eax, %ecx), %xmm4
154 movdqu -48(%eax, %ecx), %xmm5
155 movdqu -32(%eax, %ecx), %xmm6
156 movdqu -16(%eax, %ecx), %xmm7
157 movdqu %xmm0, (%edx)
158 movdqu %xmm1, 16(%edx)
159 movdqu %xmm2, 32(%edx)
160 movdqu %xmm3, 48(%edx)
161 movdqu %xmm4, -64(%edx, %ecx)
162 movdqu %xmm5, -48(%edx, %ecx)
163 movdqu %xmm6, -32(%edx, %ecx)
164 movdqu %xmm7, -16(%edx, %ecx)
165 jmp L(mm_return)
166
167L(mm_len_128_or_more_forward):
Varvara Rainchik5a922842014-04-24 15:41:20 +0400168 PUSH (%esi)
169 PUSH (%edi)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400170
171/* Aligning the address of destination. */
Varvara Rainchikfce86142014-05-27 12:41:55 +0400172 movdqu (%eax), %xmm0
173 movdqu 16(%eax), %xmm1
174 movdqu 32(%eax), %xmm2
175 movdqu 48(%eax), %xmm3
Varvara Rainchik5a922842014-04-24 15:41:20 +0400176
Varvara Rainchikfce86142014-05-27 12:41:55 +0400177 leal 64(%edx), %edi
178 andl $-64, %edi
179 subl %edx, %eax
Varvara Rainchik5a922842014-04-24 15:41:20 +0400180
Varvara Rainchikfce86142014-05-27 12:41:55 +0400181 movdqu (%eax, %edi), %xmm4
182 movdqu 16(%eax, %edi), %xmm5
183 movdqu 32(%eax, %edi), %xmm6
184 movdqu 48(%eax, %edi), %xmm7
Varvara Rainchik5a922842014-04-24 15:41:20 +0400185
Varvara Rainchikfce86142014-05-27 12:41:55 +0400186 movdqu %xmm0, (%edx)
187 movdqu %xmm1, 16(%edx)
188 movdqu %xmm2, 32(%edx)
189 movdqu %xmm3, 48(%edx)
190 movdqa %xmm4, (%edi)
191 movaps %xmm5, 16(%edi)
192 movaps %xmm6, 32(%edi)
193 movaps %xmm7, 48(%edi)
194 addl $64, %edi
Varvara Rainchik5a922842014-04-24 15:41:20 +0400195
Varvara Rainchikfce86142014-05-27 12:41:55 +0400196 leal (%edx, %ecx), %ebx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400197 andl $-64, %ebx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400198 cmp %edi, %ebx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400199 jbe L(mm_copy_remaining_forward)
200
Varvara Rainchikfce86142014-05-27 12:41:55 +0400201 cmp $SHARED_CACHE_SIZE_HALF, %ecx
202 jae L(mm_large_page_loop_forward)
203
Varvara Rainchik5a922842014-04-24 15:41:20 +0400204 .p2align 4
205L(mm_main_loop_forward):
206
Varvara Rainchikfce86142014-05-27 12:41:55 +0400207 prefetcht0 128(%eax, %edi)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400208
Varvara Rainchikfce86142014-05-27 12:41:55 +0400209 movdqu (%eax, %edi), %xmm0
210 movdqu 16(%eax, %edi), %xmm1
211 movdqu 32(%eax, %edi), %xmm2
212 movdqu 48(%eax, %edi), %xmm3
213 movdqa %xmm0, (%edi)
214 movaps %xmm1, 16(%edi)
215 movaps %xmm2, 32(%edi)
216 movaps %xmm3, 48(%edi)
217 leal 64(%edi), %edi
218 cmp %edi, %ebx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400219 ja L(mm_main_loop_forward)
220
221L(mm_copy_remaining_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400222 addl %edx, %ecx
223 subl %edi, %ecx
224/* We copied all up till %edi position in the dst.
Varvara Rainchik5a922842014-04-24 15:41:20 +0400225 In %ecx now is how many bytes are left to copy.
226 Now we need to advance %esi. */
Varvara Rainchikfce86142014-05-27 12:41:55 +0400227 leal (%edi, %eax), %esi
Varvara Rainchik5a922842014-04-24 15:41:20 +0400228
229L(mm_remaining_0_64_bytes_forward):
230 cmp $32, %ecx
231 ja L(mm_remaining_33_64_bytes_forward)
232 cmp $16, %ecx
233 ja L(mm_remaining_17_32_bytes_forward)
234 testl %ecx, %ecx
235 .p2align 4,,2
236 je L(mm_return_pop_all)
237
238 cmpb $8, %cl
239 ja L(mm_remaining_9_16_bytes_forward)
240 cmpb $4, %cl
241 .p2align 4,,5
242 ja L(mm_remaining_5_8_bytes_forward)
243 cmpb $2, %cl
244 .p2align 4,,1
245 ja L(mm_remaining_3_4_bytes_forward)
246 movzbl -1(%esi,%ecx), %eax
247 movzbl (%esi), %ebx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400248 movb %al, -1(%edi,%ecx)
249 movb %bl, (%edi)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400250 jmp L(mm_return_pop_all)
251
252L(mm_remaining_33_64_bytes_forward):
253 movdqu (%esi), %xmm0
254 movdqu 16(%esi), %xmm1
255 movdqu -32(%esi, %ecx), %xmm2
256 movdqu -16(%esi, %ecx), %xmm3
Varvara Rainchikfce86142014-05-27 12:41:55 +0400257 movdqu %xmm0, (%edi)
258 movdqu %xmm1, 16(%edi)
259 movdqu %xmm2, -32(%edi, %ecx)
260 movdqu %xmm3, -16(%edi, %ecx)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400261 jmp L(mm_return_pop_all)
262
263L(mm_remaining_17_32_bytes_forward):
264 movdqu (%esi), %xmm0
265 movdqu -16(%esi, %ecx), %xmm1
Varvara Rainchikfce86142014-05-27 12:41:55 +0400266 movdqu %xmm0, (%edi)
267 movdqu %xmm1, -16(%edi, %ecx)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400268 jmp L(mm_return_pop_all)
269
270L(mm_remaining_9_16_bytes_forward):
271 movq (%esi), %xmm0
272 movq -8(%esi, %ecx), %xmm1
Varvara Rainchikfce86142014-05-27 12:41:55 +0400273 movq %xmm0, (%edi)
274 movq %xmm1, -8(%edi, %ecx)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400275 jmp L(mm_return_pop_all)
276
Varvara Rainchikfce86142014-05-27 12:41:55 +0400277L(mm_remaining_5_8_bytes_forward):
278 movl (%esi), %eax
279 movl -4(%esi,%ecx), %ebx
280 movl %eax, (%edi)
281 movl %ebx, -4(%edi,%ecx)
282 jmp L(mm_return_pop_all)
283
284L(mm_remaining_3_4_bytes_forward):
285 movzwl -2(%esi,%ecx), %eax
286 movzwl (%esi), %ebx
287 movw %ax, -2(%edi,%ecx)
288 movw %bx, (%edi)
289 jmp L(mm_return_pop_all)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400290
291L(mm_len_0_16_bytes_forward):
292 testb $24, %cl
293 jne L(mm_len_9_16_bytes_forward)
294 testb $4, %cl
295 .p2align 4,,5
296 jne L(mm_len_5_8_bytes_forward)
297 testl %ecx, %ecx
298 .p2align 4,,2
299 je L(mm_return)
300 testb $2, %cl
301 .p2align 4,,1
302 jne L(mm_len_2_4_bytes_forward)
303 movzbl -1(%eax,%ecx), %ebx
304 movzbl (%eax), %eax
305 movb %bl, -1(%edx,%ecx)
306 movb %al, (%edx)
307 jmp L(mm_return)
308
309L(mm_len_2_4_bytes_forward):
310 movzwl -2(%eax,%ecx), %ebx
311 movzwl (%eax), %eax
312 movw %bx, -2(%edx,%ecx)
313 movw %ax, (%edx)
314 jmp L(mm_return)
315
316L(mm_len_5_8_bytes_forward):
317 movl (%eax), %ebx
318 movl -4(%eax,%ecx), %eax
319 movl %ebx, (%edx)
320 movl %eax, -4(%edx,%ecx)
321 jmp L(mm_return)
322
323L(mm_len_9_16_bytes_forward):
324 movq (%eax), %xmm0
325 movq -8(%eax, %ecx), %xmm1
326 movq %xmm0, (%edx)
327 movq %xmm1, -8(%edx, %ecx)
328 jmp L(mm_return)
329
Varvara Rainchikfce86142014-05-27 12:41:55 +0400330L(mm_recalc_len):
331/* Compute in %ecx how many bytes are left to copy after
332 the main loop stops. */
333 movl %ebx, %ecx
334 subl %edx, %ecx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400335/* The code for copying backwards. */
336L(mm_len_0_or_more_backward):
337
Varvara Rainchikfce86142014-05-27 12:41:55 +0400338/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
Varvara Rainchik5a922842014-04-24 15:41:20 +0400339 separately. */
340 cmp $16, %ecx
341 jbe L(mm_len_0_16_bytes_backward)
342
Varvara Rainchikfce86142014-05-27 12:41:55 +0400343 cmpl $32, %ecx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400344 jg L(mm_len_32_or_more_backward)
345
346/* Copy [0..32] and return. */
347 movdqu (%eax), %xmm0
348 movdqu -16(%eax, %ecx), %xmm1
349 movdqu %xmm0, (%edx)
350 movdqu %xmm1, -16(%edx, %ecx)
351 jmp L(mm_return)
352
353L(mm_len_32_or_more_backward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400354 cmpl $64, %ecx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400355 jg L(mm_len_64_or_more_backward)
356
357/* Copy [0..64] and return. */
358 movdqu (%eax), %xmm0
359 movdqu 16(%eax), %xmm1
360 movdqu -16(%eax, %ecx), %xmm2
361 movdqu -32(%eax, %ecx), %xmm3
362 movdqu %xmm0, (%edx)
363 movdqu %xmm1, 16(%edx)
364 movdqu %xmm2, -16(%edx, %ecx)
365 movdqu %xmm3, -32(%edx, %ecx)
366 jmp L(mm_return)
367
368L(mm_len_64_or_more_backward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400369 cmpl $128, %ecx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400370 jg L(mm_len_128_or_more_backward)
371
372/* Copy [0..128] and return. */
373 movdqu (%eax), %xmm0
374 movdqu 16(%eax), %xmm1
375 movdqu 32(%eax), %xmm2
376 movdqu 48(%eax), %xmm3
377 movdqu -64(%eax, %ecx), %xmm4
378 movdqu -48(%eax, %ecx), %xmm5
379 movdqu -32(%eax, %ecx), %xmm6
380 movdqu -16(%eax, %ecx), %xmm7
381 movdqu %xmm0, (%edx)
382 movdqu %xmm1, 16(%edx)
383 movdqu %xmm2, 32(%edx)
384 movdqu %xmm3, 48(%edx)
385 movdqu %xmm4, -64(%edx, %ecx)
386 movdqu %xmm5, -48(%edx, %ecx)
387 movdqu %xmm6, -32(%edx, %ecx)
388 movdqu %xmm7, -16(%edx, %ecx)
389 jmp L(mm_return)
390
391L(mm_len_128_or_more_backward):
Varvara Rainchik5a922842014-04-24 15:41:20 +0400392 PUSH (%esi)
393 PUSH (%edi)
394
395/* Aligning the address of destination. We need to save
396 16 bits from the source in order not to overwrite them. */
397 movdqu -16(%eax, %ecx), %xmm0
398 movdqu -32(%eax, %ecx), %xmm1
399 movdqu -48(%eax, %ecx), %xmm2
400 movdqu -64(%eax, %ecx), %xmm3
401
402 leal (%edx, %ecx), %edi
403 andl $-64, %edi
404
405 movl %eax, %esi
406 subl %edx, %esi
407
408 movdqu -16(%edi, %esi), %xmm4
409 movdqu -32(%edi, %esi), %xmm5
410 movdqu -48(%edi, %esi), %xmm6
411 movdqu -64(%edi, %esi), %xmm7
412
413 movdqu %xmm0, -16(%edx, %ecx)
414 movdqu %xmm1, -32(%edx, %ecx)
415 movdqu %xmm2, -48(%edx, %ecx)
416 movdqu %xmm3, -64(%edx, %ecx)
417 movdqa %xmm4, -16(%edi)
418 movdqa %xmm5, -32(%edi)
419 movdqa %xmm6, -48(%edi)
420 movdqa %xmm7, -64(%edi)
421 leal -64(%edi), %edi
422
423 leal 64(%edx), %ebx
424 andl $-64, %ebx
425
Varvara Rainchik5a922842014-04-24 15:41:20 +0400426 cmp %edi, %ebx
Varvara Rainchikfce86142014-05-27 12:41:55 +0400427 jae L(mm_main_loop_backward_end)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400428
Varvara Rainchikfce86142014-05-27 12:41:55 +0400429 cmp $SHARED_CACHE_SIZE_HALF, %ecx
430 jae L(mm_large_page_loop_backward)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400431
432 .p2align 4
433L(mm_main_loop_backward):
434
435 prefetcht0 -128(%edi, %esi)
436
437 movdqu -64(%edi, %esi), %xmm0
438 movdqu -48(%edi, %esi), %xmm1
439 movdqu -32(%edi, %esi), %xmm2
440 movdqu -16(%edi, %esi), %xmm3
441 movdqa %xmm0, -64(%edi)
442 movdqa %xmm1, -48(%edi)
443 movdqa %xmm2, -32(%edi)
444 movdqa %xmm3, -16(%edi)
445 leal -64(%edi), %edi
446 cmp %edi, %ebx
447 jb L(mm_main_loop_backward)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400448L(mm_main_loop_backward_end):
Varvara Rainchik5a922842014-04-24 15:41:20 +0400449 POP (%edi)
450 POP (%esi)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400451 jmp L(mm_recalc_len)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400452
453/* Copy [0..16] and return. */
454L(mm_len_0_16_bytes_backward):
455 testb $24, %cl
456 jnz L(mm_len_9_16_bytes_backward)
457 testb $4, %cl
458 .p2align 4,,5
459 jnz L(mm_len_5_8_bytes_backward)
460 testl %ecx, %ecx
461 .p2align 4,,2
462 je L(mm_return)
463 testb $2, %cl
464 .p2align 4,,1
465 jne L(mm_len_3_4_bytes_backward)
466 movzbl -1(%eax,%ecx), %ebx
467 movzbl (%eax), %eax
468 movb %bl, -1(%edx,%ecx)
469 movb %al, (%edx)
470 jmp L(mm_return)
471
472L(mm_len_3_4_bytes_backward):
473 movzwl -2(%eax,%ecx), %ebx
474 movzwl (%eax), %eax
475 movw %bx, -2(%edx,%ecx)
476 movw %ax, (%edx)
477 jmp L(mm_return)
478
479L(mm_len_9_16_bytes_backward):
480 PUSH (%esi)
481 movl -4(%eax,%ecx), %ebx
482 movl -8(%eax,%ecx), %esi
483 movl %ebx, -4(%edx,%ecx)
484 movl %esi, -8(%edx,%ecx)
485 subl $8, %ecx
486 POP (%esi)
487 jmp L(mm_len_0_16_bytes_backward)
488
489L(mm_len_5_8_bytes_backward):
490 movl (%eax), %ebx
491 movl -4(%eax,%ecx), %eax
492 movl %ebx, (%edx)
493 movl %eax, -4(%edx,%ecx)
494
495L(mm_return):
496 movl %edx, %eax
497 RETURN
498
499L(mm_return_pop_all):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400500 movl %edx, %eax
Varvara Rainchik5a922842014-04-24 15:41:20 +0400501 POP (%edi)
502 POP (%esi)
503 RETURN
504
505/* Big length copy forward part. */
506
Varvara Rainchik5a922842014-04-24 15:41:20 +0400507 .p2align 4
508L(mm_large_page_loop_forward):
Varvara Rainchikfce86142014-05-27 12:41:55 +0400509 movdqu (%eax, %edi), %xmm0
510 movdqu 16(%eax, %edi), %xmm1
511 movdqu 32(%eax, %edi), %xmm2
512 movdqu 48(%eax, %edi), %xmm3
513 movntdq %xmm0, (%edi)
514 movntdq %xmm1, 16(%edi)
515 movntdq %xmm2, 32(%edi)
516 movntdq %xmm3, 48(%edi)
517 leal 64(%edi), %edi
518 cmp %edi, %ebx
Varvara Rainchik5a922842014-04-24 15:41:20 +0400519 ja L(mm_large_page_loop_forward)
520 sfence
Varvara Rainchikfce86142014-05-27 12:41:55 +0400521 jmp L(mm_copy_remaining_forward)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400522
523/* Big length copy backward part. */
Varvara Rainchik5a922842014-04-24 15:41:20 +0400524 .p2align 4
525L(mm_large_page_loop_backward):
526 movdqu -64(%edi, %esi), %xmm0
527 movdqu -48(%edi, %esi), %xmm1
528 movdqu -32(%edi, %esi), %xmm2
529 movdqu -16(%edi, %esi), %xmm3
530 movntdq %xmm0, -64(%edi)
531 movntdq %xmm1, -48(%edi)
532 movntdq %xmm2, -32(%edi)
533 movntdq %xmm3, -16(%edi)
534 leal -64(%edi), %edi
535 cmp %edi, %ebx
536 jb L(mm_large_page_loop_backward)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400537 sfence
Varvara Rainchik5a922842014-04-24 15:41:20 +0400538 POP (%edi)
539 POP (%esi)
Varvara Rainchikfce86142014-05-27 12:41:55 +0400540 jmp L(mm_recalc_len)
Varvara Rainchik5a922842014-04-24 15:41:20 +0400541
542END (MEMMOVE)