blob: 4c30fb62eeabb4b29566163eca01d9988d27d1f6 [file] [log] [blame]
Varvara Rainchika020a242014-04-29 17:44:56 +04001/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include "cache.h"
32
33#ifndef MEMCPY
34# define MEMCPY memcpy
35#endif
36
37#ifndef L
38# define L(label) .L##label
39#endif
40
41#ifndef cfi_startproc
42# define cfi_startproc .cfi_startproc
43#endif
44
45#ifndef cfi_endproc
46# define cfi_endproc .cfi_endproc
47#endif
48
49#ifndef cfi_rel_offset
50# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
51#endif
52
53#ifndef cfi_restore
54# define cfi_restore(reg) .cfi_restore reg
55#endif
56
57#ifndef cfi_adjust_cfa_offset
58# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
59#endif
60
61#ifndef ENTRY
62# define ENTRY(name) \
63 .type name, @function; \
64 .globl name; \
65 .p2align 4; \
66name: \
67 cfi_startproc
68#endif
69
70#ifndef END
71# define END(name) \
72 cfi_endproc; \
73 .size name, .-name
74#endif
75
76#define CFI_PUSH(REG) \
77 cfi_adjust_cfa_offset (4); \
78 cfi_rel_offset (REG, 0)
79
80#define CFI_POP(REG) \
81 cfi_adjust_cfa_offset (-4); \
82 cfi_restore (REG)
83
84#define PUSH(REG) push REG;
85#define POP(REG) pop REG;
86
87#define ENTRANCE PUSH (%rbx);
88#define RETURN_END POP (%rbx); ret
89#define RETURN RETURN_END;
90
91 .section .text.sse2,"ax",@progbits
92ENTRY (MEMCPY)
93 ENTRANCE
94 cmp %rsi, %rdi
95 je L(return)
96
97 cmp $16, %rdx
98 jbe L(len_0_16_bytes)
99
100 cmp $SHARED_CACHE_SIZE_HALF, %rdx
101 jae L(large_page)
102
103 movdqu (%rsi), %xmm0
104 movdqu -16(%rsi, %rdx), %xmm1
105 cmp $32, %rdx
106 movdqu %xmm0, (%rdi)
107 movdqu %xmm1, -16(%rdi, %rdx)
108 jbe L(return)
109
110 movdqu 16(%rsi), %xmm0
111 movdqu -32(%rsi, %rdx), %xmm1
112 cmp $64, %rdx
113 movdqu %xmm0, 16(%rdi)
114 movdqu %xmm1, -32(%rdi, %rdx)
115 jbe L(return)
116
117 movdqu 32(%rsi), %xmm0
118 movdqu 48(%rsi), %xmm1
119 movdqu -48(%rsi, %rdx), %xmm2
120 movdqu -64(%rsi, %rdx), %xmm3
121 cmp $128, %rdx
122 movdqu %xmm0, 32(%rdi)
123 movdqu %xmm1, 48(%rdi)
124 movdqu %xmm2, -48(%rdi, %rdx)
125 movdqu %xmm3, -64(%rdi, %rdx)
126 jbe L(return)
127
128/* Now the main loop: we align the address of the destination. */
129 lea 64(%rdi), %r8
130 and $-64, %r8
131
132 add %rdi, %rdx
133 and $-64, %rdx
134
135 sub %rdi, %rsi
136
137/* We should stop two iterations before the termination
138 (in order not to misprefetch). */
139 sub $64, %rdx
140 cmp %r8, %rdx
141 je L(main_loop_just_one_iteration)
142
143 sub $64, %rdx
144 cmp %r8, %rdx
145 je L(main_loop_last_two_iterations)
146
147
148 .p2align 4
149L(main_loop_cache):
150
151 prefetcht0 128(%r8, %rsi)
152
153 movdqu (%r8, %rsi), %xmm0
154 movdqu 16(%r8, %rsi), %xmm1
155 movdqu 32(%r8, %rsi), %xmm2
156 movdqu 48(%r8, %rsi), %xmm3
157 movdqa %xmm0, (%r8)
158 movdqa %xmm1, 16(%r8)
159 movdqa %xmm2, 32(%r8)
160 movdqa %xmm3, 48(%r8)
161 lea 64(%r8), %r8
162 cmp %r8, %rdx
163 jne L(main_loop_cache)
164
165L(main_loop_last_two_iterations):
166 movdqu (%r8, %rsi), %xmm0
167 movdqu 16(%r8, %rsi), %xmm1
168 movdqu 32(%r8, %rsi), %xmm2
169 movdqu 48(%r8, %rsi), %xmm3
170 movdqu 64(%r8, %rsi), %xmm4
171 movdqu 80(%r8, %rsi), %xmm5
172 movdqu 96(%r8, %rsi), %xmm6
173 movdqu 112(%r8, %rsi), %xmm7
174 movdqa %xmm0, (%r8)
175 movdqa %xmm1, 16(%r8)
176 movdqa %xmm2, 32(%r8)
177 movdqa %xmm3, 48(%r8)
178 movdqa %xmm4, 64(%r8)
179 movdqa %xmm5, 80(%r8)
180 movdqa %xmm6, 96(%r8)
181 movdqa %xmm7, 112(%r8)
182 jmp L(return)
183
184L(main_loop_just_one_iteration):
185 movdqu (%r8, %rsi), %xmm0
186 movdqu 16(%r8, %rsi), %xmm1
187 movdqu 32(%r8, %rsi), %xmm2
188 movdqu 48(%r8, %rsi), %xmm3
189 movdqa %xmm0, (%r8)
190 movdqa %xmm1, 16(%r8)
191 movdqa %xmm2, 32(%r8)
192 movdqa %xmm3, 48(%r8)
193 jmp L(return)
194
195L(large_page):
196 movdqu (%rsi), %xmm0
197 movdqu 16(%rsi), %xmm1
198 movdqu 32(%rsi), %xmm2
199 movdqu 48(%rsi), %xmm3
200 movdqu -64(%rsi, %rdx), %xmm4
201 movdqu -48(%rsi, %rdx), %xmm5
202 movdqu -32(%rsi, %rdx), %xmm6
203 movdqu -16(%rsi, %rdx), %xmm7
204 movdqu %xmm0, (%rdi)
205 movdqu %xmm1, 16(%rdi)
206 movdqu %xmm2, 32(%rdi)
207 movdqu %xmm3, 48(%rdi)
208 movdqu %xmm4, -64(%rdi, %rdx)
209 movdqu %xmm5, -48(%rdi, %rdx)
210 movdqu %xmm6, -32(%rdi, %rdx)
211 movdqu %xmm7, -16(%rdi, %rdx)
212
213 movdqu 64(%rsi), %xmm0
214 movdqu 80(%rsi), %xmm1
215 movdqu 96(%rsi), %xmm2
216 movdqu 112(%rsi), %xmm3
217 movdqu -128(%rsi, %rdx), %xmm4
218 movdqu -112(%rsi, %rdx), %xmm5
219 movdqu -96(%rsi, %rdx), %xmm6
220 movdqu -80(%rsi, %rdx), %xmm7
221 movdqu %xmm0, 64(%rdi)
222 movdqu %xmm1, 80(%rdi)
223 movdqu %xmm2, 96(%rdi)
224 movdqu %xmm3, 112(%rdi)
225 movdqu %xmm4, -128(%rdi, %rdx)
226 movdqu %xmm5, -112(%rdi, %rdx)
227 movdqu %xmm6, -96(%rdi, %rdx)
228 movdqu %xmm7, -80(%rdi, %rdx)
229
230/* Now the main loop with non temporal stores. We align
231 the address of the destination. */
232 lea 128(%rdi), %r8
233 and $-128, %r8
234
235 add %rdi, %rdx
236 and $-128, %rdx
237
238 sub %rdi, %rsi
239
240 .p2align 4
241L(main_loop_large_page):
242 movdqu (%r8, %rsi), %xmm0
243 movdqu 16(%r8, %rsi), %xmm1
244 movdqu 32(%r8, %rsi), %xmm2
245 movdqu 48(%r8, %rsi), %xmm3
246 movdqu 64(%r8, %rsi), %xmm4
247 movdqu 80(%r8, %rsi), %xmm5
248 movdqu 96(%r8, %rsi), %xmm6
249 movdqu 112(%r8, %rsi), %xmm7
250 movntdq %xmm0, (%r8)
251 movntdq %xmm1, 16(%r8)
252 movntdq %xmm2, 32(%r8)
253 movntdq %xmm3, 48(%r8)
254 movntdq %xmm4, 64(%r8)
255 movntdq %xmm5, 80(%r8)
256 movntdq %xmm6, 96(%r8)
257 movntdq %xmm7, 112(%r8)
258 lea 128(%r8), %r8
259 cmp %r8, %rdx
260 jne L(main_loop_large_page)
261 sfence
262 jmp L(return)
263
264L(len_0_16_bytes):
265 testb $24, %dl
266 jne L(len_9_16_bytes)
267 testb $4, %dl
268 .p2align 4,,5
269 jne L(len_5_8_bytes)
270 test %rdx, %rdx
271 .p2align 4,,2
272 je L(return)
273 movzbl (%rsi), %ebx
274 testb $2, %dl
275 movb %bl, (%rdi)
276 je L(return)
277 movzwl -2(%rsi,%rdx), %ebx
278 movw %bx, -2(%rdi,%rdx)
279 jmp L(return)
280
281L(len_9_16_bytes):
282 movq (%rsi), %xmm0
283 movq -8(%rsi, %rdx), %xmm1
284 movq %xmm0, (%rdi)
285 movq %xmm1, -8(%rdi, %rdx)
286 jmp L(return)
287
288L(len_5_8_bytes):
289 movl (%rsi), %ebx
290 movl %ebx, (%rdi)
291 movl -4(%rsi,%rdx), %ebx
292 movl %ebx, -4(%rdi,%rdx)
293 jmp L(return)
294
295L(return):
296 mov %rdi, %rax
297 RETURN
298
299END (MEMCPY)