blob: 1b305c7904d5f5320f701eb20688f2ac49dd94ed [file] [log] [blame]
Varvara Rainchik5a922842014-04-24 15:41:20 +04001/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8 * Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10
11 * Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14
15 * Neither the name of Intel Corporation nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#include "cache.h"
32
33#ifndef MEMCPY
34# define MEMCPY memcpy
35#endif
36
37#ifndef L
38# define L(label) .L##label
39#endif
40
41#ifndef cfi_startproc
42# define cfi_startproc .cfi_startproc
43#endif
44
45#ifndef cfi_endproc
46# define cfi_endproc .cfi_endproc
47#endif
48
49#ifndef cfi_rel_offset
50# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
51#endif
52
53#ifndef cfi_restore
54# define cfi_restore(reg) .cfi_restore reg
55#endif
56
57#ifndef cfi_adjust_cfa_offset
58# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
59#endif
60
61#ifndef ENTRY
62# define ENTRY(name) \
63 .type name, @function; \
64 .globl name; \
65 .p2align 4; \
66name: \
67 cfi_startproc
68#endif
69
70#ifndef END
71# define END(name) \
72 cfi_endproc; \
73 .size name, .-name
74#endif
75
76#define DEST PARMS
77#define SRC DEST+4
78#define LEN SRC+4
79
80#define CFI_PUSH(REG) \
81 cfi_adjust_cfa_offset (4); \
82 cfi_rel_offset (REG, 0)
83
84#define CFI_POP(REG) \
85 cfi_adjust_cfa_offset (-4); \
86 cfi_restore (REG)
87
88#define PUSH(REG) pushl REG; CFI_PUSH (REG)
89#define POP(REG) popl REG; CFI_POP (REG)
90
91#define PARMS 8 /* Preserve EBX. */
92#define ENTRANCE PUSH (%ebx);
93#define RETURN_END POP (%ebx); ret
94#define RETURN RETURN_END; CFI_PUSH (%ebx)
95
96 .section .text.sse2,"ax",@progbits
97ENTRY (MEMCPY)
98 ENTRANCE
99 movl LEN(%esp), %ecx
100 movl SRC(%esp), %eax
101 movl DEST(%esp), %edx
102
103 cmp %eax, %edx
104 je L(return)
105
106 cmp $16, %ecx
107 jbe L(len_0_16_bytes)
108
109 cmp $SHARED_CACHE_SIZE_HALF, %ecx
110 jae L(large_page)
111
112 movdqu (%eax), %xmm0
113 movdqu -16(%eax, %ecx), %xmm1
114 cmpl $32, %ecx
115 movdqu %xmm0, (%edx)
116 movdqu %xmm1, -16(%edx, %ecx)
117 jbe L(return)
118
119 movdqu 16(%eax), %xmm0
120 movdqu -32(%eax, %ecx), %xmm1
121 cmpl $64, %ecx
122 movdqu %xmm0, 16(%edx)
123 movdqu %xmm1, -32(%edx, %ecx)
124 jbe L(return)
125
126 movdqu 32(%eax), %xmm0
127 movdqu 48(%eax), %xmm1
128 movdqu -48(%eax, %ecx), %xmm2
129 movdqu -64(%eax, %ecx), %xmm3
130 cmpl $128, %ecx
131 movdqu %xmm0, 32(%edx)
132 movdqu %xmm1, 48(%edx)
133 movdqu %xmm2, -48(%edx, %ecx)
134 movdqu %xmm3, -64(%edx, %ecx)
135 jbe L(return)
136
137/* Now the main loop: we align the address of the destination. */
138 leal 64(%edx), %ebx
139 andl $-64, %ebx
140
141 addl %edx, %ecx
142 andl $-64, %ecx
143
144 subl %edx, %eax
145
146/* We should stop two iterations before the termination
147 (in order not to misprefetch). */
148 subl $64, %ecx
149 cmpl %ebx, %ecx
150 je L(main_loop_just_one_iteration)
151
152 subl $64, %ecx
153 cmpl %ebx, %ecx
154 je L(main_loop_last_two_iterations)
155
156
157 .p2align 4
158L(main_loop_cache):
159
160 prefetcht0 128(%ebx, %eax)
161
162 movdqu (%ebx, %eax), %xmm0
163 movdqu 16(%ebx, %eax), %xmm1
164 movdqu 32(%ebx, %eax), %xmm2
165 movdqu 48(%ebx, %eax), %xmm3
166 movdqa %xmm0, (%ebx)
167 movdqa %xmm1, 16(%ebx)
168 movdqa %xmm2, 32(%ebx)
169 movdqa %xmm3, 48(%ebx)
170 lea 64(%ebx), %ebx
171 cmpl %ebx, %ecx
172 jne L(main_loop_cache)
173
174L(main_loop_last_two_iterations):
175 movdqu (%ebx, %eax), %xmm0
176 movdqu 16(%ebx, %eax), %xmm1
177 movdqu 32(%ebx, %eax), %xmm2
178 movdqu 48(%ebx, %eax), %xmm3
179 movdqu 64(%ebx, %eax), %xmm4
180 movdqu 80(%ebx, %eax), %xmm5
181 movdqu 96(%ebx, %eax), %xmm6
182 movdqu 112(%ebx, %eax), %xmm7
183 movdqa %xmm0, (%ebx)
184 movdqa %xmm1, 16(%ebx)
185 movdqa %xmm2, 32(%ebx)
186 movdqa %xmm3, 48(%ebx)
187 movdqa %xmm4, 64(%ebx)
188 movdqa %xmm5, 80(%ebx)
189 movdqa %xmm6, 96(%ebx)
190 movdqa %xmm7, 112(%ebx)
191 jmp L(return)
192
193L(main_loop_just_one_iteration):
194 movdqu (%ebx, %eax), %xmm0
195 movdqu 16(%ebx, %eax), %xmm1
196 movdqu 32(%ebx, %eax), %xmm2
197 movdqu 48(%ebx, %eax), %xmm3
198 movdqa %xmm0, (%ebx)
199 movdqa %xmm1, 16(%ebx)
200 movdqa %xmm2, 32(%ebx)
201 movdqa %xmm3, 48(%ebx)
202 jmp L(return)
203
204L(large_page):
205 movdqu (%eax), %xmm0
206 movdqu 16(%eax), %xmm1
207 movdqu 32(%eax), %xmm2
208 movdqu 48(%eax), %xmm3
209 movdqu -64(%eax, %ecx), %xmm4
210 movdqu -48(%eax, %ecx), %xmm5
211 movdqu -32(%eax, %ecx), %xmm6
212 movdqu -16(%eax, %ecx), %xmm7
213 movdqu %xmm0, (%edx)
214 movdqu %xmm1, 16(%edx)
215 movdqu %xmm2, 32(%edx)
216 movdqu %xmm3, 48(%edx)
217 movdqu %xmm4, -64(%edx, %ecx)
218 movdqu %xmm5, -48(%edx, %ecx)
219 movdqu %xmm6, -32(%edx, %ecx)
220 movdqu %xmm7, -16(%edx, %ecx)
221
222 movdqu 64(%eax), %xmm0
223 movdqu 80(%eax), %xmm1
224 movdqu 96(%eax), %xmm2
225 movdqu 112(%eax), %xmm3
226 movdqu -128(%eax, %ecx), %xmm4
227 movdqu -112(%eax, %ecx), %xmm5
228 movdqu -96(%eax, %ecx), %xmm6
229 movdqu -80(%eax, %ecx), %xmm7
230 movdqu %xmm0, 64(%edx)
231 movdqu %xmm1, 80(%edx)
232 movdqu %xmm2, 96(%edx)
233 movdqu %xmm3, 112(%edx)
234 movdqu %xmm4, -128(%edx, %ecx)
235 movdqu %xmm5, -112(%edx, %ecx)
236 movdqu %xmm6, -96(%edx, %ecx)
237 movdqu %xmm7, -80(%edx, %ecx)
238
239/* Now the main loop with non temporal stores. We align
240 the address of the destination. */
241 leal 128(%edx), %ebx
242 andl $-128, %ebx
243
244 addl %edx, %ecx
245 andl $-128, %ecx
246
247 subl %edx, %eax
248
249 .p2align 4
250L(main_loop_large_page):
251 movdqu (%ebx, %eax), %xmm0
252 movdqu 16(%ebx, %eax), %xmm1
253 movdqu 32(%ebx, %eax), %xmm2
254 movdqu 48(%ebx, %eax), %xmm3
255 movdqu 64(%ebx, %eax), %xmm4
256 movdqu 80(%ebx, %eax), %xmm5
257 movdqu 96(%ebx, %eax), %xmm6
258 movdqu 112(%ebx, %eax), %xmm7
259 movntdq %xmm0, (%ebx)
260 movntdq %xmm1, 16(%ebx)
261 movntdq %xmm2, 32(%ebx)
262 movntdq %xmm3, 48(%ebx)
263 movntdq %xmm4, 64(%ebx)
264 movntdq %xmm5, 80(%ebx)
265 movntdq %xmm6, 96(%ebx)
266 movntdq %xmm7, 112(%ebx)
267 lea 128(%ebx), %ebx
268 cmpl %ebx, %ecx
269 jne L(main_loop_large_page)
270 sfence
271 jmp L(return)
272
273L(len_0_16_bytes):
274 testb $24, %cl
275 jne L(len_9_16_bytes)
276 testb $4, %cl
277 .p2align 4,,5
278 jne L(len_5_8_bytes)
279 testl %ecx, %ecx
280 .p2align 4,,2
281 je L(return)
282 movzbl (%eax), %ebx
283 testb $2, %cl
284 movb %bl, (%edx)
285 je L(return)
286 movzwl -2(%eax,%ecx), %ebx
287 movw %bx, -2(%edx,%ecx)
288 jmp L(return)
289
290L(len_9_16_bytes):
291 movq (%eax), %xmm0
292 movq -8(%eax, %ecx), %xmm1
293 movq %xmm0, (%edx)
294 movq %xmm1, -8(%edx, %ecx)
295 jmp L(return)
296
297L(len_5_8_bytes):
298 movl (%eax), %ebx
299 movl %ebx, (%edx)
300 movl -4(%eax,%ecx), %ebx
301 movl %ebx, -4(%edx,%ecx)
302 jmp L(return)
303
304L(return):
305 movl %edx, %eax
306 RETURN
307
308END (MEMCPY)