Blame - libc/arch-x86_64/string/sse2-memcpy-slm.S - platform_bionic

blob: 4c30fb62eeabb4b29566163eca01d9988d27d1f6 [file] [log] [blame]

Varvara Rainchik	a020a24	2014-04-29 17:44:56 +0400	[diff] [blame]	1	/*
				2	Copyright (c) 2014, Intel Corporation
				3	All rights reserved.
				4
				5	Redistribution and use in source and binary forms, with or without
				6	modification, are permitted provided that the following conditions are met:
				7
				8	* Redistributions of source code must retain the above copyright notice,
				9	* this list of conditions and the following disclaimer.
				10
				11	* Redistributions in binary form must reproduce the above copyright notice,
				12	* this list of conditions and the following disclaimer in the documentation
				13	* and/or other materials provided with the distribution.
				14
				15	* Neither the name of Intel Corporation nor the names of its contributors
				16	* may be used to endorse or promote products derived from this software
				17	* without specific prior written permission.
				18
				19	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
				20	ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
				21	WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
				22	DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
				23	ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
				24	(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
				25	LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
				26	ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				27	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
				28	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				29	*/
				30
				31	#include "cache.h"
				32
				33	#ifndef MEMCPY
				34	# define MEMCPY memcpy
				35	#endif
				36
				37	#ifndef L
				38	# define L(label) .L##label
				39	#endif
				40
				41	#ifndef cfi_startproc
				42	# define cfi_startproc .cfi_startproc
				43	#endif
				44
				45	#ifndef cfi_endproc
				46	# define cfi_endproc .cfi_endproc
				47	#endif
				48
				49	#ifndef cfi_rel_offset
				50	# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
				51	#endif
				52
				53	#ifndef cfi_restore
				54	# define cfi_restore(reg) .cfi_restore reg
				55	#endif
				56
				57	#ifndef cfi_adjust_cfa_offset
				58	# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
				59	#endif
				60
				61	#ifndef ENTRY
				62	# define ENTRY(name) \
				63	.type name, @function; \
				64	.globl name; \
				65	.p2align 4; \
				66	name: \
				67	cfi_startproc
				68	#endif
				69
				70	#ifndef END
				71	# define END(name) \
				72	cfi_endproc; \
				73	.size name, .-name
				74	#endif
				75
				76	#define CFI_PUSH(REG) \
				77	cfi_adjust_cfa_offset (4); \
				78	cfi_rel_offset (REG, 0)
				79
				80	#define CFI_POP(REG) \
				81	cfi_adjust_cfa_offset (-4); \
				82	cfi_restore (REG)
				83
				84	#define PUSH(REG) push REG;
				85	#define POP(REG) pop REG;
				86
				87	#define ENTRANCE PUSH (%rbx);
				88	#define RETURN_END POP (%rbx); ret
				89	#define RETURN RETURN_END;
				90
				91	.section .text.sse2,"ax",@progbits
				92	ENTRY (MEMCPY)
				93	ENTRANCE
				94	cmp %rsi, %rdi
				95	je L(return)
				96
				97	cmp $16, %rdx
				98	jbe L(len_0_16_bytes)
				99
				100	cmp $SHARED_CACHE_SIZE_HALF, %rdx
				101	jae L(large_page)
				102
				103	movdqu (%rsi), %xmm0
				104	movdqu -16(%rsi, %rdx), %xmm1
				105	cmp $32, %rdx
				106	movdqu %xmm0, (%rdi)
				107	movdqu %xmm1, -16(%rdi, %rdx)
				108	jbe L(return)
				109
				110	movdqu 16(%rsi), %xmm0
				111	movdqu -32(%rsi, %rdx), %xmm1
				112	cmp $64, %rdx
				113	movdqu %xmm0, 16(%rdi)
				114	movdqu %xmm1, -32(%rdi, %rdx)
				115	jbe L(return)
				116
				117	movdqu 32(%rsi), %xmm0
				118	movdqu 48(%rsi), %xmm1
				119	movdqu -48(%rsi, %rdx), %xmm2
				120	movdqu -64(%rsi, %rdx), %xmm3
				121	cmp $128, %rdx
				122	movdqu %xmm0, 32(%rdi)
				123	movdqu %xmm1, 48(%rdi)
				124	movdqu %xmm2, -48(%rdi, %rdx)
				125	movdqu %xmm3, -64(%rdi, %rdx)
				126	jbe L(return)
				127
				128	/* Now the main loop: we align the address of the destination. */
				129	lea 64(%rdi), %r8
				130	and $-64, %r8
				131
				132	add %rdi, %rdx
				133	and $-64, %rdx
				134
				135	sub %rdi, %rsi
				136
				137	/* We should stop two iterations before the termination
				138	(in order not to misprefetch). */
				139	sub $64, %rdx
				140	cmp %r8, %rdx
				141	je L(main_loop_just_one_iteration)
				142
				143	sub $64, %rdx
				144	cmp %r8, %rdx
				145	je L(main_loop_last_two_iterations)
				146
				147
				148	.p2align 4
				149	L(main_loop_cache):
				150
				151	prefetcht0 128(%r8, %rsi)
				152
				153	movdqu (%r8, %rsi), %xmm0
				154	movdqu 16(%r8, %rsi), %xmm1
				155	movdqu 32(%r8, %rsi), %xmm2
				156	movdqu 48(%r8, %rsi), %xmm3
				157	movdqa %xmm0, (%r8)
				158	movdqa %xmm1, 16(%r8)
				159	movdqa %xmm2, 32(%r8)
				160	movdqa %xmm3, 48(%r8)
				161	lea 64(%r8), %r8
				162	cmp %r8, %rdx
				163	jne L(main_loop_cache)
				164
				165	L(main_loop_last_two_iterations):
				166	movdqu (%r8, %rsi), %xmm0
				167	movdqu 16(%r8, %rsi), %xmm1
				168	movdqu 32(%r8, %rsi), %xmm2
				169	movdqu 48(%r8, %rsi), %xmm3
				170	movdqu 64(%r8, %rsi), %xmm4
				171	movdqu 80(%r8, %rsi), %xmm5
				172	movdqu 96(%r8, %rsi), %xmm6
				173	movdqu 112(%r8, %rsi), %xmm7
				174	movdqa %xmm0, (%r8)
				175	movdqa %xmm1, 16(%r8)
				176	movdqa %xmm2, 32(%r8)
				177	movdqa %xmm3, 48(%r8)
				178	movdqa %xmm4, 64(%r8)
				179	movdqa %xmm5, 80(%r8)
				180	movdqa %xmm6, 96(%r8)
				181	movdqa %xmm7, 112(%r8)
				182	jmp L(return)
				183
				184	L(main_loop_just_one_iteration):
				185	movdqu (%r8, %rsi), %xmm0
				186	movdqu 16(%r8, %rsi), %xmm1
				187	movdqu 32(%r8, %rsi), %xmm2
				188	movdqu 48(%r8, %rsi), %xmm3
				189	movdqa %xmm0, (%r8)
				190	movdqa %xmm1, 16(%r8)
				191	movdqa %xmm2, 32(%r8)
				192	movdqa %xmm3, 48(%r8)
				193	jmp L(return)
				194
				195	L(large_page):
				196	movdqu (%rsi), %xmm0
				197	movdqu 16(%rsi), %xmm1
				198	movdqu 32(%rsi), %xmm2
				199	movdqu 48(%rsi), %xmm3
				200	movdqu -64(%rsi, %rdx), %xmm4
				201	movdqu -48(%rsi, %rdx), %xmm5
				202	movdqu -32(%rsi, %rdx), %xmm6
				203	movdqu -16(%rsi, %rdx), %xmm7
				204	movdqu %xmm0, (%rdi)
				205	movdqu %xmm1, 16(%rdi)
				206	movdqu %xmm2, 32(%rdi)
				207	movdqu %xmm3, 48(%rdi)
				208	movdqu %xmm4, -64(%rdi, %rdx)
				209	movdqu %xmm5, -48(%rdi, %rdx)
				210	movdqu %xmm6, -32(%rdi, %rdx)
				211	movdqu %xmm7, -16(%rdi, %rdx)
				212
				213	movdqu 64(%rsi), %xmm0
				214	movdqu 80(%rsi), %xmm1
				215	movdqu 96(%rsi), %xmm2
				216	movdqu 112(%rsi), %xmm3
				217	movdqu -128(%rsi, %rdx), %xmm4
				218	movdqu -112(%rsi, %rdx), %xmm5
				219	movdqu -96(%rsi, %rdx), %xmm6
				220	movdqu -80(%rsi, %rdx), %xmm7
				221	movdqu %xmm0, 64(%rdi)
				222	movdqu %xmm1, 80(%rdi)
				223	movdqu %xmm2, 96(%rdi)
				224	movdqu %xmm3, 112(%rdi)
				225	movdqu %xmm4, -128(%rdi, %rdx)
				226	movdqu %xmm5, -112(%rdi, %rdx)
				227	movdqu %xmm6, -96(%rdi, %rdx)
				228	movdqu %xmm7, -80(%rdi, %rdx)
				229
				230	/* Now the main loop with non temporal stores. We align
				231	the address of the destination. */
				232	lea 128(%rdi), %r8
				233	and $-128, %r8
				234
				235	add %rdi, %rdx
				236	and $-128, %rdx
				237
				238	sub %rdi, %rsi
				239
				240	.p2align 4
				241	L(main_loop_large_page):
				242	movdqu (%r8, %rsi), %xmm0
				243	movdqu 16(%r8, %rsi), %xmm1
				244	movdqu 32(%r8, %rsi), %xmm2
				245	movdqu 48(%r8, %rsi), %xmm3
				246	movdqu 64(%r8, %rsi), %xmm4
				247	movdqu 80(%r8, %rsi), %xmm5
				248	movdqu 96(%r8, %rsi), %xmm6
				249	movdqu 112(%r8, %rsi), %xmm7
				250	movntdq %xmm0, (%r8)
				251	movntdq %xmm1, 16(%r8)
				252	movntdq %xmm2, 32(%r8)
				253	movntdq %xmm3, 48(%r8)
				254	movntdq %xmm4, 64(%r8)
				255	movntdq %xmm5, 80(%r8)
				256	movntdq %xmm6, 96(%r8)
				257	movntdq %xmm7, 112(%r8)
				258	lea 128(%r8), %r8
				259	cmp %r8, %rdx
				260	jne L(main_loop_large_page)
				261	sfence
				262	jmp L(return)
				263
				264	L(len_0_16_bytes):
				265	testb $24, %dl
				266	jne L(len_9_16_bytes)
				267	testb $4, %dl
				268	.p2align 4,,5
				269	jne L(len_5_8_bytes)
				270	test %rdx, %rdx
				271	.p2align 4,,2
				272	je L(return)
				273	movzbl (%rsi), %ebx
				274	testb $2, %dl
				275	movb %bl, (%rdi)
				276	je L(return)
				277	movzwl -2(%rsi,%rdx), %ebx
				278	movw %bx, -2(%rdi,%rdx)
				279	jmp L(return)
				280
				281	L(len_9_16_bytes):
				282	movq (%rsi), %xmm0
				283	movq -8(%rsi, %rdx), %xmm1
				284	movq %xmm0, (%rdi)
				285	movq %xmm1, -8(%rdi, %rdx)
				286	jmp L(return)
				287
				288	L(len_5_8_bytes):
				289	movl (%rsi), %ebx
				290	movl %ebx, (%rdi)
				291	movl -4(%rsi,%rdx), %ebx
				292	movl %ebx, -4(%rdi,%rdx)
				293	jmp L(return)
				294
				295	L(return):
				296	mov %rdi, %rax
				297	RETURN
				298
				299	END (MEMCPY)