Blame - libc/arch-x86/silvermont/string/sse2-memcpy-slm.S - platform_bionic

blob: 1b305c7904d5f5320f701eb20688f2ac49dd94ed [file] [log] [blame]

Varvara Rainchik	5a92284	2014-04-24 15:41:20 +0400	[diff] [blame]	1	/*
				2	Copyright (c) 2014, Intel Corporation
				3	All rights reserved.
				4
				5	Redistribution and use in source and binary forms, with or without
				6	modification, are permitted provided that the following conditions are met:
				7
				8	* Redistributions of source code must retain the above copyright notice,
				9	* this list of conditions and the following disclaimer.
				10
				11	* Redistributions in binary form must reproduce the above copyright notice,
				12	* this list of conditions and the following disclaimer in the documentation
				13	* and/or other materials provided with the distribution.
				14
				15	* Neither the name of Intel Corporation nor the names of its contributors
				16	* may be used to endorse or promote products derived from this software
				17	* without specific prior written permission.
				18
				19	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
				20	ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
				21	WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
				22	DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
				23	ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
				24	(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
				25	LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
				26	ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				27	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
				28	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				29	*/
				30
				31	#include "cache.h"
				32
				33	#ifndef MEMCPY
				34	# define MEMCPY memcpy
				35	#endif
				36
				37	#ifndef L
				38	# define L(label) .L##label
				39	#endif
				40
				41	#ifndef cfi_startproc
				42	# define cfi_startproc .cfi_startproc
				43	#endif
				44
				45	#ifndef cfi_endproc
				46	# define cfi_endproc .cfi_endproc
				47	#endif
				48
				49	#ifndef cfi_rel_offset
				50	# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
				51	#endif
				52
				53	#ifndef cfi_restore
				54	# define cfi_restore(reg) .cfi_restore reg
				55	#endif
				56
				57	#ifndef cfi_adjust_cfa_offset
				58	# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
				59	#endif
				60
				61	#ifndef ENTRY
				62	# define ENTRY(name) \
				63	.type name, @function; \
				64	.globl name; \
				65	.p2align 4; \
				66	name: \
				67	cfi_startproc
				68	#endif
				69
				70	#ifndef END
				71	# define END(name) \
				72	cfi_endproc; \
				73	.size name, .-name
				74	#endif
				75
				76	#define DEST PARMS
				77	#define SRC DEST+4
				78	#define LEN SRC+4
				79
				80	#define CFI_PUSH(REG) \
				81	cfi_adjust_cfa_offset (4); \
				82	cfi_rel_offset (REG, 0)
				83
				84	#define CFI_POP(REG) \
				85	cfi_adjust_cfa_offset (-4); \
				86	cfi_restore (REG)
				87
				88	#define PUSH(REG) pushl REG; CFI_PUSH (REG)
				89	#define POP(REG) popl REG; CFI_POP (REG)
				90
				91	#define PARMS 8 /* Preserve EBX. */
				92	#define ENTRANCE PUSH (%ebx);
				93	#define RETURN_END POP (%ebx); ret
				94	#define RETURN RETURN_END; CFI_PUSH (%ebx)
				95
				96	.section .text.sse2,"ax",@progbits
				97	ENTRY (MEMCPY)
				98	ENTRANCE
				99	movl LEN(%esp), %ecx
				100	movl SRC(%esp), %eax
				101	movl DEST(%esp), %edx
				102
				103	cmp %eax, %edx
				104	je L(return)
				105
				106	cmp $16, %ecx
				107	jbe L(len_0_16_bytes)
				108
				109	cmp $SHARED_CACHE_SIZE_HALF, %ecx
				110	jae L(large_page)
				111
				112	movdqu (%eax), %xmm0
				113	movdqu -16(%eax, %ecx), %xmm1
				114	cmpl $32, %ecx
				115	movdqu %xmm0, (%edx)
				116	movdqu %xmm1, -16(%edx, %ecx)
				117	jbe L(return)
				118
				119	movdqu 16(%eax), %xmm0
				120	movdqu -32(%eax, %ecx), %xmm1
				121	cmpl $64, %ecx
				122	movdqu %xmm0, 16(%edx)
				123	movdqu %xmm1, -32(%edx, %ecx)
				124	jbe L(return)
				125
				126	movdqu 32(%eax), %xmm0
				127	movdqu 48(%eax), %xmm1
				128	movdqu -48(%eax, %ecx), %xmm2
				129	movdqu -64(%eax, %ecx), %xmm3
				130	cmpl $128, %ecx
				131	movdqu %xmm0, 32(%edx)
				132	movdqu %xmm1, 48(%edx)
				133	movdqu %xmm2, -48(%edx, %ecx)
				134	movdqu %xmm3, -64(%edx, %ecx)
				135	jbe L(return)
				136
				137	/* Now the main loop: we align the address of the destination. */
				138	leal 64(%edx), %ebx
				139	andl $-64, %ebx
				140
				141	addl %edx, %ecx
				142	andl $-64, %ecx
				143
				144	subl %edx, %eax
				145
				146	/* We should stop two iterations before the termination
				147	(in order not to misprefetch). */
				148	subl $64, %ecx
				149	cmpl %ebx, %ecx
				150	je L(main_loop_just_one_iteration)
				151
				152	subl $64, %ecx
				153	cmpl %ebx, %ecx
				154	je L(main_loop_last_two_iterations)
				155
				156
				157	.p2align 4
				158	L(main_loop_cache):
				159
				160	prefetcht0 128(%ebx, %eax)
				161
				162	movdqu (%ebx, %eax), %xmm0
				163	movdqu 16(%ebx, %eax), %xmm1
				164	movdqu 32(%ebx, %eax), %xmm2
				165	movdqu 48(%ebx, %eax), %xmm3
				166	movdqa %xmm0, (%ebx)
				167	movdqa %xmm1, 16(%ebx)
				168	movdqa %xmm2, 32(%ebx)
				169	movdqa %xmm3, 48(%ebx)
				170	lea 64(%ebx), %ebx
				171	cmpl %ebx, %ecx
				172	jne L(main_loop_cache)
				173
				174	L(main_loop_last_two_iterations):
				175	movdqu (%ebx, %eax), %xmm0
				176	movdqu 16(%ebx, %eax), %xmm1
				177	movdqu 32(%ebx, %eax), %xmm2
				178	movdqu 48(%ebx, %eax), %xmm3
				179	movdqu 64(%ebx, %eax), %xmm4
				180	movdqu 80(%ebx, %eax), %xmm5
				181	movdqu 96(%ebx, %eax), %xmm6
				182	movdqu 112(%ebx, %eax), %xmm7
				183	movdqa %xmm0, (%ebx)
				184	movdqa %xmm1, 16(%ebx)
				185	movdqa %xmm2, 32(%ebx)
				186	movdqa %xmm3, 48(%ebx)
				187	movdqa %xmm4, 64(%ebx)
				188	movdqa %xmm5, 80(%ebx)
				189	movdqa %xmm6, 96(%ebx)
				190	movdqa %xmm7, 112(%ebx)
				191	jmp L(return)
				192
				193	L(main_loop_just_one_iteration):
				194	movdqu (%ebx, %eax), %xmm0
				195	movdqu 16(%ebx, %eax), %xmm1
				196	movdqu 32(%ebx, %eax), %xmm2
				197	movdqu 48(%ebx, %eax), %xmm3
				198	movdqa %xmm0, (%ebx)
				199	movdqa %xmm1, 16(%ebx)
				200	movdqa %xmm2, 32(%ebx)
				201	movdqa %xmm3, 48(%ebx)
				202	jmp L(return)
				203
				204	L(large_page):
				205	movdqu (%eax), %xmm0
				206	movdqu 16(%eax), %xmm1
				207	movdqu 32(%eax), %xmm2
				208	movdqu 48(%eax), %xmm3
				209	movdqu -64(%eax, %ecx), %xmm4
				210	movdqu -48(%eax, %ecx), %xmm5
				211	movdqu -32(%eax, %ecx), %xmm6
				212	movdqu -16(%eax, %ecx), %xmm7
				213	movdqu %xmm0, (%edx)
				214	movdqu %xmm1, 16(%edx)
				215	movdqu %xmm2, 32(%edx)
				216	movdqu %xmm3, 48(%edx)
				217	movdqu %xmm4, -64(%edx, %ecx)
				218	movdqu %xmm5, -48(%edx, %ecx)
				219	movdqu %xmm6, -32(%edx, %ecx)
				220	movdqu %xmm7, -16(%edx, %ecx)
				221
				222	movdqu 64(%eax), %xmm0
				223	movdqu 80(%eax), %xmm1
				224	movdqu 96(%eax), %xmm2
				225	movdqu 112(%eax), %xmm3
				226	movdqu -128(%eax, %ecx), %xmm4
				227	movdqu -112(%eax, %ecx), %xmm5
				228	movdqu -96(%eax, %ecx), %xmm6
				229	movdqu -80(%eax, %ecx), %xmm7
				230	movdqu %xmm0, 64(%edx)
				231	movdqu %xmm1, 80(%edx)
				232	movdqu %xmm2, 96(%edx)
				233	movdqu %xmm3, 112(%edx)
				234	movdqu %xmm4, -128(%edx, %ecx)
				235	movdqu %xmm5, -112(%edx, %ecx)
				236	movdqu %xmm6, -96(%edx, %ecx)
				237	movdqu %xmm7, -80(%edx, %ecx)
				238
				239	/* Now the main loop with non temporal stores. We align
				240	the address of the destination. */
				241	leal 128(%edx), %ebx
				242	andl $-128, %ebx
				243
				244	addl %edx, %ecx
				245	andl $-128, %ecx
				246
				247	subl %edx, %eax
				248
				249	.p2align 4
				250	L(main_loop_large_page):
				251	movdqu (%ebx, %eax), %xmm0
				252	movdqu 16(%ebx, %eax), %xmm1
				253	movdqu 32(%ebx, %eax), %xmm2
				254	movdqu 48(%ebx, %eax), %xmm3
				255	movdqu 64(%ebx, %eax), %xmm4
				256	movdqu 80(%ebx, %eax), %xmm5
				257	movdqu 96(%ebx, %eax), %xmm6
				258	movdqu 112(%ebx, %eax), %xmm7
				259	movntdq %xmm0, (%ebx)
				260	movntdq %xmm1, 16(%ebx)
				261	movntdq %xmm2, 32(%ebx)
				262	movntdq %xmm3, 48(%ebx)
				263	movntdq %xmm4, 64(%ebx)
				264	movntdq %xmm5, 80(%ebx)
				265	movntdq %xmm6, 96(%ebx)
				266	movntdq %xmm7, 112(%ebx)
				267	lea 128(%ebx), %ebx
				268	cmpl %ebx, %ecx
				269	jne L(main_loop_large_page)
				270	sfence
				271	jmp L(return)
				272
				273	L(len_0_16_bytes):
				274	testb $24, %cl
				275	jne L(len_9_16_bytes)
				276	testb $4, %cl
				277	.p2align 4,,5
				278	jne L(len_5_8_bytes)
				279	testl %ecx, %ecx
				280	.p2align 4,,2
				281	je L(return)
				282	movzbl (%eax), %ebx
				283	testb $2, %cl
				284	movb %bl, (%edx)
				285	je L(return)
				286	movzwl -2(%eax,%ecx), %ebx
				287	movw %bx, -2(%edx,%ecx)
				288	jmp L(return)
				289
				290	L(len_9_16_bytes):
				291	movq (%eax), %xmm0
				292	movq -8(%eax, %ecx), %xmm1
				293	movq %xmm0, (%edx)
				294	movq %xmm1, -8(%edx, %ecx)
				295	jmp L(return)
				296
				297	L(len_5_8_bytes):
				298	movl (%eax), %ebx
				299	movl %ebx, (%edx)
				300	movl -4(%eax,%ecx), %ebx
				301	movl %ebx, -4(%edx,%ecx)
				302	jmp L(return)
				303
				304	L(return):
				305	movl %edx, %eax
				306	RETURN
				307
				308	END (MEMCPY)