Blame - libc/arch-x86_64/string/sse2-memmove-slm.S - platform_bionic

blob: ee8440e9aada3081ddaad31df665b6c9b542f063 [file] [log] [blame]

Varvara Rainchik	a020a24	2014-04-29 17:44:56 +0400	[diff] [blame^]	1	/*
				2	Copyright (c) 2014, Intel Corporation
				3	All rights reserved.
				4
				5	Redistribution and use in source and binary forms, with or without
				6	modification, are permitted provided that the following conditions are met:
				7
				8	* Redistributions of source code must retain the above copyright notice,
				9	* this list of conditions and the following disclaimer.
				10
				11	* Redistributions in binary form must reproduce the above copyright notice,
				12	* this list of conditions and the following disclaimer in the documentation
				13	* and/or other materials provided with the distribution.
				14
				15	* Neither the name of Intel Corporation nor the names of its contributors
				16	* may be used to endorse or promote products derived from this software
				17	* without specific prior written permission.
				18
				19	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
				20	ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
				21	WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
				22	DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
				23	ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
				24	(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
				25	LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
				26	ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
				27	(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
				28	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				29	*/
				30
				31	#include "cache.h"
				32
				33	#ifndef MEMMOVE
				34	# define MEMMOVE memmove
				35	#endif
				36
				37	#ifndef L
				38	# define L(label) .L##label
				39	#endif
				40
				41	#ifndef cfi_startproc
				42	# define cfi_startproc .cfi_startproc
				43	#endif
				44
				45	#ifndef cfi_endproc
				46	# define cfi_endproc .cfi_endproc
				47	#endif
				48
				49	#ifndef cfi_rel_offset
				50	# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
				51	#endif
				52
				53	#ifndef cfi_restore
				54	# define cfi_restore(reg) .cfi_restore reg
				55	#endif
				56
				57	#ifndef cfi_adjust_cfa_offset
				58	# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
				59	#endif
				60
				61	#ifndef ENTRY
				62	# define ENTRY(name) \
				63	.type name, @function; \
				64	.globl name; \
				65	.p2align 4; \
				66	name: \
				67	cfi_startproc
				68	#endif
				69
				70	#ifndef END
				71	# define END(name) \
				72	cfi_endproc; \
				73	.size name, .-name
				74	#endif
				75
				76	#define CFI_PUSH(REG) \
				77	cfi_adjust_cfa_offset (4); \
				78	cfi_rel_offset (REG, 0)
				79
				80	#define CFI_POP(REG) \
				81	cfi_adjust_cfa_offset (-4); \
				82	cfi_restore (REG)
				83
				84	#define PUSH(REG) push REG;
				85	#define POP(REG) pop REG;
				86
				87	#define ENTRANCE PUSH (%rbx);
				88	#define RETURN_END POP (%rbx); ret
				89	#define RETURN RETURN_END;
				90
				91	.section .text.sse2,"ax",@progbits
				92	ENTRY (MEMMOVE)
				93	ENTRANCE
				94	#ifdef USE_AS_BCOPY
				95	xchg %rsi, %rdi
				96	#endif
				97	mov %rdi, %rax
				98
				99	/* Check whether we should copy backward or forward. */
				100	cmp %rsi, %rdi
				101	je L(mm_return)
				102	ja L(mm_len_0_or_more_backward)
				103
				104	/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
				105	separately. */
				106	cmp $16, %rdx
				107	jbe L(mm_len_0_16_bytes_forward)
				108
				109	cmp $32, %rdx
				110	jg L(mm_len_32_or_more_forward)
				111
				112	/* Copy [0..32] and return. */
				113	movdqu (%rsi), %xmm0
				114	movdqu -16(%rsi, %rdx), %xmm1
				115	movdqu %xmm0, (%rdi)
				116	movdqu %xmm1, -16(%rdi, %rdx)
				117	jmp L(mm_return)
				118
				119	L(mm_len_32_or_more_forward):
				120	cmp $64, %rdx
				121	jg L(mm_len_64_or_more_forward)
				122
				123	/* Copy [0..64] and return. */
				124	movdqu (%rsi), %xmm0
				125	movdqu 16(%rsi), %xmm1
				126	movdqu -16(%rsi, %rdx), %xmm2
				127	movdqu -32(%rsi, %rdx), %xmm3
				128	movdqu %xmm0, (%rdi)
				129	movdqu %xmm1, 16(%rdi)
				130	movdqu %xmm2, -16(%rdi, %rdx)
				131	movdqu %xmm3, -32(%rdi, %rdx)
				132	jmp L(mm_return)
				133
				134	L(mm_len_64_or_more_forward):
				135	cmp $128, %rdx
				136	jg L(mm_len_128_or_more_forward)
				137
				138	/* Copy [0..128] and return. */
				139	movdqu (%rsi), %xmm0
				140	movdqu 16(%rsi), %xmm1
				141	movdqu 32(%rsi), %xmm2
				142	movdqu 48(%rsi), %xmm3
				143	movdqu -64(%rsi, %rdx), %xmm4
				144	movdqu -48(%rsi, %rdx), %xmm5
				145	movdqu -32(%rsi, %rdx), %xmm6
				146	movdqu -16(%rsi, %rdx), %xmm7
				147	movdqu %xmm0, (%rdi)
				148	movdqu %xmm1, 16(%rdi)
				149	movdqu %xmm2, 32(%rdi)
				150	movdqu %xmm3, 48(%rdi)
				151	movdqu %xmm4, -64(%rdi, %rdx)
				152	movdqu %xmm5, -48(%rdi, %rdx)
				153	movdqu %xmm6, -32(%rdi, %rdx)
				154	movdqu %xmm7, -16(%rdi, %rdx)
				155	jmp L(mm_return)
				156
				157	L(mm_len_128_or_more_forward):
				158
				159	cmp $SHARED_CACHE_SIZE_HALF, %rdx
				160	jae L(mm_large_page_forward)
				161
				162	mov %rsi, %r8 // copy src to r8
				163	mov %rdi, %r9 // copy dst to r9
				164
				165	/* Aligning the address of destination. */
				166	/* save first unaligned 64 bytes */
				167	movdqu (%rsi), %xmm0
				168	movdqu 16(%rsi), %xmm1
				169	movdqu 32(%rsi), %xmm2
				170	movdqu 48(%rsi), %xmm3
				171
				172	lea 64(%r9), %rdi
				173	and $-64, %rdi /* rdi now aligned to next 64 byte boundary */
				174
				175	sub %r9, %rsi /* rsi = src - dst = diff */
				176
				177	movdqu (%rdi, %rsi), %xmm4
				178	movdqu 16(%rdi, %rsi), %xmm5
				179	movdqu 32(%rdi, %rsi), %xmm6
				180	movdqu 48(%rdi, %rsi), %xmm7
				181
				182	movdqu %xmm0, (%r9)
				183	movdqu %xmm1, 16(%r9)
				184	movdqu %xmm2, 32(%r9)
				185	movdqu %xmm3, 48(%r9)
				186	movdqa %xmm4, (%rdi)
				187	movdqa %xmm5, 16(%rdi)
				188	movdqa %xmm6, 32(%rdi)
				189	movdqa %xmm7, 48(%rdi)
				190	add $64, %rdi
				191
				192	lea (%r9, %rdx), %rbx
				193	and $-64, %rbx
				194
				195	cmp %rdi, %rbx
				196	jbe L(mm_copy_remaining_forward)
				197
				198	.p2align 4
				199	L(mm_main_loop_forward):
				200
				201	prefetcht0 128(%rdi, %rsi)
				202
				203	movdqu (%rdi, %rsi), %xmm0
				204	movdqu 16(%rdi, %rsi), %xmm1
				205	movdqu 32(%rdi, %rsi), %xmm2
				206	movdqu 48(%rdi, %rsi), %xmm3
				207	movdqa %xmm0, (%rdi)
				208	movdqa %xmm1, 16(%rdi)
				209	movdqa %xmm2, 32(%rdi)
				210	movdqa %xmm3, 48(%rdi)
				211	lea 64(%rdi), %rdi
				212	cmp %rdi, %rbx
				213	ja L(mm_main_loop_forward)
				214
				215	L(mm_copy_remaining_forward):
				216	add %r9, %rdx
				217	sub %rdi, %rdx
				218	/* We copied all up till %rdi position in the dst.
				219	In %rdx now is how many bytes are left to copy.
				220	Now we need to advance %r8. */
				221	lea (%rdi, %rsi), %r8
				222
				223	L(mm_remaining_0_64_bytes_forward):
				224	cmp $32, %rdx
				225	ja L(mm_remaining_33_64_bytes_forward)
				226	cmp $16, %rdx
				227	ja L(mm_remaining_17_32_bytes_forward)
				228	test %rdx, %rdx
				229	.p2align 4,,2
				230	je L(mm_return)
				231
				232	cmpb $8, %dl
				233	ja L(mm_remaining_9_16_bytes_forward)
				234	cmpb $4, %dl
				235	.p2align 4,,5
				236	ja L(mm_remaining_5_8_bytes_forward)
				237	cmpb $2, %dl
				238	.p2align 4,,1
				239	ja L(mm_remaining_3_4_bytes_forward)
				240	movzbl -1(%r8,%rdx), %esi
				241	movzbl (%r8), %ebx
				242	movb %sil, -1(%rdi,%rdx)
				243	movb %bl, (%rdi)
				244	jmp L(mm_return)
				245
				246	L(mm_remaining_33_64_bytes_forward):
				247	movdqu (%r8), %xmm0
				248	movdqu 16(%r8), %xmm1
				249	movdqu -32(%r8, %rdx), %xmm2
				250	movdqu -16(%r8, %rdx), %xmm3
				251	movdqu %xmm0, (%rdi)
				252	movdqu %xmm1, 16(%rdi)
				253	movdqu %xmm2, -32(%rdi, %rdx)
				254	movdqu %xmm3, -16(%rdi, %rdx)
				255	jmp L(mm_return)
				256
				257	L(mm_remaining_17_32_bytes_forward):
				258	movdqu (%r8), %xmm0
				259	movdqu -16(%r8, %rdx), %xmm1
				260	movdqu %xmm0, (%rdi)
				261	movdqu %xmm1, -16(%rdi, %rdx)
				262	jmp L(mm_return)
				263
				264	L(mm_remaining_3_4_bytes_forward):
				265	movzwl -2(%r8,%rdx), %esi
				266	movzwl (%r8), %ebx
				267	movw %si, -2(%rdi,%rdx)
				268	movw %bx, (%rdi)
				269	jmp L(mm_return)
				270
				271	L(mm_remaining_5_8_bytes_forward):
				272	movl (%r8), %esi
				273	movl -4(%r8,%rdx), %ebx
				274	movl %esi, (%rdi)
				275	movl %ebx, -4(%rdi,%rdx)
				276	jmp L(mm_return)
				277
				278	L(mm_remaining_9_16_bytes_forward):
				279	mov (%r8), %rsi
				280	mov -8(%r8, %rdx), %rbx
				281	mov %rsi, (%rdi)
				282	mov %rbx, -8(%rdi, %rdx)
				283	jmp L(mm_return)
				284
				285	L(mm_len_0_16_bytes_forward):
				286	testb $24, %dl
				287	jne L(mm_len_9_16_bytes_forward)
				288	testb $4, %dl
				289	.p2align 4,,5
				290	jne L(mm_len_5_8_bytes_forward)
				291	test %rdx, %rdx
				292	.p2align 4,,2
				293	je L(mm_return)
				294	testb $2, %dl
				295	.p2align 4,,1
				296	jne L(mm_len_2_4_bytes_forward)
				297	movzbl -1(%rsi,%rdx), %ebx
				298	movzbl (%rsi), %esi
				299	movb %bl, -1(%rdi,%rdx)
				300	movb %sil, (%rdi)
				301	jmp L(mm_return)
				302
				303	L(mm_len_2_4_bytes_forward):
				304	movzwl -2(%rsi,%rdx), %ebx
				305	movzwl (%rsi), %esi
				306	movw %bx, -2(%rdi,%rdx)
				307	movw %si, (%rdi)
				308	jmp L(mm_return)
				309
				310	L(mm_len_5_8_bytes_forward):
				311	movl (%rsi), %ebx
				312	movl -4(%rsi,%rdx), %esi
				313	movl %ebx, (%rdi)
				314	movl %esi, -4(%rdi,%rdx)
				315	jmp L(mm_return)
				316
				317	L(mm_len_9_16_bytes_forward):
				318	mov (%rsi), %rbx
				319	mov -8(%rsi, %rdx), %rsi
				320	mov %rbx, (%rdi)
				321	mov %rsi, -8(%rdi, %rdx)
				322	jmp L(mm_return)
				323
				324	/* The code for copying backwards. */
				325	L(mm_len_0_or_more_backward):
				326
				327	/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
				328	separately. */
				329	cmp $16, %rdx
				330	jbe L(mm_len_0_16_bytes_backward)
				331
				332	cmp $32, %rdx
				333	jg L(mm_len_32_or_more_backward)
				334
				335	/* Copy [0..32] and return. */
				336	movdqu (%rsi), %xmm0
				337	movdqu -16(%rsi, %rdx), %xmm1
				338	movdqu %xmm0, (%rdi)
				339	movdqu %xmm1, -16(%rdi, %rdx)
				340	jmp L(mm_return)
				341
				342	L(mm_len_32_or_more_backward):
				343	cmp $64, %rdx
				344	jg L(mm_len_64_or_more_backward)
				345
				346	/* Copy [0..64] and return. */
				347	movdqu (%rsi), %xmm0
				348	movdqu 16(%rsi), %xmm1
				349	movdqu -16(%rsi, %rdx), %xmm2
				350	movdqu -32(%rsi, %rdx), %xmm3
				351	movdqu %xmm0, (%rdi)
				352	movdqu %xmm1, 16(%rdi)
				353	movdqu %xmm2, -16(%rdi, %rdx)
				354	movdqu %xmm3, -32(%rdi, %rdx)
				355	jmp L(mm_return)
				356
				357	L(mm_len_64_or_more_backward):
				358	cmp $128, %rdx
				359	jg L(mm_len_128_or_more_backward)
				360
				361	/* Copy [0..128] and return. */
				362	movdqu (%rsi), %xmm0
				363	movdqu 16(%rsi), %xmm1
				364	movdqu 32(%rsi), %xmm2
				365	movdqu 48(%rsi), %xmm3
				366	movdqu -64(%rsi, %rdx), %xmm4
				367	movdqu -48(%rsi, %rdx), %xmm5
				368	movdqu -32(%rsi, %rdx), %xmm6
				369	movdqu -16(%rsi, %rdx), %xmm7
				370	movdqu %xmm0, (%rdi)
				371	movdqu %xmm1, 16(%rdi)
				372	movdqu %xmm2, 32(%rdi)
				373	movdqu %xmm3, 48(%rdi)
				374	movdqu %xmm4, -64(%rdi, %rdx)
				375	movdqu %xmm5, -48(%rdi, %rdx)
				376	movdqu %xmm6, -32(%rdi, %rdx)
				377	movdqu %xmm7, -16(%rdi, %rdx)
				378	jmp L(mm_return)
				379
				380	L(mm_len_128_or_more_backward):
				381
				382	cmp $SHARED_CACHE_SIZE_HALF, %rdx
				383	jae L(mm_large_page_backward)
				384
				385	/* Aligning the address of destination. We need to save
				386	16 bits from the source in order not to overwrite them. */
				387	movdqu -16(%rsi, %rdx), %xmm0
				388	movdqu -32(%rsi, %rdx), %xmm1
				389	movdqu -48(%rsi, %rdx), %xmm2
				390	movdqu -64(%rsi, %rdx), %xmm3
				391
				392	lea (%rdi, %rdx), %r9
				393	and $-64, %r9 /* r9 = aligned dst */
				394
				395	mov %rsi, %r8
				396	sub %rdi, %r8 /* r8 = src - dst, diff */
				397
				398	movdqu -16(%r9, %r8), %xmm4
				399	movdqu -32(%r9, %r8), %xmm5
				400	movdqu -48(%r9, %r8), %xmm6
				401	movdqu -64(%r9, %r8), %xmm7
				402
				403	movdqu %xmm0, -16(%rdi, %rdx)
				404	movdqu %xmm1, -32(%rdi, %rdx)
				405	movdqu %xmm2, -48(%rdi, %rdx)
				406	movdqu %xmm3, -64(%rdi, %rdx)
				407	movdqa %xmm4, -16(%r9)
				408	movdqa %xmm5, -32(%r9)
				409	movdqa %xmm6, -48(%r9)
				410	movdqa %xmm7, -64(%r9)
				411	lea -64(%r9), %r9
				412
				413	lea 64(%rdi), %rbx
				414	and $-64, %rbx
				415
				416	/* Compute in %rdx how many bytes are left to copy after
				417	the main loop stops. */
				418	mov %rbx, %rdx
				419	sub %rdi, %rdx
				420
				421	cmp %r9, %rbx
				422	jb L(mm_main_loop_backward)
				423	jmp L(mm_len_0_or_more_backward)
				424
				425	.p2align 4
				426	L(mm_main_loop_backward):
				427
				428	prefetcht0 -128(%r9, %r8)
				429
				430	movdqu -64(%r9, %r8), %xmm0
				431	movdqu -48(%r9, %r8), %xmm1
				432	movdqu -32(%r9, %r8), %xmm2
				433	movdqu -16(%r9, %r8), %xmm3
				434	movdqa %xmm0, -64(%r9)
				435	movdqa %xmm1, -48(%r9)
				436	movdqa %xmm2, -32(%r9)
				437	movdqa %xmm3, -16(%r9)
				438	lea -64(%r9), %r9
				439	cmp %r9, %rbx
				440	jb L(mm_main_loop_backward)
				441	jmp L(mm_len_0_or_more_backward)
				442
				443	/* Copy [0..16] and return. */
				444	L(mm_len_0_16_bytes_backward):
				445	testb $24, %dl
				446	jnz L(mm_len_9_16_bytes_backward)
				447	testb $4, %dl
				448	.p2align 4,,5
				449	jnz L(mm_len_5_8_bytes_backward)
				450	test %rdx, %rdx
				451	.p2align 4,,2
				452	je L(mm_return)
				453	testb $2, %dl
				454	.p2align 4,,1
				455	jne L(mm_len_3_4_bytes_backward)
				456	movzbl -1(%rsi,%rdx), %ebx
				457	movzbl (%rsi), %ecx
				458	movb %bl, -1(%rdi,%rdx)
				459	movb %cl, (%rdi)
				460	jmp L(mm_return)
				461
				462	L(mm_len_3_4_bytes_backward):
				463	movzwl -2(%rsi,%rdx), %ebx
				464	movzwl (%rsi), %ecx
				465	movw %bx, -2(%rdi,%rdx)
				466	movw %cx, (%rdi)
				467	jmp L(mm_return)
				468
				469	L(mm_len_9_16_bytes_backward):
				470	movl -4(%rsi,%rdx), %ebx
				471	movl -8(%rsi,%rdx), %ecx
				472	movl %ebx, -4(%rdi,%rdx)
				473	movl %ecx, -8(%rdi,%rdx)
				474	sub $8, %rdx
				475	jmp L(mm_len_0_16_bytes_backward)
				476
				477	L(mm_len_5_8_bytes_backward):
				478	movl (%rsi), %ebx
				479	movl -4(%rsi,%rdx), %ecx
				480	movl %ebx, (%rdi)
				481	movl %ecx, -4(%rdi,%rdx)
				482
				483	L(mm_return):
				484	RETURN
				485
				486	/* Big length copy forward part. */
				487
				488	L(mm_large_page_forward):
				489	/* Aligning the address of destination. We need to save
				490	16 bits from the source in order not to overwrite them. */
				491
				492	mov %rsi, %r8
				493	mov %rdi, %r9
				494
				495	movdqu (%rsi), %xmm0
				496	movdqu 16(%rsi), %xmm1
				497	movdqu 32(%rsi), %xmm2
				498	movdqu 48(%rsi), %xmm3
				499
				500	lea 64(%r9), %rdi
				501	and $-64, %rdi /* rdi = aligned dst */
				502
				503	sub %r9, %rsi /* rsi = diff */
				504
				505	movdqu (%rdi, %rsi), %xmm4
				506	movdqu 16(%rdi, %rsi), %xmm5
				507	movdqu 32(%rdi, %rsi), %xmm6
				508	movdqu 48(%rdi, %rsi), %xmm7
				509
				510	movdqu %xmm0, (%r9)
				511	movdqu %xmm1, 16(%r9)
				512	movdqu %xmm2, 32(%r9)
				513	movdqu %xmm3, 48(%r9)
				514	movntdq %xmm4, (%rdi)
				515	movntdq %xmm5, 16(%rdi)
				516	movntdq %xmm6, 32(%rdi)
				517	movntdq %xmm7, 48(%rdi)
				518	add $64, %rdi
				519
				520	lea (%r9, %rdx), %rbx
				521	and $-128, %rbx
				522
				523	cmp %rdi, %rbx
				524	jbe L(mm_copy_remaining_forward)
				525
				526	.p2align 4
				527	L(mm_large_page_loop_forward):
				528	movdqu (%rdi, %rsi), %xmm0
				529	movdqu 16(%rdi, %rsi), %xmm1
				530	movdqu 32(%rdi, %rsi), %xmm2
				531	movdqu 48(%rdi, %rsi), %xmm3
				532	movdqu 64(%rdi, %rsi), %xmm4
				533	movdqu 80(%rdi, %rsi), %xmm5
				534	movdqu 96(%rdi, %rsi), %xmm6
				535	movdqu 112(%rdi, %rsi), %xmm7
				536	movntdq %xmm0, (%rdi)
				537	movntdq %xmm1, 16(%rdi)
				538	movntdq %xmm2, 32(%rdi)
				539	movntdq %xmm3, 48(%rdi)
				540	movntdq %xmm4, 64(%rdi)
				541	movntdq %xmm5, 80(%rdi)
				542	movntdq %xmm6, 96(%rdi)
				543	movntdq %xmm7, 112(%rdi)
				544	lea 128(%rdi), %rdi
				545	cmp %rdi, %rbx
				546	ja L(mm_large_page_loop_forward)
				547	sfence
				548
				549	add %r9, %rdx
				550	sub %rdi, %rdx
				551	/* We copied all up till %rdi position in the dst.
				552	In %rdx now is how many bytes are left to copy.
				553	Now we need to advance %r8. */
				554	lea (%rdi, %rsi), %r8
				555
				556	cmp $64, %rdx
				557	jb L(mm_remaining_0_64_bytes_forward)
				558
				559	movdqu (%r8), %xmm0
				560	movdqu 16(%r8), %xmm1
				561	movdqu 32(%r8), %xmm2
				562	movdqu 48(%r8), %xmm3
				563	movdqu -64(%r8, %rdx), %xmm4
				564	movdqu -48(%r8, %rdx), %xmm5
				565	movdqu -32(%r8, %rdx), %xmm6
				566	movdqu -16(%r8, %rdx), %xmm7
				567	movdqu %xmm0, (%rdi)
				568	movdqu %xmm1, 16(%rdi)
				569	movdqu %xmm2, 32(%rdi)
				570	movdqu %xmm3, 48(%rdi)
				571	movdqu %xmm4, -64(%rdi, %rdx)
				572	movdqu %xmm5, -48(%rdi, %rdx)
				573	movdqu %xmm6, -32(%rdi, %rdx)
				574	movdqu %xmm7, -16(%rdi, %rdx)
				575	jmp L(mm_return)
				576
				577
				578	/* Big length copy backward part. */
				579	L(mm_large_page_backward):
				580	/* Aligning the address of destination. We need to save
				581	16 bits from the source in order not to overwrite them. */
				582
				583	movdqu -16(%rsi, %rdx), %xmm0
				584	movdqu -32(%rsi, %rdx), %xmm1
				585	movdqu -48(%rsi, %rdx), %xmm2
				586	movdqu -64(%rsi, %rdx), %xmm3
				587
				588	lea (%rdi, %rdx), %r9
				589	and $-64, %r9
				590
				591	mov %rsi, %r8
				592	sub %rdi, %r8
				593
				594	movdqu -16(%r9, %r8), %xmm4
				595	movdqu -32(%r9, %r8), %xmm5
				596	movdqu -48(%r9, %r8), %xmm6
				597	movdqu -64(%r9, %r8), %xmm7
				598
				599	movdqu %xmm0, -16(%rdi, %rdx)
				600	movdqu %xmm1, -32(%rdi, %rdx)
				601	movdqu %xmm2, -48(%rdi, %rdx)
				602	movdqu %xmm3, -64(%rdi, %rdx)
				603	movntdq %xmm4, -16(%r9)
				604	movntdq %xmm5, -32(%r9)
				605	movntdq %xmm6, -48(%r9)
				606	movntdq %xmm7, -64(%r9)
				607	lea -64(%r9), %r9
				608
				609	lea 128(%rdi), %rbx
				610	and $-64, %rbx
				611
				612	/* Compute in %rdx how many bytes are left to copy after
				613	the main loop stops. */
				614	mov %rbx, %rdx
				615	sub %rdi, %rdx
				616
				617	cmp %r9, %rbx
				618	jae L(mm_len_0_or_more_backward)
				619
				620	.p2align 4
				621	L(mm_large_page_loop_backward):
				622	movdqu -64(%r9, %r8), %xmm0
				623	movdqu -48(%r9, %r8), %xmm1
				624	movdqu -32(%r9, %r8), %xmm2
				625	movdqu -16(%r9, %r8), %xmm3
				626	movntdq %xmm0, -64(%r9)
				627	movntdq %xmm1, -48(%r9)
				628	movntdq %xmm2, -32(%r9)
				629	movntdq %xmm3, -16(%r9)
				630	lea -64(%r9), %r9
				631	cmp %r9, %rbx
				632	jb L(mm_large_page_loop_backward)
				633	jmp L(mm_len_0_or_more_backward)
				634
				635	END (MEMMOVE)