Add 64-bit Silvermont-optimized string/memory functions.

Add following functions:
bcopy, bzero, memcpy, memmove, memset, stpcpy, stpncpy, strcat, strcpy,
strlen, strncat, strncpy, memcmp, strcmp, strncmp.
Set all these functions as the default ones.

Change-Id: Ic66b250ad8c349a43d25e2d4dea075604f6df6ac
Signed-off-by: Varvara Rainchik <varvara.rainchik@intel.com>
diff --git a/libc/arch-x86_64/string/sse2-memcpy-slm.S b/libc/arch-x86_64/string/sse2-memcpy-slm.S
new file mode 100644
index 0000000..4c30fb6
--- /dev/null
+++ b/libc/arch-x86_64/string/sse2-memcpy-slm.S
@@ -0,0 +1,299 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cache.h"
+
+#ifndef MEMCPY
+# define MEMCPY		memcpy
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)		\
+	.type name,  @function;		\
+	.globl name;		\
+	.p2align 4;		\
+name:		\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)		\
+	cfi_endproc;		\
+	.size name, .-name
+#endif
+
+#define CFI_PUSH(REG)		\
+	cfi_adjust_cfa_offset (4);		\
+	cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)		\
+	cfi_adjust_cfa_offset (-4);		\
+	cfi_restore (REG)
+
+#define PUSH(REG)	push REG;
+#define POP(REG)	pop REG;
+
+#define ENTRANCE	PUSH (%rbx);
+#define RETURN_END	POP (%rbx); ret
+#define RETURN		RETURN_END;
+
+	.section .text.sse2,"ax",@progbits
+ENTRY (MEMCPY)
+	ENTRANCE
+	cmp	%rsi, %rdi
+	je	L(return)
+
+	cmp	$16, %rdx
+	jbe	L(len_0_16_bytes)
+
+	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
+	jae	L(large_page)
+
+	movdqu	(%rsi), %xmm0
+	movdqu	-16(%rsi, %rdx), %xmm1
+	cmp	$32, %rdx
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, -16(%rdi, %rdx)
+	jbe	L(return)
+
+	movdqu	16(%rsi), %xmm0
+	movdqu	-32(%rsi, %rdx), %xmm1
+	cmp	$64, %rdx
+	movdqu	%xmm0, 16(%rdi)
+	movdqu	%xmm1, -32(%rdi, %rdx)
+	jbe	L(return)
+
+	movdqu	32(%rsi), %xmm0
+	movdqu	48(%rsi), %xmm1
+	movdqu	-48(%rsi, %rdx), %xmm2
+	movdqu	-64(%rsi, %rdx), %xmm3
+	cmp	$128, %rdx
+	movdqu	%xmm0, 32(%rdi)
+	movdqu	%xmm1, 48(%rdi)
+	movdqu	%xmm2, -48(%rdi, %rdx)
+	movdqu	%xmm3, -64(%rdi, %rdx)
+	jbe	L(return)
+
+/* Now the main loop: we align the address of the destination.  */
+	lea	64(%rdi), %r8
+	and	$-64, %r8
+
+	add	%rdi, %rdx
+	and	$-64, %rdx
+
+	sub	%rdi, %rsi
+
+/* We should stop two iterations before the termination
+	(in order not to misprefetch).  */
+	sub	$64, %rdx
+	cmp	%r8, %rdx
+	je	L(main_loop_just_one_iteration)
+
+	sub	$64, %rdx
+	cmp	%r8, %rdx
+	je	L(main_loop_last_two_iterations)
+
+
+	.p2align 4
+L(main_loop_cache):
+
+	prefetcht0 128(%r8, %rsi)
+
+	movdqu	(%r8, %rsi), %xmm0
+	movdqu	16(%r8, %rsi), %xmm1
+	movdqu	32(%r8, %rsi), %xmm2
+	movdqu	48(%r8, %rsi), %xmm3
+	movdqa	%xmm0, (%r8)
+	movdqa	%xmm1, 16(%r8)
+	movdqa	%xmm2, 32(%r8)
+	movdqa	%xmm3, 48(%r8)
+	lea	64(%r8), %r8
+	cmp	%r8, %rdx
+	jne	L(main_loop_cache)
+
+L(main_loop_last_two_iterations):
+	movdqu	(%r8, %rsi), %xmm0
+	movdqu	16(%r8, %rsi), %xmm1
+	movdqu	32(%r8, %rsi), %xmm2
+	movdqu	48(%r8, %rsi), %xmm3
+	movdqu	64(%r8, %rsi), %xmm4
+	movdqu	80(%r8, %rsi), %xmm5
+	movdqu	96(%r8, %rsi), %xmm6
+	movdqu	112(%r8, %rsi), %xmm7
+	movdqa	%xmm0, (%r8)
+	movdqa	%xmm1, 16(%r8)
+	movdqa	%xmm2, 32(%r8)
+	movdqa	%xmm3, 48(%r8)
+	movdqa	%xmm4, 64(%r8)
+	movdqa	%xmm5, 80(%r8)
+	movdqa	%xmm6, 96(%r8)
+	movdqa	%xmm7, 112(%r8)
+	jmp	L(return)
+
+L(main_loop_just_one_iteration):
+	movdqu	(%r8, %rsi), %xmm0
+	movdqu	16(%r8, %rsi), %xmm1
+	movdqu	32(%r8, %rsi), %xmm2
+	movdqu	48(%r8, %rsi), %xmm3
+	movdqa	%xmm0, (%r8)
+	movdqa	%xmm1, 16(%r8)
+	movdqa	%xmm2, 32(%r8)
+	movdqa	%xmm3, 48(%r8)
+	jmp	L(return)
+
+L(large_page):
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm1
+	movdqu	32(%rsi), %xmm2
+	movdqu	48(%rsi), %xmm3
+	movdqu	-64(%rsi, %rdx), %xmm4
+	movdqu	-48(%rsi, %rdx), %xmm5
+	movdqu	-32(%rsi, %rdx), %xmm6
+	movdqu	-16(%rsi, %rdx), %xmm7
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm1, 16(%rdi)
+	movdqu	%xmm2, 32(%rdi)
+	movdqu	%xmm3, 48(%rdi)
+	movdqu	%xmm4, -64(%rdi, %rdx)
+	movdqu	%xmm5, -48(%rdi, %rdx)
+	movdqu	%xmm6, -32(%rdi, %rdx)
+	movdqu	%xmm7, -16(%rdi, %rdx)
+
+	movdqu	64(%rsi), %xmm0
+	movdqu	80(%rsi), %xmm1
+	movdqu	96(%rsi), %xmm2
+	movdqu	112(%rsi), %xmm3
+	movdqu	-128(%rsi, %rdx), %xmm4
+	movdqu	-112(%rsi, %rdx), %xmm5
+	movdqu	-96(%rsi, %rdx), %xmm6
+	movdqu	-80(%rsi, %rdx), %xmm7
+	movdqu	%xmm0, 64(%rdi)
+	movdqu	%xmm1, 80(%rdi)
+	movdqu	%xmm2, 96(%rdi)
+	movdqu	%xmm3, 112(%rdi)
+	movdqu	%xmm4, -128(%rdi, %rdx)
+	movdqu	%xmm5, -112(%rdi, %rdx)
+	movdqu	%xmm6, -96(%rdi, %rdx)
+	movdqu	%xmm7, -80(%rdi, %rdx)
+
+/* Now the main loop with non temporal stores. We align
+	the address of the destination.  */
+	lea	128(%rdi), %r8
+	and	$-128, %r8
+
+	add	%rdi, %rdx
+	and	$-128, %rdx
+
+	sub	%rdi, %rsi
+
+	.p2align 4
+L(main_loop_large_page):
+	movdqu	(%r8, %rsi), %xmm0
+	movdqu	16(%r8, %rsi), %xmm1
+	movdqu	32(%r8, %rsi), %xmm2
+	movdqu	48(%r8, %rsi), %xmm3
+	movdqu	64(%r8, %rsi), %xmm4
+	movdqu	80(%r8, %rsi), %xmm5
+	movdqu	96(%r8, %rsi), %xmm6
+	movdqu	112(%r8, %rsi), %xmm7
+	movntdq	%xmm0, (%r8)
+	movntdq	%xmm1, 16(%r8)
+	movntdq	%xmm2, 32(%r8)
+	movntdq	%xmm3, 48(%r8)
+	movntdq	%xmm4, 64(%r8)
+	movntdq	%xmm5, 80(%r8)
+	movntdq	%xmm6, 96(%r8)
+	movntdq	%xmm7, 112(%r8)
+	lea	128(%r8), %r8
+	cmp	%r8, %rdx
+	jne	L(main_loop_large_page)
+	sfence
+	jmp	L(return)
+
+L(len_0_16_bytes):
+	testb	$24, %dl
+	jne	L(len_9_16_bytes)
+	testb	$4, %dl
+	.p2align 4,,5
+	jne	L(len_5_8_bytes)
+	test	%rdx, %rdx
+	.p2align 4,,2
+	je	L(return)
+	movzbl	(%rsi), %ebx
+	testb	$2, %dl
+	movb	%bl, (%rdi)
+	je	L(return)
+	movzwl	-2(%rsi,%rdx), %ebx
+	movw	%bx, -2(%rdi,%rdx)
+	jmp	L(return)
+
+L(len_9_16_bytes):
+	movq	(%rsi), %xmm0
+	movq	-8(%rsi, %rdx), %xmm1
+	movq	%xmm0, (%rdi)
+	movq	%xmm1, -8(%rdi, %rdx)
+	jmp	L(return)
+
+L(len_5_8_bytes):
+	movl	(%rsi), %ebx
+	movl	%ebx, (%rdi)
+	movl	-4(%rsi,%rdx), %ebx
+	movl	%ebx, -4(%rdi,%rdx)
+	jmp	L(return)
+
+L(return):
+	mov 	%rdi, %rax
+	RETURN
+
+END (MEMCPY)