Add 32-bit Silvermont-optimized string/memory functions.

Add following functions:
bcopy, memcpy, memmove, memset, bzero, memcmp, wmemcmp, strlen,
strcpy, strncpy, stpcpy, stpncpy.
Create new directories inside arch-x86 to specify architecture: atom,
silvermont and generic (non atom or silvermont architectures are treated like generic).
Due to introducing optimized versions of stpcpy and stpncpy,
c-implementations of these functions are moved from
common for architectures makefile to arm and mips specific makefiles.

Change-Id: I990f8061c3e9bca1f154119303da9e781c5d086e
Signed-off-by: Varvara Rainchik <varvara.rainchik@intel.com>
diff --git a/libc/arch-x86/atom/atom.mk b/libc/arch-x86/atom/atom.mk
new file mode 100644
index 0000000..bf408b4
--- /dev/null
+++ b/libc/arch-x86/atom/atom.mk
@@ -0,0 +1,34 @@
+libc_bionic_src_files_x86 += \
+    arch-x86/atom/string/sse2-bzero-atom.S \
+    arch-x86/atom/string/sse2-index-atom.S \
+    arch-x86/atom/string/sse2-memchr-atom.S \
+    arch-x86/atom/string/sse2-memrchr-atom.S \
+    arch-x86/atom/string/sse2-memset-atom.S \
+    arch-x86/atom/string/sse2-strchr-atom.S \
+    arch-x86/atom/string/sse2-strlen-atom.S \
+    arch-x86/atom/string/sse2-strnlen-atom.S \
+    arch-x86/atom/string/sse2-strrchr-atom.S \
+    arch-x86/atom/string/sse2-wcschr-atom.S \
+    arch-x86/atom/string/sse2-wcsrchr-atom.S \
+    arch-x86/atom/string/sse2-wcslen-atom.S \
+    arch-x86/atom/string/sse2-wcscmp-atom.S \
+    arch-x86/atom/string/ssse3-bcopy-atom.S \
+    arch-x86/atom/string/ssse3-memcmp-atom.S \
+    arch-x86/atom/string/ssse3-memcmp16-atom.S \
+    arch-x86/atom/string/ssse3-memcpy-atom.S \
+    arch-x86/atom/string/ssse3-memmove-atom.S \
+    arch-x86/atom/string/ssse3-strcat-atom.S \
+    arch-x86/atom/string/ssse3-strcmp-atom.S \
+    arch-x86/atom/string/ssse3-strcpy-atom.S \
+    arch-x86/atom/string/ssse3-strlcat-atom.S \
+    arch-x86/atom/string/ssse3-strlcpy-atom.S \
+    arch-x86/atom/string/ssse3-strncat-atom.S \
+    arch-x86/atom/string/ssse3-strncmp-atom.S \
+    arch-x86/atom/string/ssse3-strncpy-atom.S \
+    arch-x86/atom/string/ssse3-wcscat-atom.S \
+    arch-x86/atom/string/ssse3-wcscpy-atom.S \
+    arch-x86/atom/string/ssse3-wmemcmp-atom.S
+
+libc_bionic_src_files_x86 += \
+    arch-x86/silvermont/string/sse2-stpcpy-slm.S \
+    arch-x86/silvermont/string/sse2-stpncpy-slm.S
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/atom/string/cache.h
similarity index 89%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86/atom/string/cache.h
index 9d0a563..823bb1e 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/atom/string/cache.h
@@ -28,15 +28,9 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
 /* Values are optimized for Atom */
 #define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
 #define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
 
 #define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
 #define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
diff --git a/libc/arch-x86/string/sse2-bzero-atom.S b/libc/arch-x86/atom/string/sse2-bzero-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-bzero-atom.S
rename to libc/arch-x86/atom/string/sse2-bzero-atom.S
diff --git a/libc/arch-x86/string/sse2-index-atom.S b/libc/arch-x86/atom/string/sse2-index-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-index-atom.S
rename to libc/arch-x86/atom/string/sse2-index-atom.S
diff --git a/libc/arch-x86/string/sse2-memchr-atom.S b/libc/arch-x86/atom/string/sse2-memchr-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-memchr-atom.S
rename to libc/arch-x86/atom/string/sse2-memchr-atom.S
diff --git a/libc/arch-x86/string/sse2-memrchr-atom.S b/libc/arch-x86/atom/string/sse2-memrchr-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-memrchr-atom.S
rename to libc/arch-x86/atom/string/sse2-memrchr-atom.S
diff --git a/libc/arch-x86/string/sse2-memset-atom.S b/libc/arch-x86/atom/string/sse2-memset-atom.S
similarity index 98%
rename from libc/arch-x86/string/sse2-memset-atom.S
rename to libc/arch-x86/atom/string/sse2-memset-atom.S
index a54bf51..b0963a1 100644
--- a/libc/arch-x86/string/sse2-memset-atom.S
+++ b/libc/arch-x86/atom/string/sse2-memset-atom.S
@@ -29,7 +29,6 @@
 */
 
 #include "cache.h"
-#undef __i686
 
 #ifndef L
 # define L(label)	.L##label
@@ -107,7 +106,7 @@
    jump table with relative offsets.   */
 # define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
     /* We first load PC into EBX.  */				\
-    call	__i686.get_pc_thunk.bx;				\
+    call	__x86.get_pc_thunk.bx;				\
     /* Get the address of the jump table.  */			\
     add		$(TABLE - .), %ebx;				\
     /* Get the entry and convert the relative offset to the	\
@@ -117,12 +116,12 @@
     /* We loaded the jump table and adjuested EDX. Go.  */	\
     jmp		*%ebx
 
-	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
-	.globl	__i686.get_pc_thunk.bx
-	.hidden	__i686.get_pc_thunk.bx
+	.section	.gnu.linkonce.t.__x86.get_pc_thunk.bx,"ax",@progbits
+	.globl	__x86.get_pc_thunk.bx
+	.hidden	__x86.get_pc_thunk.bx
 	ALIGN (4)
-	.type	__i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
+	.type	__x86.get_pc_thunk.bx,@function
+__x86.get_pc_thunk.bx:
 	movl	(%esp), %ebx
 	ret
 #else
@@ -321,7 +320,7 @@
 	mov	$SHARED_CACHE_SIZE, %ebx
 #else
 # if (defined SHARED || defined __PIC__)
-	call	__i686.get_pc_thunk.bx
+	call	__x86.get_pc_thunk.bx
 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
 	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
 # else
@@ -340,7 +339,7 @@
 #else
 # if (defined SHARED || defined __PIC__)
 #  define RESTORE_EBX_STATE
-	call	__i686.get_pc_thunk.bx
+	call	__x86.get_pc_thunk.bx
 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
 # else
diff --git a/libc/arch-x86/string/sse2-strchr-atom.S b/libc/arch-x86/atom/string/sse2-strchr-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-strchr-atom.S
rename to libc/arch-x86/atom/string/sse2-strchr-atom.S
diff --git a/libc/arch-x86/string/sse2-strlen-atom.S b/libc/arch-x86/atom/string/sse2-strlen-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-strlen-atom.S
rename to libc/arch-x86/atom/string/sse2-strlen-atom.S
diff --git a/libc/arch-x86/string/sse2-strnlen-atom.S b/libc/arch-x86/atom/string/sse2-strnlen-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-strnlen-atom.S
rename to libc/arch-x86/atom/string/sse2-strnlen-atom.S
diff --git a/libc/arch-x86/string/sse2-strrchr-atom.S b/libc/arch-x86/atom/string/sse2-strrchr-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-strrchr-atom.S
rename to libc/arch-x86/atom/string/sse2-strrchr-atom.S
diff --git a/libc/arch-x86/string/sse2-wcschr-atom.S b/libc/arch-x86/atom/string/sse2-wcschr-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-wcschr-atom.S
rename to libc/arch-x86/atom/string/sse2-wcschr-atom.S
diff --git a/libc/arch-x86/string/sse2-wcscmp-atom.S b/libc/arch-x86/atom/string/sse2-wcscmp-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-wcscmp-atom.S
rename to libc/arch-x86/atom/string/sse2-wcscmp-atom.S
diff --git a/libc/arch-x86/string/sse2-wcslen-atom.S b/libc/arch-x86/atom/string/sse2-wcslen-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-wcslen-atom.S
rename to libc/arch-x86/atom/string/sse2-wcslen-atom.S
diff --git a/libc/arch-x86/string/sse2-wcsrchr-atom.S b/libc/arch-x86/atom/string/sse2-wcsrchr-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-wcsrchr-atom.S
rename to libc/arch-x86/atom/string/sse2-wcsrchr-atom.S
diff --git a/libc/arch-x86/string/ssse3-bcopy-atom.S b/libc/arch-x86/atom/string/ssse3-bcopy-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-bcopy-atom.S
rename to libc/arch-x86/atom/string/ssse3-bcopy-atom.S
diff --git a/libc/arch-x86/string/ssse3-memcmp-atom.S b/libc/arch-x86/atom/string/ssse3-memcmp-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-memcmp-atom.S
rename to libc/arch-x86/atom/string/ssse3-memcmp-atom.S
diff --git a/libc/arch-x86/string/ssse3-memcmp16-atom.S b/libc/arch-x86/atom/string/ssse3-memcmp16-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-memcmp16-atom.S
rename to libc/arch-x86/atom/string/ssse3-memcmp16-atom.S
diff --git a/libc/arch-x86/string/ssse3-memcpy-atom.S b/libc/arch-x86/atom/string/ssse3-memcpy-atom.S
similarity index 99%
rename from libc/arch-x86/string/ssse3-memcpy-atom.S
rename to libc/arch-x86/atom/string/ssse3-memcpy-atom.S
index 1080a38..ac5ec2d 100644
--- a/libc/arch-x86/string/ssse3-memcpy-atom.S
+++ b/libc/arch-x86/atom/string/ssse3-memcpy-atom.S
@@ -29,7 +29,6 @@
 */
 
 #include "cache.h"
-#undef __i686
 
 #ifndef MEMCPY
 # define MEMCPY	memcpy
@@ -101,9 +100,8 @@
 # define RETURN_END	POP (%ebx); ret
 # define RETURN		RETURN_END; CFI_PUSH (%ebx)
 # define JMPTBL(I, B)	I - B
-# undef __i686
 
-# define SETUP_PIC_REG(x)	call	__i686.get_pc_thunk.x
+# define SETUP_PIC_REG(x)	call	__x86.get_pc_thunk.x
 
 /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
 	jump table with relative offsets.  INDEX is a register contains the
diff --git a/libc/arch-x86/string/ssse3-memmove-atom.S b/libc/arch-x86/atom/string/ssse3-memmove-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-memmove-atom.S
rename to libc/arch-x86/atom/string/ssse3-memmove-atom.S
diff --git a/libc/arch-x86/string/ssse3-strcat-atom.S b/libc/arch-x86/atom/string/ssse3-strcat-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strcat-atom.S
rename to libc/arch-x86/atom/string/ssse3-strcat-atom.S
diff --git a/libc/arch-x86/string/ssse3-strcmp-atom.S b/libc/arch-x86/atom/string/ssse3-strcmp-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strcmp-atom.S
rename to libc/arch-x86/atom/string/ssse3-strcmp-atom.S
diff --git a/libc/arch-x86/string/ssse3-strcpy-atom.S b/libc/arch-x86/atom/string/ssse3-strcpy-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strcpy-atom.S
rename to libc/arch-x86/atom/string/ssse3-strcpy-atom.S
diff --git a/libc/arch-x86/string/ssse3-strlcat-atom.S b/libc/arch-x86/atom/string/ssse3-strlcat-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strlcat-atom.S
rename to libc/arch-x86/atom/string/ssse3-strlcat-atom.S
diff --git a/libc/arch-x86/string/ssse3-strlcpy-atom.S b/libc/arch-x86/atom/string/ssse3-strlcpy-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strlcpy-atom.S
rename to libc/arch-x86/atom/string/ssse3-strlcpy-atom.S
diff --git a/libc/arch-x86/string/ssse3-strncat-atom.S b/libc/arch-x86/atom/string/ssse3-strncat-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strncat-atom.S
rename to libc/arch-x86/atom/string/ssse3-strncat-atom.S
diff --git a/libc/arch-x86/string/ssse3-strncmp-atom.S b/libc/arch-x86/atom/string/ssse3-strncmp-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strncmp-atom.S
rename to libc/arch-x86/atom/string/ssse3-strncmp-atom.S
diff --git a/libc/arch-x86/string/ssse3-strncpy-atom.S b/libc/arch-x86/atom/string/ssse3-strncpy-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strncpy-atom.S
rename to libc/arch-x86/atom/string/ssse3-strncpy-atom.S
diff --git a/libc/arch-x86/string/ssse3-wcscat-atom.S b/libc/arch-x86/atom/string/ssse3-wcscat-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-wcscat-atom.S
rename to libc/arch-x86/atom/string/ssse3-wcscat-atom.S
diff --git a/libc/arch-x86/string/ssse3-wcscpy-atom.S b/libc/arch-x86/atom/string/ssse3-wcscpy-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-wcscpy-atom.S
rename to libc/arch-x86/atom/string/ssse3-wcscpy-atom.S
diff --git a/libc/arch-x86/string/ssse3-wmemcmp-atom.S b/libc/arch-x86/atom/string/ssse3-wmemcmp-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-wmemcmp-atom.S
rename to libc/arch-x86/atom/string/ssse3-wmemcmp-atom.S
diff --git a/libc/arch-x86/generic/generic.mk b/libc/arch-x86/generic/generic.mk
new file mode 100644
index 0000000..c8b40ee
--- /dev/null
+++ b/libc/arch-x86/generic/generic.mk
@@ -0,0 +1,55 @@
+libc_bionic_src_files_x86 += \
+    arch-x86/atom/string/sse2-index-atom.S \
+    arch-x86/atom/string/sse2-memchr-atom.S \
+    arch-x86/atom/string/sse2-memrchr-atom.S \
+    arch-x86/atom/string/sse2-strchr-atom.S \
+    arch-x86/atom/string/sse2-strnlen-atom.S \
+    arch-x86/atom/string/sse2-strrchr-atom.S \
+    arch-x86/atom/string/sse2-wcschr-atom.S \
+    arch-x86/atom/string/sse2-wcsrchr-atom.S \
+    arch-x86/atom/string/sse2-wcslen-atom.S \
+    arch-x86/atom/string/sse2-wcscmp-atom.S \
+    arch-x86/silvermont/string/sse2-bcopy-slm.S \
+    arch-x86/silvermont/string/sse2-bzero-slm.S \
+    arch-x86/silvermont/string/sse2-memcpy-slm.S \
+    arch-x86/silvermont/string/sse2-memmove-slm.S \
+    arch-x86/silvermont/string/sse2-memset-slm.S \
+    arch-x86/silvermont/string/sse2-stpcpy-slm.S \
+    arch-x86/silvermont/string/sse2-stpncpy-slm.S \
+    arch-x86/silvermont/string/sse2-strcpy-slm.S \
+    arch-x86/silvermont/string/sse2-strlen-slm.S \
+    arch-x86/silvermont/string/sse2-strncpy-slm.S
+
+ifeq ($(ARCH_X86_HAVE_SSSE3),true)
+libc_bionic_src_files_x86 += \
+    arch-x86/atom/string/ssse3-strncat-atom.S \
+    arch-x86/atom/string/ssse3-strlcat-atom.S \
+    arch-x86/atom/string/ssse3-strlcpy-atom.S \
+    arch-x86/atom/string/ssse3-strcmp-atom.S \
+    arch-x86/atom/string/ssse3-strncmp-atom.S \
+    arch-x86/atom/string/ssse3-strcat-atom.S \
+    arch-x86/atom/string/ssse3-memcmp16-atom.S \
+    arch-x86/atom/string/ssse3-wcscat-atom.S \
+    arch-x86/atom/string/ssse3-wcscpy-atom.S
+else
+libc_bionic_src_files_x86 += \
+    arch-x86/generic/string/strcmp.S \
+    arch-x86/generic/string/strncmp.S \
+    arch-x86/generic/string/strcat.S \
+    bionic/__memcmp16.cpp \
+    upstream-freebsd/lib/libc/string/wcscpy.c \
+    upstream-freebsd/lib/libc/string/wcscat.c \
+    upstream-openbsd/lib/libc/string/strlcat.c \
+    upstream-openbsd/lib/libc/string/strlcpy.c \
+    upstream-openbsd/lib/libc/string/strncat.c
+endif
+
+ifeq ($(ARCH_X86_HAVE_SSE4),true)
+ libc_bionic_src_files_x86 += \
+    arch-x86/silvermont/string/sse4-memcmp-slm.S \
+    arch-x86/silvermont/string/sse4-wmemcmp-slm.S
+else
+libc_bionic_src_files_x86 += \
+    arch-x86/generic/string/memcmp.S \
+    upstream-freebsd/lib/libc/string/wmemcmp.c
+endif
diff --git a/libc/arch-x86/string/bcopy.S b/libc/arch-x86/generic/string/bcopy.S
similarity index 100%
rename from libc/arch-x86/string/bcopy.S
rename to libc/arch-x86/generic/string/bcopy.S
diff --git a/libc/arch-x86/string/memcmp.S b/libc/arch-x86/generic/string/memcmp.S
similarity index 100%
rename from libc/arch-x86/string/memcmp.S
rename to libc/arch-x86/generic/string/memcmp.S
diff --git a/libc/arch-x86/string/memcpy.S b/libc/arch-x86/generic/string/memcpy.S
similarity index 100%
rename from libc/arch-x86/string/memcpy.S
rename to libc/arch-x86/generic/string/memcpy.S
diff --git a/libc/arch-x86/string/memmove.S b/libc/arch-x86/generic/string/memmove.S
similarity index 100%
rename from libc/arch-x86/string/memmove.S
rename to libc/arch-x86/generic/string/memmove.S
diff --git a/libc/arch-x86/string/strcat.S b/libc/arch-x86/generic/string/strcat.S
similarity index 100%
rename from libc/arch-x86/string/strcat.S
rename to libc/arch-x86/generic/string/strcat.S
diff --git a/libc/arch-x86/string/strcmp.S b/libc/arch-x86/generic/string/strcmp.S
similarity index 100%
rename from libc/arch-x86/string/strcmp.S
rename to libc/arch-x86/generic/string/strcmp.S
diff --git a/libc/arch-x86/string/strncmp.S b/libc/arch-x86/generic/string/strncmp.S
similarity index 100%
rename from libc/arch-x86/string/strncmp.S
rename to libc/arch-x86/generic/string/strncmp.S
diff --git a/libc/arch-x86/string/swab.S b/libc/arch-x86/generic/string/swab.S
similarity index 100%
rename from libc/arch-x86/string/swab.S
rename to libc/arch-x86/generic/string/swab.S
diff --git a/libc/arch-x86/silvermont/silvermont.mk b/libc/arch-x86/silvermont/silvermont.mk
new file mode 100644
index 0000000..b951ad5
--- /dev/null
+++ b/libc/arch-x86/silvermont/silvermont.mk
@@ -0,0 +1,34 @@
+libc_bionic_src_files_x86 += \
+    arch-x86/silvermont/string/sse2-bcopy-slm.S \
+    arch-x86/silvermont/string/sse2-bzero-slm.S \
+    arch-x86/silvermont/string/sse2-memcpy-slm.S \
+    arch-x86/silvermont/string/sse2-memmove-slm.S \
+    arch-x86/silvermont/string/sse2-memset-slm.S \
+    arch-x86/silvermont/string/sse2-stpcpy-slm.S \
+    arch-x86/silvermont/string/sse2-stpncpy-slm.S \
+    arch-x86/silvermont/string/sse2-strcpy-slm.S \
+    arch-x86/silvermont/string/sse2-strlen-slm.S \
+    arch-x86/silvermont/string/sse2-strncpy-slm.S \
+    arch-x86/silvermont/string/sse4-memcmp-slm.S \
+    arch-x86/silvermont/string/sse4-wmemcmp-slm.S
+
+libc_bionic_src_files_x86 += \
+    arch-x86/atom/string/sse2-memchr-atom.S \
+    arch-x86/atom/string/sse2-memrchr-atom.S \
+    arch-x86/atom/string/sse2-strchr-atom.S \
+    arch-x86/atom/string/sse2-strrchr-atom.S \
+    arch-x86/atom/string/sse2-index-atom.S \
+    arch-x86/atom/string/sse2-strnlen-atom.S \
+    arch-x86/atom/string/sse2-wcschr-atom.S \
+    arch-x86/atom/string/sse2-wcsrchr-atom.S \
+    arch-x86/atom/string/sse2-wcslen-atom.S \
+    arch-x86/atom/string/sse2-wcscmp-atom.S \
+    arch-x86/atom/string/ssse3-strncat-atom.S \
+    arch-x86/atom/string/ssse3-strlcat-atom.S \
+    arch-x86/atom/string/ssse3-strlcpy-atom.S \
+    arch-x86/atom/string/ssse3-strcmp-atom.S \
+    arch-x86/atom/string/ssse3-strncmp-atom.S \
+    arch-x86/atom/string/ssse3-strcat-atom.S \
+    arch-x86/atom/string/ssse3-memcmp16-atom.S \
+    arch-x86/atom/string/ssse3-wcscat-atom.S \
+    arch-x86/atom/string/ssse3-wcscpy-atom.S
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/silvermont/string/cache.h
similarity index 90%
rename from libc/arch-x86/string/cache.h
rename to libc/arch-x86/silvermont/string/cache.h
index 9d0a563..c342b1c 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/silvermont/string/cache.h
@@ -28,15 +28,9 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(__slm__)
 /* Values are optimized for Silvermont */
 #define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
 #define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
 
 #define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
 #define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/silvermont/string/sse2-bcopy-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86/silvermont/string/sse2-bcopy-slm.S
index 9d0a563..190d52f 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/silvermont/string/sse2-bcopy-slm.S
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2014, Intel Corporation
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -28,15 +28,7 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
 
-#define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
+#define MEMMOVE	bcopy
+#define USE_AS_BCOPY
+#include "sse2-memmove-slm.S"
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/silvermont/string/sse2-bzero-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86/silvermont/string/sse2-bzero-slm.S
index 9d0a563..b682ed6 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/silvermont/string/sse2-bzero-slm.S
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2014, Intel Corporation
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -28,15 +28,6 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
+#define USE_AS_BZERO
+#define MEMSET  bzero
+#include "sse2-memset-slm.S"
diff --git a/libc/arch-x86/silvermont/string/sse2-memcpy-slm.S b/libc/arch-x86/silvermont/string/sse2-memcpy-slm.S
new file mode 100644
index 0000000..1b305c7
--- /dev/null
+++ b/libc/arch-x86/silvermont/string/sse2-memcpy-slm.S
@@ -0,0 +1,308 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cache.h"
+
+#ifndef MEMCPY
+# define MEMCPY	memcpy
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)		\
+	.type name,  @function;		\
+	.globl name;		\
+	.p2align 4;		\
+name:		\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)		\
+	cfi_endproc;		\
+	.size name, .-name
+#endif
+
+#define DEST		PARMS
+#define SRC		DEST+4
+#define LEN		SRC+4
+
+#define CFI_PUSH(REG)		\
+  cfi_adjust_cfa_offset (4);		\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)		\
+  cfi_adjust_cfa_offset (-4);		\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#define PARMS		8		/* Preserve EBX.  */
+#define ENTRANCE	PUSH (%ebx);
+#define RETURN_END	POP (%ebx); ret
+#define RETURN		RETURN_END; CFI_PUSH (%ebx)
+
+	.section .text.sse2,"ax",@progbits
+ENTRY (MEMCPY)
+	ENTRANCE
+	movl	LEN(%esp), %ecx
+	movl	SRC(%esp), %eax
+	movl	DEST(%esp), %edx
+
+	cmp	%eax, %edx
+	je	L(return)
+
+	cmp	$16, %ecx
+	jbe	L(len_0_16_bytes)
+
+	cmp     $SHARED_CACHE_SIZE_HALF, %ecx
+	jae     L(large_page)
+
+	movdqu	(%eax), %xmm0
+	movdqu	-16(%eax, %ecx), %xmm1
+	cmpl    $32, %ecx
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jbe	L(return)
+
+	movdqu	16(%eax), %xmm0
+	movdqu	-32(%eax, %ecx), %xmm1
+	cmpl    $64, %ecx
+	movdqu	%xmm0, 16(%edx)
+	movdqu	%xmm1, -32(%edx, %ecx)
+	jbe	L(return)
+
+	movdqu	32(%eax), %xmm0
+	movdqu	48(%eax), %xmm1
+	movdqu	-48(%eax, %ecx), %xmm2
+	movdqu	-64(%eax, %ecx), %xmm3
+	cmpl    $128, %ecx
+	movdqu	%xmm0, 32(%edx)
+	movdqu	%xmm1, 48(%edx)
+	movdqu	%xmm2, -48(%edx, %ecx)
+	movdqu	%xmm3, -64(%edx, %ecx)
+	jbe	L(return)
+
+/* Now the main loop: we align the address of the destination.  */
+	leal	64(%edx), %ebx
+	andl	$-64, %ebx
+
+	addl	%edx, %ecx
+	andl	$-64, %ecx
+
+	subl	%edx, %eax
+
+/* We should stop two iterations before the termination
+	(in order not to misprefetch).  */
+	subl	$64, %ecx
+	cmpl	%ebx, %ecx
+	je	L(main_loop_just_one_iteration)
+
+	subl	$64, %ecx
+	cmpl	%ebx, %ecx
+	je	L(main_loop_last_two_iterations)
+
+
+	.p2align 4
+L(main_loop_cache):
+
+	prefetcht0 128(%ebx, %eax)
+
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqa	%xmm0, (%ebx)
+	movdqa	%xmm1, 16(%ebx)
+	movdqa	%xmm2, 32(%ebx)
+	movdqa	%xmm3, 48(%ebx)
+	lea	64(%ebx), %ebx
+	cmpl	%ebx, %ecx
+	jne	L(main_loop_cache)
+
+L(main_loop_last_two_iterations):
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqu	64(%ebx, %eax), %xmm4
+	movdqu	80(%ebx, %eax), %xmm5
+	movdqu	96(%ebx, %eax), %xmm6
+	movdqu	112(%ebx, %eax), %xmm7
+	movdqa	%xmm0, (%ebx)
+	movdqa	%xmm1, 16(%ebx)
+	movdqa	%xmm2, 32(%ebx)
+	movdqa	%xmm3, 48(%ebx)
+	movdqa	%xmm4, 64(%ebx)
+	movdqa	%xmm5, 80(%ebx)
+	movdqa	%xmm6, 96(%ebx)
+	movdqa	%xmm7, 112(%ebx)
+	jmp	L(return)
+
+L(main_loop_just_one_iteration):
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqa	%xmm0, (%ebx)
+	movdqa	%xmm1, 16(%ebx)
+	movdqa	%xmm2, 32(%ebx)
+	movdqa	%xmm3, 48(%ebx)
+	jmp	L(return)
+
+L(large_page):
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqu	-64(%eax, %ecx), %xmm4
+	movdqu	-48(%eax, %ecx), %xmm5
+	movdqu	-32(%eax, %ecx), %xmm6
+	movdqu	-16(%eax, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+
+	movdqu	64(%eax), %xmm0
+	movdqu	80(%eax), %xmm1
+	movdqu	96(%eax), %xmm2
+	movdqu	112(%eax), %xmm3
+	movdqu	-128(%eax, %ecx), %xmm4
+	movdqu	-112(%eax, %ecx), %xmm5
+	movdqu	-96(%eax, %ecx), %xmm6
+	movdqu	-80(%eax, %ecx), %xmm7
+	movdqu	%xmm0, 64(%edx)
+	movdqu	%xmm1, 80(%edx)
+	movdqu	%xmm2, 96(%edx)
+	movdqu	%xmm3, 112(%edx)
+	movdqu	%xmm4, -128(%edx, %ecx)
+	movdqu	%xmm5, -112(%edx, %ecx)
+	movdqu	%xmm6, -96(%edx, %ecx)
+	movdqu	%xmm7, -80(%edx, %ecx)
+
+/* Now the main loop with non temporal stores. We align
+	the address of the destination.  */
+	leal	128(%edx), %ebx
+	andl	$-128, %ebx
+
+	addl	%edx, %ecx
+	andl	$-128, %ecx
+
+	subl	%edx, %eax
+
+	.p2align 4
+L(main_loop_large_page):
+	movdqu	(%ebx, %eax), %xmm0
+	movdqu	16(%ebx, %eax), %xmm1
+	movdqu	32(%ebx, %eax), %xmm2
+	movdqu	48(%ebx, %eax), %xmm3
+	movdqu	64(%ebx, %eax), %xmm4
+	movdqu	80(%ebx, %eax), %xmm5
+	movdqu	96(%ebx, %eax), %xmm6
+	movdqu	112(%ebx, %eax), %xmm7
+	movntdq	%xmm0, (%ebx)
+	movntdq	%xmm1, 16(%ebx)
+	movntdq	%xmm2, 32(%ebx)
+	movntdq	%xmm3, 48(%ebx)
+	movntdq	%xmm4, 64(%ebx)
+	movntdq	%xmm5, 80(%ebx)
+	movntdq	%xmm6, 96(%ebx)
+	movntdq	%xmm7, 112(%ebx)
+	lea	128(%ebx), %ebx
+	cmpl	%ebx, %ecx
+	jne	L(main_loop_large_page)
+	sfence
+	jmp	L(return)
+
+L(len_0_16_bytes):
+	testb	$24, %cl
+	jne	L(len_9_16_bytes)
+	testb	$4, %cl
+	.p2align 4,,5
+	jne	L(len_5_8_bytes)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(return)
+	movzbl	(%eax), %ebx
+	testb	$2, %cl
+	movb	%bl, (%edx)
+	je	L(return)
+	movzwl	-2(%eax,%ecx), %ebx
+	movw	%bx, -2(%edx,%ecx)
+	jmp	L(return)
+
+L(len_9_16_bytes):
+	movq	(%eax), %xmm0
+	movq	-8(%eax, %ecx), %xmm1
+	movq	%xmm0, (%edx)
+	movq	%xmm1, -8(%edx, %ecx)
+	jmp	L(return)
+
+L(len_5_8_bytes):
+	movl	(%eax), %ebx
+	movl	%ebx, (%edx)
+	movl	-4(%eax,%ecx), %ebx
+	movl	%ebx, -4(%edx,%ecx)
+	jmp	L(return)
+
+L(return):
+	movl	%edx, %eax
+	RETURN
+
+END (MEMCPY)
diff --git a/libc/arch-x86/silvermont/string/sse2-memmove-slm.S b/libc/arch-x86/silvermont/string/sse2-memmove-slm.S
new file mode 100644
index 0000000..79a0a36
--- /dev/null
+++ b/libc/arch-x86/silvermont/string/sse2-memmove-slm.S
@@ -0,0 +1,673 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cache.h"
+
+#ifndef MEMMOVE
+# define MEMMOVE	memmove
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)		\
+	.type name,  @function;		\
+	.globl name;		\
+	.p2align 4;		\
+name:		\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)		\
+	cfi_endproc;		\
+	.size name, .-name
+#endif
+
+#ifdef USE_AS_BCOPY
+# define SRC           PARMS
+# define DEST          SRC+4
+# define LEN           DEST+4
+#else
+# define DEST          PARMS
+# define SRC           DEST+4
+# define LEN           SRC+4
+#endif
+
+#define CFI_PUSH(REG)		\
+  cfi_adjust_cfa_offset (4);		\
+  cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)		\
+  cfi_adjust_cfa_offset (-4);		\
+  cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#define PARMS		8		/* Preserve EBX.  */
+#define ENTRANCE	PUSH (%ebx);
+#define RETURN_END	POP (%ebx); ret
+#define RETURN		RETURN_END; CFI_PUSH (%ebx)
+
+	.section .text.sse2,"ax",@progbits
+ENTRY (MEMMOVE)
+	ENTRANCE
+	movl	LEN(%esp), %ecx
+	movl	SRC(%esp), %eax
+	movl	DEST(%esp), %edx
+
+/* Check whether we should copy backward or forward.  */
+	cmp	%eax, %edx
+	je	L(mm_return)
+	ja	L(mm_len_0_or_more_backward)
+
+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
+	separately.  */
+	cmp	$16, %ecx
+	jbe	L(mm_len_0_16_bytes_forward)
+
+	cmpl    $32, %ecx
+	jg	L(mm_len_32_or_more_forward)
+
+/* Copy [0..32] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	-16(%eax, %ecx), %xmm1
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jmp	L(mm_return)
+
+L(mm_len_32_or_more_forward):
+	cmpl    $64, %ecx
+	jg	L(mm_len_64_or_more_forward)
+
+/* Copy [0..64] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	-16(%eax, %ecx), %xmm2
+	movdqu	-32(%eax, %ecx), %xmm3
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, -16(%edx, %ecx)
+	movdqu	%xmm3, -32(%edx, %ecx)
+	jmp	L(mm_return)
+
+L(mm_len_64_or_more_forward):
+	cmpl    $128, %ecx
+	jg	L(mm_len_128_or_more_forward)
+
+/* Copy [0..128] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqu	-64(%eax, %ecx), %xmm4
+	movdqu	-48(%eax, %ecx), %xmm5
+	movdqu	-32(%eax, %ecx), %xmm6
+	movdqu	-16(%eax, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+	jmp	L(mm_return)
+
+L(mm_len_128_or_more_forward):
+
+	cmp     $SHARED_CACHE_SIZE_HALF, %ecx
+	jae     L(mm_large_page_forward)
+
+	PUSH (%esi)
+	PUSH (%edi)
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+/* Aligning the address of destination.  */
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm1
+	movdqu	32(%esi), %xmm2
+	movdqu	48(%esi), %xmm3
+
+	leal	64(%edi), %edx
+	andl	$-64, %edx
+
+	movl	%esi, %eax
+	subl	%edi, %eax
+
+	movdqu	(%edx, %eax), %xmm4
+	movdqu	16(%edx, %eax), %xmm5
+	movdqu	32(%edx, %eax), %xmm6
+	movdqu	48(%edx, %eax), %xmm7
+
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm1, 16(%edi)
+	movdqu	%xmm2, 32(%edi)
+	movdqu	%xmm3, 48(%edi)
+	movdqa	%xmm4, (%edx)
+	movdqa	%xmm5, 16(%edx)
+	movdqa	%xmm6, 32(%edx)
+	movdqa	%xmm7, 48(%edx)
+	addl	$64, %edx
+
+	leal	(%edi, %ecx), %ebx
+	andl	$-64, %ebx
+
+	cmp	%edx, %ebx
+	jbe	L(mm_copy_remaining_forward)
+
+	.p2align 4
+L(mm_main_loop_forward):
+
+	prefetcht0 128(%edx, %eax)
+
+	movdqu	(%edx, %eax), %xmm0
+	movdqu	16(%edx, %eax), %xmm1
+	movdqu	32(%edx, %eax), %xmm2
+	movdqu	48(%edx, %eax), %xmm3
+	movdqa	%xmm0, (%edx)
+	movdqa	%xmm1, 16(%edx)
+	movdqa	%xmm2, 32(%edx)
+	movdqa	%xmm3, 48(%edx)
+	leal	64(%edx), %edx
+	cmp	%edx, %ebx
+	ja	L(mm_main_loop_forward)
+
+L(mm_copy_remaining_forward):
+	addl	%edi, %ecx
+	subl	%edx, %ecx
+/* We copied all up till %edx position in the dst.
+	In %ecx now is how many bytes are left to copy.
+	Now we need to advance %esi. */
+	leal	(%edx, %eax), %esi
+
+L(mm_remaining_0_64_bytes_forward):
+	cmp	$32, %ecx
+	ja	L(mm_remaining_33_64_bytes_forward)
+	cmp	$16, %ecx
+	ja	L(mm_remaining_17_32_bytes_forward)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(mm_return_pop_all)
+
+	cmpb	$8, %cl
+	ja	L(mm_remaining_9_16_bytes_forward)
+	cmpb	$4, %cl
+	.p2align 4,,5
+	ja	L(mm_remaining_5_8_bytes_forward)
+	cmpb	$2, %cl
+	.p2align 4,,1
+	ja	L(mm_remaining_3_4_bytes_forward)
+	movzbl	-1(%esi,%ecx), %eax
+	movzbl	(%esi), %ebx
+	movb	%al, -1(%edx,%ecx)
+	movb	%bl, (%edx)
+	jmp	L(mm_return_pop_all)
+
+L(mm_remaining_33_64_bytes_forward):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm1
+	movdqu	-32(%esi, %ecx), %xmm2
+	movdqu	-16(%esi, %ecx), %xmm3
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, -32(%edx, %ecx)
+	movdqu	%xmm3, -16(%edx, %ecx)
+	jmp	L(mm_return_pop_all)
+
+L(mm_remaining_17_32_bytes_forward):
+	movdqu	(%esi), %xmm0
+	movdqu	-16(%esi, %ecx), %xmm1
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jmp	L(mm_return_pop_all)
+
+L(mm_remaining_3_4_bytes_forward):
+	movzwl	-2(%esi,%ecx), %eax
+	movzwl	(%esi), %ebx
+	movw	%ax, -2(%edx,%ecx)
+	movw	%bx, (%edx)
+	jmp	L(mm_return_pop_all)
+
+L(mm_remaining_5_8_bytes_forward):
+	movl	(%esi), %eax
+	movl	-4(%esi,%ecx), %ebx
+	movl	%eax, (%edx)
+	movl	%ebx, -4(%edx,%ecx)
+	jmp	L(mm_return_pop_all)
+
+L(mm_remaining_9_16_bytes_forward):
+	movq	(%esi), %xmm0
+	movq	-8(%esi, %ecx), %xmm1
+	movq	%xmm0, (%edx)
+	movq	%xmm1, -8(%edx, %ecx)
+	jmp	L(mm_return_pop_all)
+
+
+L(mm_len_0_16_bytes_forward):
+	testb	$24, %cl
+	jne	L(mm_len_9_16_bytes_forward)
+	testb	$4, %cl
+	.p2align 4,,5
+	jne	L(mm_len_5_8_bytes_forward)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(mm_return)
+	testb	$2, %cl
+	.p2align 4,,1
+	jne	L(mm_len_2_4_bytes_forward)
+	movzbl	-1(%eax,%ecx), %ebx
+	movzbl	(%eax), %eax
+	movb	%bl, -1(%edx,%ecx)
+	movb	%al, (%edx)
+	jmp	L(mm_return)
+
+L(mm_len_2_4_bytes_forward):
+	movzwl	-2(%eax,%ecx), %ebx
+	movzwl	(%eax), %eax
+	movw	%bx, -2(%edx,%ecx)
+	movw	%ax, (%edx)
+	jmp	L(mm_return)
+
+L(mm_len_5_8_bytes_forward):
+	movl	(%eax), %ebx
+	movl	-4(%eax,%ecx), %eax
+	movl	%ebx, (%edx)
+	movl	%eax, -4(%edx,%ecx)
+	jmp	L(mm_return)
+
+L(mm_len_9_16_bytes_forward):
+	movq	(%eax), %xmm0
+	movq	-8(%eax, %ecx), %xmm1
+	movq	%xmm0, (%edx)
+	movq	%xmm1, -8(%edx, %ecx)
+	jmp	L(mm_return)
+
+/* The code for copying backwards.  */
+L(mm_len_0_or_more_backward):
+
+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
+	separately.  */
+	cmp	$16, %ecx
+	jbe	L(mm_len_0_16_bytes_backward)
+
+	cmpl    $32, %ecx
+	jg	L(mm_len_32_or_more_backward)
+
+/* Copy [0..32] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	-16(%eax, %ecx), %xmm1
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, -16(%edx, %ecx)
+	jmp	L(mm_return)
+
+L(mm_len_32_or_more_backward):
+	cmpl    $64, %ecx
+	jg	L(mm_len_64_or_more_backward)
+
+/* Copy [0..64] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	-16(%eax, %ecx), %xmm2
+	movdqu	-32(%eax, %ecx), %xmm3
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, -16(%edx, %ecx)
+	movdqu	%xmm3, -32(%edx, %ecx)
+	jmp	L(mm_return)
+
+L(mm_len_64_or_more_backward):
+	cmpl    $128, %ecx
+	jg	L(mm_len_128_or_more_backward)
+
+/* Copy [0..128] and return.  */
+	movdqu	(%eax), %xmm0
+	movdqu	16(%eax), %xmm1
+	movdqu	32(%eax), %xmm2
+	movdqu	48(%eax), %xmm3
+	movdqu	-64(%eax, %ecx), %xmm4
+	movdqu	-48(%eax, %ecx), %xmm5
+	movdqu	-32(%eax, %ecx), %xmm6
+	movdqu	-16(%eax, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+	jmp	L(mm_return)
+
+L(mm_len_128_or_more_backward):
+
+	cmp     $SHARED_CACHE_SIZE_HALF, %ecx
+	jae     L(mm_large_page_backward)
+
+	PUSH (%esi)
+	PUSH (%edi)
+
+/* Aligning the address of destination. We need to save
+	16 bits from the source in order not to overwrite them.  */
+	movdqu	-16(%eax, %ecx), %xmm0
+	movdqu	-32(%eax, %ecx), %xmm1
+	movdqu	-48(%eax, %ecx), %xmm2
+	movdqu	-64(%eax, %ecx), %xmm3
+
+	leal	(%edx, %ecx), %edi
+	andl	$-64, %edi
+
+	movl	%eax, %esi
+	subl	%edx, %esi
+
+	movdqu	-16(%edi, %esi), %xmm4
+	movdqu	-32(%edi, %esi), %xmm5
+	movdqu	-48(%edi, %esi), %xmm6
+	movdqu	-64(%edi, %esi), %xmm7
+
+	movdqu	%xmm0, -16(%edx, %ecx)
+	movdqu	%xmm1, -32(%edx, %ecx)
+	movdqu	%xmm2, -48(%edx, %ecx)
+	movdqu	%xmm3, -64(%edx, %ecx)
+	movdqa	%xmm4, -16(%edi)
+	movdqa	%xmm5, -32(%edi)
+	movdqa	%xmm6, -48(%edi)
+	movdqa	%xmm7, -64(%edi)
+	leal	-64(%edi), %edi
+
+	leal	64(%edx), %ebx
+	andl	$-64, %ebx
+
+/* Compute in %ecx how many bytes are left to copy after
+	the main loop stops.  */
+	movl	%ebx, %ecx
+	subl	%edx, %ecx
+
+	cmp	%edi, %ebx
+	jb	L(mm_main_loop_backward)
+
+	POP (%edi)
+	POP (%esi)
+	jmp	L(mm_len_0_or_more_backward)
+
+	.p2align 4
+L(mm_main_loop_backward):
+
+	prefetcht0 -128(%edi, %esi)
+
+	movdqu	-64(%edi, %esi), %xmm0
+	movdqu	-48(%edi, %esi), %xmm1
+	movdqu	-32(%edi, %esi), %xmm2
+	movdqu	-16(%edi, %esi), %xmm3
+	movdqa	%xmm0, -64(%edi)
+	movdqa	%xmm1, -48(%edi)
+	movdqa	%xmm2, -32(%edi)
+	movdqa	%xmm3, -16(%edi)
+	leal	-64(%edi), %edi
+	cmp	%edi, %ebx
+	jb	L(mm_main_loop_backward)
+	POP (%edi)
+	POP (%esi)
+	jmp	L(mm_len_0_or_more_backward)
+
+/* Copy [0..16] and return.  */
+L(mm_len_0_16_bytes_backward):
+	testb	$24, %cl
+	jnz	L(mm_len_9_16_bytes_backward)
+	testb	$4, %cl
+	.p2align 4,,5
+	jnz	L(mm_len_5_8_bytes_backward)
+	testl	%ecx, %ecx
+	.p2align 4,,2
+	je	L(mm_return)
+	testb	$2, %cl
+	.p2align 4,,1
+	jne	L(mm_len_3_4_bytes_backward)
+	movzbl	-1(%eax,%ecx), %ebx
+	movzbl	(%eax), %eax
+	movb	%bl, -1(%edx,%ecx)
+	movb	%al, (%edx)
+	jmp	L(mm_return)
+
+L(mm_len_3_4_bytes_backward):
+	movzwl	-2(%eax,%ecx), %ebx
+	movzwl	(%eax), %eax
+	movw	%bx, -2(%edx,%ecx)
+	movw	%ax, (%edx)
+	jmp	L(mm_return)
+
+L(mm_len_9_16_bytes_backward):
+	PUSH (%esi)
+	movl	-4(%eax,%ecx), %ebx
+	movl	-8(%eax,%ecx), %esi
+	movl	%ebx, -4(%edx,%ecx)
+	movl	%esi, -8(%edx,%ecx)
+	subl	$8, %ecx
+	POP (%esi)
+	jmp	L(mm_len_0_16_bytes_backward)
+
+L(mm_len_5_8_bytes_backward):
+	movl	(%eax), %ebx
+	movl	-4(%eax,%ecx), %eax
+	movl	%ebx, (%edx)
+	movl	%eax, -4(%edx,%ecx)
+
+L(mm_return):
+	movl	%edx, %eax
+	RETURN
+
+L(mm_return_pop_all):
+	movl	%edi, %eax
+	POP (%edi)
+	POP (%esi)
+	RETURN
+
+/* Big length copy forward part.  */
+
+L(mm_large_page_forward):
+/* Aligning the address of destination. We need to save
+	16 bits from the source in order not to overwrite them.  */
+
+	PUSH (%esi)
+	PUSH (%edi)
+	movl	%eax, %esi
+	movl	%edx, %edi
+
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm1
+	movdqu	32(%esi), %xmm2
+	movdqu	48(%esi), %xmm3
+
+	leal	64(%edi), %edx
+	andl	$-64, %edx
+
+	movl	%esi, %eax
+	subl	%edi, %eax
+
+	movdqu	(%edx, %eax), %xmm4
+	movdqu	16(%edx, %eax), %xmm5
+	movdqu	32(%edx, %eax), %xmm6
+	movdqu	48(%edx, %eax), %xmm7
+
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm1, 16(%edi)
+	movdqu	%xmm2, 32(%edi)
+	movdqu	%xmm3, 48(%edi)
+	movntdq	%xmm4, (%edx)
+	movntdq	%xmm5, 16(%edx)
+	movntdq	%xmm6, 32(%edx)
+	movntdq	%xmm7, 48(%edx)
+	addl	$64, %edx
+
+	leal	(%edi, %ecx), %ebx
+	andl	$-128, %ebx
+
+	cmp	%edx, %ebx
+	jbe	L(mm_copy_remaining_forward)
+
+	.p2align 4
+L(mm_large_page_loop_forward):
+	movdqu	(%edx, %eax), %xmm0
+	movdqu	16(%edx, %eax), %xmm1
+	movdqu	32(%edx, %eax), %xmm2
+	movdqu	48(%edx, %eax), %xmm3
+	movdqu	64(%edx, %eax), %xmm4
+	movdqu	80(%edx, %eax), %xmm5
+	movdqu	96(%edx, %eax), %xmm6
+	movdqu	112(%edx, %eax), %xmm7
+	movntdq	%xmm0, (%edx)
+	movntdq	%xmm1, 16(%edx)
+	movntdq	%xmm2, 32(%edx)
+	movntdq	%xmm3, 48(%edx)
+	movntdq	%xmm4, 64(%edx)
+	movntdq	%xmm5, 80(%edx)
+	movntdq	%xmm6, 96(%edx)
+	movntdq	%xmm7, 112(%edx)
+	leal	128(%edx), %edx
+	cmp	%edx, %ebx
+	ja	L(mm_large_page_loop_forward)
+	sfence
+
+	addl	%edi, %ecx
+	subl	%edx, %ecx
+/* We copied all up till %edx position in the dst.
+	In %ecx now is how many bytes are left to copy.
+	Now we need to advance %esi. */
+	leal	(%edx, %eax), %esi
+
+	cmp	$64, %ecx
+	jb	L(mm_remaining_0_64_bytes_forward)
+
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm1
+	movdqu	32(%esi), %xmm2
+	movdqu	48(%esi), %xmm3
+	movdqu	-64(%esi, %ecx), %xmm4
+	movdqu	-48(%esi, %ecx), %xmm5
+	movdqu	-32(%esi, %ecx), %xmm6
+	movdqu	-16(%esi, %ecx), %xmm7
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm1, 16(%edx)
+	movdqu	%xmm2, 32(%edx)
+	movdqu	%xmm3, 48(%edx)
+	movdqu	%xmm4, -64(%edx, %ecx)
+	movdqu	%xmm5, -48(%edx, %ecx)
+	movdqu	%xmm6, -32(%edx, %ecx)
+	movdqu	%xmm7, -16(%edx, %ecx)
+	jmp	L(mm_return_pop_all)
+
+
+/* Big length copy backward part.  */
+L(mm_large_page_backward):
+/* Aligning the address of destination. We need to save
+	16 bits from the source in order not to overwrite them.  */
+
+	PUSH (%esi)
+	PUSH (%edi)
+
+	movdqu	-16(%eax, %ecx), %xmm0
+	movdqu	-32(%eax, %ecx), %xmm1
+	movdqu	-48(%eax, %ecx), %xmm2
+	movdqu	-64(%eax, %ecx), %xmm3
+
+	leal	(%edx, %ecx), %edi
+	andl	$-64, %edi
+
+	movl	%eax, %esi
+	subl	%edx, %esi
+
+	movdqu	-16(%edi, %esi), %xmm4
+	movdqu	-32(%edi, %esi), %xmm5
+	movdqu	-48(%edi, %esi), %xmm6
+	movdqu	-64(%edi, %esi), %xmm7
+
+	movdqu	%xmm0, -16(%edx, %ecx)
+	movdqu	%xmm1, -32(%edx, %ecx)
+	movdqu	%xmm2, -48(%edx, %ecx)
+	movdqu	%xmm3, -64(%edx, %ecx)
+	movntdq	%xmm4, -16(%edi)
+	movntdq	%xmm5, -32(%edi)
+	movntdq	%xmm6, -48(%edi)
+	movntdq	%xmm7, -64(%edi)
+	leal	-64(%edi), %edi
+
+	leal	128(%edx), %ebx
+	andl	$-64, %ebx
+
+/* Compute in %ecx how many bytes are left to copy after
+	the main loop stops.  */
+	movl	%ebx, %ecx
+	subl	%edx, %ecx
+
+	cmp	%edi, %ebx
+	jae	L(mm_len_0_or_more_backward)
+
+	.p2align 4
+L(mm_large_page_loop_backward):
+	movdqu	-64(%edi, %esi), %xmm0
+	movdqu	-48(%edi, %esi), %xmm1
+	movdqu	-32(%edi, %esi), %xmm2
+	movdqu	-16(%edi, %esi), %xmm3
+	movntdq	%xmm0, -64(%edi)
+	movntdq	%xmm1, -48(%edi)
+	movntdq	%xmm2, -32(%edi)
+	movntdq	%xmm3, -16(%edi)
+	leal	-64(%edi), %edi
+	cmp	%edi, %ebx
+	jb	L(mm_large_page_loop_backward)
+	POP (%edi)
+	POP (%esi)
+	jmp	L(mm_len_0_or_more_backward)
+
+END (MEMMOVE)
diff --git a/libc/arch-x86/string/sse2-memset-atom.S b/libc/arch-x86/silvermont/string/sse2-memset-slm.S
similarity index 80%
copy from libc/arch-x86/string/sse2-memset-atom.S
copy to libc/arch-x86/silvermont/string/sse2-memset-slm.S
index a54bf51..c30bf74 100644
--- a/libc/arch-x86/string/sse2-memset-atom.S
+++ b/libc/arch-x86/silvermont/string/sse2-memset-slm.S
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2014, Intel Corporation
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -29,7 +29,10 @@
 */
 
 #include "cache.h"
-#undef __i686
+
+#ifndef MEMSET
+# define MEMSET memset
+#endif
 
 #ifndef L
 # define L(label)	.L##label
@@ -61,7 +64,7 @@
 
 #ifndef ENTRY
 # define ENTRY(name)			\
-	.type name,  @function; 	\
+	.type name,  @function;		\
 	.globl name;			\
 	.p2align 4;			\
 name:					\
@@ -107,7 +110,7 @@
    jump table with relative offsets.   */
 # define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
     /* We first load PC into EBX.  */				\
-    call	__i686.get_pc_thunk.bx;				\
+    call	__x86.get_pc_thunk.bx;				\
     /* Get the address of the jump table.  */			\
     add		$(TABLE - .), %ebx;				\
     /* Get the entry and convert the relative offset to the	\
@@ -117,12 +120,12 @@
     /* We loaded the jump table and adjuested EDX. Go.  */	\
     jmp		*%ebx
 
-	.section	.gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
-	.globl	__i686.get_pc_thunk.bx
-	.hidden	__i686.get_pc_thunk.bx
+	.section	.gnu.linkonce.t.__x86.get_pc_thunk.bx,"ax",@progbits
+	.globl	__x86.get_pc_thunk.bx
+	.hidden	__x86.get_pc_thunk.bx
 	ALIGN (4)
-	.type	__i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
+	.type	__x86.get_pc_thunk.bx,@function
+__x86.get_pc_thunk.bx:
 	movl	(%esp), %ebx
 	ret
 #else
@@ -139,16 +142,18 @@
     jmp		*TABLE(,%ecx,4)
 #endif
 
-#ifndef MEMSET
-# define MEMSET memset
-#endif
-
 	.section .text.sse2,"ax",@progbits
 	ALIGN (4)
 ENTRY (MEMSET)
 	ENTRANCE
 
 	movl	LEN(%esp), %ecx
+	cmp	$0, %ecx
+	ja	L(1byteormore)
+	SETRTNVAL
+	RETURN
+
+L(1byteormore):
 #ifdef USE_AS_BZERO
 	xor	%eax, %eax
 #else
@@ -156,147 +161,62 @@
 	movb	%al, %ah
 	/* Fill the whole EAX with pattern.  */
 	movl	%eax, %edx
-	shl	$16, %eax
+	shl	 $16, %eax
 	or	%edx, %eax
 #endif
 	movl	DEST(%esp), %edx
-	cmp	$32, %ecx
-	jae	L(32bytesormore)
+	cmp	$1, %ecx
+	je	L(1byte)
+	cmp	$16, %ecx
+	jae	L(16bytesormore)
 
-L(write_less32bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
+	cmp	$4, %ecx
+	jb	L(4bytesless)
+	movl	%eax, (%edx)
+	movl	%eax, -4(%edx, %ecx)
+	cmp	$8, %ecx
+	jb	L(8bytesless)
+	movl	%eax, 4(%edx)
+	movl	%eax, -8(%edx, %ecx)
+L(8bytesless):
+	SETRTNVAL
+	RETURN
 
+L(4bytesless):
+	movw	%ax, (%edx)
+	movw	%ax, -2(%edx, %ecx)
+	SETRTNVAL
+	RETURN
 
-	.pushsection .rodata.sse2,"a",@progbits
-	ALIGN (2)
-L(table_less_32bytes):
-	.int	JMPTBL (L(write_0bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_1bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_2bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_3bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_4bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_5bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_6bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_7bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_8bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_9bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_10bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_11bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_12bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_13bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_14bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_15bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_16bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_17bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_18bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_19bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_20bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_21bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_22bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_23bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_24bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_25bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_26bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_27bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_28bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_29bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_30bytes), L(table_less_32bytes))
-	.int	JMPTBL (L(write_31bytes), L(table_less_32bytes))
-	.popsection
-
-	ALIGN (4)
-L(write_28bytes):
-	movl	%eax, -28(%edx)
-L(write_24bytes):
-	movl	%eax, -24(%edx)
-L(write_20bytes):
-	movl	%eax, -20(%edx)
-L(write_16bytes):
-	movl	%eax, -16(%edx)
-L(write_12bytes):
-	movl	%eax, -12(%edx)
-L(write_8bytes):
-	movl	%eax, -8(%edx)
-L(write_4bytes):
-	movl	%eax, -4(%edx)
-L(write_0bytes):
+L(1byte):
+	movb	%al, (%edx)
 	SETRTNVAL
 	RETURN
 
 	ALIGN (4)
-L(write_29bytes):
-	movl	%eax, -29(%edx)
-L(write_25bytes):
-	movl	%eax, -25(%edx)
-L(write_21bytes):
-	movl	%eax, -21(%edx)
-L(write_17bytes):
-	movl	%eax, -17(%edx)
-L(write_13bytes):
-	movl	%eax, -13(%edx)
-L(write_9bytes):
-	movl	%eax, -9(%edx)
-L(write_5bytes):
-	movl	%eax, -5(%edx)
-L(write_1bytes):
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(write_30bytes):
-	movl	%eax, -30(%edx)
-L(write_26bytes):
-	movl	%eax, -26(%edx)
-L(write_22bytes):
-	movl	%eax, -22(%edx)
-L(write_18bytes):
-	movl	%eax, -18(%edx)
-L(write_14bytes):
-	movl	%eax, -14(%edx)
-L(write_10bytes):
-	movl	%eax, -10(%edx)
-L(write_6bytes):
-	movl	%eax, -6(%edx)
-L(write_2bytes):
-	movw	%ax, -2(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-L(write_31bytes):
-	movl	%eax, -31(%edx)
-L(write_27bytes):
-	movl	%eax, -27(%edx)
-L(write_23bytes):
-	movl	%eax, -23(%edx)
-L(write_19bytes):
-	movl	%eax, -19(%edx)
-L(write_15bytes):
-	movl	%eax, -15(%edx)
-L(write_11bytes):
-	movl	%eax, -11(%edx)
-L(write_7bytes):
-	movl	%eax, -7(%edx)
-L(write_3bytes):
-	movw	%ax, -3(%edx)
-	movb	%al, -1(%edx)
-	SETRTNVAL
-	RETURN
-
-	ALIGN (4)
-/* ECX > 32 and EDX is 4 byte aligned.  */
-L(32bytesormore):
-	/* Fill xmm0 with the pattern.  */
+L(16bytesormore):
 #ifdef USE_AS_BZERO
 	pxor	%xmm0, %xmm0
 #else
 	movd	%eax, %xmm0
 	pshufd	$0, %xmm0, %xmm0
 #endif
+
+	cmp	$64, %ecx
+	ja	L(64bytesmore)
+	movdqu	%xmm0, (%edx)
+	movdqu	%xmm0, -16(%edx, %ecx)
+	cmp	$32, %ecx
+	jbe	L(32bytesless)
+	movdqu	%xmm0, 16(%edx)
+	movdqu	%xmm0, -32(%edx, %ecx)
+L(32bytesless):
+	SETRTNVAL
+	RETURN
+
+L(64bytesmore):
 	testl	$0xf, %edx
 	jz	L(aligned_16)
-/* ECX > 32 and EDX is not 16 byte aligned.  */
 L(not_aligned_16):
 	movdqu	%xmm0, (%edx)
 	movl	%edx, %eax
@@ -321,71 +241,73 @@
 	mov	$SHARED_CACHE_SIZE, %ebx
 #else
 # if (defined SHARED || defined __PIC__)
-	call	__i686.get_pc_thunk.bx
+	call	__x86.get_pc_thunk.bx
 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
+	mov	$__x86_shared_cache_size@GOTOFF(%ebx), %ebx
 # else
 	PUSH (%ebx)
-	mov	__x86_shared_cache_size, %ebx
+	mov	$__x86_shared_cache_size, %ebx
 # endif
 #endif
 	cmp	%ebx, %ecx
 	jae	L(128bytesormore_nt_start)
 
+	POP (%ebx)
 
 #ifdef DATA_CACHE_SIZE
-	POP (%ebx)
-# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
-	cmp	$DATA_CACHE_SIZE, %ecx
+	PUSH (%ebx)
+	mov	$DATA_CACHE_SIZE, %ebx
 #else
 # if (defined SHARED || defined __PIC__)
-#  define RESTORE_EBX_STATE
-	call	__i686.get_pc_thunk.bx
+	call	__x86.get_pc_thunk.bx
 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
+	mov	$__x86_data_cache_size@GOTOFF(%ebx), %ebx
 # else
-	POP (%ebx)
-#  define RESTORE_EBX_STATE CFI_PUSH (%ebx)
-	cmp	__x86_data_cache_size, %ecx
+	PUSH (%ebx)
+	mov	$__x86_data_cache_size, %ebx
 # endif
 #endif
 
+	cmp	%ebx, %ecx
 	jae	L(128bytes_L2_normal)
 	subl	$128, %ecx
 L(128bytesormore_normal):
 	sub	$128, %ecx
 	movdqa	%xmm0, (%edx)
-	movdqa	%xmm0, 0x10(%edx)
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm0, 0x30(%edx)
-	movdqa	%xmm0, 0x40(%edx)
-	movdqa	%xmm0, 0x50(%edx)
-	movdqa	%xmm0, 0x60(%edx)
-	movdqa	%xmm0, 0x70(%edx)
+	movaps	%xmm0, 0x10(%edx)
+	movaps	%xmm0, 0x20(%edx)
+	movaps	%xmm0, 0x30(%edx)
+	movaps	%xmm0, 0x40(%edx)
+	movaps	%xmm0, 0x50(%edx)
+	movaps	%xmm0, 0x60(%edx)
+	movaps	%xmm0, 0x70(%edx)
 	lea	128(%edx), %edx
 	jb	L(128bytesless_normal)
 
 
 	sub	$128, %ecx
 	movdqa	%xmm0, (%edx)
-	movdqa	%xmm0, 0x10(%edx)
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm0, 0x30(%edx)
-	movdqa	%xmm0, 0x40(%edx)
-	movdqa	%xmm0, 0x50(%edx)
-	movdqa	%xmm0, 0x60(%edx)
-	movdqa	%xmm0, 0x70(%edx)
+	movaps	%xmm0, 0x10(%edx)
+	movaps	%xmm0, 0x20(%edx)
+	movaps	%xmm0, 0x30(%edx)
+	movaps	%xmm0, 0x40(%edx)
+	movaps	%xmm0, 0x50(%edx)
+	movaps	%xmm0, 0x60(%edx)
+	movaps	%xmm0, 0x70(%edx)
 	lea	128(%edx), %edx
 	jae	L(128bytesormore_normal)
 
 L(128bytesless_normal):
-	add	$128, %ecx
+	lea	128(%ecx), %ecx
+#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__)
+	POP (%ebx)
+#endif
 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
 
 	ALIGN (4)
 L(128bytes_L2_normal):
-	prefetcht0	0x380(%edx)
-	prefetcht0	0x3c0(%edx)
+	prefetchnta	0x380(%edx)
+	prefetchnta	0x3c0(%edx)
 	sub	$128, %ecx
 	movdqa	%xmm0, (%edx)
 	movaps	%xmm0, 0x10(%edx)
@@ -400,28 +322,26 @@
 	jae	L(128bytes_L2_normal)
 
 L(128bytesless_L2_normal):
+#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__)
+	POP (%ebx)
+#endif
 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
 
-	RESTORE_EBX_STATE
 L(128bytesormore_nt_start):
 	sub	%ebx, %ecx
-	mov	%ebx, %eax
-	and	$0x7f, %eax
-	add	%eax, %ecx
-	movd	%xmm0, %eax
 	ALIGN (4)
 L(128bytesormore_shared_cache_loop):
-	prefetcht0	0x3c0(%edx)
-	prefetcht0	0x380(%edx)
+	prefetchnta	0x3c0(%edx)
+	prefetchnta	0x380(%edx)
 	sub	$0x80, %ebx
 	movdqa	%xmm0, (%edx)
-	movdqa	%xmm0, 0x10(%edx)
-	movdqa	%xmm0, 0x20(%edx)
-	movdqa	%xmm0, 0x30(%edx)
-	movdqa	%xmm0, 0x40(%edx)
-	movdqa	%xmm0, 0x50(%edx)
-	movdqa	%xmm0, 0x60(%edx)
-	movdqa	%xmm0, 0x70(%edx)
+	movaps	%xmm0, 0x10(%edx)
+	movaps	%xmm0, 0x20(%edx)
+	movaps	%xmm0, 0x30(%edx)
+	movaps	%xmm0, 0x40(%edx)
+	movaps	%xmm0, 0x50(%edx)
+	movaps	%xmm0, 0x60(%edx)
+	movaps	%xmm0, 0x70(%edx)
 	add	$0x80, %edx
 	cmp	$0x80, %ebx
 	jae	L(128bytesormore_shared_cache_loop)
@@ -443,7 +363,7 @@
 	jae	L(128bytesormore_nt)
 	sfence
 L(shared_cache_loop_end):
-#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__)
+#if defined SHARED_CACHE_SIZE || !(defined SHARED || defined __PIC__)
 	POP (%ebx)
 #endif
 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/silvermont/string/sse2-stpcpy-slm.S
old mode 100644
new mode 100755
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86/silvermont/string/sse2-stpcpy-slm.S
index 9d0a563..5c43fa5
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/silvermont/string/sse2-stpcpy-slm.S
@@ -1,42 +1,33 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
+/*

+Copyright (c) 2014, Intel Corporation

+All rights reserved.

+

+Redistribution and use in source and binary forms, with or without

+modification, are permitted provided that the following conditions are met:

+

+    * Redistributions of source code must retain the above copyright notice,

+    * this list of conditions and the following disclaimer.

+

+    * Redistributions in binary form must reproduce the above copyright notice,

+    * this list of conditions and the following disclaimer in the documentation

+    * and/or other materials provided with the distribution.

+

+    * Neither the name of Intel Corporation nor the names of its contributors

+    * may be used to endorse or promote products derived from this software

+    * without specific prior written permission.

+

+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON

+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+

+#define USE_AS_STPCPY

+#define STRCPY stpcpy

+#include "sse2-strcpy-slm.S"

diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/silvermont/string/sse2-stpncpy-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86/silvermont/string/sse2-stpncpy-slm.S
index 9d0a563..af5c0d3 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/silvermont/string/sse2-stpncpy-slm.S
@@ -1,42 +1,34 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
+/*

+Copyright (c) 2014, Intel Corporation

+All rights reserved.

+

+Redistribution and use in source and binary forms, with or without

+modification, are permitted provided that the following conditions are met:

+

+    * Redistributions of source code must retain the above copyright notice,

+    * this list of conditions and the following disclaimer.

+

+    * Redistributions in binary form must reproduce the above copyright notice,

+    * this list of conditions and the following disclaimer in the documentation

+    * and/or other materials provided with the distribution.

+

+    * Neither the name of Intel Corporation nor the names of its contributors

+    * may be used to endorse or promote products derived from this software

+    * without specific prior written permission.

+

+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON

+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+

+#define USE_AS_STRNCPY

+#define USE_AS_STPCPY

+#define STRCPY stpncpy

+#include "sse2-strcpy-slm.S"

diff --git a/libc/arch-x86/silvermont/string/sse2-strcpy-slm.S b/libc/arch-x86/silvermont/string/sse2-strcpy-slm.S
new file mode 100755
index 0000000..b5d84b5
--- /dev/null
+++ b/libc/arch-x86/silvermont/string/sse2-strcpy-slm.S
@@ -0,0 +1,2157 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)             \
+	.type name, @function;   \
+	.globl name;             \
+	.p2align 4;              \
+name:                            \
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)               \
+	cfi_endproc;             \
+	.size name, .-name
+#endif
+
+#define CFI_PUSH(REG)                  \
+	cfi_adjust_cfa_offset (4);     \
+	cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)                   \
+	cfi_adjust_cfa_offset (-4);    \
+	cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#ifndef STRCPY
+# define STRCPY  strcpy
+#endif
+
+#ifdef USE_AS_STPNCPY
+# define USE_AS_STRNCPY
+# define USE_AS_STPCPY
+#endif
+
+#ifdef USE_AS_STRNCPY
+# define PARMS  16
+# define ENTRANCE PUSH(%ebx); PUSH(%esi); PUSH(%edi)
+# define RETURN  POP(%edi); POP(%esi); POP(%ebx); ret; CFI_PUSH(%ebx); CFI_PUSH(%edi); CFI_PUSH(%edi);
+#else
+# define PARMS  12
+# define ENTRANCE PUSH(%esi); PUSH(%edi)
+# define RETURN  POP(%edi); POP(%esi); ret; CFI_PUSH(%esi); CFI_PUSH(%edi);
+#endif
+
+#define STR1  PARMS
+#define STR2  STR1+4
+#define LEN  STR2+4
+
+
+#if (defined SHARED || defined __PIC__)
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into ECX and branch to it.  TABLE is a
+	jump	table with relative offsets.  INDEX is a register contains the
+	index	into the jump table.   SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)            \
+	/* We first load PC into ECX.  */                       \
+	call	__x86.get_pc_thunk.cx;                         \
+	/* Get the address of the jump table.  */               \
+	addl	$(TABLE - .), %ecx;                             \
+	/* Get the entry and convert the relative offset to the \
+	absolute	address.  */                            \
+	addl	(%ecx,INDEX,SCALE), %ecx;                       \
+	/* We loaded the jump table and adjuested ECX. Go.  */  \
+	jmp	*%ecx
+#else
+# define JMPTBL(I, B)	I
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+	absolute	offsets.  INDEX is a register contains the index into the
+	jump	table.  SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+	jmp	*TABLE(,INDEX,SCALE)
+#endif
+
+.text
+ENTRY (STRCPY)
+	ENTRANCE
+	mov	STR1(%esp), %edi
+	mov	STR2(%esp), %esi
+#ifdef USE_AS_STRNCPY
+	movl	LEN(%esp), %ebx
+	test	%ebx, %ebx
+	jz	L(ExitZero)
+#endif
+
+	mov	%esi, %ecx
+#ifndef USE_AS_STPCPY
+	mov	%edi, %eax      /* save result */
+#endif
+	and	$15, %ecx
+	jz	L(SourceStringAlignmentZero)
+
+	and	$-16, %esi
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	(%esi), %xmm1
+#ifdef USE_AS_STRNCPY
+	add	%ecx, %ebx
+#endif
+	pmovmskb %xmm1, %edx
+	shr	%cl, %edx
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	cmp	$16, %ebx
+	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
+#else
+	cmp	$17, %ebx
+	jbe	L(CopyFrom1To16BytesTailCase2OrCase3)
+#endif
+#endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail)
+
+	pcmpeqb	16(%esi), %xmm0
+	pmovmskb %xmm0, %edx
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	cmp	$32, %ebx
+	jbe	L(CopyFrom1To32BytesCase2OrCase3)
+#else
+	cmp	$33, %ebx
+	jbe	L(CopyFrom1To32BytesCase2OrCase3)
+#endif
+#endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes)
+
+	movdqu	(%esi, %ecx), %xmm1   /* copy 16 bytes */
+	movdqu	%xmm1, (%edi)
+
+	sub	%ecx, %edi
+	mov	%edi, %edx
+	mov	$16, %ecx
+	and	$15, %edx
+	jz	L(Align16Both)
+
+/* If source adress alignment != destination adress alignment */
+	.p2align 4
+L(Unalign16Both):
+	movdqa	(%esi, %ecx), %xmm1
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%edi, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$48, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movaps	16(%esi, %ecx), %xmm3
+	movdqu	%xmm2, (%edi, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movaps	16(%esi, %ecx), %xmm4
+	movdqu	%xmm3, (%edi, %ecx)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movaps	16(%esi, %ecx), %xmm1
+	movdqu	%xmm4, (%edi, %ecx)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm1)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movaps	16(%esi, %ecx), %xmm2
+	movdqu	%xmm1, (%edi, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm2)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movaps	16(%esi, %ecx), %xmm3
+	movdqu	%xmm2, (%edi, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm3)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movdqu	%xmm3, (%edi, %ecx)
+	mov	%esi, %edx
+	lea	16(%esi, %ecx), %esi
+	and	$-0x40, %esi
+	sub	%esi, %edx
+	sub	%edx, %edi
+#ifdef USE_AS_STRNCPY
+	lea	64+64(%ebx, %edx), %ebx
+#endif
+L(Unaligned64Loop):
+	movaps	(%esi), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%esi), %xmm5
+	movaps	32(%esi), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%esi), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(UnalignedLeaveCase2OrCase3)
+#endif
+	test	%edx, %edx
+	jnz	L(Unaligned64Leave)
+
+L(Unaligned64Loop_start):
+	add	$64, %edi
+	add	$64, %esi
+	movdqu	%xmm4, -64(%edi)
+	movaps	(%esi), %xmm2
+	movdqa	%xmm2, %xmm4
+	movdqu	%xmm5, -48(%edi)
+	movaps	16(%esi), %xmm5
+	pminub	%xmm5, %xmm2
+	movaps	32(%esi), %xmm3
+	movdqu	%xmm6, -32(%edi)
+	movaps	%xmm3, %xmm6
+	movdqu	%xmm7, -16(%edi)
+	movaps	48(%esi), %xmm7
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(UnalignedLeaveCase2OrCase3)
+#endif
+	test	%edx, %edx
+	jz	L(Unaligned64Loop_start)
+
+L(Unaligned64Leave):
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	%xmm4, %xmm0
+	pcmpeqb	%xmm5, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnaligned_0)
+	test	%ecx, %ecx
+	jnz	L(CopyFrom1To16BytesUnaligned_16)
+
+	pcmpeqb	%xmm6, %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnaligned_32)
+
+	bsf	%ecx, %edx
+	movdqu	%xmm4, (%edi)
+	movdqu	%xmm5, 16(%edi)
+	movdqu	%xmm6, 32(%edi)
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	lea	48(%edi, %edx), %eax
+#endif
+	movdqu	%xmm7, 48(%edi)
+	add	$15, %ebx
+	sub	%edx, %ebx
+	lea	49(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+#else
+	add	$48, %esi
+	add	$48, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+/* If source adress alignment == destination adress alignment */
+
+L(SourceStringAlignmentZero):
+	pxor	%xmm0, %xmm0
+	movdqa	(%esi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	cmp	$16, %ebx
+	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
+#else
+	cmp	$17, %ebx
+	jbe	L(CopyFrom1To16BytesTail1Case2OrCase3)
+#endif
+#endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail1)
+
+	pcmpeqb	16(%esi), %xmm0
+	movdqu	%xmm1, (%edi)
+	pmovmskb %xmm0, %edx
+
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	cmp	$32, %ebx
+	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
+#else
+	cmp	$33, %ebx
+	jbe	L(CopyFrom1To32Bytes1Case2OrCase3)
+#endif
+#endif
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32Bytes1)
+
+	mov	%edi, %edx
+	mov	$16, %ecx
+	and	$15, %edx
+	jnz	L(Unalign16Both)
+
+L(Align16Both):
+	movdqa	(%esi, %ecx), %xmm1
+	movdqa	16(%esi, %ecx), %xmm2
+	movdqa	%xmm1, (%edi, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$16, %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$48, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesXmm2)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movdqa	16(%esi, %ecx), %xmm3
+	movdqa	%xmm2, (%edi, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesXmm3)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movdqa	16(%esi, %ecx), %xmm4
+	movdqa	%xmm3, (%edi, %ecx)
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesXmm4)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movdqa	16(%esi, %ecx), %xmm1
+	movdqa	%xmm4, (%edi, %ecx)
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesXmm1)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movdqa	16(%esi, %ecx), %xmm2
+	movdqa	%xmm1, (%edi, %ecx)
+	pcmpeqb	%xmm2, %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesXmm2)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movdqa	16(%esi, %ecx), %xmm3
+	movdqa	%xmm2, (%edi, %ecx)
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+	lea	16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesXmm3)
+#else
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes)
+#endif
+
+	movdqa	%xmm3, (%edi, %ecx)
+	mov	%esi, %edx
+	lea	16(%esi, %ecx), %esi
+	and	$-0x40, %esi
+	sub	%esi, %edx
+	sub	%edx, %edi
+#ifdef USE_AS_STRNCPY
+	lea	64+64(%ebx, %edx), %ebx
+#endif
+L(Aligned64Loop):
+	movdqa	(%esi), %xmm2
+	movdqa	%xmm2, %xmm4
+	movaps	16(%esi), %xmm5
+	movdqa	32(%esi), %xmm3
+	movdqa	%xmm3, %xmm6
+	movaps	48(%esi), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm0, %xmm3
+	pmovmskb %xmm3, %edx
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(AlignedLeaveCase2OrCase3)
+#endif
+	test	%edx, %edx
+	jnz	L(Aligned64Leave)
+
+L(Aligned64Loop_start):
+	add	$64, %esi
+	add	$64, %edi
+	movaps	%xmm4, -64(%edi)
+	movdqa	(%esi), %xmm2
+	movdqa	%xmm2, %xmm4
+	movaps	%xmm5, -48(%edi)
+	movaps	16(%esi), %xmm5
+	pminub	%xmm5, %xmm2
+	movaps	32(%esi), %xmm3
+	movaps	%xmm6, -32(%edi)
+	movdqa	%xmm3, %xmm6
+	movaps	%xmm7, -16(%edi)
+	movaps	48(%esi), %xmm7
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb %xmm0, %edx
+#ifdef USE_AS_STRNCPY
+	sub	$64, %ebx
+	jbe	L(AlignedLeaveCase2OrCase3)
+#endif
+	test	%edx, %edx
+	jz	L(Aligned64Loop_start)
+
+L(Aligned64Leave):
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+
+	pcmpeqb	%xmm4, %xmm0
+	pcmpeqb	%xmm5, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes_0)
+	test	%ecx, %ecx
+	jnz	L(CopyFrom1To16Bytes_16)
+
+	pcmpeqb	%xmm6, %xmm0
+	pcmpeqb	%xmm7, %xmm1
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %ecx
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16Bytes_32)
+
+	bsf	%ecx, %edx
+	movdqa	%xmm4, (%edi)
+	movdqa	%xmm5, 16(%edi)
+	movdqa	%xmm6, 32(%edi)
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	lea	48(%edi, %edx), %eax
+#endif
+	movdqa	%xmm7, 48(%edi)
+	add	$15, %ebx
+	sub	%edx, %ebx
+	lea	49(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+#else
+	add	$48, %esi
+	add	$48, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+/*----------------------------------------------------*/
+
+/* Case1 */
+#ifndef USE_AS_STRNCPY
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%ecx, %edi
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+	.p2align 4
+L(CopyFrom1To16BytesTail):
+#ifdef USE_AS_STRNCPY
+	sub	%ecx, %ebx
+#endif
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1):
+	add	$16, %esi
+	add	$16, %edi
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+#endif
+L(CopyFrom1To16BytesTail1):
+	bsf	%edx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes):
+#ifdef USE_AS_STRNCPY
+	sub	%ecx, %ebx
+#endif
+	bsf	%edx, %edx
+	add	%ecx, %esi
+	add	$16, %edx
+	sub	%ecx, %edx
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+	.p2align 4
+L(CopyFrom1To16Bytes_0):
+	bsf	%edx, %edx
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	lea	(%edi, %edx), %eax
+#endif
+	movdqa	%xmm4, (%edi)
+	add	$63, %ebx
+	sub	%edx, %ebx
+	lea	1(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+#else
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+	.p2align 4
+L(CopyFrom1To16Bytes_16):
+	bsf	%ecx, %edx
+	movdqa	%xmm4, (%edi)
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	lea	16(%edi, %edx), %eax
+#endif
+	movdqa	%xmm5, 16(%edi)
+	add	$47, %ebx
+	sub	%edx, %ebx
+	lea	17(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+#else
+	add	$16, %esi
+	add	$16, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+	.p2align 4
+L(CopyFrom1To16Bytes_32):
+	bsf	%edx, %edx
+	movdqa	%xmm4, (%edi)
+	movdqa	%xmm5, 16(%edi)
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	lea	32(%edi, %edx), %eax
+#endif
+	movdqa	%xmm6, 32(%edi)
+	add	$31, %ebx
+	sub	%edx, %ebx
+	lea	33(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+#else
+	add	$32, %esi
+	add	$32, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+	bsf	%edx, %edx
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	lea	(%edi, %edx), %eax
+#endif
+	movdqu	%xmm4, (%edi)
+	add	$63, %ebx
+	sub	%edx, %ebx
+	lea	1(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+#else
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+	bsf	%ecx, %edx
+	movdqu	%xmm4, (%edi)
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	lea	16(%edi, %edx), %eax
+#endif
+	movdqu	%xmm5, 16(%edi)
+	add	$47, %ebx
+	sub	%edx, %ebx
+	lea	17(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+#else
+	add	$16, %esi
+	add	$16, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+	.p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+	bsf	%edx, %edx
+	movdqu	%xmm4, (%edi)
+	movdqu	%xmm5, 16(%edi)
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+	lea	32(%edi, %edx), %eax
+#endif
+	movdqu	%xmm6, 32(%edi)
+	add	$31, %ebx
+	sub	%edx, %ebx
+	lea	33(%edi, %edx), %edi
+	jmp	L(StrncpyFillTailWithZero)
+#else
+	add	$32, %esi
+	add	$32, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+#ifdef USE_AS_STRNCPY
+	.p2align 4
+L(CopyFrom1To16BytesXmm6):
+	movdqa	%xmm6, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesXmm5):
+	movdqa	%xmm5, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesXmm4):
+	movdqa	%xmm4, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesXmm3):
+	movdqa	%xmm3, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesXmm2):
+	movdqa	%xmm2, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesXmm1):
+	movdqa	%xmm1, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm6):
+	movdqu	%xmm6, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm5):
+	movdqu	%xmm5, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm4):
+	movdqu	%xmm4, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm3):
+	movdqu	%xmm3, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm1):
+	movdqu	%xmm1, (%edi, %ecx)
+	jmp	L(CopyFrom1To16BytesXmmExit)
+
+	.p2align 4
+L(CopyFrom1To16BytesExit):
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+/* Case2 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2):
+	add	$16, %ebx
+	add	%ecx, %edi
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	add	$16, %edx
+	sub	%ecx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+/* Case2 or Case3,  Case3 */
+
+	.p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+	add	$16, %ebx
+	add	%ecx, %edi
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To32BytesCase2)
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTailCase2)
+	sub	%ecx, %ebx
+	add	%ecx, %esi
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+	add	$16, %edi
+	add	$16, %esi
+	sub	$16, %ebx
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesTail1Case2)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+#endif
+
+/*-----------------------------------------------------------------*/
+	.p2align 4
+L(Exit0):
+#ifdef USE_AS_STPCPY
+	mov	%edi, %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit1):
+	movb	%dh, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$1, %ebx
+	lea	1(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit2):
+	movw	(%esi), %dx
+	movw	%dx, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	1(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$2, %ebx
+	lea	2(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit3):
+	movw	(%esi), %cx
+	movw	%cx, (%edi)
+	movb	%dh, 2(%edi)
+#ifdef USE_AS_STPCPY
+	lea	2(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$3, %ebx
+	lea	3(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit4):
+	movl	(%esi), %edx
+	movl	%edx, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	3(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$4, %ebx
+	lea	4(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit5):
+	movl	(%esi), %ecx
+	movb	%dh, 4(%edi)
+	movl	%ecx, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	4(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$5, %ebx
+	lea	5(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit6):
+	movl	(%esi), %ecx
+	movw	4(%esi), %dx
+	movl	%ecx, (%edi)
+	movw	%dx, 4(%edi)
+#ifdef USE_AS_STPCPY
+	lea	5(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$6, %ebx
+	lea	6(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit7):
+	movl	(%esi), %ecx
+	movl	3(%esi), %edx
+	movl	%ecx, (%edi)
+	movl	%edx, 3(%edi)
+#ifdef USE_AS_STPCPY
+	lea	6(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$7, %ebx
+	lea	7(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit8):
+	movlpd	(%esi), %xmm0
+	movlpd	%xmm0, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	7(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$8, %ebx
+	lea	8(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit9):
+	movlpd	(%esi), %xmm0
+	movb	%dh, 8(%edi)
+	movlpd	%xmm0, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	8(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$9, %ebx
+	lea	9(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit10):
+	movlpd	(%esi), %xmm0
+	movw	8(%esi), %dx
+	movlpd	%xmm0, (%edi)
+	movw	%dx, 8(%edi)
+#ifdef USE_AS_STPCPY
+	lea	9(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$10, %ebx
+	lea	10(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit11):
+	movlpd	(%esi), %xmm0
+	movl	7(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 7(%edi)
+#ifdef USE_AS_STPCPY
+	lea	10(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$11, %ebx
+	lea	11(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit12):
+	movlpd	(%esi), %xmm0
+	movl	8(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 8(%edi)
+#ifdef USE_AS_STPCPY
+	lea	11(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$12, %ebx
+	lea	12(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit13):
+	movlpd	(%esi), %xmm0
+	movlpd	5(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 5(%edi)
+#ifdef USE_AS_STPCPY
+	lea	12(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$13, %ebx
+	lea	13(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit14):
+	movlpd	(%esi), %xmm0
+	movlpd	6(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 6(%edi)
+#ifdef USE_AS_STPCPY
+	lea	13(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$14, %ebx
+	lea	14(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit15):
+	movlpd	(%esi), %xmm0
+	movlpd	7(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 7(%edi)
+#ifdef USE_AS_STPCPY
+	lea	14(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$15, %ebx
+	lea	15(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit16):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	15(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$16, %ebx
+	lea	16(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit17):
+	movdqu	(%esi), %xmm0
+	xor	%cl, %cl
+	movdqu	%xmm0, (%edi)
+	movb	%cl, 16(%edi)
+#ifdef USE_AS_STPCPY
+	lea	16(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$17, %ebx
+	lea	17(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit18):
+	movdqu	(%esi), %xmm0
+	movw	16(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movw	%cx, 16(%edi)
+#ifdef USE_AS_STPCPY
+	lea	17(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$18, %ebx
+	lea	18(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit19):
+	movdqu	(%esi), %xmm0
+	movl	15(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 15(%edi)
+#ifdef USE_AS_STPCPY
+	lea	18(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$19, %ebx
+	lea	19(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit20):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+#ifdef USE_AS_STPCPY
+	lea	19(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$20, %ebx
+	lea	20(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit21):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	xor	%dl, %dl
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+	movb	%dl, 20(%edi)
+#ifdef USE_AS_STPCPY
+	lea	20(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$21, %ebx
+	lea	21(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit22):
+	movdqu	(%esi), %xmm0
+	movlpd	14(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 14(%edi)
+#ifdef USE_AS_STPCPY
+	lea	21(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$22, %ebx
+	lea	22(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit23):
+	movdqu	(%esi), %xmm0
+	movlpd	15(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 15(%edi)
+#ifdef USE_AS_STPCPY
+	lea	22(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$23, %ebx
+	lea	23(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit24):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+#ifdef USE_AS_STPCPY
+	lea	23(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$24, %ebx
+	lea	24(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit25):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	xor	%cl, %cl
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movb	%cl, 24(%edi)
+#ifdef USE_AS_STPCPY
+	lea	24(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$25, %ebx
+	lea	25(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit26):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movw	24(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movw	%cx, 24(%edi)
+#ifdef USE_AS_STPCPY
+	lea	25(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$26, %ebx
+	lea	26(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit27):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	23(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 23(%edi)
+#ifdef USE_AS_STPCPY
+	lea	26(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$27, %ebx
+	lea	27(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit28):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	24(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 24(%edi)
+#ifdef USE_AS_STPCPY
+	lea	27(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$28, %ebx
+	lea	28(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit29):
+	movdqu	(%esi), %xmm0
+	movdqu	13(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 13(%edi)
+#ifdef USE_AS_STPCPY
+	lea	28(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$29, %ebx
+	lea	29(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit30):
+	movdqu	(%esi), %xmm0
+	movdqu	14(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 14(%edi)
+#ifdef USE_AS_STPCPY
+	lea	29(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$30, %ebx
+	lea	30(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+
+	.p2align 4
+L(Exit31):
+	movdqu	(%esi), %xmm0
+	movdqu	15(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 15(%edi)
+#ifdef USE_AS_STPCPY
+	lea	30(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$31, %ebx
+	lea	31(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+	.p2align 4
+L(Exit32):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 16(%edi)
+#ifdef USE_AS_STPCPY
+	lea	31(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+	sub	$32, %ebx
+	lea	32(%edi), %edi
+	jnz	L(StrncpyFillTailWithZero)
+#endif
+	RETURN
+
+#ifdef USE_AS_STRNCPY
+
+	.p2align 4
+L(StrncpyExit1):
+	movb	(%esi), %dl
+	movb	%dl, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	1(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit2):
+	movw	(%esi), %dx
+	movw	%dx, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	2(%edi), %eax
+#endif
+	RETURN
+	.p2align 4
+L(StrncpyExit3):
+	movw	(%esi), %cx
+	movb	2(%esi), %dl
+	movw	%cx, (%edi)
+	movb	%dl, 2(%edi)
+#ifdef USE_AS_STPCPY
+	lea	3(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit4):
+	movl	(%esi), %edx
+	movl	%edx, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	4(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit5):
+	movl	(%esi), %ecx
+	movb	4(%esi), %dl
+	movl	%ecx, (%edi)
+	movb	%dl, 4(%edi)
+#ifdef USE_AS_STPCPY
+	lea	5(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit6):
+	movl	(%esi), %ecx
+	movw	4(%esi), %dx
+	movl	%ecx, (%edi)
+	movw	%dx, 4(%edi)
+#ifdef USE_AS_STPCPY
+	lea	6(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit7):
+	movl	(%esi), %ecx
+	movl	3(%esi), %edx
+	movl	%ecx, (%edi)
+	movl	%edx, 3(%edi)
+#ifdef USE_AS_STPCPY
+	lea	7(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit8):
+	movlpd	(%esi), %xmm0
+	movlpd	%xmm0, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	8(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit9):
+	movlpd	(%esi), %xmm0
+	movb	8(%esi), %dl
+	movlpd	%xmm0, (%edi)
+	movb	%dl, 8(%edi)
+#ifdef USE_AS_STPCPY
+	lea	9(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit10):
+	movlpd	(%esi), %xmm0
+	movw	8(%esi), %dx
+	movlpd	%xmm0, (%edi)
+	movw	%dx, 8(%edi)
+#ifdef USE_AS_STPCPY
+	lea	10(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit11):
+	movlpd	(%esi), %xmm0
+	movl	7(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 7(%edi)
+#ifdef USE_AS_STPCPY
+	lea	11(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit12):
+	movlpd	(%esi), %xmm0
+	movl	8(%esi), %edx
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 8(%edi)
+#ifdef USE_AS_STPCPY
+	lea	12(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit13):
+	movlpd	(%esi), %xmm0
+	movlpd	5(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 5(%edi)
+#ifdef USE_AS_STPCPY
+	lea	13(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit14):
+	movlpd	(%esi), %xmm0
+	movlpd	6(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 6(%edi)
+#ifdef USE_AS_STPCPY
+	lea	14(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit15):
+	movlpd	(%esi), %xmm0
+	movlpd	7(%esi), %xmm1
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm1, 7(%edi)
+#ifdef USE_AS_STPCPY
+	lea	15(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit16):
+	movdqu	(%esi), %xmm0
+	movdqu	%xmm0, (%edi)
+#ifdef USE_AS_STPCPY
+	lea	16(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit17):
+	movdqu	(%esi), %xmm0
+	movb	16(%esi), %cl
+	movdqu	%xmm0, (%edi)
+	movb	%cl, 16(%edi)
+#ifdef USE_AS_STPCPY
+	lea	17(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit18):
+	movdqu	(%esi), %xmm0
+	movw	16(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movw	%cx, 16(%edi)
+#ifdef USE_AS_STPCPY
+	lea	18(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit19):
+	movdqu	(%esi), %xmm0
+	movl	15(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 15(%edi)
+#ifdef USE_AS_STPCPY
+	lea	19(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit20):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+#ifdef USE_AS_STPCPY
+	lea	20(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit21):
+	movdqu	(%esi), %xmm0
+	movl	16(%esi), %ecx
+	movb	20(%esi), %dl
+	movdqu	%xmm0, (%edi)
+	movl	%ecx, 16(%edi)
+	movb	%dl, 20(%edi)
+#ifdef USE_AS_STPCPY
+	lea	21(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit22):
+	movdqu	(%esi), %xmm0
+	movlpd	14(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 14(%edi)
+#ifdef USE_AS_STPCPY
+	lea	22(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit23):
+	movdqu	(%esi), %xmm0
+	movlpd	15(%esi), %xmm3
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm3, 15(%edi)
+#ifdef USE_AS_STPCPY
+	lea	23(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit24):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+#ifdef USE_AS_STPCPY
+	lea	24(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit25):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movb	24(%esi), %cl
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movb	%cl, 24(%edi)
+#ifdef USE_AS_STPCPY
+	lea	25(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit26):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movw	24(%esi), %cx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movw	%cx, 24(%edi)
+#ifdef USE_AS_STPCPY
+	lea	26(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit27):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	23(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 23(%edi)
+#ifdef USE_AS_STPCPY
+	lea	27(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit28):
+	movdqu	(%esi), %xmm0
+	movlpd	16(%esi), %xmm2
+	movl	24(%esi), %ecx
+	movdqu	%xmm0, (%edi)
+	movlpd	%xmm2, 16(%edi)
+	movl	%ecx, 24(%edi)
+#ifdef USE_AS_STPCPY
+	lea	28(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit29):
+	movdqu	(%esi), %xmm0
+	movdqu	13(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 13(%edi)
+#ifdef USE_AS_STPCPY
+	lea	29(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit30):
+	movdqu	(%esi), %xmm0
+	movdqu	14(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 14(%edi)
+#ifdef USE_AS_STPCPY
+	lea	30(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit31):
+	movdqu	(%esi), %xmm0
+	movdqu	15(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 15(%edi)
+#ifdef USE_AS_STPCPY
+	lea	31(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit32):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 16(%edi)
+#ifdef USE_AS_STPCPY
+	lea	32(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(StrncpyExit33):
+	movdqu	(%esi), %xmm0
+	movdqu	16(%esi), %xmm2
+	movb	32(%esi), %cl
+	movdqu	%xmm0, (%edi)
+	movdqu	%xmm2, 16(%edi)
+	movb	%cl, 32(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill0):
+	RETURN
+
+	.p2align 4
+L(Fill1):
+	movb	%dl, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill2):
+	movw	%dx, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill3):
+	movl	%edx, -1(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill4):
+	movl	%edx, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill5):
+	movl	%edx, (%edi)
+	movb	%dl, 4(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill6):
+	movl	%edx, (%edi)
+	movw	%dx, 4(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill7):
+	movlpd	%xmm0, -1(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill8):
+	movlpd	%xmm0, (%edi)
+	RETURN
+
+	.p2align 4
+L(Fill9):
+	movlpd	%xmm0, (%edi)
+	movb	%dl, 8(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill10):
+	movlpd	%xmm0, (%edi)
+	movw	%dx, 8(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill11):
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 7(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill12):
+	movlpd	%xmm0, (%edi)
+	movl	%edx, 8(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill13):
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm0, 5(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill14):
+	movlpd	%xmm0, (%edi)
+	movlpd	%xmm0, 6(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill15):
+	movdqu	%xmm0, -1(%edi)
+	RETURN
+
+	.p2align 4
+L(Fill16):
+	movdqu	%xmm0, (%edi)
+	RETURN
+
+	.p2align 4
+L(CopyFrom1To16BytesUnalignedXmm2):
+	movdqu	%xmm2, (%edi, %ecx)
+
+	.p2align 4
+L(CopyFrom1To16BytesXmmExit):
+	bsf	%edx, %edx
+	add	$15, %ebx
+	add	%ecx, %edi
+#ifdef USE_AS_STPCPY
+	lea	(%edi, %edx), %eax
+#endif
+	sub	%edx, %ebx
+	lea	1(%edi, %edx), %edi
+
+	.p2align 4
+L(StrncpyFillTailWithZero):
+	pxor	%xmm0, %xmm0
+	xor	%edx, %edx
+	sub	$16, %ebx
+	jbe	L(StrncpyFillExit)
+
+	movdqu	%xmm0, (%edi)
+	add	$16, %edi
+
+	mov	%edi, %esi
+	and	$0xf, %esi
+	sub	%esi, %edi
+	add	%esi, %ebx
+	sub	$64, %ebx
+	jb	L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+	movdqa	%xmm0, (%edi)
+	movdqa	%xmm0, 16(%edi)
+	movdqa	%xmm0, 32(%edi)
+	movdqa	%xmm0, 48(%edi)
+	add	$64, %edi
+	sub	$64, %ebx
+	jae	L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+	add	$32, %ebx
+	jl	L(StrncpyFillLess32)
+	movdqa	%xmm0, (%edi)
+	movdqa	%xmm0, 16(%edi)
+	add	$32, %edi
+	sub	$16, %ebx
+	jl	L(StrncpyFillExit)
+	movdqa	%xmm0, (%edi)
+	add	$16, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+L(StrncpyFillLess32):
+	add	$16, %ebx
+	jl	L(StrncpyFillExit)
+	movdqa	%xmm0, (%edi)
+	add	$16, %edi
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+L(StrncpyFillExit):
+	add	$16, %ebx
+	BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+	.p2align 4
+L(AlignedLeaveCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(Aligned64LeaveCase2)
+L(Aligned64LeaveCase3):
+	lea	64(%ebx), %ecx
+	and	$-16, %ecx
+	add	$48, %ebx
+	jl	L(CopyFrom1To16BytesCase3)
+	movdqa	%xmm4, (%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqa	%xmm5, 16(%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqa	%xmm6, 32(%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqa	%xmm7, 48(%edi)
+#ifdef USE_AS_STPCPY
+	lea	64(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(Aligned64LeaveCase2):
+	pxor	%xmm0, %xmm0
+	xor	%ecx, %ecx
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesXmm4)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqa	%xmm4, (%edi)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesXmm5)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqa	%xmm5, 16(%edi)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesXmm6)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqa	%xmm6, 32(%edi)
+	lea	16(%edi, %ecx), %edi
+	lea	16(%esi, %ecx), %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(UnalignedLeaveCase2OrCase3):
+	test	%edx, %edx
+	jnz	L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+	lea	64(%ebx), %ecx
+	and	$-16, %ecx
+	add	$48, %ebx
+	jl	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm4, (%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm5, 16(%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm6, 32(%edi)
+	sub	$16, %ebx
+	jb	L(CopyFrom1To16BytesCase3)
+	movdqu	%xmm7, 48(%edi)
+#ifdef USE_AS_STPCPY
+	lea	64(%edi), %eax
+#endif
+	RETURN
+
+	.p2align 4
+L(Unaligned64LeaveCase2):
+	pxor	%xmm0, %xmm0
+	xor	%ecx, %ecx
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb %xmm0, %edx
+	add	$48, %ebx
+	jle	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm4)
+
+	pcmpeqb	%xmm5, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm4, (%edi)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm5)
+
+	pcmpeqb	%xmm6, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm5, 16(%edi)
+	add	$16, %ecx
+	sub	$16, %ebx
+	jbe	L(CopyFrom1To16BytesCase2OrCase3)
+	test	%edx, %edx
+	jnz	L(CopyFrom1To16BytesUnalignedXmm6)
+
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb %xmm0, %edx
+	movdqu	%xmm6, 32(%edi)
+	lea	16(%edi, %ecx), %edi
+	lea	16(%esi, %ecx), %esi
+	bsf	%edx, %edx
+	cmp	%ebx, %edx
+	jb	L(CopyFrom1To16BytesExit)
+	BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+	.p2align 4
+L(ExitZero):
+	movl	%edi, %eax
+	RETURN
+#endif
+
+END (STRCPY)
+
+	.p2align 4
+	.section .rodata
+L(ExitTable):
+	.int	JMPTBL(L(Exit1), L(ExitTable))
+	.int	JMPTBL(L(Exit2), L(ExitTable))
+	.int	JMPTBL(L(Exit3), L(ExitTable))
+	.int	JMPTBL(L(Exit4), L(ExitTable))
+	.int	JMPTBL(L(Exit5), L(ExitTable))
+	.int	JMPTBL(L(Exit6), L(ExitTable))
+	.int	JMPTBL(L(Exit7), L(ExitTable))
+	.int	JMPTBL(L(Exit8), L(ExitTable))
+	.int	JMPTBL(L(Exit9), L(ExitTable))
+	.int	JMPTBL(L(Exit10), L(ExitTable))
+	.int	JMPTBL(L(Exit11), L(ExitTable))
+	.int	JMPTBL(L(Exit12), L(ExitTable))
+	.int	JMPTBL(L(Exit13), L(ExitTable))
+	.int	JMPTBL(L(Exit14), L(ExitTable))
+	.int	JMPTBL(L(Exit15), L(ExitTable))
+	.int	JMPTBL(L(Exit16), L(ExitTable))
+	.int	JMPTBL(L(Exit17), L(ExitTable))
+	.int	JMPTBL(L(Exit18), L(ExitTable))
+	.int	JMPTBL(L(Exit19), L(ExitTable))
+	.int	JMPTBL(L(Exit20), L(ExitTable))
+	.int	JMPTBL(L(Exit21), L(ExitTable))
+	.int	JMPTBL(L(Exit22), L(ExitTable))
+	.int    JMPTBL(L(Exit23), L(ExitTable))
+	.int	JMPTBL(L(Exit24), L(ExitTable))
+	.int	JMPTBL(L(Exit25), L(ExitTable))
+	.int	JMPTBL(L(Exit26), L(ExitTable))
+	.int	JMPTBL(L(Exit27), L(ExitTable))
+	.int	JMPTBL(L(Exit28), L(ExitTable))
+	.int	JMPTBL(L(Exit29), L(ExitTable))
+	.int	JMPTBL(L(Exit30), L(ExitTable))
+	.int	JMPTBL(L(Exit31), L(ExitTable))
+	.int	JMPTBL(L(Exit32), L(ExitTable))
+#ifdef USE_AS_STRNCPY
+L(ExitStrncpyTable):
+	.int	JMPTBL(L(Exit0), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
+	.int    JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+	.int	JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+
+	.p2align 4
+L(FillTable):
+	.int	JMPTBL(L(Fill0), L(FillTable))
+	.int	JMPTBL(L(Fill1), L(FillTable))
+	.int	JMPTBL(L(Fill2), L(FillTable))
+	.int	JMPTBL(L(Fill3), L(FillTable))
+	.int	JMPTBL(L(Fill4), L(FillTable))
+	.int	JMPTBL(L(Fill5), L(FillTable))
+	.int	JMPTBL(L(Fill6), L(FillTable))
+	.int	JMPTBL(L(Fill7), L(FillTable))
+	.int	JMPTBL(L(Fill8), L(FillTable))
+	.int	JMPTBL(L(Fill9), L(FillTable))
+	.int	JMPTBL(L(Fill10), L(FillTable))
+	.int	JMPTBL(L(Fill11), L(FillTable))
+	.int	JMPTBL(L(Fill12), L(FillTable))
+	.int	JMPTBL(L(Fill13), L(FillTable))
+	.int	JMPTBL(L(Fill14), L(FillTable))
+	.int	JMPTBL(L(Fill15), L(FillTable))
+	.int	JMPTBL(L(Fill16), L(FillTable))
+#endif
diff --git a/libc/arch-x86/silvermont/string/sse2-strlen-slm.S b/libc/arch-x86/silvermont/string/sse2-strlen-slm.S
new file mode 100755
index 0000000..27cc025
--- /dev/null
+++ b/libc/arch-x86/silvermont/string/sse2-strlen-slm.S
@@ -0,0 +1,328 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef STRLEN
+# define STRLEN strlen
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)             \
+	.type name,  @function;  \
+	.globl name;             \
+	.p2align 4;              \
+name:                            \
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)               \
+	cfi_endproc;             \
+	.size name,	.-name
+#endif
+
+#define CFI_PUSH(REG)                   \
+	cfi_adjust_cfa_offset (4);      \
+	cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)                    \
+	cfi_adjust_cfa_offset (-4);     \
+	cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+	.section .text.sse2,"ax",@progbits
+ENTRY (STRLEN)
+	mov	4(%esp), %edx
+	mov	%edx, %ecx
+	and	$0x3f, %ecx
+	pxor	%xmm0, %xmm0
+	cmp	$0x30, %ecx
+	ja	L(next)
+	movdqu	(%edx), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit_less16)
+	mov	%edx, %eax
+	and	$-16, %eax
+	jmp	L(align16_start)
+L(next):
+	mov	%edx, %eax
+	and	$-16, %eax
+	PUSH	(%edi)
+	pcmpeqb	(%eax), %xmm0
+	mov	$-1, %edi
+	sub	%eax, %ecx
+	shl	%cl, %edi
+	pmovmskb %xmm0, %ecx
+	and	%edi, %ecx
+	POP	(%edi)
+	jnz	L(exit_unaligned)
+	pxor	%xmm0, %xmm0
+L(align16_start):
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	pcmpeqb	16(%eax), %xmm0
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%eax), %xmm1
+	pmovmskb %xmm1, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%eax), %xmm2
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%eax), %xmm3
+	pmovmskb %xmm3, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%eax), %xmm0
+	add	$64, %eax
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%eax), %xmm1
+	pmovmskb %xmm1, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%eax), %xmm2
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%eax), %xmm3
+	pmovmskb %xmm3, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%eax), %xmm0
+	add	$64, %eax
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%eax), %xmm1
+	pmovmskb %xmm1, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%eax), %xmm2
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%eax), %xmm3
+	pmovmskb %xmm3, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%eax), %xmm0
+	add	$64, %eax
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%eax), %xmm1
+	pmovmskb %xmm1, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%eax), %xmm2
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%eax), %xmm3
+	pmovmskb %xmm3, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit64)
+
+
+	test	$0x3f, %eax
+	jz	L(align64_loop)
+
+	pcmpeqb	80(%eax), %xmm0
+	add	$80, %eax
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit)
+
+	test	$0x3f, %eax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%eax), %xmm1
+	add	$16, %eax
+	pmovmskb %xmm1, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit)
+
+	test	$0x3f, %eax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%eax), %xmm2
+	add	$16, %eax
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit)
+
+	test	$0x3f, %eax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%eax), %xmm3
+	add	$16, %eax
+	pmovmskb %xmm3, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit)
+
+	add	$16, %eax
+	.p2align 4
+L(align64_loop):
+	movaps	(%eax),	%xmm4
+	pminub	16(%eax), 	%xmm4
+	movaps	32(%eax), 	%xmm5
+	pminub	48(%eax), 	%xmm5
+	add	$64, 	%eax
+	pminub	%xmm4,	%xmm5
+	pcmpeqb	%xmm0,	%xmm5
+	pmovmskb %xmm5,	%ecx
+	test	%ecx,	%ecx
+	jz	L(align64_loop)
+
+
+	pcmpeqb	-64(%eax), %xmm0
+	sub	$80, 	%eax
+	pmovmskb %xmm0, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%eax), %xmm1
+	pmovmskb %xmm1, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%eax), %xmm2
+	pmovmskb %xmm2, %ecx
+	test	%ecx, %ecx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%eax), %xmm3
+	pmovmskb %xmm3, %ecx
+	sub	%edx, %eax
+	bsf	%ecx, %ecx
+	add	%ecx, %eax
+	add	$64, %eax
+	ret
+
+	.p2align 4
+L(exit):
+	sub	%edx, %eax
+	bsf	%ecx, %ecx
+	add	%ecx, %eax
+	ret
+
+L(exit_less16):
+	bsf	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit_unaligned):
+	sub	%edx, %eax
+	bsf	%ecx, %ecx
+	add	%ecx, %eax
+	ret
+
+	.p2align 4
+L(exit16):
+	sub	%edx, %eax
+	bsf	%ecx, %ecx
+	add	%ecx, %eax
+	add	$16, %eax
+	ret
+
+	.p2align 4
+L(exit32):
+	sub	%edx, %eax
+	bsf	%ecx, %ecx
+	add	%ecx, %eax
+	add	$32, %eax
+	ret
+
+	.p2align 4
+L(exit48):
+	sub	%edx, %eax
+	bsf	%ecx, %ecx
+	add	%ecx, %eax
+	add	$48, %eax
+	ret
+
+	.p2align 4
+L(exit64):
+	sub	%edx, %eax
+	bsf	%ecx, %ecx
+	add	%ecx, %eax
+	add	$64, %eax
+	ret
+
+END (STRLEN)
+
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/silvermont/string/sse2-strncpy-slm.S
old mode 100644
new mode 100755
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86/silvermont/string/sse2-strncpy-slm.S
index 9d0a563..591419f
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/silvermont/string/sse2-strncpy-slm.S
@@ -1,42 +1,33 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-    * Redistributions of source code must retain the above copyright notice,
-    * this list of conditions and the following disclaimer.
-
-    * Redistributions in binary form must reproduce the above copyright notice,
-    * this list of conditions and the following disclaimer in the documentation
-    * and/or other materials provided with the distribution.
-
-    * Neither the name of Intel Corporation nor the names of its contributors
-    * may be used to endorse or promote products derived from this software
-    * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
+/*

+Copyright (c) 2014, Intel Corporation

+All rights reserved.

+

+Redistribution and use in source and binary forms, with or without

+modification, are permitted provided that the following conditions are met:

+

+    * Redistributions of source code must retain the above copyright notice,

+    * this list of conditions and the following disclaimer.

+

+    * Redistributions in binary form must reproduce the above copyright notice,

+    * this list of conditions and the following disclaimer in the documentation

+    * and/or other materials provided with the distribution.

+

+    * Neither the name of Intel Corporation nor the names of its contributors

+    * may be used to endorse or promote products derived from this software

+    * without specific prior written permission.

+

+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR

+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON

+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

+*/

+

+#define USE_AS_STRNCPY

+#define STRCPY strncpy

+#include "sse2-strcpy-slm.S"

diff --git a/libc/arch-x86/silvermont/string/sse4-memcmp-slm.S b/libc/arch-x86/silvermont/string/sse4-memcmp-slm.S
new file mode 100755
index 0000000..b302883
--- /dev/null
+++ b/libc/arch-x86/silvermont/string/sse4-memcmp-slm.S
@@ -0,0 +1,1277 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES                                                                                                                                                               (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg)	.cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
+#endif
+
+#ifndef cfi_remember_state
+# define cfi_remember_state	.cfi_remember_state
+#endif
+
+#ifndef cfi_restore_state
+# define cfi_restore_state	.cfi_restore_state
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)             \
+	.type name, @function;   \
+	.globl name;             \
+	.p2align 4;              \
+name:                            \
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)               \
+	cfi_endproc;             \
+	.size name, .-name
+#endif
+
+#ifndef MEMCMP
+# define MEMCMP	memcmp
+#endif
+
+#define CFI_PUSH(REG)	\
+	cfi_adjust_cfa_offset (4);	\
+	cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG)	\
+	cfi_adjust_cfa_offset (-4);	\
+	cfi_restore (REG)
+
+#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+#define POP(REG)	popl REG; CFI_POP (REG)
+
+#define PARMS	4
+#define BLK1	PARMS
+#define BLK2	BLK1 + 4
+#define LEN	BLK2 + 4
+#define RETURN	POP (%ebx); ret; CFI_PUSH (%ebx)
+
+
+#if (defined SHARED || defined __PIC__)
+# define JMPTBL(I, B)	I - B
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+	jump	table with relative offsets.  INDEX is a register contains the
+	index	into the jump table.   SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+/* We first load PC into EBX.  */	\
+	call	__x86.get_pc_thunk.bx;	\
+/* Get the address of the jump table.  */	\
+	addl	$(TABLE - .), %ebx;	\
+/* Get the entry and convert the relative offset to the	\
+	absolute	address.  */	\
+	addl	(%ebx,INDEX,SCALE), %ebx;	\
+/* We loaded the jump table and adjuested EDX/ESI. Go.  */	\
+	jmp	*%ebx
+#else
+# define JMPTBL(I, B)	I
+
+/* Load an entry in a jump table into EBX and branch to it.  TABLE is a
+	jump	table with relative offsets.  INDEX is a register contains the
+	index	into the jump table.   SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)	\
+	jmp	*TABLE(,INDEX,SCALE)
+#endif
+
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	.section .text.sse4.2,"ax",@progbits
+ENTRY (MEMCMP)
+	movl	BLK1(%esp), %eax
+	movl	BLK2(%esp), %edx
+	movl	LEN(%esp), %ecx
+
+#ifdef USE_AS_WMEMCMP
+	shl	$2, %ecx
+	test	%ecx, %ecx
+	jz	L(return0)
+#else
+	cmp	$1, %ecx
+	jbe	L(less1bytes)
+#endif
+
+	pxor	%xmm0, %xmm0
+	cmp	$64, %ecx
+	ja	L(64bytesormore)
+	cmp	$8, %ecx
+
+#ifndef USE_AS_WMEMCMP
+	PUSH	(%ebx)
+	jb	L(less8bytes)
+#else
+	jb	L(less8bytes)
+	PUSH	(%ebx)
+#endif
+
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+#ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(less8bytes):
+	mov	(%eax), %bl
+	cmpb	(%edx), %bl
+	jne	L(nonzero)
+
+	mov	1(%eax), %bl
+	cmpb	1(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$2, %ecx
+	jz	L(0bytes)
+
+	mov	2(%eax), %bl
+	cmpb	2(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$3, %ecx
+	jz	L(0bytes)
+
+	mov	3(%eax), %bl
+	cmpb	3(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$4, %ecx
+	jz	L(0bytes)
+
+	mov	4(%eax), %bl
+	cmpb	4(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$5, %ecx
+	jz	L(0bytes)
+
+	mov	5(%eax), %bl
+	cmpb	5(%edx), %bl
+	jne	L(nonzero)
+
+	cmp	$6, %ecx
+	jz	L(0bytes)
+
+	mov	6(%eax), %bl
+	cmpb	6(%edx), %bl
+	je	L(0bytes)
+
+L(nonzero):
+	POP	(%ebx)
+	mov	$1, %eax
+	ja	L(above)
+	neg	%eax
+L(above):
+	ret
+	CFI_PUSH (%ebx)
+#endif
+
+	.p2align 4
+L(0bytes):
+	POP	(%ebx)
+	xor	%eax, %eax
+	ret
+
+#ifdef USE_AS_WMEMCMP
+
+/* for wmemcmp, case N == 1 */
+
+	.p2align 4
+L(less8bytes):
+	mov	(%eax), %ecx
+	cmp	(%edx), %ecx
+	je	L(return0)
+	mov	$1, %eax
+	jg	L(find_diff_bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(find_diff_bigger):
+	ret
+
+	.p2align 4
+L(return0):
+	xor	%eax, %eax
+	ret
+#endif
+
+#ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(less1bytes):
+	jb	L(0bytesend)
+	movzbl	(%eax), %eax
+	movzbl	(%edx), %edx
+	sub	%edx, %eax
+	ret
+
+	.p2align 4
+L(0bytesend):
+	xor	%eax, %eax
+	ret
+#endif
+	.p2align 4
+L(64bytesormore):
+	PUSH	(%ebx)
+	mov	%ecx, %ebx
+	mov	$64, %ecx
+	sub	$64, %ebx
+L(64bytesormore_loop):
+	movdqu	(%eax), %xmm1
+	movdqu	(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_16diff)
+
+	movdqu	16(%eax), %xmm1
+	movdqu	16(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_32diff)
+
+	movdqu	32(%eax), %xmm1
+	movdqu	32(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_48diff)
+
+	movdqu	48(%eax), %xmm1
+	movdqu	48(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(find_64diff)
+	add	%ecx, %eax
+	add	%ecx, %edx
+	sub	%ecx, %ebx
+	jae	L(64bytesormore_loop)
+	add	%ebx, %ecx
+	add	%ecx, %edx
+	add	%ecx, %eax
+	BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+#ifdef USE_AS_WMEMCMP
+
+/* Label needs only for table_64bytes filling */
+L(unreal_case):
+/* no code here */
+
+#endif
+	.p2align 4
+L(find_16diff):
+	sub	$16, %ecx
+L(find_32diff):
+	sub	$16, %ecx
+L(find_48diff):
+	sub	$16, %ecx
+L(find_64diff):
+	add	%ecx, %edx
+	add	%ecx, %eax
+
+#ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(16bytes):
+	mov	-16(%eax), %ecx
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+#else
+	.p2align 4
+L(16bytes):
+	mov	-16(%eax), %ecx
+	cmp	-16(%edx), %ecx
+	jne	L(find_diff)
+L(12bytes):
+	mov	-12(%eax), %ecx
+	cmp	-12(%edx), %ecx
+	jne	L(find_diff)
+L(8bytes):
+	mov	-8(%eax), %ecx
+	cmp	-8(%edx), %ecx
+	jne	L(find_diff)
+L(4bytes):
+	mov	-4(%eax), %ecx
+	cmp	-4(%edx), %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+#endif
+
+#ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(49bytes):
+	movdqu	-49(%eax), %xmm1
+	movdqu	-49(%edx), %xmm2
+	mov	$-49, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(33bytes):
+	movdqu	-33(%eax), %xmm1
+	movdqu	-33(%edx), %xmm2
+	mov	$-33, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(17bytes):
+	mov	-17(%eax), %ecx
+	mov	-17(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(13bytes):
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(9bytes):
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(5bytes):
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(50bytes):
+	mov	$-50, %ebx
+	movdqu	-50(%eax), %xmm1
+	movdqu	-50(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(34bytes):
+	mov	$-34, %ebx
+	movdqu	-34(%eax), %xmm1
+	movdqu	-34(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(18bytes):
+	mov	-18(%eax), %ecx
+	mov	-18(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(14bytes):
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(10bytes):
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(6bytes):
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(2bytes):
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(51bytes):
+	mov	$-51, %ebx
+	movdqu	-51(%eax), %xmm1
+	movdqu	-51(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(35bytes):
+	mov	$-35, %ebx
+	movdqu	-35(%eax), %xmm1
+	movdqu	-35(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(19bytes):
+	movl	-19(%eax), %ecx
+	movl	-19(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(15bytes):
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(11bytes):
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(7bytes):
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+L(3bytes):
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+L(1bytes):
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+#endif
+	.p2align 4
+L(52bytes):
+	movdqu	-52(%eax), %xmm1
+	movdqu	-52(%edx), %xmm2
+	mov	$-52, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(36bytes):
+	movdqu	-36(%eax), %xmm1
+	movdqu	-36(%edx), %xmm2
+	mov	$-36, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(20bytes):
+	movdqu	-20(%eax), %xmm1
+	movdqu	-20(%edx), %xmm2
+	mov	$-20, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-4(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+#else
+	cmp	-4(%edx), %ecx
+#endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+#ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(53bytes):
+	movdqu	-53(%eax), %xmm1
+	movdqu	-53(%edx), %xmm2
+	mov	$-53, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(37bytes):
+	mov	$-37, %ebx
+	movdqu	-37(%eax), %xmm1
+	movdqu	-37(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(21bytes):
+	mov	$-21, %ebx
+	movdqu	-21(%eax), %xmm1
+	movdqu	-21(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(54bytes):
+	movdqu	-54(%eax), %xmm1
+	movdqu	-54(%edx), %xmm2
+	mov	$-54, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(38bytes):
+	mov	$-38, %ebx
+	movdqu	-38(%eax), %xmm1
+	movdqu	-38(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(22bytes):
+	mov	$-22, %ebx
+	movdqu	-22(%eax), %xmm1
+	movdqu	-22(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(55bytes):
+	movdqu	-55(%eax), %xmm1
+	movdqu	-55(%edx), %xmm2
+	mov	$-55, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(39bytes):
+	mov	$-39, %ebx
+	movdqu	-39(%eax), %xmm1
+	movdqu	-39(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(23bytes):
+	mov	$-23, %ebx
+	movdqu	-23(%eax), %xmm1
+	movdqu	-23(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+#endif
+	.p2align 4
+L(56bytes):
+	movdqu	-56(%eax), %xmm1
+	movdqu	-56(%edx), %xmm2
+	mov	$-56, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(40bytes):
+	mov	$-40, %ebx
+	movdqu	-40(%eax), %xmm1
+	movdqu	-40(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(24bytes):
+	mov	$-24, %ebx
+	movdqu	-24(%eax), %xmm1
+	movdqu	-24(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-8(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+#else
+	cmp	-8(%edx), %ecx
+#endif
+	jne	L(find_diff)
+
+	mov	-4(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+#else
+	cmp	-4(%edx), %ecx
+#endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+#ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(57bytes):
+	movdqu	-57(%eax), %xmm1
+	movdqu	-57(%edx), %xmm2
+	mov	$-57, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(41bytes):
+	mov	$-41, %ebx
+	movdqu	-41(%eax), %xmm1
+	movdqu	-41(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(25bytes):
+	mov	$-25, %ebx
+	movdqu	-25(%eax), %xmm1
+	movdqu	-25(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(58bytes):
+	movdqu	-58(%eax), %xmm1
+	movdqu	-58(%edx), %xmm2
+	mov	$-58, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(42bytes):
+	mov	$-42, %ebx
+	movdqu	-42(%eax), %xmm1
+	movdqu	-42(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(26bytes):
+	mov	$-26, %ebx
+	movdqu	-26(%eax), %xmm1
+	movdqu	-26(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(59bytes):
+	movdqu	-59(%eax), %xmm1
+	movdqu	-59(%edx), %xmm2
+	mov	$-59, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(43bytes):
+	mov	$-43, %ebx
+	movdqu	-43(%eax), %xmm1
+	movdqu	-43(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(27bytes):
+	mov	$-27, %ebx
+	movdqu	-27(%eax), %xmm1
+	movdqu	-27(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+#endif
+	.p2align 4
+L(60bytes):
+	movdqu	-60(%eax), %xmm1
+	movdqu	-60(%edx), %xmm2
+	mov	$-60, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(44bytes):
+	mov	$-44, %ebx
+	movdqu	-44(%eax), %xmm1
+	movdqu	-44(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(28bytes):
+	mov	$-28, %ebx
+	movdqu	-28(%eax), %xmm1
+	movdqu	-28(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-12(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+#else
+	cmp	-12(%edx), %ecx
+#endif
+	jne	L(find_diff)
+
+	mov	-8(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+#else
+	cmp	-8(%edx), %ecx
+#endif
+	jne	L(find_diff)
+
+	mov	-4(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+#else
+	cmp	-4(%edx), %ecx
+#endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+#ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(61bytes):
+	movdqu	-61(%eax), %xmm1
+	movdqu	-61(%edx), %xmm2
+	mov	$-61, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(45bytes):
+	mov	$-45, %ebx
+	movdqu	-45(%eax), %xmm1
+	movdqu	-45(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(29bytes):
+	mov	$-29, %ebx
+	movdqu	-29(%eax), %xmm1
+	movdqu	-29(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-13(%eax), %ecx
+	mov	-13(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-9(%eax), %ecx
+	mov	-9(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	-5(%eax), %ecx
+	mov	-5(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzbl	-1(%eax), %ecx
+	cmp	-1(%edx), %cl
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(62bytes):
+	movdqu	-62(%eax), %xmm1
+	movdqu	-62(%edx), %xmm2
+	mov	$-62, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(46bytes):
+	mov	$-46, %ebx
+	movdqu	-46(%eax), %xmm1
+	movdqu	-46(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(30bytes):
+	mov	$-30, %ebx
+	movdqu	-30(%eax), %xmm1
+	movdqu	-30(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+	mov	-14(%eax), %ecx
+	mov	-14(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-10(%eax), %ecx
+	mov	-10(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	mov	-6(%eax), %ecx
+	mov	-6(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-2(%eax), %ecx
+	movzwl	-2(%edx), %ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bh, %ch
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+
+	.p2align 4
+L(63bytes):
+	movdqu	-63(%eax), %xmm1
+	movdqu	-63(%edx), %xmm2
+	mov	$-63, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(47bytes):
+	mov	$-47, %ebx
+	movdqu	-47(%eax), %xmm1
+	movdqu	-47(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(31bytes):
+	mov	$-31, %ebx
+	movdqu	-31(%eax), %xmm1
+	movdqu	-31(%edx), %xmm2
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	movl	-15(%eax), %ecx
+	movl	-15(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-11(%eax), %ecx
+	movl	-11(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movl	-7(%eax), %ecx
+	movl	-7(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+	movzwl	-3(%eax), %ecx
+	movzwl	-3(%edx), %ebx
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	movzbl	-1(%eax), %eax
+	cmpb	-1(%edx), %al
+	mov	$0, %eax
+	jne	L(end)
+	RETURN
+#endif
+
+	.p2align 4
+L(64bytes):
+	movdqu	-64(%eax), %xmm1
+	movdqu	-64(%edx), %xmm2
+	mov	$-64, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(48bytes):
+	movdqu	-48(%eax), %xmm1
+	movdqu	-48(%edx), %xmm2
+	mov	$-48, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+L(32bytes):
+	movdqu	-32(%eax), %xmm1
+	movdqu	-32(%edx), %xmm2
+	mov	$-32, %ebx
+	pxor	%xmm1, %xmm2
+	ptest	%xmm2, %xmm0
+	jnc	L(less16bytes)
+
+	mov	-16(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+	mov	-16(%edx), %ebx
+	cmp	%ebx, %ecx
+#else
+	cmp	-16(%edx), %ecx
+#endif
+	jne	L(find_diff)
+
+	mov	-12(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+	mov	-12(%edx), %ebx
+	cmp	%ebx, %ecx
+#else
+	cmp	-12(%edx), %ecx
+#endif
+	jne	L(find_diff)
+
+	mov	-8(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+	mov	-8(%edx), %ebx
+	cmp	%ebx, %ecx
+#else
+	cmp	-8(%edx), %ecx
+#endif
+	jne	L(find_diff)
+
+	mov	-4(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+	mov	-4(%edx), %ebx
+	cmp	%ebx, %ecx
+#else
+	cmp	-4(%edx), %ecx
+#endif
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+
+#ifndef USE_AS_WMEMCMP
+	.p2align 4
+L(less16bytes):
+	add	%ebx, %eax
+	add	%ebx, %edx
+
+	mov	(%eax), %ecx
+	mov	(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	4(%eax), %ecx
+	mov	4(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	8(%eax), %ecx
+	mov	8(%edx), %ebx
+	cmp	%ebx, %ecx
+	jne	L(find_diff)
+
+	mov	12(%eax), %ecx
+	mov	12(%edx), %ebx
+	cmp	%ebx, %ecx
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+#else
+	.p2align 4
+L(less16bytes):
+	add	%ebx, %eax
+	add	%ebx, %edx
+
+	mov	(%eax), %ecx
+	cmp	(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	4(%eax), %ecx
+	cmp	4(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	8(%eax), %ecx
+	cmp	8(%edx), %ecx
+	jne	L(find_diff)
+
+	mov	12(%eax), %ecx
+	cmp	12(%edx), %ecx
+
+	mov	$0, %eax
+	jne	L(find_diff)
+	RETURN
+#endif
+
+	.p2align 4
+L(find_diff):
+#ifndef USE_AS_WMEMCMP
+	cmpb	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+	jne	L(end)
+	shr	$16,%ecx
+	shr	$16,%ebx
+	cmp	%bl, %cl
+	jne	L(end)
+	cmp	%bx, %cx
+L(end):
+	POP	(%ebx)
+	mov	$1, %eax
+	ja	L(bigger)
+	neg	%eax
+L(bigger):
+	ret
+#else
+	POP	(%ebx)
+	mov	$1, %eax
+	jg	L(bigger)
+	neg	%eax
+	ret
+
+	.p2align 4
+L(bigger):
+	ret
+#endif
+END (MEMCMP)
+
+	.section .rodata.sse4.2,"a",@progbits
+	.p2align 2
+	.type	L(table_64bytes), @object
+#ifndef USE_AS_WMEMCMP
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(1bytes), L(table_64bytes))
+	.int	JMPTBL (L(2bytes), L(table_64bytes))
+	.int	JMPTBL (L(3bytes), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(5bytes), L(table_64bytes))
+	.int	JMPTBL (L(6bytes), L(table_64bytes))
+	.int	JMPTBL (L(7bytes), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(9bytes), L(table_64bytes))
+	.int	JMPTBL (L(10bytes), L(table_64bytes))
+	.int	JMPTBL (L(11bytes), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(13bytes), L(table_64bytes))
+	.int	JMPTBL (L(14bytes), L(table_64bytes))
+	.int	JMPTBL (L(15bytes), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(17bytes), L(table_64bytes))
+	.int	JMPTBL (L(18bytes), L(table_64bytes))
+	.int	JMPTBL (L(19bytes), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(21bytes), L(table_64bytes))
+	.int	JMPTBL (L(22bytes), L(table_64bytes))
+	.int	JMPTBL (L(23bytes), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(25bytes), L(table_64bytes))
+	.int	JMPTBL (L(26bytes), L(table_64bytes))
+	.int	JMPTBL (L(27bytes), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(29bytes), L(table_64bytes))
+	.int	JMPTBL (L(30bytes), L(table_64bytes))
+	.int	JMPTBL (L(31bytes), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(33bytes), L(table_64bytes))
+	.int	JMPTBL (L(34bytes), L(table_64bytes))
+	.int	JMPTBL (L(35bytes), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(37bytes), L(table_64bytes))
+	.int	JMPTBL (L(38bytes), L(table_64bytes))
+	.int	JMPTBL (L(39bytes), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(41bytes), L(table_64bytes))
+	.int	JMPTBL (L(42bytes), L(table_64bytes))
+	.int	JMPTBL (L(43bytes), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(45bytes), L(table_64bytes))
+	.int	JMPTBL (L(46bytes), L(table_64bytes))
+	.int	JMPTBL (L(47bytes), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(49bytes), L(table_64bytes))
+	.int	JMPTBL (L(50bytes), L(table_64bytes))
+	.int	JMPTBL (L(51bytes), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(53bytes), L(table_64bytes))
+	.int	JMPTBL (L(54bytes), L(table_64bytes))
+	.int	JMPTBL (L(55bytes), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(57bytes), L(table_64bytes))
+	.int	JMPTBL (L(58bytes), L(table_64bytes))
+	.int	JMPTBL (L(59bytes), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(61bytes), L(table_64bytes))
+	.int	JMPTBL (L(62bytes), L(table_64bytes))
+	.int	JMPTBL (L(63bytes), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+#else
+L(table_64bytes):
+	.int	JMPTBL (L(0bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(4bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(8bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(12bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(16bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(20bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(24bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(28bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(32bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(36bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(40bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(44bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(48bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(52bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(56bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(60bytes), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(unreal_case), L(table_64bytes))
+	.int	JMPTBL (L(64bytes), L(table_64bytes))
+#endif
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/silvermont/string/sse4-wmemcmp-slm.S
old mode 100644
new mode 100755
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86/silvermont/string/sse4-wmemcmp-slm.S
index 9d0a563..2c350bb
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/silvermont/string/sse4-wmemcmp-slm.S
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2014, Intel Corporation
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -28,15 +28,6 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE	(1024*1024)			/* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE	(512*1024)			/* Atom L2 Cache */
-#define DATA_CACHE_SIZE		(24*1024)			/* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF	(SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF	(DATA_CACHE_SIZE / 2)
+#define USE_AS_WMEMCMP
+#define MEMCMP wmemcmp
+#include "sse4-memcmp-slm.S"
diff --git a/libc/arch-x86/x86.mk b/libc/arch-x86/x86.mk
index a1d55f0..34da0ce 100644
--- a/libc/arch-x86/x86.mk
+++ b/libc/arch-x86/x86.mk
@@ -32,60 +32,15 @@
     arch-x86/bionic/syscall.S \
     arch-x86/bionic/vfork.S \
 
-ifeq ($(ARCH_X86_HAVE_SSSE3),true)
-libc_bionic_src_files_x86 += \
-    arch-x86/string/ssse3-memcpy-atom.S \
-    arch-x86/string/ssse3-memmove-atom.S \
-    arch-x86/string/ssse3-bcopy-atom.S \
-    arch-x86/string/ssse3-strncat-atom.S \
-    arch-x86/string/ssse3-strncpy-atom.S \
-    arch-x86/string/ssse3-strlcat-atom.S \
-    arch-x86/string/ssse3-strlcpy-atom.S \
-    arch-x86/string/ssse3-strcmp-atom.S \
-    arch-x86/string/ssse3-strncmp-atom.S \
-    arch-x86/string/ssse3-strcat-atom.S \
-    arch-x86/string/ssse3-strcpy-atom.S \
-    arch-x86/string/ssse3-memcmp-atom.S \
-    arch-x86/string/ssse3-wmemcmp-atom.S \
-    arch-x86/string/ssse3-memcmp16-atom.S \
-    arch-x86/string/ssse3-wcscat-atom.S \
-    arch-x86/string/ssse3-wcscpy-atom.S
-else
-libc_bionic_src_files_x86 += \
-    arch-x86/string/memcpy.S \
-    arch-x86/string/memmove.S \
-    arch-x86/string/bcopy.S \
-    arch-x86/string/strcmp.S \
-    arch-x86/string/strncmp.S \
-    arch-x86/string/strcat.S \
-    arch-x86/string/memcmp.S \
-    bionic/__memcmp16.cpp \
-    upstream-freebsd/lib/libc/string/wcscpy.c \
-    upstream-freebsd/lib/libc/string/wcscat.c \
-    upstream-freebsd/lib/libc/string/wmemcmp.c \
-    upstream-openbsd/lib/libc/string/strcpy.c \
-    upstream-openbsd/lib/libc/string/strlcat.c \
-    upstream-openbsd/lib/libc/string/strlcpy.c \
-    upstream-openbsd/lib/libc/string/strncat.c \
-    upstream-openbsd/lib/libc/string/strncpy.c \
-
+## ARCH variant specific source files
+arch_variant_mk := $(LOCAL_PATH)/arch-x86/$(TARGET_ARCH_VARIANT)/$(TARGET_ARCH_VARIANT).mk
+ifeq ($(wildcard $(arch_variant_mk)),)
+    arch_variant_mk := $(LOCAL_PATH)/arch-x86/generic/generic.mk
 endif
+include $(arch_variant_mk)
+libc_common_additional_dependencies += $(arch_variant_mk)
 
-libc_bionic_src_files_x86 += \
-    arch-x86/string/sse2-memset-atom.S \
-    arch-x86/string/sse2-bzero-atom.S \
-    arch-x86/string/sse2-memchr-atom.S \
-    arch-x86/string/sse2-memrchr-atom.S \
-    arch-x86/string/sse2-strchr-atom.S \
-    arch-x86/string/sse2-strrchr-atom.S \
-    arch-x86/string/sse2-index-atom.S \
-    arch-x86/string/sse2-strlen-atom.S \
-    arch-x86/string/sse2-strnlen-atom.S \
-    arch-x86/string/sse2-wcschr-atom.S \
-    arch-x86/string/sse2-wcsrchr-atom.S \
-    arch-x86/string/sse2-wcslen-atom.S \
-    arch-x86/string/sse2-wcscmp-atom.S \
-
+arch_variant_mk :=
 
 libc_crt_target_cflags_x86 := \
     -m32 \