Merge "Reduce stack usage of tmpfile(3)."
diff --git a/libc/Android.mk b/libc/Android.mk
index b5a8088..d6c0599 100644
--- a/libc/Android.mk
+++ b/libc/Android.mk
@@ -448,8 +448,6 @@
upstream-openbsd/lib/libc/stdlib/strtoull.c \
upstream-openbsd/lib/libc/stdlib/strtoumax.c \
upstream-openbsd/lib/libc/stdlib/system.c \
- upstream-openbsd/lib/libc/string/stpcpy.c \
- upstream-openbsd/lib/libc/string/stpncpy.c \
upstream-openbsd/lib/libc/string/strcasecmp.c \
upstream-openbsd/lib/libc/string/strcspn.c \
upstream-openbsd/lib/libc/string/strdup.c \
diff --git a/libc/SYSCALLS.TXT b/libc/SYSCALLS.TXT
index 220c713..93ed85c 100644
--- a/libc/SYSCALLS.TXT
+++ b/libc/SYSCALLS.TXT
@@ -61,7 +61,6 @@
int tkill(pid_t tid, int sig) all
int tgkill(pid_t tgid, pid_t tid, int sig) all
int __ptrace:ptrace(int request, int pid, void* addr, void* data) all
-int __set_thread_area:set_thread_area(void* user_desc) mips,mips64,x86
# <sys/resource.h>
int getrusage(int, struct rusage*) all
@@ -331,3 +330,7 @@
# MIPS-specific
int _flush_cache:cacheflush(char* addr, const int nbytes, const int op) mips,mips64
+int __set_tls:set_thread_area(void*) mips,mips64
+
+# x86-specific
+int __set_thread_area:set_thread_area(void*) x86
diff --git a/libc/arch-arm/arm.mk b/libc/arch-arm/arm.mk
index 2dbcb56..06b1675 100644
--- a/libc/arch-arm/arm.mk
+++ b/libc/arch-arm/arm.mk
@@ -26,6 +26,8 @@
upstream-freebsd/lib/libc/string/wcsrchr.c \
upstream-freebsd/lib/libc/string/wmemcmp.c \
upstream-openbsd/lib/libc/string/bcopy.c \
+ upstream-openbsd/lib/libc/string/stpcpy.c \
+ upstream-openbsd/lib/libc/string/stpncpy.c \
upstream-openbsd/lib/libc/string/strlcat.c \
upstream-openbsd/lib/libc/string/strlcpy.c \
upstream-openbsd/lib/libc/string/strncat.c \
diff --git a/libc/arch-arm64/arm64.mk b/libc/arch-arm64/arm64.mk
index 14cc3f4..7a9eb4e 100644
--- a/libc/arch-arm64/arm64.mk
+++ b/libc/arch-arm64/arm64.mk
@@ -15,6 +15,8 @@
upstream-freebsd/lib/libc/string/wcsrchr.c \
upstream-freebsd/lib/libc/string/wmemcmp.c \
upstream-openbsd/lib/libc/string/bcopy.c \
+ upstream-openbsd/lib/libc/string/stpcpy.c \
+ upstream-openbsd/lib/libc/string/stpncpy.c \
upstream-openbsd/lib/libc/string/strcat.c \
upstream-openbsd/lib/libc/string/strcpy.c \
upstream-openbsd/lib/libc/string/strlcat.c \
diff --git a/libc/arch-mips/bionic/__set_tls.c b/libc/arch-mips/bionic/__set_tls.c
deleted file mode 100644
index 38e3a50..0000000
--- a/libc/arch-mips/bionic/__set_tls.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (C) 2008 The Android Open Source Project
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-#include <pthread.h>
-
-extern int __set_thread_area(void *u_info);
-
-int __set_tls(void *ptr)
-{
- return __set_thread_area(ptr);
-}
diff --git a/libc/arch-mips/mips.mk b/libc/arch-mips/mips.mk
index 53fa223..d7d1df4 100644
--- a/libc/arch-mips/mips.mk
+++ b/libc/arch-mips/mips.mk
@@ -27,6 +27,8 @@
upstream-freebsd/lib/libc/string/wcsrchr.c \
upstream-freebsd/lib/libc/string/wmemcmp.c \
upstream-openbsd/lib/libc/string/bcopy.c \
+ upstream-openbsd/lib/libc/string/stpcpy.c \
+ upstream-openbsd/lib/libc/string/stpncpy.c \
upstream-openbsd/lib/libc/string/strcat.c \
upstream-openbsd/lib/libc/string/strcmp.c \
upstream-openbsd/lib/libc/string/strcpy.c \
@@ -61,7 +63,6 @@
arch-mips/bionic/memcmp16.S \
arch-mips/bionic/_setjmp.S \
arch-mips/bionic/setjmp.S \
- arch-mips/bionic/__set_tls.c \
arch-mips/bionic/sigsetjmp.S \
arch-mips/bionic/syscall.S \
arch-mips/bionic/vfork.S \
diff --git a/libc/arch-mips/syscalls/__set_thread_area.S b/libc/arch-mips/syscalls/__set_tls.S
similarity index 84%
rename from libc/arch-mips/syscalls/__set_thread_area.S
rename to libc/arch-mips/syscalls/__set_tls.S
index f83249e..e5b0ca2 100644
--- a/libc/arch-mips/syscalls/__set_thread_area.S
+++ b/libc/arch-mips/syscalls/__set_tls.S
@@ -2,7 +2,7 @@
#include <private/bionic_asm.h>
-ENTRY(__set_thread_area)
+ENTRY(__set_tls)
.set noreorder
.cpload t9
li v0, __NR_set_thread_area
@@ -16,4 +16,4 @@
j t9
nop
.set reorder
-END(__set_thread_area)
+END(__set_tls)
diff --git a/libc/arch-mips64/bionic/__set_tls.c b/libc/arch-mips64/bionic/__set_tls.c
deleted file mode 100644
index 38e3a50..0000000
--- a/libc/arch-mips64/bionic/__set_tls.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (C) 2008 The Android Open Source Project
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-#include <pthread.h>
-
-extern int __set_thread_area(void *u_info);
-
-int __set_tls(void *ptr)
-{
- return __set_thread_area(ptr);
-}
diff --git a/libc/arch-mips64/mips64.mk b/libc/arch-mips64/mips64.mk
index 75620d8..b6e0209 100644
--- a/libc/arch-mips64/mips64.mk
+++ b/libc/arch-mips64/mips64.mk
@@ -17,6 +17,8 @@
upstream-freebsd/lib/libc/string/wcsrchr.c \
upstream-freebsd/lib/libc/string/wmemcmp.c \
upstream-openbsd/lib/libc/string/bcopy.c \
+ upstream-openbsd/lib/libc/string/stpcpy.c \
+ upstream-openbsd/lib/libc/string/stpncpy.c \
upstream-openbsd/lib/libc/string/strcat.c \
upstream-openbsd/lib/libc/string/strcmp.c \
upstream-openbsd/lib/libc/string/strcpy.c \
@@ -47,7 +49,6 @@
arch-mips64/bionic/memcmp16.S \
arch-mips64/bionic/_setjmp.S \
arch-mips64/bionic/setjmp.S \
- arch-mips64/bionic/__set_tls.c \
arch-mips64/bionic/sigsetjmp.S \
arch-mips64/bionic/syscall.S \
arch-mips64/bionic/vfork.S \
diff --git a/libc/arch-mips64/syscalls/__set_thread_area.S b/libc/arch-mips64/syscalls/__set_tls.S
similarity index 82%
rename from libc/arch-mips64/syscalls/__set_thread_area.S
rename to libc/arch-mips64/syscalls/__set_tls.S
index c28ee4a..f1c31b4 100644
--- a/libc/arch-mips64/syscalls/__set_thread_area.S
+++ b/libc/arch-mips64/syscalls/__set_tls.S
@@ -2,7 +2,7 @@
#include <private/bionic_asm.h>
-ENTRY(__set_thread_area)
+ENTRY(__set_tls)
.set push
.set noreorder
li v0, __NR_set_thread_area
@@ -22,5 +22,5 @@
j t9
move ra, t0
.set pop
-END(__set_thread_area)
-.hidden __set_thread_area
+END(__set_tls)
+.hidden __set_tls
diff --git a/libc/arch-x86/atom/atom.mk b/libc/arch-x86/atom/atom.mk
new file mode 100644
index 0000000..bf408b4
--- /dev/null
+++ b/libc/arch-x86/atom/atom.mk
@@ -0,0 +1,34 @@
+libc_bionic_src_files_x86 += \
+ arch-x86/atom/string/sse2-bzero-atom.S \
+ arch-x86/atom/string/sse2-index-atom.S \
+ arch-x86/atom/string/sse2-memchr-atom.S \
+ arch-x86/atom/string/sse2-memrchr-atom.S \
+ arch-x86/atom/string/sse2-memset-atom.S \
+ arch-x86/atom/string/sse2-strchr-atom.S \
+ arch-x86/atom/string/sse2-strlen-atom.S \
+ arch-x86/atom/string/sse2-strnlen-atom.S \
+ arch-x86/atom/string/sse2-strrchr-atom.S \
+ arch-x86/atom/string/sse2-wcschr-atom.S \
+ arch-x86/atom/string/sse2-wcsrchr-atom.S \
+ arch-x86/atom/string/sse2-wcslen-atom.S \
+ arch-x86/atom/string/sse2-wcscmp-atom.S \
+ arch-x86/atom/string/ssse3-bcopy-atom.S \
+ arch-x86/atom/string/ssse3-memcmp-atom.S \
+ arch-x86/atom/string/ssse3-memcmp16-atom.S \
+ arch-x86/atom/string/ssse3-memcpy-atom.S \
+ arch-x86/atom/string/ssse3-memmove-atom.S \
+ arch-x86/atom/string/ssse3-strcat-atom.S \
+ arch-x86/atom/string/ssse3-strcmp-atom.S \
+ arch-x86/atom/string/ssse3-strcpy-atom.S \
+ arch-x86/atom/string/ssse3-strlcat-atom.S \
+ arch-x86/atom/string/ssse3-strlcpy-atom.S \
+ arch-x86/atom/string/ssse3-strncat-atom.S \
+ arch-x86/atom/string/ssse3-strncmp-atom.S \
+ arch-x86/atom/string/ssse3-strncpy-atom.S \
+ arch-x86/atom/string/ssse3-wcscat-atom.S \
+ arch-x86/atom/string/ssse3-wcscpy-atom.S \
+ arch-x86/atom/string/ssse3-wmemcmp-atom.S
+
+libc_bionic_src_files_x86 += \
+ arch-x86/silvermont/string/sse2-stpcpy-slm.S \
+ arch-x86/silvermont/string/sse2-stpncpy-slm.S
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/atom/string/cache.h
similarity index 89%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86/atom/string/cache.h
index 9d0a563..823bb1e 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/atom/string/cache.h
@@ -28,15 +28,9 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */
-#else
/* Values are optimized for Atom */
#define SHARED_CACHE_SIZE (512*1024) /* Atom L2 Cache */
#define DATA_CACHE_SIZE (24*1024) /* Atom L1 Data Cache */
-#endif
#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
diff --git a/libc/arch-x86/string/sse2-bzero-atom.S b/libc/arch-x86/atom/string/sse2-bzero-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-bzero-atom.S
rename to libc/arch-x86/atom/string/sse2-bzero-atom.S
diff --git a/libc/arch-x86/string/sse2-index-atom.S b/libc/arch-x86/atom/string/sse2-index-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-index-atom.S
rename to libc/arch-x86/atom/string/sse2-index-atom.S
diff --git a/libc/arch-x86/string/sse2-memchr-atom.S b/libc/arch-x86/atom/string/sse2-memchr-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-memchr-atom.S
rename to libc/arch-x86/atom/string/sse2-memchr-atom.S
diff --git a/libc/arch-x86/string/sse2-memrchr-atom.S b/libc/arch-x86/atom/string/sse2-memrchr-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-memrchr-atom.S
rename to libc/arch-x86/atom/string/sse2-memrchr-atom.S
diff --git a/libc/arch-x86/string/sse2-memset-atom.S b/libc/arch-x86/atom/string/sse2-memset-atom.S
similarity index 98%
rename from libc/arch-x86/string/sse2-memset-atom.S
rename to libc/arch-x86/atom/string/sse2-memset-atom.S
index a54bf51..b0963a1 100644
--- a/libc/arch-x86/string/sse2-memset-atom.S
+++ b/libc/arch-x86/atom/string/sse2-memset-atom.S
@@ -29,7 +29,6 @@
*/
#include "cache.h"
-#undef __i686
#ifndef L
# define L(label) .L##label
@@ -107,7 +106,7 @@
jump table with relative offsets. */
# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
/* We first load PC into EBX. */ \
- call __i686.get_pc_thunk.bx; \
+ call __x86.get_pc_thunk.bx; \
/* Get the address of the jump table. */ \
add $(TABLE - .), %ebx; \
/* Get the entry and convert the relative offset to the \
@@ -117,12 +116,12 @@
/* We loaded the jump table and adjuested EDX. Go. */ \
jmp *%ebx
- .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
- .globl __i686.get_pc_thunk.bx
- .hidden __i686.get_pc_thunk.bx
+ .section .gnu.linkonce.t.__x86.get_pc_thunk.bx,"ax",@progbits
+ .globl __x86.get_pc_thunk.bx
+ .hidden __x86.get_pc_thunk.bx
ALIGN (4)
- .type __i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
+ .type __x86.get_pc_thunk.bx,@function
+__x86.get_pc_thunk.bx:
movl (%esp), %ebx
ret
#else
@@ -321,7 +320,7 @@
mov $SHARED_CACHE_SIZE, %ebx
#else
# if (defined SHARED || defined __PIC__)
- call __i686.get_pc_thunk.bx
+ call __x86.get_pc_thunk.bx
add $_GLOBAL_OFFSET_TABLE_, %ebx
mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx
# else
@@ -340,7 +339,7 @@
#else
# if (defined SHARED || defined __PIC__)
# define RESTORE_EBX_STATE
- call __i686.get_pc_thunk.bx
+ call __x86.get_pc_thunk.bx
add $_GLOBAL_OFFSET_TABLE_, %ebx
cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx
# else
diff --git a/libc/arch-x86/string/sse2-strchr-atom.S b/libc/arch-x86/atom/string/sse2-strchr-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-strchr-atom.S
rename to libc/arch-x86/atom/string/sse2-strchr-atom.S
diff --git a/libc/arch-x86/string/sse2-strlen-atom.S b/libc/arch-x86/atom/string/sse2-strlen-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-strlen-atom.S
rename to libc/arch-x86/atom/string/sse2-strlen-atom.S
diff --git a/libc/arch-x86/string/sse2-strnlen-atom.S b/libc/arch-x86/atom/string/sse2-strnlen-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-strnlen-atom.S
rename to libc/arch-x86/atom/string/sse2-strnlen-atom.S
diff --git a/libc/arch-x86/string/sse2-strrchr-atom.S b/libc/arch-x86/atom/string/sse2-strrchr-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-strrchr-atom.S
rename to libc/arch-x86/atom/string/sse2-strrchr-atom.S
diff --git a/libc/arch-x86/string/sse2-wcschr-atom.S b/libc/arch-x86/atom/string/sse2-wcschr-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-wcschr-atom.S
rename to libc/arch-x86/atom/string/sse2-wcschr-atom.S
diff --git a/libc/arch-x86/string/sse2-wcscmp-atom.S b/libc/arch-x86/atom/string/sse2-wcscmp-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-wcscmp-atom.S
rename to libc/arch-x86/atom/string/sse2-wcscmp-atom.S
diff --git a/libc/arch-x86/string/sse2-wcslen-atom.S b/libc/arch-x86/atom/string/sse2-wcslen-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-wcslen-atom.S
rename to libc/arch-x86/atom/string/sse2-wcslen-atom.S
diff --git a/libc/arch-x86/string/sse2-wcsrchr-atom.S b/libc/arch-x86/atom/string/sse2-wcsrchr-atom.S
similarity index 100%
rename from libc/arch-x86/string/sse2-wcsrchr-atom.S
rename to libc/arch-x86/atom/string/sse2-wcsrchr-atom.S
diff --git a/libc/arch-x86/string/ssse3-bcopy-atom.S b/libc/arch-x86/atom/string/ssse3-bcopy-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-bcopy-atom.S
rename to libc/arch-x86/atom/string/ssse3-bcopy-atom.S
diff --git a/libc/arch-x86/string/ssse3-memcmp-atom.S b/libc/arch-x86/atom/string/ssse3-memcmp-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-memcmp-atom.S
rename to libc/arch-x86/atom/string/ssse3-memcmp-atom.S
diff --git a/libc/arch-x86/string/ssse3-memcmp16-atom.S b/libc/arch-x86/atom/string/ssse3-memcmp16-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-memcmp16-atom.S
rename to libc/arch-x86/atom/string/ssse3-memcmp16-atom.S
diff --git a/libc/arch-x86/string/ssse3-memcpy-atom.S b/libc/arch-x86/atom/string/ssse3-memcpy-atom.S
similarity index 99%
rename from libc/arch-x86/string/ssse3-memcpy-atom.S
rename to libc/arch-x86/atom/string/ssse3-memcpy-atom.S
index 1080a38..ac5ec2d 100644
--- a/libc/arch-x86/string/ssse3-memcpy-atom.S
+++ b/libc/arch-x86/atom/string/ssse3-memcpy-atom.S
@@ -29,7 +29,6 @@
*/
#include "cache.h"
-#undef __i686
#ifndef MEMCPY
# define MEMCPY memcpy
@@ -101,9 +100,8 @@
# define RETURN_END POP (%ebx); ret
# define RETURN RETURN_END; CFI_PUSH (%ebx)
# define JMPTBL(I, B) I - B
-# undef __i686
-# define SETUP_PIC_REG(x) call __i686.get_pc_thunk.x
+# define SETUP_PIC_REG(x) call __x86.get_pc_thunk.x
/* Load an entry in a jump table into EBX and branch to it. TABLE is a
jump table with relative offsets. INDEX is a register contains the
diff --git a/libc/arch-x86/string/ssse3-memmove-atom.S b/libc/arch-x86/atom/string/ssse3-memmove-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-memmove-atom.S
rename to libc/arch-x86/atom/string/ssse3-memmove-atom.S
diff --git a/libc/arch-x86/string/ssse3-strcat-atom.S b/libc/arch-x86/atom/string/ssse3-strcat-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strcat-atom.S
rename to libc/arch-x86/atom/string/ssse3-strcat-atom.S
diff --git a/libc/arch-x86/string/ssse3-strcmp-atom.S b/libc/arch-x86/atom/string/ssse3-strcmp-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strcmp-atom.S
rename to libc/arch-x86/atom/string/ssse3-strcmp-atom.S
diff --git a/libc/arch-x86/string/ssse3-strcpy-atom.S b/libc/arch-x86/atom/string/ssse3-strcpy-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strcpy-atom.S
rename to libc/arch-x86/atom/string/ssse3-strcpy-atom.S
diff --git a/libc/arch-x86/string/ssse3-strlcat-atom.S b/libc/arch-x86/atom/string/ssse3-strlcat-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strlcat-atom.S
rename to libc/arch-x86/atom/string/ssse3-strlcat-atom.S
diff --git a/libc/arch-x86/string/ssse3-strlcpy-atom.S b/libc/arch-x86/atom/string/ssse3-strlcpy-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strlcpy-atom.S
rename to libc/arch-x86/atom/string/ssse3-strlcpy-atom.S
diff --git a/libc/arch-x86/string/ssse3-strncat-atom.S b/libc/arch-x86/atom/string/ssse3-strncat-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strncat-atom.S
rename to libc/arch-x86/atom/string/ssse3-strncat-atom.S
diff --git a/libc/arch-x86/string/ssse3-strncmp-atom.S b/libc/arch-x86/atom/string/ssse3-strncmp-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strncmp-atom.S
rename to libc/arch-x86/atom/string/ssse3-strncmp-atom.S
diff --git a/libc/arch-x86/string/ssse3-strncpy-atom.S b/libc/arch-x86/atom/string/ssse3-strncpy-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-strncpy-atom.S
rename to libc/arch-x86/atom/string/ssse3-strncpy-atom.S
diff --git a/libc/arch-x86/string/ssse3-wcscat-atom.S b/libc/arch-x86/atom/string/ssse3-wcscat-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-wcscat-atom.S
rename to libc/arch-x86/atom/string/ssse3-wcscat-atom.S
diff --git a/libc/arch-x86/string/ssse3-wcscpy-atom.S b/libc/arch-x86/atom/string/ssse3-wcscpy-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-wcscpy-atom.S
rename to libc/arch-x86/atom/string/ssse3-wcscpy-atom.S
diff --git a/libc/arch-x86/string/ssse3-wmemcmp-atom.S b/libc/arch-x86/atom/string/ssse3-wmemcmp-atom.S
similarity index 100%
rename from libc/arch-x86/string/ssse3-wmemcmp-atom.S
rename to libc/arch-x86/atom/string/ssse3-wmemcmp-atom.S
diff --git a/libc/arch-x86/generic/generic.mk b/libc/arch-x86/generic/generic.mk
new file mode 100644
index 0000000..c8b40ee
--- /dev/null
+++ b/libc/arch-x86/generic/generic.mk
@@ -0,0 +1,55 @@
+libc_bionic_src_files_x86 += \
+ arch-x86/atom/string/sse2-index-atom.S \
+ arch-x86/atom/string/sse2-memchr-atom.S \
+ arch-x86/atom/string/sse2-memrchr-atom.S \
+ arch-x86/atom/string/sse2-strchr-atom.S \
+ arch-x86/atom/string/sse2-strnlen-atom.S \
+ arch-x86/atom/string/sse2-strrchr-atom.S \
+ arch-x86/atom/string/sse2-wcschr-atom.S \
+ arch-x86/atom/string/sse2-wcsrchr-atom.S \
+ arch-x86/atom/string/sse2-wcslen-atom.S \
+ arch-x86/atom/string/sse2-wcscmp-atom.S \
+ arch-x86/silvermont/string/sse2-bcopy-slm.S \
+ arch-x86/silvermont/string/sse2-bzero-slm.S \
+ arch-x86/silvermont/string/sse2-memcpy-slm.S \
+ arch-x86/silvermont/string/sse2-memmove-slm.S \
+ arch-x86/silvermont/string/sse2-memset-slm.S \
+ arch-x86/silvermont/string/sse2-stpcpy-slm.S \
+ arch-x86/silvermont/string/sse2-stpncpy-slm.S \
+ arch-x86/silvermont/string/sse2-strcpy-slm.S \
+ arch-x86/silvermont/string/sse2-strlen-slm.S \
+ arch-x86/silvermont/string/sse2-strncpy-slm.S
+
+ifeq ($(ARCH_X86_HAVE_SSSE3),true)
+libc_bionic_src_files_x86 += \
+ arch-x86/atom/string/ssse3-strncat-atom.S \
+ arch-x86/atom/string/ssse3-strlcat-atom.S \
+ arch-x86/atom/string/ssse3-strlcpy-atom.S \
+ arch-x86/atom/string/ssse3-strcmp-atom.S \
+ arch-x86/atom/string/ssse3-strncmp-atom.S \
+ arch-x86/atom/string/ssse3-strcat-atom.S \
+ arch-x86/atom/string/ssse3-memcmp16-atom.S \
+ arch-x86/atom/string/ssse3-wcscat-atom.S \
+ arch-x86/atom/string/ssse3-wcscpy-atom.S
+else
+libc_bionic_src_files_x86 += \
+ arch-x86/generic/string/strcmp.S \
+ arch-x86/generic/string/strncmp.S \
+ arch-x86/generic/string/strcat.S \
+ bionic/__memcmp16.cpp \
+ upstream-freebsd/lib/libc/string/wcscpy.c \
+ upstream-freebsd/lib/libc/string/wcscat.c \
+ upstream-openbsd/lib/libc/string/strlcat.c \
+ upstream-openbsd/lib/libc/string/strlcpy.c \
+ upstream-openbsd/lib/libc/string/strncat.c
+endif
+
+ifeq ($(ARCH_X86_HAVE_SSE4),true)
+ libc_bionic_src_files_x86 += \
+ arch-x86/silvermont/string/sse4-memcmp-slm.S \
+ arch-x86/silvermont/string/sse4-wmemcmp-slm.S
+else
+libc_bionic_src_files_x86 += \
+ arch-x86/generic/string/memcmp.S \
+ upstream-freebsd/lib/libc/string/wmemcmp.c
+endif
diff --git a/libc/arch-x86/string/bcopy.S b/libc/arch-x86/generic/string/bcopy.S
similarity index 100%
rename from libc/arch-x86/string/bcopy.S
rename to libc/arch-x86/generic/string/bcopy.S
diff --git a/libc/arch-x86/string/memcmp.S b/libc/arch-x86/generic/string/memcmp.S
similarity index 100%
rename from libc/arch-x86/string/memcmp.S
rename to libc/arch-x86/generic/string/memcmp.S
diff --git a/libc/arch-x86/string/memcpy.S b/libc/arch-x86/generic/string/memcpy.S
similarity index 100%
rename from libc/arch-x86/string/memcpy.S
rename to libc/arch-x86/generic/string/memcpy.S
diff --git a/libc/arch-x86/string/memmove.S b/libc/arch-x86/generic/string/memmove.S
similarity index 100%
rename from libc/arch-x86/string/memmove.S
rename to libc/arch-x86/generic/string/memmove.S
diff --git a/libc/arch-x86/string/strcat.S b/libc/arch-x86/generic/string/strcat.S
similarity index 100%
rename from libc/arch-x86/string/strcat.S
rename to libc/arch-x86/generic/string/strcat.S
diff --git a/libc/arch-x86/string/strcmp.S b/libc/arch-x86/generic/string/strcmp.S
similarity index 100%
rename from libc/arch-x86/string/strcmp.S
rename to libc/arch-x86/generic/string/strcmp.S
diff --git a/libc/arch-x86/string/strncmp.S b/libc/arch-x86/generic/string/strncmp.S
similarity index 100%
rename from libc/arch-x86/string/strncmp.S
rename to libc/arch-x86/generic/string/strncmp.S
diff --git a/libc/arch-x86/string/swab.S b/libc/arch-x86/generic/string/swab.S
similarity index 100%
rename from libc/arch-x86/string/swab.S
rename to libc/arch-x86/generic/string/swab.S
diff --git a/libc/arch-x86/silvermont/silvermont.mk b/libc/arch-x86/silvermont/silvermont.mk
new file mode 100644
index 0000000..b951ad5
--- /dev/null
+++ b/libc/arch-x86/silvermont/silvermont.mk
@@ -0,0 +1,34 @@
+libc_bionic_src_files_x86 += \
+ arch-x86/silvermont/string/sse2-bcopy-slm.S \
+ arch-x86/silvermont/string/sse2-bzero-slm.S \
+ arch-x86/silvermont/string/sse2-memcpy-slm.S \
+ arch-x86/silvermont/string/sse2-memmove-slm.S \
+ arch-x86/silvermont/string/sse2-memset-slm.S \
+ arch-x86/silvermont/string/sse2-stpcpy-slm.S \
+ arch-x86/silvermont/string/sse2-stpncpy-slm.S \
+ arch-x86/silvermont/string/sse2-strcpy-slm.S \
+ arch-x86/silvermont/string/sse2-strlen-slm.S \
+ arch-x86/silvermont/string/sse2-strncpy-slm.S \
+ arch-x86/silvermont/string/sse4-memcmp-slm.S \
+ arch-x86/silvermont/string/sse4-wmemcmp-slm.S
+
+libc_bionic_src_files_x86 += \
+ arch-x86/atom/string/sse2-memchr-atom.S \
+ arch-x86/atom/string/sse2-memrchr-atom.S \
+ arch-x86/atom/string/sse2-strchr-atom.S \
+ arch-x86/atom/string/sse2-strrchr-atom.S \
+ arch-x86/atom/string/sse2-index-atom.S \
+ arch-x86/atom/string/sse2-strnlen-atom.S \
+ arch-x86/atom/string/sse2-wcschr-atom.S \
+ arch-x86/atom/string/sse2-wcsrchr-atom.S \
+ arch-x86/atom/string/sse2-wcslen-atom.S \
+ arch-x86/atom/string/sse2-wcscmp-atom.S \
+ arch-x86/atom/string/ssse3-strncat-atom.S \
+ arch-x86/atom/string/ssse3-strlcat-atom.S \
+ arch-x86/atom/string/ssse3-strlcpy-atom.S \
+ arch-x86/atom/string/ssse3-strcmp-atom.S \
+ arch-x86/atom/string/ssse3-strncmp-atom.S \
+ arch-x86/atom/string/ssse3-strcat-atom.S \
+ arch-x86/atom/string/ssse3-memcmp16-atom.S \
+ arch-x86/atom/string/ssse3-wcscat-atom.S \
+ arch-x86/atom/string/ssse3-wcscpy-atom.S
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/silvermont/string/cache.h
similarity index 90%
rename from libc/arch-x86/string/cache.h
rename to libc/arch-x86/silvermont/string/cache.h
index 9d0a563..c342b1c 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/silvermont/string/cache.h
@@ -28,15 +28,9 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#if defined(__slm__)
/* Values are optimized for Silvermont */
#define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */
#define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE (512*1024) /* Atom L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Atom L1 Data Cache */
-#endif
#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/silvermont/string/sse2-bcopy-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86/silvermont/string/sse2-bcopy-slm.S
index 9d0a563..190d52f 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/silvermont/string/sse2-bcopy-slm.S
@@ -1,5 +1,5 @@
/*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,15 +28,7 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE (512*1024) /* Atom L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Atom L1 Data Cache */
-#endif
-#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
+#define MEMMOVE bcopy
+#define USE_AS_BCOPY
+#include "sse2-memmove-slm.S"
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/silvermont/string/sse2-bzero-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86/silvermont/string/sse2-bzero-slm.S
index 9d0a563..b682ed6 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/silvermont/string/sse2-bzero-slm.S
@@ -1,5 +1,5 @@
/*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,15 +28,6 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE (512*1024) /* Atom L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
+#define USE_AS_BZERO
+#define MEMSET bzero
+#include "sse2-memset-slm.S"
diff --git a/libc/arch-x86/silvermont/string/sse2-memcpy-slm.S b/libc/arch-x86/silvermont/string/sse2-memcpy-slm.S
new file mode 100644
index 0000000..1b305c7
--- /dev/null
+++ b/libc/arch-x86/silvermont/string/sse2-memcpy-slm.S
@@ -0,0 +1,308 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cache.h"
+
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#define DEST PARMS
+#define SRC DEST+4
+#define LEN SRC+4
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#define PARMS 8 /* Preserve EBX. */
+#define ENTRANCE PUSH (%ebx);
+#define RETURN_END POP (%ebx); ret
+#define RETURN RETURN_END; CFI_PUSH (%ebx)
+
+ .section .text.sse2,"ax",@progbits
+ENTRY (MEMCPY)
+ ENTRANCE
+ movl LEN(%esp), %ecx
+ movl SRC(%esp), %eax
+ movl DEST(%esp), %edx
+
+ cmp %eax, %edx
+ je L(return)
+
+ cmp $16, %ecx
+ jbe L(len_0_16_bytes)
+
+ cmp $SHARED_CACHE_SIZE_HALF, %ecx
+ jae L(large_page)
+
+ movdqu (%eax), %xmm0
+ movdqu -16(%eax, %ecx), %xmm1
+ cmpl $32, %ecx
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, -16(%edx, %ecx)
+ jbe L(return)
+
+ movdqu 16(%eax), %xmm0
+ movdqu -32(%eax, %ecx), %xmm1
+ cmpl $64, %ecx
+ movdqu %xmm0, 16(%edx)
+ movdqu %xmm1, -32(%edx, %ecx)
+ jbe L(return)
+
+ movdqu 32(%eax), %xmm0
+ movdqu 48(%eax), %xmm1
+ movdqu -48(%eax, %ecx), %xmm2
+ movdqu -64(%eax, %ecx), %xmm3
+ cmpl $128, %ecx
+ movdqu %xmm0, 32(%edx)
+ movdqu %xmm1, 48(%edx)
+ movdqu %xmm2, -48(%edx, %ecx)
+ movdqu %xmm3, -64(%edx, %ecx)
+ jbe L(return)
+
+/* Now the main loop: we align the address of the destination. */
+ leal 64(%edx), %ebx
+ andl $-64, %ebx
+
+ addl %edx, %ecx
+ andl $-64, %ecx
+
+ subl %edx, %eax
+
+/* We should stop two iterations before the termination
+ (in order not to misprefetch). */
+ subl $64, %ecx
+ cmpl %ebx, %ecx
+ je L(main_loop_just_one_iteration)
+
+ subl $64, %ecx
+ cmpl %ebx, %ecx
+ je L(main_loop_last_two_iterations)
+
+
+ .p2align 4
+L(main_loop_cache):
+
+ prefetcht0 128(%ebx, %eax)
+
+ movdqu (%ebx, %eax), %xmm0
+ movdqu 16(%ebx, %eax), %xmm1
+ movdqu 32(%ebx, %eax), %xmm2
+ movdqu 48(%ebx, %eax), %xmm3
+ movdqa %xmm0, (%ebx)
+ movdqa %xmm1, 16(%ebx)
+ movdqa %xmm2, 32(%ebx)
+ movdqa %xmm3, 48(%ebx)
+ lea 64(%ebx), %ebx
+ cmpl %ebx, %ecx
+ jne L(main_loop_cache)
+
+L(main_loop_last_two_iterations):
+ movdqu (%ebx, %eax), %xmm0
+ movdqu 16(%ebx, %eax), %xmm1
+ movdqu 32(%ebx, %eax), %xmm2
+ movdqu 48(%ebx, %eax), %xmm3
+ movdqu 64(%ebx, %eax), %xmm4
+ movdqu 80(%ebx, %eax), %xmm5
+ movdqu 96(%ebx, %eax), %xmm6
+ movdqu 112(%ebx, %eax), %xmm7
+ movdqa %xmm0, (%ebx)
+ movdqa %xmm1, 16(%ebx)
+ movdqa %xmm2, 32(%ebx)
+ movdqa %xmm3, 48(%ebx)
+ movdqa %xmm4, 64(%ebx)
+ movdqa %xmm5, 80(%ebx)
+ movdqa %xmm6, 96(%ebx)
+ movdqa %xmm7, 112(%ebx)
+ jmp L(return)
+
+L(main_loop_just_one_iteration):
+ movdqu (%ebx, %eax), %xmm0
+ movdqu 16(%ebx, %eax), %xmm1
+ movdqu 32(%ebx, %eax), %xmm2
+ movdqu 48(%ebx, %eax), %xmm3
+ movdqa %xmm0, (%ebx)
+ movdqa %xmm1, 16(%ebx)
+ movdqa %xmm2, 32(%ebx)
+ movdqa %xmm3, 48(%ebx)
+ jmp L(return)
+
+L(large_page):
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu 32(%eax), %xmm2
+ movdqu 48(%eax), %xmm3
+ movdqu -64(%eax, %ecx), %xmm4
+ movdqu -48(%eax, %ecx), %xmm5
+ movdqu -32(%eax, %ecx), %xmm6
+ movdqu -16(%eax, %ecx), %xmm7
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, 16(%edx)
+ movdqu %xmm2, 32(%edx)
+ movdqu %xmm3, 48(%edx)
+ movdqu %xmm4, -64(%edx, %ecx)
+ movdqu %xmm5, -48(%edx, %ecx)
+ movdqu %xmm6, -32(%edx, %ecx)
+ movdqu %xmm7, -16(%edx, %ecx)
+
+ movdqu 64(%eax), %xmm0
+ movdqu 80(%eax), %xmm1
+ movdqu 96(%eax), %xmm2
+ movdqu 112(%eax), %xmm3
+ movdqu -128(%eax, %ecx), %xmm4
+ movdqu -112(%eax, %ecx), %xmm5
+ movdqu -96(%eax, %ecx), %xmm6
+ movdqu -80(%eax, %ecx), %xmm7
+ movdqu %xmm0, 64(%edx)
+ movdqu %xmm1, 80(%edx)
+ movdqu %xmm2, 96(%edx)
+ movdqu %xmm3, 112(%edx)
+ movdqu %xmm4, -128(%edx, %ecx)
+ movdqu %xmm5, -112(%edx, %ecx)
+ movdqu %xmm6, -96(%edx, %ecx)
+ movdqu %xmm7, -80(%edx, %ecx)
+
+/* Now the main loop with non temporal stores. We align
+ the address of the destination. */
+ leal 128(%edx), %ebx
+ andl $-128, %ebx
+
+ addl %edx, %ecx
+ andl $-128, %ecx
+
+ subl %edx, %eax
+
+ .p2align 4
+L(main_loop_large_page):
+ movdqu (%ebx, %eax), %xmm0
+ movdqu 16(%ebx, %eax), %xmm1
+ movdqu 32(%ebx, %eax), %xmm2
+ movdqu 48(%ebx, %eax), %xmm3
+ movdqu 64(%ebx, %eax), %xmm4
+ movdqu 80(%ebx, %eax), %xmm5
+ movdqu 96(%ebx, %eax), %xmm6
+ movdqu 112(%ebx, %eax), %xmm7
+ movntdq %xmm0, (%ebx)
+ movntdq %xmm1, 16(%ebx)
+ movntdq %xmm2, 32(%ebx)
+ movntdq %xmm3, 48(%ebx)
+ movntdq %xmm4, 64(%ebx)
+ movntdq %xmm5, 80(%ebx)
+ movntdq %xmm6, 96(%ebx)
+ movntdq %xmm7, 112(%ebx)
+ lea 128(%ebx), %ebx
+ cmpl %ebx, %ecx
+ jne L(main_loop_large_page)
+ sfence
+ jmp L(return)
+
+L(len_0_16_bytes):
+ testb $24, %cl
+ jne L(len_9_16_bytes)
+ testb $4, %cl
+ .p2align 4,,5
+ jne L(len_5_8_bytes)
+ testl %ecx, %ecx
+ .p2align 4,,2
+ je L(return)
+ movzbl (%eax), %ebx
+ testb $2, %cl
+ movb %bl, (%edx)
+ je L(return)
+ movzwl -2(%eax,%ecx), %ebx
+ movw %bx, -2(%edx,%ecx)
+ jmp L(return)
+
+L(len_9_16_bytes):
+ movq (%eax), %xmm0
+ movq -8(%eax, %ecx), %xmm1
+ movq %xmm0, (%edx)
+ movq %xmm1, -8(%edx, %ecx)
+ jmp L(return)
+
+L(len_5_8_bytes):
+ movl (%eax), %ebx
+ movl %ebx, (%edx)
+ movl -4(%eax,%ecx), %ebx
+ movl %ebx, -4(%edx,%ecx)
+ jmp L(return)
+
+L(return):
+ movl %edx, %eax
+ RETURN
+
+END (MEMCPY)
diff --git a/libc/arch-x86/silvermont/string/sse2-memmove-slm.S b/libc/arch-x86/silvermont/string/sse2-memmove-slm.S
new file mode 100644
index 0000000..79a0a36
--- /dev/null
+++ b/libc/arch-x86/silvermont/string/sse2-memmove-slm.S
@@ -0,0 +1,673 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cache.h"
+
+#ifndef MEMMOVE
+# define MEMMOVE memmove
+#endif
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#ifdef USE_AS_BCOPY
+# define SRC PARMS
+# define DEST SRC+4
+# define LEN DEST+4
+#else
+# define DEST PARMS
+# define SRC DEST+4
+# define LEN SRC+4
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#define PARMS 8 /* Preserve EBX. */
+#define ENTRANCE PUSH (%ebx);
+#define RETURN_END POP (%ebx); ret
+#define RETURN RETURN_END; CFI_PUSH (%ebx)
+
+ .section .text.sse2,"ax",@progbits
+ENTRY (MEMMOVE)
+ ENTRANCE
+ movl LEN(%esp), %ecx
+ movl SRC(%esp), %eax
+ movl DEST(%esp), %edx
+
+/* Check whether we should copy backward or forward. */
+ cmp %eax, %edx
+ je L(mm_return)
+ ja L(mm_len_0_or_more_backward)
+
+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
+ separately. */
+ cmp $16, %ecx
+ jbe L(mm_len_0_16_bytes_forward)
+
+ cmpl $32, %ecx
+ jg L(mm_len_32_or_more_forward)
+
+/* Copy [0..32] and return. */
+ movdqu (%eax), %xmm0
+ movdqu -16(%eax, %ecx), %xmm1
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, -16(%edx, %ecx)
+ jmp L(mm_return)
+
+L(mm_len_32_or_more_forward):
+ cmpl $64, %ecx
+ jg L(mm_len_64_or_more_forward)
+
+/* Copy [0..64] and return. */
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu -16(%eax, %ecx), %xmm2
+ movdqu -32(%eax, %ecx), %xmm3
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, 16(%edx)
+ movdqu %xmm2, -16(%edx, %ecx)
+ movdqu %xmm3, -32(%edx, %ecx)
+ jmp L(mm_return)
+
+L(mm_len_64_or_more_forward):
+ cmpl $128, %ecx
+ jg L(mm_len_128_or_more_forward)
+
+/* Copy [0..128] and return. */
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu 32(%eax), %xmm2
+ movdqu 48(%eax), %xmm3
+ movdqu -64(%eax, %ecx), %xmm4
+ movdqu -48(%eax, %ecx), %xmm5
+ movdqu -32(%eax, %ecx), %xmm6
+ movdqu -16(%eax, %ecx), %xmm7
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, 16(%edx)
+ movdqu %xmm2, 32(%edx)
+ movdqu %xmm3, 48(%edx)
+ movdqu %xmm4, -64(%edx, %ecx)
+ movdqu %xmm5, -48(%edx, %ecx)
+ movdqu %xmm6, -32(%edx, %ecx)
+ movdqu %xmm7, -16(%edx, %ecx)
+ jmp L(mm_return)
+
+L(mm_len_128_or_more_forward):
+
+ cmp $SHARED_CACHE_SIZE_HALF, %ecx
+ jae L(mm_large_page_forward)
+
+ PUSH (%esi)
+ PUSH (%edi)
+ movl %eax, %esi
+ movl %edx, %edi
+
+/* Aligning the address of destination. */
+ movdqu (%esi), %xmm0
+ movdqu 16(%esi), %xmm1
+ movdqu 32(%esi), %xmm2
+ movdqu 48(%esi), %xmm3
+
+ leal 64(%edi), %edx
+ andl $-64, %edx
+
+ movl %esi, %eax
+ subl %edi, %eax
+
+ movdqu (%edx, %eax), %xmm4
+ movdqu 16(%edx, %eax), %xmm5
+ movdqu 32(%edx, %eax), %xmm6
+ movdqu 48(%edx, %eax), %xmm7
+
+ movdqu %xmm0, (%edi)
+ movdqu %xmm1, 16(%edi)
+ movdqu %xmm2, 32(%edi)
+ movdqu %xmm3, 48(%edi)
+ movdqa %xmm4, (%edx)
+ movdqa %xmm5, 16(%edx)
+ movdqa %xmm6, 32(%edx)
+ movdqa %xmm7, 48(%edx)
+ addl $64, %edx
+
+ leal (%edi, %ecx), %ebx
+ andl $-64, %ebx
+
+ cmp %edx, %ebx
+ jbe L(mm_copy_remaining_forward)
+
+ .p2align 4
+L(mm_main_loop_forward):
+
+ prefetcht0 128(%edx, %eax)
+
+ movdqu (%edx, %eax), %xmm0
+ movdqu 16(%edx, %eax), %xmm1
+ movdqu 32(%edx, %eax), %xmm2
+ movdqu 48(%edx, %eax), %xmm3
+ movdqa %xmm0, (%edx)
+ movdqa %xmm1, 16(%edx)
+ movdqa %xmm2, 32(%edx)
+ movdqa %xmm3, 48(%edx)
+ leal 64(%edx), %edx
+ cmp %edx, %ebx
+ ja L(mm_main_loop_forward)
+
+L(mm_copy_remaining_forward):
+ addl %edi, %ecx
+ subl %edx, %ecx
+/* We copied all up till %edx position in the dst.
+ In %ecx now is how many bytes are left to copy.
+ Now we need to advance %esi. */
+ leal (%edx, %eax), %esi
+
+L(mm_remaining_0_64_bytes_forward):
+ cmp $32, %ecx
+ ja L(mm_remaining_33_64_bytes_forward)
+ cmp $16, %ecx
+ ja L(mm_remaining_17_32_bytes_forward)
+ testl %ecx, %ecx
+ .p2align 4,,2
+ je L(mm_return_pop_all)
+
+ cmpb $8, %cl
+ ja L(mm_remaining_9_16_bytes_forward)
+ cmpb $4, %cl
+ .p2align 4,,5
+ ja L(mm_remaining_5_8_bytes_forward)
+ cmpb $2, %cl
+ .p2align 4,,1
+ ja L(mm_remaining_3_4_bytes_forward)
+ movzbl -1(%esi,%ecx), %eax
+ movzbl (%esi), %ebx
+ movb %al, -1(%edx,%ecx)
+ movb %bl, (%edx)
+ jmp L(mm_return_pop_all)
+
+L(mm_remaining_33_64_bytes_forward):
+ movdqu (%esi), %xmm0
+ movdqu 16(%esi), %xmm1
+ movdqu -32(%esi, %ecx), %xmm2
+ movdqu -16(%esi, %ecx), %xmm3
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, 16(%edx)
+ movdqu %xmm2, -32(%edx, %ecx)
+ movdqu %xmm3, -16(%edx, %ecx)
+ jmp L(mm_return_pop_all)
+
+L(mm_remaining_17_32_bytes_forward):
+ movdqu (%esi), %xmm0
+ movdqu -16(%esi, %ecx), %xmm1
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, -16(%edx, %ecx)
+ jmp L(mm_return_pop_all)
+
+L(mm_remaining_3_4_bytes_forward):
+ movzwl -2(%esi,%ecx), %eax
+ movzwl (%esi), %ebx
+ movw %ax, -2(%edx,%ecx)
+ movw %bx, (%edx)
+ jmp L(mm_return_pop_all)
+
+L(mm_remaining_5_8_bytes_forward):
+ movl (%esi), %eax
+ movl -4(%esi,%ecx), %ebx
+ movl %eax, (%edx)
+ movl %ebx, -4(%edx,%ecx)
+ jmp L(mm_return_pop_all)
+
+L(mm_remaining_9_16_bytes_forward):
+ movq (%esi), %xmm0
+ movq -8(%esi, %ecx), %xmm1
+ movq %xmm0, (%edx)
+ movq %xmm1, -8(%edx, %ecx)
+ jmp L(mm_return_pop_all)
+
+
+L(mm_len_0_16_bytes_forward):
+ testb $24, %cl
+ jne L(mm_len_9_16_bytes_forward)
+ testb $4, %cl
+ .p2align 4,,5
+ jne L(mm_len_5_8_bytes_forward)
+ testl %ecx, %ecx
+ .p2align 4,,2
+ je L(mm_return)
+ testb $2, %cl
+ .p2align 4,,1
+ jne L(mm_len_2_4_bytes_forward)
+ movzbl -1(%eax,%ecx), %ebx
+ movzbl (%eax), %eax
+ movb %bl, -1(%edx,%ecx)
+ movb %al, (%edx)
+ jmp L(mm_return)
+
+L(mm_len_2_4_bytes_forward):
+ movzwl -2(%eax,%ecx), %ebx
+ movzwl (%eax), %eax
+ movw %bx, -2(%edx,%ecx)
+ movw %ax, (%edx)
+ jmp L(mm_return)
+
+L(mm_len_5_8_bytes_forward):
+ movl (%eax), %ebx
+ movl -4(%eax,%ecx), %eax
+ movl %ebx, (%edx)
+ movl %eax, -4(%edx,%ecx)
+ jmp L(mm_return)
+
+L(mm_len_9_16_bytes_forward):
+ movq (%eax), %xmm0
+ movq -8(%eax, %ecx), %xmm1
+ movq %xmm0, (%edx)
+ movq %xmm1, -8(%edx, %ecx)
+ jmp L(mm_return)
+
+/* The code for copying backwards. */
+L(mm_len_0_or_more_backward):
+
+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
+ separately. */
+ cmp $16, %ecx
+ jbe L(mm_len_0_16_bytes_backward)
+
+ cmpl $32, %ecx
+ jg L(mm_len_32_or_more_backward)
+
+/* Copy [0..32] and return. */
+ movdqu (%eax), %xmm0
+ movdqu -16(%eax, %ecx), %xmm1
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, -16(%edx, %ecx)
+ jmp L(mm_return)
+
+L(mm_len_32_or_more_backward):
+ cmpl $64, %ecx
+ jg L(mm_len_64_or_more_backward)
+
+/* Copy [0..64] and return. */
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu -16(%eax, %ecx), %xmm2
+ movdqu -32(%eax, %ecx), %xmm3
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, 16(%edx)
+ movdqu %xmm2, -16(%edx, %ecx)
+ movdqu %xmm3, -32(%edx, %ecx)
+ jmp L(mm_return)
+
+L(mm_len_64_or_more_backward):
+ cmpl $128, %ecx
+ jg L(mm_len_128_or_more_backward)
+
+/* Copy [0..128] and return. */
+ movdqu (%eax), %xmm0
+ movdqu 16(%eax), %xmm1
+ movdqu 32(%eax), %xmm2
+ movdqu 48(%eax), %xmm3
+ movdqu -64(%eax, %ecx), %xmm4
+ movdqu -48(%eax, %ecx), %xmm5
+ movdqu -32(%eax, %ecx), %xmm6
+ movdqu -16(%eax, %ecx), %xmm7
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, 16(%edx)
+ movdqu %xmm2, 32(%edx)
+ movdqu %xmm3, 48(%edx)
+ movdqu %xmm4, -64(%edx, %ecx)
+ movdqu %xmm5, -48(%edx, %ecx)
+ movdqu %xmm6, -32(%edx, %ecx)
+ movdqu %xmm7, -16(%edx, %ecx)
+ jmp L(mm_return)
+
+L(mm_len_128_or_more_backward):
+
+ cmp $SHARED_CACHE_SIZE_HALF, %ecx
+ jae L(mm_large_page_backward)
+
+ PUSH (%esi)
+ PUSH (%edi)
+
+/* Aligning the address of destination. We need to save
+ 16 bits from the source in order not to overwrite them. */
+ movdqu -16(%eax, %ecx), %xmm0
+ movdqu -32(%eax, %ecx), %xmm1
+ movdqu -48(%eax, %ecx), %xmm2
+ movdqu -64(%eax, %ecx), %xmm3
+
+ leal (%edx, %ecx), %edi
+ andl $-64, %edi
+
+ movl %eax, %esi
+ subl %edx, %esi
+
+ movdqu -16(%edi, %esi), %xmm4
+ movdqu -32(%edi, %esi), %xmm5
+ movdqu -48(%edi, %esi), %xmm6
+ movdqu -64(%edi, %esi), %xmm7
+
+ movdqu %xmm0, -16(%edx, %ecx)
+ movdqu %xmm1, -32(%edx, %ecx)
+ movdqu %xmm2, -48(%edx, %ecx)
+ movdqu %xmm3, -64(%edx, %ecx)
+ movdqa %xmm4, -16(%edi)
+ movdqa %xmm5, -32(%edi)
+ movdqa %xmm6, -48(%edi)
+ movdqa %xmm7, -64(%edi)
+ leal -64(%edi), %edi
+
+ leal 64(%edx), %ebx
+ andl $-64, %ebx
+
+/* Compute in %ecx how many bytes are left to copy after
+ the main loop stops. */
+ movl %ebx, %ecx
+ subl %edx, %ecx
+
+ cmp %edi, %ebx
+ jb L(mm_main_loop_backward)
+
+ POP (%edi)
+ POP (%esi)
+ jmp L(mm_len_0_or_more_backward)
+
+ .p2align 4
+L(mm_main_loop_backward):
+
+ prefetcht0 -128(%edi, %esi)
+
+ movdqu -64(%edi, %esi), %xmm0
+ movdqu -48(%edi, %esi), %xmm1
+ movdqu -32(%edi, %esi), %xmm2
+ movdqu -16(%edi, %esi), %xmm3
+ movdqa %xmm0, -64(%edi)
+ movdqa %xmm1, -48(%edi)
+ movdqa %xmm2, -32(%edi)
+ movdqa %xmm3, -16(%edi)
+ leal -64(%edi), %edi
+ cmp %edi, %ebx
+ jb L(mm_main_loop_backward)
+ POP (%edi)
+ POP (%esi)
+ jmp L(mm_len_0_or_more_backward)
+
+/* Copy [0..16] and return. */
+L(mm_len_0_16_bytes_backward):
+ testb $24, %cl
+ jnz L(mm_len_9_16_bytes_backward)
+ testb $4, %cl
+ .p2align 4,,5
+ jnz L(mm_len_5_8_bytes_backward)
+ testl %ecx, %ecx
+ .p2align 4,,2
+ je L(mm_return)
+ testb $2, %cl
+ .p2align 4,,1
+ jne L(mm_len_3_4_bytes_backward)
+ movzbl -1(%eax,%ecx), %ebx
+ movzbl (%eax), %eax
+ movb %bl, -1(%edx,%ecx)
+ movb %al, (%edx)
+ jmp L(mm_return)
+
+L(mm_len_3_4_bytes_backward):
+ movzwl -2(%eax,%ecx), %ebx
+ movzwl (%eax), %eax
+ movw %bx, -2(%edx,%ecx)
+ movw %ax, (%edx)
+ jmp L(mm_return)
+
+L(mm_len_9_16_bytes_backward):
+ PUSH (%esi)
+ movl -4(%eax,%ecx), %ebx
+ movl -8(%eax,%ecx), %esi
+ movl %ebx, -4(%edx,%ecx)
+ movl %esi, -8(%edx,%ecx)
+ subl $8, %ecx
+ POP (%esi)
+ jmp L(mm_len_0_16_bytes_backward)
+
+L(mm_len_5_8_bytes_backward):
+ movl (%eax), %ebx
+ movl -4(%eax,%ecx), %eax
+ movl %ebx, (%edx)
+ movl %eax, -4(%edx,%ecx)
+
+L(mm_return):
+ movl %edx, %eax
+ RETURN
+
+L(mm_return_pop_all):
+ movl %edi, %eax
+ POP (%edi)
+ POP (%esi)
+ RETURN
+
+/* Big length copy forward part. */
+
+L(mm_large_page_forward):
+/* Aligning the address of destination. We need to save
+ 16 bits from the source in order not to overwrite them. */
+
+ PUSH (%esi)
+ PUSH (%edi)
+ movl %eax, %esi
+ movl %edx, %edi
+
+ movdqu (%esi), %xmm0
+ movdqu 16(%esi), %xmm1
+ movdqu 32(%esi), %xmm2
+ movdqu 48(%esi), %xmm3
+
+ leal 64(%edi), %edx
+ andl $-64, %edx
+
+ movl %esi, %eax
+ subl %edi, %eax
+
+ movdqu (%edx, %eax), %xmm4
+ movdqu 16(%edx, %eax), %xmm5
+ movdqu 32(%edx, %eax), %xmm6
+ movdqu 48(%edx, %eax), %xmm7
+
+ movdqu %xmm0, (%edi)
+ movdqu %xmm1, 16(%edi)
+ movdqu %xmm2, 32(%edi)
+ movdqu %xmm3, 48(%edi)
+ movntdq %xmm4, (%edx)
+ movntdq %xmm5, 16(%edx)
+ movntdq %xmm6, 32(%edx)
+ movntdq %xmm7, 48(%edx)
+ addl $64, %edx
+
+ leal (%edi, %ecx), %ebx
+ andl $-128, %ebx
+
+ cmp %edx, %ebx
+ jbe L(mm_copy_remaining_forward)
+
+ .p2align 4
+L(mm_large_page_loop_forward):
+ movdqu (%edx, %eax), %xmm0
+ movdqu 16(%edx, %eax), %xmm1
+ movdqu 32(%edx, %eax), %xmm2
+ movdqu 48(%edx, %eax), %xmm3
+ movdqu 64(%edx, %eax), %xmm4
+ movdqu 80(%edx, %eax), %xmm5
+ movdqu 96(%edx, %eax), %xmm6
+ movdqu 112(%edx, %eax), %xmm7
+ movntdq %xmm0, (%edx)
+ movntdq %xmm1, 16(%edx)
+ movntdq %xmm2, 32(%edx)
+ movntdq %xmm3, 48(%edx)
+ movntdq %xmm4, 64(%edx)
+ movntdq %xmm5, 80(%edx)
+ movntdq %xmm6, 96(%edx)
+ movntdq %xmm7, 112(%edx)
+ leal 128(%edx), %edx
+ cmp %edx, %ebx
+ ja L(mm_large_page_loop_forward)
+ sfence
+
+ addl %edi, %ecx
+ subl %edx, %ecx
+/* We copied all up till %edx position in the dst.
+ In %ecx now is how many bytes are left to copy.
+ Now we need to advance %esi. */
+ leal (%edx, %eax), %esi
+
+ cmp $64, %ecx
+ jb L(mm_remaining_0_64_bytes_forward)
+
+ movdqu (%esi), %xmm0
+ movdqu 16(%esi), %xmm1
+ movdqu 32(%esi), %xmm2
+ movdqu 48(%esi), %xmm3
+ movdqu -64(%esi, %ecx), %xmm4
+ movdqu -48(%esi, %ecx), %xmm5
+ movdqu -32(%esi, %ecx), %xmm6
+ movdqu -16(%esi, %ecx), %xmm7
+ movdqu %xmm0, (%edx)
+ movdqu %xmm1, 16(%edx)
+ movdqu %xmm2, 32(%edx)
+ movdqu %xmm3, 48(%edx)
+ movdqu %xmm4, -64(%edx, %ecx)
+ movdqu %xmm5, -48(%edx, %ecx)
+ movdqu %xmm6, -32(%edx, %ecx)
+ movdqu %xmm7, -16(%edx, %ecx)
+ jmp L(mm_return_pop_all)
+
+
+/* Big length copy backward part. */
+L(mm_large_page_backward):
+/* Aligning the address of destination. We need to save
+ 16 bits from the source in order not to overwrite them. */
+
+ PUSH (%esi)
+ PUSH (%edi)
+
+ movdqu -16(%eax, %ecx), %xmm0
+ movdqu -32(%eax, %ecx), %xmm1
+ movdqu -48(%eax, %ecx), %xmm2
+ movdqu -64(%eax, %ecx), %xmm3
+
+ leal (%edx, %ecx), %edi
+ andl $-64, %edi
+
+ movl %eax, %esi
+ subl %edx, %esi
+
+ movdqu -16(%edi, %esi), %xmm4
+ movdqu -32(%edi, %esi), %xmm5
+ movdqu -48(%edi, %esi), %xmm6
+ movdqu -64(%edi, %esi), %xmm7
+
+ movdqu %xmm0, -16(%edx, %ecx)
+ movdqu %xmm1, -32(%edx, %ecx)
+ movdqu %xmm2, -48(%edx, %ecx)
+ movdqu %xmm3, -64(%edx, %ecx)
+ movntdq %xmm4, -16(%edi)
+ movntdq %xmm5, -32(%edi)
+ movntdq %xmm6, -48(%edi)
+ movntdq %xmm7, -64(%edi)
+ leal -64(%edi), %edi
+
+ leal 128(%edx), %ebx
+ andl $-64, %ebx
+
+/* Compute in %ecx how many bytes are left to copy after
+ the main loop stops. */
+ movl %ebx, %ecx
+ subl %edx, %ecx
+
+ cmp %edi, %ebx
+ jae L(mm_len_0_or_more_backward)
+
+ .p2align 4
+L(mm_large_page_loop_backward):
+ movdqu -64(%edi, %esi), %xmm0
+ movdqu -48(%edi, %esi), %xmm1
+ movdqu -32(%edi, %esi), %xmm2
+ movdqu -16(%edi, %esi), %xmm3
+ movntdq %xmm0, -64(%edi)
+ movntdq %xmm1, -48(%edi)
+ movntdq %xmm2, -32(%edi)
+ movntdq %xmm3, -16(%edi)
+ leal -64(%edi), %edi
+ cmp %edi, %ebx
+ jb L(mm_large_page_loop_backward)
+ POP (%edi)
+ POP (%esi)
+ jmp L(mm_len_0_or_more_backward)
+
+END (MEMMOVE)
diff --git a/libc/arch-x86/string/sse2-memset-atom.S b/libc/arch-x86/silvermont/string/sse2-memset-slm.S
similarity index 80%
copy from libc/arch-x86/string/sse2-memset-atom.S
copy to libc/arch-x86/silvermont/string/sse2-memset-slm.S
index a54bf51..c30bf74 100644
--- a/libc/arch-x86/string/sse2-memset-atom.S
+++ b/libc/arch-x86/silvermont/string/sse2-memset-slm.S
@@ -1,5 +1,5 @@
/*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -29,7 +29,10 @@
*/
#include "cache.h"
-#undef __i686
+
+#ifndef MEMSET
+# define MEMSET memset
+#endif
#ifndef L
# define L(label) .L##label
@@ -61,7 +64,7 @@
#ifndef ENTRY
# define ENTRY(name) \
- .type name, @function; \
+ .type name, @function; \
.globl name; \
.p2align 4; \
name: \
@@ -107,7 +110,7 @@
jump table with relative offsets. */
# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \
/* We first load PC into EBX. */ \
- call __i686.get_pc_thunk.bx; \
+ call __x86.get_pc_thunk.bx; \
/* Get the address of the jump table. */ \
add $(TABLE - .), %ebx; \
/* Get the entry and convert the relative offset to the \
@@ -117,12 +120,12 @@
/* We loaded the jump table and adjuested EDX. Go. */ \
jmp *%ebx
- .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits
- .globl __i686.get_pc_thunk.bx
- .hidden __i686.get_pc_thunk.bx
+ .section .gnu.linkonce.t.__x86.get_pc_thunk.bx,"ax",@progbits
+ .globl __x86.get_pc_thunk.bx
+ .hidden __x86.get_pc_thunk.bx
ALIGN (4)
- .type __i686.get_pc_thunk.bx,@function
-__i686.get_pc_thunk.bx:
+ .type __x86.get_pc_thunk.bx,@function
+__x86.get_pc_thunk.bx:
movl (%esp), %ebx
ret
#else
@@ -139,16 +142,18 @@
jmp *TABLE(,%ecx,4)
#endif
-#ifndef MEMSET
-# define MEMSET memset
-#endif
-
.section .text.sse2,"ax",@progbits
ALIGN (4)
ENTRY (MEMSET)
ENTRANCE
movl LEN(%esp), %ecx
+ cmp $0, %ecx
+ ja L(1byteormore)
+ SETRTNVAL
+ RETURN
+
+L(1byteormore):
#ifdef USE_AS_BZERO
xor %eax, %eax
#else
@@ -156,147 +161,62 @@
movb %al, %ah
/* Fill the whole EAX with pattern. */
movl %eax, %edx
- shl $16, %eax
+ shl $16, %eax
or %edx, %eax
#endif
movl DEST(%esp), %edx
- cmp $32, %ecx
- jae L(32bytesormore)
+ cmp $1, %ecx
+ je L(1byte)
+ cmp $16, %ecx
+ jae L(16bytesormore)
-L(write_less32bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes))
+ cmp $4, %ecx
+ jb L(4bytesless)
+ movl %eax, (%edx)
+ movl %eax, -4(%edx, %ecx)
+ cmp $8, %ecx
+ jb L(8bytesless)
+ movl %eax, 4(%edx)
+ movl %eax, -8(%edx, %ecx)
+L(8bytesless):
+ SETRTNVAL
+ RETURN
+L(4bytesless):
+ movw %ax, (%edx)
+ movw %ax, -2(%edx, %ecx)
+ SETRTNVAL
+ RETURN
- .pushsection .rodata.sse2,"a",@progbits
- ALIGN (2)
-L(table_less_32bytes):
- .int JMPTBL (L(write_0bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_1bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_2bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_3bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_4bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_5bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_6bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_7bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_8bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_9bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_10bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_11bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_12bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_13bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_14bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_15bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_16bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_17bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_18bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_19bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_20bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_21bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_22bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_23bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_24bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_25bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_26bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_27bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_28bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_29bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_30bytes), L(table_less_32bytes))
- .int JMPTBL (L(write_31bytes), L(table_less_32bytes))
- .popsection
-
- ALIGN (4)
-L(write_28bytes):
- movl %eax, -28(%edx)
-L(write_24bytes):
- movl %eax, -24(%edx)
-L(write_20bytes):
- movl %eax, -20(%edx)
-L(write_16bytes):
- movl %eax, -16(%edx)
-L(write_12bytes):
- movl %eax, -12(%edx)
-L(write_8bytes):
- movl %eax, -8(%edx)
-L(write_4bytes):
- movl %eax, -4(%edx)
-L(write_0bytes):
+L(1byte):
+ movb %al, (%edx)
SETRTNVAL
RETURN
ALIGN (4)
-L(write_29bytes):
- movl %eax, -29(%edx)
-L(write_25bytes):
- movl %eax, -25(%edx)
-L(write_21bytes):
- movl %eax, -21(%edx)
-L(write_17bytes):
- movl %eax, -17(%edx)
-L(write_13bytes):
- movl %eax, -13(%edx)
-L(write_9bytes):
- movl %eax, -9(%edx)
-L(write_5bytes):
- movl %eax, -5(%edx)
-L(write_1bytes):
- movb %al, -1(%edx)
- SETRTNVAL
- RETURN
-
- ALIGN (4)
-L(write_30bytes):
- movl %eax, -30(%edx)
-L(write_26bytes):
- movl %eax, -26(%edx)
-L(write_22bytes):
- movl %eax, -22(%edx)
-L(write_18bytes):
- movl %eax, -18(%edx)
-L(write_14bytes):
- movl %eax, -14(%edx)
-L(write_10bytes):
- movl %eax, -10(%edx)
-L(write_6bytes):
- movl %eax, -6(%edx)
-L(write_2bytes):
- movw %ax, -2(%edx)
- SETRTNVAL
- RETURN
-
- ALIGN (4)
-L(write_31bytes):
- movl %eax, -31(%edx)
-L(write_27bytes):
- movl %eax, -27(%edx)
-L(write_23bytes):
- movl %eax, -23(%edx)
-L(write_19bytes):
- movl %eax, -19(%edx)
-L(write_15bytes):
- movl %eax, -15(%edx)
-L(write_11bytes):
- movl %eax, -11(%edx)
-L(write_7bytes):
- movl %eax, -7(%edx)
-L(write_3bytes):
- movw %ax, -3(%edx)
- movb %al, -1(%edx)
- SETRTNVAL
- RETURN
-
- ALIGN (4)
-/* ECX > 32 and EDX is 4 byte aligned. */
-L(32bytesormore):
- /* Fill xmm0 with the pattern. */
+L(16bytesormore):
#ifdef USE_AS_BZERO
pxor %xmm0, %xmm0
#else
movd %eax, %xmm0
pshufd $0, %xmm0, %xmm0
#endif
+
+ cmp $64, %ecx
+ ja L(64bytesmore)
+ movdqu %xmm0, (%edx)
+ movdqu %xmm0, -16(%edx, %ecx)
+ cmp $32, %ecx
+ jbe L(32bytesless)
+ movdqu %xmm0, 16(%edx)
+ movdqu %xmm0, -32(%edx, %ecx)
+L(32bytesless):
+ SETRTNVAL
+ RETURN
+
+L(64bytesmore):
testl $0xf, %edx
jz L(aligned_16)
-/* ECX > 32 and EDX is not 16 byte aligned. */
L(not_aligned_16):
movdqu %xmm0, (%edx)
movl %edx, %eax
@@ -321,71 +241,73 @@
mov $SHARED_CACHE_SIZE, %ebx
#else
# if (defined SHARED || defined __PIC__)
- call __i686.get_pc_thunk.bx
+ call __x86.get_pc_thunk.bx
add $_GLOBAL_OFFSET_TABLE_, %ebx
- mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx
+ mov $__x86_shared_cache_size@GOTOFF(%ebx), %ebx
# else
PUSH (%ebx)
- mov __x86_shared_cache_size, %ebx
+ mov $__x86_shared_cache_size, %ebx
# endif
#endif
cmp %ebx, %ecx
jae L(128bytesormore_nt_start)
+ POP (%ebx)
#ifdef DATA_CACHE_SIZE
- POP (%ebx)
-# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
- cmp $DATA_CACHE_SIZE, %ecx
+ PUSH (%ebx)
+ mov $DATA_CACHE_SIZE, %ebx
#else
# if (defined SHARED || defined __PIC__)
-# define RESTORE_EBX_STATE
- call __i686.get_pc_thunk.bx
+ call __x86.get_pc_thunk.bx
add $_GLOBAL_OFFSET_TABLE_, %ebx
- cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx
+ mov $__x86_data_cache_size@GOTOFF(%ebx), %ebx
# else
- POP (%ebx)
-# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
- cmp __x86_data_cache_size, %ecx
+ PUSH (%ebx)
+ mov $__x86_data_cache_size, %ebx
# endif
#endif
+ cmp %ebx, %ecx
jae L(128bytes_L2_normal)
subl $128, %ecx
L(128bytesormore_normal):
sub $128, %ecx
movdqa %xmm0, (%edx)
- movdqa %xmm0, 0x10(%edx)
- movdqa %xmm0, 0x20(%edx)
- movdqa %xmm0, 0x30(%edx)
- movdqa %xmm0, 0x40(%edx)
- movdqa %xmm0, 0x50(%edx)
- movdqa %xmm0, 0x60(%edx)
- movdqa %xmm0, 0x70(%edx)
+ movaps %xmm0, 0x10(%edx)
+ movaps %xmm0, 0x20(%edx)
+ movaps %xmm0, 0x30(%edx)
+ movaps %xmm0, 0x40(%edx)
+ movaps %xmm0, 0x50(%edx)
+ movaps %xmm0, 0x60(%edx)
+ movaps %xmm0, 0x70(%edx)
lea 128(%edx), %edx
jb L(128bytesless_normal)
sub $128, %ecx
movdqa %xmm0, (%edx)
- movdqa %xmm0, 0x10(%edx)
- movdqa %xmm0, 0x20(%edx)
- movdqa %xmm0, 0x30(%edx)
- movdqa %xmm0, 0x40(%edx)
- movdqa %xmm0, 0x50(%edx)
- movdqa %xmm0, 0x60(%edx)
- movdqa %xmm0, 0x70(%edx)
+ movaps %xmm0, 0x10(%edx)
+ movaps %xmm0, 0x20(%edx)
+ movaps %xmm0, 0x30(%edx)
+ movaps %xmm0, 0x40(%edx)
+ movaps %xmm0, 0x50(%edx)
+ movaps %xmm0, 0x60(%edx)
+ movaps %xmm0, 0x70(%edx)
lea 128(%edx), %edx
jae L(128bytesormore_normal)
L(128bytesless_normal):
- add $128, %ecx
+ lea 128(%ecx), %ecx
+#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__)
+ POP (%ebx)
+#endif
BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
ALIGN (4)
L(128bytes_L2_normal):
- prefetcht0 0x380(%edx)
- prefetcht0 0x3c0(%edx)
+ prefetchnta 0x380(%edx)
+ prefetchnta 0x3c0(%edx)
sub $128, %ecx
movdqa %xmm0, (%edx)
movaps %xmm0, 0x10(%edx)
@@ -400,28 +322,26 @@
jae L(128bytes_L2_normal)
L(128bytesless_L2_normal):
+#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__)
+ POP (%ebx)
+#endif
BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
- RESTORE_EBX_STATE
L(128bytesormore_nt_start):
sub %ebx, %ecx
- mov %ebx, %eax
- and $0x7f, %eax
- add %eax, %ecx
- movd %xmm0, %eax
ALIGN (4)
L(128bytesormore_shared_cache_loop):
- prefetcht0 0x3c0(%edx)
- prefetcht0 0x380(%edx)
+ prefetchnta 0x3c0(%edx)
+ prefetchnta 0x380(%edx)
sub $0x80, %ebx
movdqa %xmm0, (%edx)
- movdqa %xmm0, 0x10(%edx)
- movdqa %xmm0, 0x20(%edx)
- movdqa %xmm0, 0x30(%edx)
- movdqa %xmm0, 0x40(%edx)
- movdqa %xmm0, 0x50(%edx)
- movdqa %xmm0, 0x60(%edx)
- movdqa %xmm0, 0x70(%edx)
+ movaps %xmm0, 0x10(%edx)
+ movaps %xmm0, 0x20(%edx)
+ movaps %xmm0, 0x30(%edx)
+ movaps %xmm0, 0x40(%edx)
+ movaps %xmm0, 0x50(%edx)
+ movaps %xmm0, 0x60(%edx)
+ movaps %xmm0, 0x70(%edx)
add $0x80, %edx
cmp $0x80, %ebx
jae L(128bytesormore_shared_cache_loop)
@@ -443,7 +363,7 @@
jae L(128bytesormore_nt)
sfence
L(shared_cache_loop_end):
-#if defined DATA_CACHE_SIZE || !(defined SHARED || defined __PIC__)
+#if defined SHARED_CACHE_SIZE || !(defined SHARED || defined __PIC__)
POP (%ebx)
#endif
BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/silvermont/string/sse2-stpcpy-slm.S
old mode 100644
new mode 100755
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86/silvermont/string/sse2-stpcpy-slm.S
index 9d0a563..5c43fa5
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/silvermont/string/sse2-stpcpy-slm.S
@@ -1,42 +1,33 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
-
- * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE (512*1024) /* Atom L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STPCPY
+#define STRCPY stpcpy
+#include "sse2-strcpy-slm.S"
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/silvermont/string/sse2-stpncpy-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86/silvermont/string/sse2-stpncpy-slm.S
index 9d0a563..af5c0d3 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/silvermont/string/sse2-stpncpy-slm.S
@@ -1,42 +1,34 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
-
- * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE (512*1024) /* Atom L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STRNCPY
+#define USE_AS_STPCPY
+#define STRCPY stpncpy
+#include "sse2-strcpy-slm.S"
diff --git a/libc/arch-x86/silvermont/string/sse2-strcpy-slm.S b/libc/arch-x86/silvermont/string/sse2-strcpy-slm.S
new file mode 100755
index 0000000..b5d84b5
--- /dev/null
+++ b/libc/arch-x86/silvermont/string/sse2-strcpy-slm.S
@@ -0,0 +1,2157 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#ifndef STRCPY
+# define STRCPY strcpy
+#endif
+
+#ifdef USE_AS_STPNCPY
+# define USE_AS_STRNCPY
+# define USE_AS_STPCPY
+#endif
+
+#ifdef USE_AS_STRNCPY
+# define PARMS 16
+# define ENTRANCE PUSH(%ebx); PUSH(%esi); PUSH(%edi)
+# define RETURN POP(%edi); POP(%esi); POP(%ebx); ret; CFI_PUSH(%ebx); CFI_PUSH(%edi); CFI_PUSH(%edi);
+#else
+# define PARMS 12
+# define ENTRANCE PUSH(%esi); PUSH(%edi)
+# define RETURN POP(%edi); POP(%esi); ret; CFI_PUSH(%esi); CFI_PUSH(%edi);
+#endif
+
+#define STR1 PARMS
+#define STR2 STR1+4
+#define LEN STR2+4
+
+
+#if (defined SHARED || defined __PIC__)
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into ECX and branch to it. TABLE is a
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ /* We first load PC into ECX. */ \
+ call __x86.get_pc_thunk.cx; \
+ /* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ecx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ecx,INDEX,SCALE), %ecx; \
+ /* We loaded the jump table and adjuested ECX. Go. */ \
+ jmp *%ecx
+#else
+# define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ absolute offsets. INDEX is a register contains the index into the
+ jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+#endif
+
+.text
+ENTRY (STRCPY)
+ ENTRANCE
+ mov STR1(%esp), %edi
+ mov STR2(%esp), %esi
+#ifdef USE_AS_STRNCPY
+ movl LEN(%esp), %ebx
+ test %ebx, %ebx
+ jz L(ExitZero)
+#endif
+
+ mov %esi, %ecx
+#ifndef USE_AS_STPCPY
+ mov %edi, %eax /* save result */
+#endif
+ and $15, %ecx
+ jz L(SourceStringAlignmentZero)
+
+ and $-16, %esi
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+
+ pcmpeqb (%esi), %xmm1
+#ifdef USE_AS_STRNCPY
+ add %ecx, %ebx
+#endif
+ pmovmskb %xmm1, %edx
+ shr %cl, %edx
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+ cmp $16, %ebx
+ jbe L(CopyFrom1To16BytesTailCase2OrCase3)
+#else
+ cmp $17, %ebx
+ jbe L(CopyFrom1To16BytesTailCase2OrCase3)
+#endif
+#endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail)
+
+ pcmpeqb 16(%esi), %xmm0
+ pmovmskb %xmm0, %edx
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+ cmp $32, %ebx
+ jbe L(CopyFrom1To32BytesCase2OrCase3)
+#else
+ cmp $33, %ebx
+ jbe L(CopyFrom1To32BytesCase2OrCase3)
+#endif
+#endif
+ test %edx, %edx
+ jnz L(CopyFrom1To32Bytes)
+
+ movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */
+ movdqu %xmm1, (%edi)
+
+ sub %ecx, %edi
+ mov %edi, %edx
+ mov $16, %ecx
+ and $15, %edx
+ jz L(Align16Both)
+
+/* If source adress alignment != destination adress alignment */
+ .p2align 4
+L(Unalign16Both):
+ movdqa (%esi, %ecx), %xmm1
+ movaps 16(%esi, %ecx), %xmm2
+ movdqu %xmm1, (%edi, %ecx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+#ifdef USE_AS_STRNCPY
+ sub $48, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm2)
+#else
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movaps 16(%esi, %ecx), %xmm3
+ movdqu %xmm2, (%edi, %ecx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm3)
+#else
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movaps 16(%esi, %ecx), %xmm4
+ movdqu %xmm3, (%edi, %ecx)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm4)
+#else
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movaps 16(%esi, %ecx), %xmm1
+ movdqu %xmm4, (%edi, %ecx)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm1)
+#else
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movaps 16(%esi, %ecx), %xmm2
+ movdqu %xmm1, (%edi, %ecx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm2)
+#else
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movaps 16(%esi, %ecx), %xmm3
+ movdqu %xmm2, (%edi, %ecx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm3)
+#else
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movdqu %xmm3, (%edi, %ecx)
+ mov %esi, %edx
+ lea 16(%esi, %ecx), %esi
+ and $-0x40, %esi
+ sub %esi, %edx
+ sub %edx, %edi
+#ifdef USE_AS_STRNCPY
+ lea 64+64(%ebx, %edx), %ebx
+#endif
+L(Unaligned64Loop):
+ movaps (%esi), %xmm2
+ movaps %xmm2, %xmm4
+ movaps 16(%esi), %xmm5
+ movaps 32(%esi), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 48(%esi), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+#ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(UnalignedLeaveCase2OrCase3)
+#endif
+ test %edx, %edx
+ jnz L(Unaligned64Leave)
+
+L(Unaligned64Loop_start):
+ add $64, %edi
+ add $64, %esi
+ movdqu %xmm4, -64(%edi)
+ movaps (%esi), %xmm2
+ movdqa %xmm2, %xmm4
+ movdqu %xmm5, -48(%edi)
+ movaps 16(%esi), %xmm5
+ pminub %xmm5, %xmm2
+ movaps 32(%esi), %xmm3
+ movdqu %xmm6, -32(%edi)
+ movaps %xmm3, %xmm6
+ movdqu %xmm7, -16(%edi)
+ movaps 48(%esi), %xmm7
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+#ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(UnalignedLeaveCase2OrCase3)
+#endif
+ test %edx, %edx
+ jz L(Unaligned64Loop_start)
+
+L(Unaligned64Leave):
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+
+ pcmpeqb %xmm4, %xmm0
+ pcmpeqb %xmm5, %xmm1
+ pmovmskb %xmm0, %edx
+ pmovmskb %xmm1, %ecx
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnaligned_0)
+ test %ecx, %ecx
+ jnz L(CopyFrom1To16BytesUnaligned_16)
+
+ pcmpeqb %xmm6, %xmm0
+ pcmpeqb %xmm7, %xmm1
+ pmovmskb %xmm0, %edx
+ pmovmskb %xmm1, %ecx
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnaligned_32)
+
+ bsf %ecx, %edx
+ movdqu %xmm4, (%edi)
+ movdqu %xmm5, 16(%edi)
+ movdqu %xmm6, 32(%edi)
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+ lea 48(%edi, %edx), %eax
+#endif
+ movdqu %xmm7, 48(%edi)
+ add $15, %ebx
+ sub %edx, %ebx
+ lea 49(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+#else
+ add $48, %esi
+ add $48, %edi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+/* If source adress alignment == destination adress alignment */
+
+L(SourceStringAlignmentZero):
+ pxor %xmm0, %xmm0
+ movdqa (%esi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+ cmp $16, %ebx
+ jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
+#else
+ cmp $17, %ebx
+ jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
+#endif
+#endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail1)
+
+ pcmpeqb 16(%esi), %xmm0
+ movdqu %xmm1, (%edi)
+ pmovmskb %xmm0, %edx
+
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+ cmp $32, %ebx
+ jbe L(CopyFrom1To32Bytes1Case2OrCase3)
+#else
+ cmp $33, %ebx
+ jbe L(CopyFrom1To32Bytes1Case2OrCase3)
+#endif
+#endif
+ test %edx, %edx
+ jnz L(CopyFrom1To32Bytes1)
+
+ mov %edi, %edx
+ mov $16, %ecx
+ and $15, %edx
+ jnz L(Unalign16Both)
+
+L(Align16Both):
+ movdqa (%esi, %ecx), %xmm1
+ movdqa 16(%esi, %ecx), %xmm2
+ movdqa %xmm1, (%edi, %ecx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+#ifdef USE_AS_STRNCPY
+ sub $48, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesXmm2)
+#else
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movdqa 16(%esi, %ecx), %xmm3
+ movdqa %xmm2, (%edi, %ecx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ lea 16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesXmm3)
+#else
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movdqa 16(%esi, %ecx), %xmm4
+ movdqa %xmm3, (%edi, %ecx)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %edx
+ lea 16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesXmm4)
+#else
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movdqa 16(%esi, %ecx), %xmm1
+ movdqa %xmm4, (%edi, %ecx)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+ lea 16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesXmm1)
+#else
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movdqa 16(%esi, %ecx), %xmm2
+ movdqa %xmm1, (%edi, %ecx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %edx
+ lea 16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesXmm2)
+#else
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movdqa 16(%esi, %ecx), %xmm3
+ movdqa %xmm2, (%edi, %ecx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ lea 16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesXmm3)
+#else
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movdqa %xmm3, (%edi, %ecx)
+ mov %esi, %edx
+ lea 16(%esi, %ecx), %esi
+ and $-0x40, %esi
+ sub %esi, %edx
+ sub %edx, %edi
+#ifdef USE_AS_STRNCPY
+ lea 64+64(%ebx, %edx), %ebx
+#endif
+L(Aligned64Loop):
+ movdqa (%esi), %xmm2
+ movdqa %xmm2, %xmm4
+ movaps 16(%esi), %xmm5
+ movdqa 32(%esi), %xmm3
+ movdqa %xmm3, %xmm6
+ movaps 48(%esi), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+#ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(AlignedLeaveCase2OrCase3)
+#endif
+ test %edx, %edx
+ jnz L(Aligned64Leave)
+
+L(Aligned64Loop_start):
+ add $64, %esi
+ add $64, %edi
+ movaps %xmm4, -64(%edi)
+ movdqa (%esi), %xmm2
+ movdqa %xmm2, %xmm4
+ movaps %xmm5, -48(%edi)
+ movaps 16(%esi), %xmm5
+ pminub %xmm5, %xmm2
+ movaps 32(%esi), %xmm3
+ movaps %xmm6, -32(%edi)
+ movdqa %xmm3, %xmm6
+ movaps %xmm7, -16(%edi)
+ movaps 48(%esi), %xmm7
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+#ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(AlignedLeaveCase2OrCase3)
+#endif
+ test %edx, %edx
+ jz L(Aligned64Loop_start)
+
+L(Aligned64Leave):
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+
+ pcmpeqb %xmm4, %xmm0
+ pcmpeqb %xmm5, %xmm1
+ pmovmskb %xmm0, %edx
+ pmovmskb %xmm1, %ecx
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes_0)
+ test %ecx, %ecx
+ jnz L(CopyFrom1To16Bytes_16)
+
+ pcmpeqb %xmm6, %xmm0
+ pcmpeqb %xmm7, %xmm1
+ pmovmskb %xmm0, %edx
+ pmovmskb %xmm1, %ecx
+ test %edx, %edx
+ jnz L(CopyFrom1To16Bytes_32)
+
+ bsf %ecx, %edx
+ movdqa %xmm4, (%edi)
+ movdqa %xmm5, 16(%edi)
+ movdqa %xmm6, 32(%edi)
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+ lea 48(%edi, %edx), %eax
+#endif
+ movdqa %xmm7, 48(%edi)
+ add $15, %ebx
+ sub %edx, %ebx
+ lea 49(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+#else
+ add $48, %esi
+ add $48, %edi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+/*----------------------------------------------------*/
+
+/* Case1 */
+#ifndef USE_AS_STRNCPY
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %ecx, %edi
+ add %ecx, %esi
+ bsf %edx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+ .p2align 4
+L(CopyFrom1To16BytesTail):
+#ifdef USE_AS_STRNCPY
+ sub %ecx, %ebx
+#endif
+ add %ecx, %esi
+ bsf %edx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1):
+ add $16, %esi
+ add $16, %edi
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+#endif
+L(CopyFrom1To16BytesTail1):
+ bsf %edx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes):
+#ifdef USE_AS_STRNCPY
+ sub %ecx, %ebx
+#endif
+ bsf %edx, %edx
+ add %ecx, %esi
+ add $16, %edx
+ sub %ecx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To16Bytes_0):
+ bsf %edx, %edx
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+ lea (%edi, %edx), %eax
+#endif
+ movdqa %xmm4, (%edi)
+ add $63, %ebx
+ sub %edx, %ebx
+ lea 1(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+#else
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+ .p2align 4
+L(CopyFrom1To16Bytes_16):
+ bsf %ecx, %edx
+ movdqa %xmm4, (%edi)
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+ lea 16(%edi, %edx), %eax
+#endif
+ movdqa %xmm5, 16(%edi)
+ add $47, %ebx
+ sub %edx, %ebx
+ lea 17(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+#else
+ add $16, %esi
+ add $16, %edi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+ .p2align 4
+L(CopyFrom1To16Bytes_32):
+ bsf %edx, %edx
+ movdqa %xmm4, (%edi)
+ movdqa %xmm5, 16(%edi)
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+ lea 32(%edi, %edx), %eax
+#endif
+ movdqa %xmm6, 32(%edi)
+ add $31, %ebx
+ sub %edx, %ebx
+ lea 33(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+#else
+ add $32, %esi
+ add $32, %edi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+ bsf %edx, %edx
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+ lea (%edi, %edx), %eax
+#endif
+ movdqu %xmm4, (%edi)
+ add $63, %ebx
+ sub %edx, %ebx
+ lea 1(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+#else
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+ bsf %ecx, %edx
+ movdqu %xmm4, (%edi)
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+ lea 16(%edi, %edx), %eax
+#endif
+ movdqu %xmm5, 16(%edi)
+ add $47, %ebx
+ sub %edx, %ebx
+ lea 17(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+#else
+ add $16, %esi
+ add $16, %edi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+ bsf %edx, %edx
+ movdqu %xmm4, (%edi)
+ movdqu %xmm5, 16(%edi)
+#ifdef USE_AS_STRNCPY
+#ifdef USE_AS_STPCPY
+ lea 32(%edi, %edx), %eax
+#endif
+ movdqu %xmm6, 32(%edi)
+ add $31, %ebx
+ sub %edx, %ebx
+ lea 33(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+#else
+ add $32, %esi
+ add $32, %edi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+#endif
+
+#ifdef USE_AS_STRNCPY
+ .p2align 4
+L(CopyFrom1To16BytesXmm6):
+ movdqa %xmm6, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesXmm5):
+ movdqa %xmm5, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesXmm4):
+ movdqa %xmm4, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesXmm3):
+ movdqa %xmm3, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesXmm2):
+ movdqa %xmm2, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesXmm1):
+ movdqa %xmm1, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm6):
+ movdqu %xmm6, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm5):
+ movdqu %xmm5, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm4):
+ movdqu %xmm4, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm3):
+ movdqu %xmm3, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm1):
+ movdqu %xmm1, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesExit):
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+/* Case2 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %ebx
+ add %ecx, %edi
+ add %ecx, %esi
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2):
+ sub %ecx, %ebx
+ add %ecx, %esi
+ bsf %edx, %edx
+ add $16, %edx
+ sub %ecx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+ sub %ecx, %ebx
+ add %ecx, %esi
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+/* Case2 or Case3, Case3 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+ add $16, %ebx
+ add %ecx, %edi
+ add %ecx, %esi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To32BytesCase2)
+ sub %ecx, %ebx
+ add %ecx, %esi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTailCase2)
+ sub %ecx, %ebx
+ add %ecx, %esi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+ add $16, %edi
+ add $16, %esi
+ sub $16, %ebx
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail1Case2)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+#endif
+
+/*-----------------------------------------------------------------*/
+ .p2align 4
+L(Exit0):
+#ifdef USE_AS_STPCPY
+ mov %edi, %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit1):
+ movb %dh, (%edi)
+#ifdef USE_AS_STPCPY
+ lea (%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $1, %ebx
+ lea 1(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit2):
+ movw (%esi), %dx
+ movw %dx, (%edi)
+#ifdef USE_AS_STPCPY
+ lea 1(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $2, %ebx
+ lea 2(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit3):
+ movw (%esi), %cx
+ movw %cx, (%edi)
+ movb %dh, 2(%edi)
+#ifdef USE_AS_STPCPY
+ lea 2(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $3, %ebx
+ lea 3(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit4):
+ movl (%esi), %edx
+ movl %edx, (%edi)
+#ifdef USE_AS_STPCPY
+ lea 3(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $4, %ebx
+ lea 4(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit5):
+ movl (%esi), %ecx
+ movb %dh, 4(%edi)
+ movl %ecx, (%edi)
+#ifdef USE_AS_STPCPY
+ lea 4(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $5, %ebx
+ lea 5(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit6):
+ movl (%esi), %ecx
+ movw 4(%esi), %dx
+ movl %ecx, (%edi)
+ movw %dx, 4(%edi)
+#ifdef USE_AS_STPCPY
+ lea 5(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $6, %ebx
+ lea 6(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit7):
+ movl (%esi), %ecx
+ movl 3(%esi), %edx
+ movl %ecx, (%edi)
+ movl %edx, 3(%edi)
+#ifdef USE_AS_STPCPY
+ lea 6(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $7, %ebx
+ lea 7(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit8):
+ movlpd (%esi), %xmm0
+ movlpd %xmm0, (%edi)
+#ifdef USE_AS_STPCPY
+ lea 7(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $8, %ebx
+ lea 8(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit9):
+ movlpd (%esi), %xmm0
+ movb %dh, 8(%edi)
+ movlpd %xmm0, (%edi)
+#ifdef USE_AS_STPCPY
+ lea 8(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $9, %ebx
+ lea 9(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit10):
+ movlpd (%esi), %xmm0
+ movw 8(%esi), %dx
+ movlpd %xmm0, (%edi)
+ movw %dx, 8(%edi)
+#ifdef USE_AS_STPCPY
+ lea 9(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $10, %ebx
+ lea 10(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit11):
+ movlpd (%esi), %xmm0
+ movl 7(%esi), %edx
+ movlpd %xmm0, (%edi)
+ movl %edx, 7(%edi)
+#ifdef USE_AS_STPCPY
+ lea 10(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $11, %ebx
+ lea 11(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit12):
+ movlpd (%esi), %xmm0
+ movl 8(%esi), %edx
+ movlpd %xmm0, (%edi)
+ movl %edx, 8(%edi)
+#ifdef USE_AS_STPCPY
+ lea 11(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $12, %ebx
+ lea 12(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit13):
+ movlpd (%esi), %xmm0
+ movlpd 5(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 5(%edi)
+#ifdef USE_AS_STPCPY
+ lea 12(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $13, %ebx
+ lea 13(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit14):
+ movlpd (%esi), %xmm0
+ movlpd 6(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 6(%edi)
+#ifdef USE_AS_STPCPY
+ lea 13(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $14, %ebx
+ lea 14(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit15):
+ movlpd (%esi), %xmm0
+ movlpd 7(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 7(%edi)
+#ifdef USE_AS_STPCPY
+ lea 14(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $15, %ebx
+ lea 15(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit16):
+ movdqu (%esi), %xmm0
+ movdqu %xmm0, (%edi)
+#ifdef USE_AS_STPCPY
+ lea 15(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ lea 16(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit17):
+ movdqu (%esi), %xmm0
+ xor %cl, %cl
+ movdqu %xmm0, (%edi)
+ movb %cl, 16(%edi)
+#ifdef USE_AS_STPCPY
+ lea 16(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $17, %ebx
+ lea 17(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit18):
+ movdqu (%esi), %xmm0
+ movw 16(%esi), %cx
+ movdqu %xmm0, (%edi)
+ movw %cx, 16(%edi)
+#ifdef USE_AS_STPCPY
+ lea 17(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $18, %ebx
+ lea 18(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit19):
+ movdqu (%esi), %xmm0
+ movl 15(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movl %ecx, 15(%edi)
+#ifdef USE_AS_STPCPY
+ lea 18(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $19, %ebx
+ lea 19(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit20):
+ movdqu (%esi), %xmm0
+ movl 16(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movl %ecx, 16(%edi)
+#ifdef USE_AS_STPCPY
+ lea 19(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $20, %ebx
+ lea 20(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit21):
+ movdqu (%esi), %xmm0
+ movl 16(%esi), %ecx
+ xor %dl, %dl
+ movdqu %xmm0, (%edi)
+ movl %ecx, 16(%edi)
+ movb %dl, 20(%edi)
+#ifdef USE_AS_STPCPY
+ lea 20(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $21, %ebx
+ lea 21(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit22):
+ movdqu (%esi), %xmm0
+ movlpd 14(%esi), %xmm3
+ movdqu %xmm0, (%edi)
+ movlpd %xmm3, 14(%edi)
+#ifdef USE_AS_STPCPY
+ lea 21(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $22, %ebx
+ lea 22(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit23):
+ movdqu (%esi), %xmm0
+ movlpd 15(%esi), %xmm3
+ movdqu %xmm0, (%edi)
+ movlpd %xmm3, 15(%edi)
+#ifdef USE_AS_STPCPY
+ lea 22(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $23, %ebx
+ lea 23(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit24):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+#ifdef USE_AS_STPCPY
+ lea 23(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $24, %ebx
+ lea 24(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit25):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ xor %cl, %cl
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movb %cl, 24(%edi)
+#ifdef USE_AS_STPCPY
+ lea 24(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $25, %ebx
+ lea 25(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit26):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movw 24(%esi), %cx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movw %cx, 24(%edi)
+#ifdef USE_AS_STPCPY
+ lea 25(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $26, %ebx
+ lea 26(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit27):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movl 23(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movl %ecx, 23(%edi)
+#ifdef USE_AS_STPCPY
+ lea 26(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $27, %ebx
+ lea 27(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit28):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movl 24(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movl %ecx, 24(%edi)
+#ifdef USE_AS_STPCPY
+ lea 27(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $28, %ebx
+ lea 28(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit29):
+ movdqu (%esi), %xmm0
+ movdqu 13(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 13(%edi)
+#ifdef USE_AS_STPCPY
+ lea 28(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $29, %ebx
+ lea 29(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit30):
+ movdqu (%esi), %xmm0
+ movdqu 14(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 14(%edi)
+#ifdef USE_AS_STPCPY
+ lea 29(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $30, %ebx
+ lea 30(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+
+ .p2align 4
+L(Exit31):
+ movdqu (%esi), %xmm0
+ movdqu 15(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 15(%edi)
+#ifdef USE_AS_STPCPY
+ lea 30(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $31, %ebx
+ lea 31(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit32):
+ movdqu (%esi), %xmm0
+ movdqu 16(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 16(%edi)
+#ifdef USE_AS_STPCPY
+ lea 31(%edi), %eax
+#endif
+#ifdef USE_AS_STRNCPY
+ sub $32, %ebx
+ lea 32(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+#ifdef USE_AS_STRNCPY
+
+ .p2align 4
+L(StrncpyExit1):
+ movb (%esi), %dl
+ movb %dl, (%edi)
+#ifdef USE_AS_STPCPY
+ lea 1(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit2):
+ movw (%esi), %dx
+ movw %dx, (%edi)
+#ifdef USE_AS_STPCPY
+ lea 2(%edi), %eax
+#endif
+ RETURN
+ .p2align 4
+L(StrncpyExit3):
+ movw (%esi), %cx
+ movb 2(%esi), %dl
+ movw %cx, (%edi)
+ movb %dl, 2(%edi)
+#ifdef USE_AS_STPCPY
+ lea 3(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit4):
+ movl (%esi), %edx
+ movl %edx, (%edi)
+#ifdef USE_AS_STPCPY
+ lea 4(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit5):
+ movl (%esi), %ecx
+ movb 4(%esi), %dl
+ movl %ecx, (%edi)
+ movb %dl, 4(%edi)
+#ifdef USE_AS_STPCPY
+ lea 5(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit6):
+ movl (%esi), %ecx
+ movw 4(%esi), %dx
+ movl %ecx, (%edi)
+ movw %dx, 4(%edi)
+#ifdef USE_AS_STPCPY
+ lea 6(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit7):
+ movl (%esi), %ecx
+ movl 3(%esi), %edx
+ movl %ecx, (%edi)
+ movl %edx, 3(%edi)
+#ifdef USE_AS_STPCPY
+ lea 7(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit8):
+ movlpd (%esi), %xmm0
+ movlpd %xmm0, (%edi)
+#ifdef USE_AS_STPCPY
+ lea 8(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit9):
+ movlpd (%esi), %xmm0
+ movb 8(%esi), %dl
+ movlpd %xmm0, (%edi)
+ movb %dl, 8(%edi)
+#ifdef USE_AS_STPCPY
+ lea 9(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit10):
+ movlpd (%esi), %xmm0
+ movw 8(%esi), %dx
+ movlpd %xmm0, (%edi)
+ movw %dx, 8(%edi)
+#ifdef USE_AS_STPCPY
+ lea 10(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit11):
+ movlpd (%esi), %xmm0
+ movl 7(%esi), %edx
+ movlpd %xmm0, (%edi)
+ movl %edx, 7(%edi)
+#ifdef USE_AS_STPCPY
+ lea 11(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit12):
+ movlpd (%esi), %xmm0
+ movl 8(%esi), %edx
+ movlpd %xmm0, (%edi)
+ movl %edx, 8(%edi)
+#ifdef USE_AS_STPCPY
+ lea 12(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit13):
+ movlpd (%esi), %xmm0
+ movlpd 5(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 5(%edi)
+#ifdef USE_AS_STPCPY
+ lea 13(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit14):
+ movlpd (%esi), %xmm0
+ movlpd 6(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 6(%edi)
+#ifdef USE_AS_STPCPY
+ lea 14(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit15):
+ movlpd (%esi), %xmm0
+ movlpd 7(%esi), %xmm1
+ movlpd %xmm0, (%edi)
+ movlpd %xmm1, 7(%edi)
+#ifdef USE_AS_STPCPY
+ lea 15(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit16):
+ movdqu (%esi), %xmm0
+ movdqu %xmm0, (%edi)
+#ifdef USE_AS_STPCPY
+ lea 16(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit17):
+ movdqu (%esi), %xmm0
+ movb 16(%esi), %cl
+ movdqu %xmm0, (%edi)
+ movb %cl, 16(%edi)
+#ifdef USE_AS_STPCPY
+ lea 17(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit18):
+ movdqu (%esi), %xmm0
+ movw 16(%esi), %cx
+ movdqu %xmm0, (%edi)
+ movw %cx, 16(%edi)
+#ifdef USE_AS_STPCPY
+ lea 18(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit19):
+ movdqu (%esi), %xmm0
+ movl 15(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movl %ecx, 15(%edi)
+#ifdef USE_AS_STPCPY
+ lea 19(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit20):
+ movdqu (%esi), %xmm0
+ movl 16(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movl %ecx, 16(%edi)
+#ifdef USE_AS_STPCPY
+ lea 20(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit21):
+ movdqu (%esi), %xmm0
+ movl 16(%esi), %ecx
+ movb 20(%esi), %dl
+ movdqu %xmm0, (%edi)
+ movl %ecx, 16(%edi)
+ movb %dl, 20(%edi)
+#ifdef USE_AS_STPCPY
+ lea 21(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit22):
+ movdqu (%esi), %xmm0
+ movlpd 14(%esi), %xmm3
+ movdqu %xmm0, (%edi)
+ movlpd %xmm3, 14(%edi)
+#ifdef USE_AS_STPCPY
+ lea 22(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit23):
+ movdqu (%esi), %xmm0
+ movlpd 15(%esi), %xmm3
+ movdqu %xmm0, (%edi)
+ movlpd %xmm3, 15(%edi)
+#ifdef USE_AS_STPCPY
+ lea 23(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit24):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+#ifdef USE_AS_STPCPY
+ lea 24(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit25):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movb 24(%esi), %cl
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movb %cl, 24(%edi)
+#ifdef USE_AS_STPCPY
+ lea 25(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit26):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movw 24(%esi), %cx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movw %cx, 24(%edi)
+#ifdef USE_AS_STPCPY
+ lea 26(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit27):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movl 23(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movl %ecx, 23(%edi)
+#ifdef USE_AS_STPCPY
+ lea 27(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit28):
+ movdqu (%esi), %xmm0
+ movlpd 16(%esi), %xmm2
+ movl 24(%esi), %ecx
+ movdqu %xmm0, (%edi)
+ movlpd %xmm2, 16(%edi)
+ movl %ecx, 24(%edi)
+#ifdef USE_AS_STPCPY
+ lea 28(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit29):
+ movdqu (%esi), %xmm0
+ movdqu 13(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 13(%edi)
+#ifdef USE_AS_STPCPY
+ lea 29(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit30):
+ movdqu (%esi), %xmm0
+ movdqu 14(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 14(%edi)
+#ifdef USE_AS_STPCPY
+ lea 30(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit31):
+ movdqu (%esi), %xmm0
+ movdqu 15(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 15(%edi)
+#ifdef USE_AS_STPCPY
+ lea 31(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit32):
+ movdqu (%esi), %xmm0
+ movdqu 16(%esi), %xmm2
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 16(%edi)
+#ifdef USE_AS_STPCPY
+ lea 32(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit33):
+ movdqu (%esi), %xmm0
+ movdqu 16(%esi), %xmm2
+ movb 32(%esi), %cl
+ movdqu %xmm0, (%edi)
+ movdqu %xmm2, 16(%edi)
+ movb %cl, 32(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill0):
+ RETURN
+
+ .p2align 4
+L(Fill1):
+ movb %dl, (%edi)
+ RETURN
+
+ .p2align 4
+L(Fill2):
+ movw %dx, (%edi)
+ RETURN
+
+ .p2align 4
+L(Fill3):
+ movl %edx, -1(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill4):
+ movl %edx, (%edi)
+ RETURN
+
+ .p2align 4
+L(Fill5):
+ movl %edx, (%edi)
+ movb %dl, 4(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill6):
+ movl %edx, (%edi)
+ movw %dx, 4(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill7):
+ movlpd %xmm0, -1(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill8):
+ movlpd %xmm0, (%edi)
+ RETURN
+
+ .p2align 4
+L(Fill9):
+ movlpd %xmm0, (%edi)
+ movb %dl, 8(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill10):
+ movlpd %xmm0, (%edi)
+ movw %dx, 8(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill11):
+ movlpd %xmm0, (%edi)
+ movl %edx, 7(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill12):
+ movlpd %xmm0, (%edi)
+ movl %edx, 8(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill13):
+ movlpd %xmm0, (%edi)
+ movlpd %xmm0, 5(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill14):
+ movlpd %xmm0, (%edi)
+ movlpd %xmm0, 6(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill15):
+ movdqu %xmm0, -1(%edi)
+ RETURN
+
+ .p2align 4
+L(Fill16):
+ movdqu %xmm0, (%edi)
+ RETURN
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm2):
+ movdqu %xmm2, (%edi, %ecx)
+
+ .p2align 4
+L(CopyFrom1To16BytesXmmExit):
+ bsf %edx, %edx
+ add $15, %ebx
+ add %ecx, %edi
+#ifdef USE_AS_STPCPY
+ lea (%edi, %edx), %eax
+#endif
+ sub %edx, %ebx
+ lea 1(%edi, %edx), %edi
+
+ .p2align 4
+L(StrncpyFillTailWithZero):
+ pxor %xmm0, %xmm0
+ xor %edx, %edx
+ sub $16, %ebx
+ jbe L(StrncpyFillExit)
+
+ movdqu %xmm0, (%edi)
+ add $16, %edi
+
+ mov %edi, %esi
+ and $0xf, %esi
+ sub %esi, %edi
+ add %esi, %ebx
+ sub $64, %ebx
+ jb L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+ movdqa %xmm0, (%edi)
+ movdqa %xmm0, 16(%edi)
+ movdqa %xmm0, 32(%edi)
+ movdqa %xmm0, 48(%edi)
+ add $64, %edi
+ sub $64, %ebx
+ jae L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+ add $32, %ebx
+ jl L(StrncpyFillLess32)
+ movdqa %xmm0, (%edi)
+ movdqa %xmm0, 16(%edi)
+ add $32, %edi
+ sub $16, %ebx
+ jl L(StrncpyFillExit)
+ movdqa %xmm0, (%edi)
+ add $16, %edi
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+L(StrncpyFillLess32):
+ add $16, %ebx
+ jl L(StrncpyFillExit)
+ movdqa %xmm0, (%edi)
+ add $16, %edi
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+L(StrncpyFillExit):
+ add $16, %ebx
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4)
+
+ .p2align 4
+L(AlignedLeaveCase2OrCase3):
+ test %edx, %edx
+ jnz L(Aligned64LeaveCase2)
+L(Aligned64LeaveCase3):
+ lea 64(%ebx), %ecx
+ and $-16, %ecx
+ add $48, %ebx
+ jl L(CopyFrom1To16BytesCase3)
+ movdqa %xmm4, (%edi)
+ sub $16, %ebx
+ jb L(CopyFrom1To16BytesCase3)
+ movdqa %xmm5, 16(%edi)
+ sub $16, %ebx
+ jb L(CopyFrom1To16BytesCase3)
+ movdqa %xmm6, 32(%edi)
+ sub $16, %ebx
+ jb L(CopyFrom1To16BytesCase3)
+ movdqa %xmm7, 48(%edi)
+#ifdef USE_AS_STPCPY
+ lea 64(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(Aligned64LeaveCase2):
+ pxor %xmm0, %xmm0
+ xor %ecx, %ecx
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %edx
+ add $48, %ebx
+ jle L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesXmm4)
+
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %edx
+ movdqa %xmm4, (%edi)
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesXmm5)
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %edx
+ movdqa %xmm5, 16(%edi)
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesXmm6)
+
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %edx
+ movdqa %xmm6, 32(%edi)
+ lea 16(%edi, %ecx), %edi
+ lea 16(%esi, %ecx), %esi
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(UnalignedLeaveCase2OrCase3):
+ test %edx, %edx
+ jnz L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+ lea 64(%ebx), %ecx
+ and $-16, %ecx
+ add $48, %ebx
+ jl L(CopyFrom1To16BytesCase3)
+ movdqu %xmm4, (%edi)
+ sub $16, %ebx
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm5, 16(%edi)
+ sub $16, %ebx
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm6, 32(%edi)
+ sub $16, %ebx
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm7, 48(%edi)
+#ifdef USE_AS_STPCPY
+ lea 64(%edi), %eax
+#endif
+ RETURN
+
+ .p2align 4
+L(Unaligned64LeaveCase2):
+ pxor %xmm0, %xmm0
+ xor %ecx, %ecx
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %edx
+ add $48, %ebx
+ jle L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm4)
+
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %edx
+ movdqu %xmm4, (%edi)
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm5)
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %edx
+ movdqu %xmm5, 16(%edi)
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm6)
+
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %edx
+ movdqu %xmm6, 32(%edi)
+ lea 16(%edi, %ecx), %edi
+ lea 16(%esi, %ecx), %esi
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(ExitZero):
+ movl %edi, %eax
+ RETURN
+#endif
+
+END (STRCPY)
+
+ .p2align 4
+ .section .rodata
+L(ExitTable):
+ .int JMPTBL(L(Exit1), L(ExitTable))
+ .int JMPTBL(L(Exit2), L(ExitTable))
+ .int JMPTBL(L(Exit3), L(ExitTable))
+ .int JMPTBL(L(Exit4), L(ExitTable))
+ .int JMPTBL(L(Exit5), L(ExitTable))
+ .int JMPTBL(L(Exit6), L(ExitTable))
+ .int JMPTBL(L(Exit7), L(ExitTable))
+ .int JMPTBL(L(Exit8), L(ExitTable))
+ .int JMPTBL(L(Exit9), L(ExitTable))
+ .int JMPTBL(L(Exit10), L(ExitTable))
+ .int JMPTBL(L(Exit11), L(ExitTable))
+ .int JMPTBL(L(Exit12), L(ExitTable))
+ .int JMPTBL(L(Exit13), L(ExitTable))
+ .int JMPTBL(L(Exit14), L(ExitTable))
+ .int JMPTBL(L(Exit15), L(ExitTable))
+ .int JMPTBL(L(Exit16), L(ExitTable))
+ .int JMPTBL(L(Exit17), L(ExitTable))
+ .int JMPTBL(L(Exit18), L(ExitTable))
+ .int JMPTBL(L(Exit19), L(ExitTable))
+ .int JMPTBL(L(Exit20), L(ExitTable))
+ .int JMPTBL(L(Exit21), L(ExitTable))
+ .int JMPTBL(L(Exit22), L(ExitTable))
+ .int JMPTBL(L(Exit23), L(ExitTable))
+ .int JMPTBL(L(Exit24), L(ExitTable))
+ .int JMPTBL(L(Exit25), L(ExitTable))
+ .int JMPTBL(L(Exit26), L(ExitTable))
+ .int JMPTBL(L(Exit27), L(ExitTable))
+ .int JMPTBL(L(Exit28), L(ExitTable))
+ .int JMPTBL(L(Exit29), L(ExitTable))
+ .int JMPTBL(L(Exit30), L(ExitTable))
+ .int JMPTBL(L(Exit31), L(ExitTable))
+ .int JMPTBL(L(Exit32), L(ExitTable))
+#ifdef USE_AS_STRNCPY
+L(ExitStrncpyTable):
+ .int JMPTBL(L(Exit0), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+
+ .p2align 4
+L(FillTable):
+ .int JMPTBL(L(Fill0), L(FillTable))
+ .int JMPTBL(L(Fill1), L(FillTable))
+ .int JMPTBL(L(Fill2), L(FillTable))
+ .int JMPTBL(L(Fill3), L(FillTable))
+ .int JMPTBL(L(Fill4), L(FillTable))
+ .int JMPTBL(L(Fill5), L(FillTable))
+ .int JMPTBL(L(Fill6), L(FillTable))
+ .int JMPTBL(L(Fill7), L(FillTable))
+ .int JMPTBL(L(Fill8), L(FillTable))
+ .int JMPTBL(L(Fill9), L(FillTable))
+ .int JMPTBL(L(Fill10), L(FillTable))
+ .int JMPTBL(L(Fill11), L(FillTable))
+ .int JMPTBL(L(Fill12), L(FillTable))
+ .int JMPTBL(L(Fill13), L(FillTable))
+ .int JMPTBL(L(Fill14), L(FillTable))
+ .int JMPTBL(L(Fill15), L(FillTable))
+ .int JMPTBL(L(Fill16), L(FillTable))
+#endif
diff --git a/libc/arch-x86/silvermont/string/sse2-strlen-slm.S b/libc/arch-x86/silvermont/string/sse2-strlen-slm.S
new file mode 100755
index 0000000..27cc025
--- /dev/null
+++ b/libc/arch-x86/silvermont/string/sse2-strlen-slm.S
@@ -0,0 +1,328 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef STRLEN
+# define STRLEN strlen
+#endif
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+ .section .text.sse2,"ax",@progbits
+ENTRY (STRLEN)
+ mov 4(%esp), %edx
+ mov %edx, %ecx
+ and $0x3f, %ecx
+ pxor %xmm0, %xmm0
+ cmp $0x30, %ecx
+ ja L(next)
+ movdqu (%edx), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %ecx
+ test %ecx, %ecx
+ jnz L(exit_less16)
+ mov %edx, %eax
+ and $-16, %eax
+ jmp L(align16_start)
+L(next):
+ mov %edx, %eax
+ and $-16, %eax
+ PUSH (%edi)
+ pcmpeqb (%eax), %xmm0
+ mov $-1, %edi
+ sub %eax, %ecx
+ shl %cl, %edi
+ pmovmskb %xmm0, %ecx
+ and %edi, %ecx
+ POP (%edi)
+ jnz L(exit_unaligned)
+ pxor %xmm0, %xmm0
+L(align16_start):
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+ pcmpeqb 16(%eax), %xmm0
+ pmovmskb %xmm0, %ecx
+ test %ecx, %ecx
+ jnz L(exit16)
+
+ pcmpeqb 32(%eax), %xmm1
+ pmovmskb %xmm1, %ecx
+ test %ecx, %ecx
+ jnz L(exit32)
+
+ pcmpeqb 48(%eax), %xmm2
+ pmovmskb %xmm2, %ecx
+ test %ecx, %ecx
+ jnz L(exit48)
+
+ pcmpeqb 64(%eax), %xmm3
+ pmovmskb %xmm3, %ecx
+ test %ecx, %ecx
+ jnz L(exit64)
+
+ pcmpeqb 80(%eax), %xmm0
+ add $64, %eax
+ pmovmskb %xmm0, %ecx
+ test %ecx, %ecx
+ jnz L(exit16)
+
+ pcmpeqb 32(%eax), %xmm1
+ pmovmskb %xmm1, %ecx
+ test %ecx, %ecx
+ jnz L(exit32)
+
+ pcmpeqb 48(%eax), %xmm2
+ pmovmskb %xmm2, %ecx
+ test %ecx, %ecx
+ jnz L(exit48)
+
+ pcmpeqb 64(%eax), %xmm3
+ pmovmskb %xmm3, %ecx
+ test %ecx, %ecx
+ jnz L(exit64)
+
+ pcmpeqb 80(%eax), %xmm0
+ add $64, %eax
+ pmovmskb %xmm0, %ecx
+ test %ecx, %ecx
+ jnz L(exit16)
+
+ pcmpeqb 32(%eax), %xmm1
+ pmovmskb %xmm1, %ecx
+ test %ecx, %ecx
+ jnz L(exit32)
+
+ pcmpeqb 48(%eax), %xmm2
+ pmovmskb %xmm2, %ecx
+ test %ecx, %ecx
+ jnz L(exit48)
+
+ pcmpeqb 64(%eax), %xmm3
+ pmovmskb %xmm3, %ecx
+ test %ecx, %ecx
+ jnz L(exit64)
+
+ pcmpeqb 80(%eax), %xmm0
+ add $64, %eax
+ pmovmskb %xmm0, %ecx
+ test %ecx, %ecx
+ jnz L(exit16)
+
+ pcmpeqb 32(%eax), %xmm1
+ pmovmskb %xmm1, %ecx
+ test %ecx, %ecx
+ jnz L(exit32)
+
+ pcmpeqb 48(%eax), %xmm2
+ pmovmskb %xmm2, %ecx
+ test %ecx, %ecx
+ jnz L(exit48)
+
+ pcmpeqb 64(%eax), %xmm3
+ pmovmskb %xmm3, %ecx
+ test %ecx, %ecx
+ jnz L(exit64)
+
+
+ test $0x3f, %eax
+ jz L(align64_loop)
+
+ pcmpeqb 80(%eax), %xmm0
+ add $80, %eax
+ pmovmskb %xmm0, %ecx
+ test %ecx, %ecx
+ jnz L(exit)
+
+ test $0x3f, %eax
+ jz L(align64_loop)
+
+ pcmpeqb 16(%eax), %xmm1
+ add $16, %eax
+ pmovmskb %xmm1, %ecx
+ test %ecx, %ecx
+ jnz L(exit)
+
+ test $0x3f, %eax
+ jz L(align64_loop)
+
+ pcmpeqb 16(%eax), %xmm2
+ add $16, %eax
+ pmovmskb %xmm2, %ecx
+ test %ecx, %ecx
+ jnz L(exit)
+
+ test $0x3f, %eax
+ jz L(align64_loop)
+
+ pcmpeqb 16(%eax), %xmm3
+ add $16, %eax
+ pmovmskb %xmm3, %ecx
+ test %ecx, %ecx
+ jnz L(exit)
+
+ add $16, %eax
+ .p2align 4
+L(align64_loop):
+ movaps (%eax), %xmm4
+ pminub 16(%eax), %xmm4
+ movaps 32(%eax), %xmm5
+ pminub 48(%eax), %xmm5
+ add $64, %eax
+ pminub %xmm4, %xmm5
+ pcmpeqb %xmm0, %xmm5
+ pmovmskb %xmm5, %ecx
+ test %ecx, %ecx
+ jz L(align64_loop)
+
+
+ pcmpeqb -64(%eax), %xmm0
+ sub $80, %eax
+ pmovmskb %xmm0, %ecx
+ test %ecx, %ecx
+ jnz L(exit16)
+
+ pcmpeqb 32(%eax), %xmm1
+ pmovmskb %xmm1, %ecx
+ test %ecx, %ecx
+ jnz L(exit32)
+
+ pcmpeqb 48(%eax), %xmm2
+ pmovmskb %xmm2, %ecx
+ test %ecx, %ecx
+ jnz L(exit48)
+
+ pcmpeqb 64(%eax), %xmm3
+ pmovmskb %xmm3, %ecx
+ sub %edx, %eax
+ bsf %ecx, %ecx
+ add %ecx, %eax
+ add $64, %eax
+ ret
+
+ .p2align 4
+L(exit):
+ sub %edx, %eax
+ bsf %ecx, %ecx
+ add %ecx, %eax
+ ret
+
+L(exit_less16):
+ bsf %ecx, %eax
+ ret
+
+ .p2align 4
+L(exit_unaligned):
+ sub %edx, %eax
+ bsf %ecx, %ecx
+ add %ecx, %eax
+ ret
+
+ .p2align 4
+L(exit16):
+ sub %edx, %eax
+ bsf %ecx, %ecx
+ add %ecx, %eax
+ add $16, %eax
+ ret
+
+ .p2align 4
+L(exit32):
+ sub %edx, %eax
+ bsf %ecx, %ecx
+ add %ecx, %eax
+ add $32, %eax
+ ret
+
+ .p2align 4
+L(exit48):
+ sub %edx, %eax
+ bsf %ecx, %ecx
+ add %ecx, %eax
+ add $48, %eax
+ ret
+
+ .p2align 4
+L(exit64):
+ sub %edx, %eax
+ bsf %ecx, %ecx
+ add %ecx, %eax
+ add $64, %eax
+ ret
+
+END (STRLEN)
+
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/silvermont/string/sse2-strncpy-slm.S
old mode 100644
new mode 100755
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86/silvermont/string/sse2-strncpy-slm.S
index 9d0a563..591419f
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/silvermont/string/sse2-strncpy-slm.S
@@ -1,42 +1,33 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
-
- * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE (512*1024) /* Atom L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STRNCPY
+#define STRCPY strncpy
+#include "sse2-strcpy-slm.S"
diff --git a/libc/arch-x86/silvermont/string/sse4-memcmp-slm.S b/libc/arch-x86/silvermont/string/sse4-memcmp-slm.S
new file mode 100755
index 0000000..b302883
--- /dev/null
+++ b/libc/arch-x86/silvermont/string/sse4-memcmp-slm.S
@@ -0,0 +1,1277 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+#endif
+
+#ifndef cfi_remember_state
+# define cfi_remember_state .cfi_remember_state
+#endif
+
+#ifndef cfi_restore_state
+# define cfi_restore_state .cfi_restore_state
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#ifndef MEMCMP
+# define MEMCMP memcmp
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#define PARMS 4
+#define BLK1 PARMS
+#define BLK2 BLK1 + 4
+#define LEN BLK2 + 4
+#define RETURN POP (%ebx); ret; CFI_PUSH (%ebx)
+
+
+#if (defined SHARED || defined __PIC__)
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into EBX and branch to it. TABLE is a
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+/* We first load PC into EBX. */ \
+ call __x86.get_pc_thunk.bx; \
+/* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ebx; \
+/* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ebx,INDEX,SCALE), %ebx; \
+/* We loaded the jump table and adjuested EDX/ESI. Go. */ \
+ jmp *%ebx
+#else
+# define JMPTBL(I, B) I
+
+/* Load an entry in a jump table into EBX and branch to it. TABLE is a
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+#endif
+
+
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+ .section .text.sse4.2,"ax",@progbits
+ENTRY (MEMCMP)
+ movl BLK1(%esp), %eax
+ movl BLK2(%esp), %edx
+ movl LEN(%esp), %ecx
+
+#ifdef USE_AS_WMEMCMP
+ shl $2, %ecx
+ test %ecx, %ecx
+ jz L(return0)
+#else
+ cmp $1, %ecx
+ jbe L(less1bytes)
+#endif
+
+ pxor %xmm0, %xmm0
+ cmp $64, %ecx
+ ja L(64bytesormore)
+ cmp $8, %ecx
+
+#ifndef USE_AS_WMEMCMP
+ PUSH (%ebx)
+ jb L(less8bytes)
+#else
+ jb L(less8bytes)
+ PUSH (%ebx)
+#endif
+
+ add %ecx, %edx
+ add %ecx, %eax
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+#ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(less8bytes):
+ mov (%eax), %bl
+ cmpb (%edx), %bl
+ jne L(nonzero)
+
+ mov 1(%eax), %bl
+ cmpb 1(%edx), %bl
+ jne L(nonzero)
+
+ cmp $2, %ecx
+ jz L(0bytes)
+
+ mov 2(%eax), %bl
+ cmpb 2(%edx), %bl
+ jne L(nonzero)
+
+ cmp $3, %ecx
+ jz L(0bytes)
+
+ mov 3(%eax), %bl
+ cmpb 3(%edx), %bl
+ jne L(nonzero)
+
+ cmp $4, %ecx
+ jz L(0bytes)
+
+ mov 4(%eax), %bl
+ cmpb 4(%edx), %bl
+ jne L(nonzero)
+
+ cmp $5, %ecx
+ jz L(0bytes)
+
+ mov 5(%eax), %bl
+ cmpb 5(%edx), %bl
+ jne L(nonzero)
+
+ cmp $6, %ecx
+ jz L(0bytes)
+
+ mov 6(%eax), %bl
+ cmpb 6(%edx), %bl
+ je L(0bytes)
+
+L(nonzero):
+ POP (%ebx)
+ mov $1, %eax
+ ja L(above)
+ neg %eax
+L(above):
+ ret
+ CFI_PUSH (%ebx)
+#endif
+
+ .p2align 4
+L(0bytes):
+ POP (%ebx)
+ xor %eax, %eax
+ ret
+
+#ifdef USE_AS_WMEMCMP
+
+/* for wmemcmp, case N == 1 */
+
+ .p2align 4
+L(less8bytes):
+ mov (%eax), %ecx
+ cmp (%edx), %ecx
+ je L(return0)
+ mov $1, %eax
+ jg L(find_diff_bigger)
+ neg %eax
+ ret
+
+ .p2align 4
+L(find_diff_bigger):
+ ret
+
+ .p2align 4
+L(return0):
+ xor %eax, %eax
+ ret
+#endif
+
+#ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(less1bytes):
+ jb L(0bytesend)
+ movzbl (%eax), %eax
+ movzbl (%edx), %edx
+ sub %edx, %eax
+ ret
+
+ .p2align 4
+L(0bytesend):
+ xor %eax, %eax
+ ret
+#endif
+ .p2align 4
+L(64bytesormore):
+ PUSH (%ebx)
+ mov %ecx, %ebx
+ mov $64, %ecx
+ sub $64, %ebx
+L(64bytesormore_loop):
+ movdqu (%eax), %xmm1
+ movdqu (%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(find_16diff)
+
+ movdqu 16(%eax), %xmm1
+ movdqu 16(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(find_32diff)
+
+ movdqu 32(%eax), %xmm1
+ movdqu 32(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(find_48diff)
+
+ movdqu 48(%eax), %xmm1
+ movdqu 48(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(find_64diff)
+ add %ecx, %eax
+ add %ecx, %edx
+ sub %ecx, %ebx
+ jae L(64bytesormore_loop)
+ add %ebx, %ecx
+ add %ecx, %edx
+ add %ecx, %eax
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4)
+
+#ifdef USE_AS_WMEMCMP
+
+/* Label needs only for table_64bytes filling */
+L(unreal_case):
+/* no code here */
+
+#endif
+ .p2align 4
+L(find_16diff):
+ sub $16, %ecx
+L(find_32diff):
+ sub $16, %ecx
+L(find_48diff):
+ sub $16, %ecx
+L(find_64diff):
+ add %ecx, %edx
+ add %ecx, %eax
+
+#ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(16bytes):
+ mov -16(%eax), %ecx
+ mov -16(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(12bytes):
+ mov -12(%eax), %ecx
+ mov -12(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(8bytes):
+ mov -8(%eax), %ecx
+ mov -8(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(4bytes):
+ mov -4(%eax), %ecx
+ mov -4(%edx), %ebx
+ cmp %ebx, %ecx
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+#else
+ .p2align 4
+L(16bytes):
+ mov -16(%eax), %ecx
+ cmp -16(%edx), %ecx
+ jne L(find_diff)
+L(12bytes):
+ mov -12(%eax), %ecx
+ cmp -12(%edx), %ecx
+ jne L(find_diff)
+L(8bytes):
+ mov -8(%eax), %ecx
+ cmp -8(%edx), %ecx
+ jne L(find_diff)
+L(4bytes):
+ mov -4(%eax), %ecx
+ cmp -4(%edx), %ecx
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+#endif
+
+#ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(49bytes):
+ movdqu -49(%eax), %xmm1
+ movdqu -49(%edx), %xmm2
+ mov $-49, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(33bytes):
+ movdqu -33(%eax), %xmm1
+ movdqu -33(%edx), %xmm2
+ mov $-33, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(17bytes):
+ mov -17(%eax), %ecx
+ mov -17(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(13bytes):
+ mov -13(%eax), %ecx
+ mov -13(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(9bytes):
+ mov -9(%eax), %ecx
+ mov -9(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(5bytes):
+ mov -5(%eax), %ecx
+ mov -5(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzbl -1(%eax), %ecx
+ cmp -1(%edx), %cl
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(50bytes):
+ mov $-50, %ebx
+ movdqu -50(%eax), %xmm1
+ movdqu -50(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(34bytes):
+ mov $-34, %ebx
+ movdqu -34(%eax), %xmm1
+ movdqu -34(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(18bytes):
+ mov -18(%eax), %ecx
+ mov -18(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(14bytes):
+ mov -14(%eax), %ecx
+ mov -14(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(10bytes):
+ mov -10(%eax), %ecx
+ mov -10(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(6bytes):
+ mov -6(%eax), %ecx
+ mov -6(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(2bytes):
+ movzwl -2(%eax), %ecx
+ movzwl -2(%edx), %ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bh, %ch
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(51bytes):
+ mov $-51, %ebx
+ movdqu -51(%eax), %xmm1
+ movdqu -51(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(35bytes):
+ mov $-35, %ebx
+ movdqu -35(%eax), %xmm1
+ movdqu -35(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(19bytes):
+ movl -19(%eax), %ecx
+ movl -19(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(15bytes):
+ movl -15(%eax), %ecx
+ movl -15(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(11bytes):
+ movl -11(%eax), %ecx
+ movl -11(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(7bytes):
+ movl -7(%eax), %ecx
+ movl -7(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+L(3bytes):
+ movzwl -3(%eax), %ecx
+ movzwl -3(%edx), %ebx
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+L(1bytes):
+ movzbl -1(%eax), %eax
+ cmpb -1(%edx), %al
+ mov $0, %eax
+ jne L(end)
+ RETURN
+#endif
+ .p2align 4
+L(52bytes):
+ movdqu -52(%eax), %xmm1
+ movdqu -52(%edx), %xmm2
+ mov $-52, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(36bytes):
+ movdqu -36(%eax), %xmm1
+ movdqu -36(%edx), %xmm2
+ mov $-36, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(20bytes):
+ movdqu -20(%eax), %xmm1
+ movdqu -20(%edx), %xmm2
+ mov $-20, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -4(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+ mov -4(%edx), %ebx
+ cmp %ebx, %ecx
+#else
+ cmp -4(%edx), %ecx
+#endif
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+
+#ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(53bytes):
+ movdqu -53(%eax), %xmm1
+ movdqu -53(%edx), %xmm2
+ mov $-53, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(37bytes):
+ mov $-37, %ebx
+ movdqu -37(%eax), %xmm1
+ movdqu -37(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(21bytes):
+ mov $-21, %ebx
+ movdqu -21(%eax), %xmm1
+ movdqu -21(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -5(%eax), %ecx
+ mov -5(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzbl -1(%eax), %ecx
+ cmp -1(%edx), %cl
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(54bytes):
+ movdqu -54(%eax), %xmm1
+ movdqu -54(%edx), %xmm2
+ mov $-54, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(38bytes):
+ mov $-38, %ebx
+ movdqu -38(%eax), %xmm1
+ movdqu -38(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(22bytes):
+ mov $-22, %ebx
+ movdqu -22(%eax), %xmm1
+ movdqu -22(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -6(%eax), %ecx
+ mov -6(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzwl -2(%eax), %ecx
+ movzwl -2(%edx), %ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bh, %ch
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(55bytes):
+ movdqu -55(%eax), %xmm1
+ movdqu -55(%edx), %xmm2
+ mov $-55, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(39bytes):
+ mov $-39, %ebx
+ movdqu -39(%eax), %xmm1
+ movdqu -39(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(23bytes):
+ mov $-23, %ebx
+ movdqu -23(%eax), %xmm1
+ movdqu -23(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ movl -7(%eax), %ecx
+ movl -7(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzwl -3(%eax), %ecx
+ movzwl -3(%edx), %ebx
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+ movzbl -1(%eax), %eax
+ cmpb -1(%edx), %al
+ mov $0, %eax
+ jne L(end)
+ RETURN
+#endif
+ .p2align 4
+L(56bytes):
+ movdqu -56(%eax), %xmm1
+ movdqu -56(%edx), %xmm2
+ mov $-56, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(40bytes):
+ mov $-40, %ebx
+ movdqu -40(%eax), %xmm1
+ movdqu -40(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(24bytes):
+ mov $-24, %ebx
+ movdqu -24(%eax), %xmm1
+ movdqu -24(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -8(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+ mov -8(%edx), %ebx
+ cmp %ebx, %ecx
+#else
+ cmp -8(%edx), %ecx
+#endif
+ jne L(find_diff)
+
+ mov -4(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+ mov -4(%edx), %ebx
+ cmp %ebx, %ecx
+#else
+ cmp -4(%edx), %ecx
+#endif
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+
+#ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(57bytes):
+ movdqu -57(%eax), %xmm1
+ movdqu -57(%edx), %xmm2
+ mov $-57, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(41bytes):
+ mov $-41, %ebx
+ movdqu -41(%eax), %xmm1
+ movdqu -41(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(25bytes):
+ mov $-25, %ebx
+ movdqu -25(%eax), %xmm1
+ movdqu -25(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -9(%eax), %ecx
+ mov -9(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ mov -5(%eax), %ecx
+ mov -5(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzbl -1(%eax), %ecx
+ cmp -1(%edx), %cl
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(58bytes):
+ movdqu -58(%eax), %xmm1
+ movdqu -58(%edx), %xmm2
+ mov $-58, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(42bytes):
+ mov $-42, %ebx
+ movdqu -42(%eax), %xmm1
+ movdqu -42(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(26bytes):
+ mov $-26, %ebx
+ movdqu -26(%eax), %xmm1
+ movdqu -26(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -10(%eax), %ecx
+ mov -10(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov -6(%eax), %ecx
+ mov -6(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ movzwl -2(%eax), %ecx
+ movzwl -2(%edx), %ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bh, %ch
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(59bytes):
+ movdqu -59(%eax), %xmm1
+ movdqu -59(%edx), %xmm2
+ mov $-59, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(43bytes):
+ mov $-43, %ebx
+ movdqu -43(%eax), %xmm1
+ movdqu -43(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(27bytes):
+ mov $-27, %ebx
+ movdqu -27(%eax), %xmm1
+ movdqu -27(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ movl -11(%eax), %ecx
+ movl -11(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movl -7(%eax), %ecx
+ movl -7(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzwl -3(%eax), %ecx
+ movzwl -3(%edx), %ebx
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+ movzbl -1(%eax), %eax
+ cmpb -1(%edx), %al
+ mov $0, %eax
+ jne L(end)
+ RETURN
+#endif
+ .p2align 4
+L(60bytes):
+ movdqu -60(%eax), %xmm1
+ movdqu -60(%edx), %xmm2
+ mov $-60, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(44bytes):
+ mov $-44, %ebx
+ movdqu -44(%eax), %xmm1
+ movdqu -44(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(28bytes):
+ mov $-28, %ebx
+ movdqu -28(%eax), %xmm1
+ movdqu -28(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -12(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+ mov -12(%edx), %ebx
+ cmp %ebx, %ecx
+#else
+ cmp -12(%edx), %ecx
+#endif
+ jne L(find_diff)
+
+ mov -8(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+ mov -8(%edx), %ebx
+ cmp %ebx, %ecx
+#else
+ cmp -8(%edx), %ecx
+#endif
+ jne L(find_diff)
+
+ mov -4(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+ mov -4(%edx), %ebx
+ cmp %ebx, %ecx
+#else
+ cmp -4(%edx), %ecx
+#endif
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+
+#ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(61bytes):
+ movdqu -61(%eax), %xmm1
+ movdqu -61(%edx), %xmm2
+ mov $-61, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(45bytes):
+ mov $-45, %ebx
+ movdqu -45(%eax), %xmm1
+ movdqu -45(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(29bytes):
+ mov $-29, %ebx
+ movdqu -29(%eax), %xmm1
+ movdqu -29(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -13(%eax), %ecx
+ mov -13(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov -9(%eax), %ecx
+ mov -9(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov -5(%eax), %ecx
+ mov -5(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzbl -1(%eax), %ecx
+ cmp -1(%edx), %cl
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(62bytes):
+ movdqu -62(%eax), %xmm1
+ movdqu -62(%edx), %xmm2
+ mov $-62, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(46bytes):
+ mov $-46, %ebx
+ movdqu -46(%eax), %xmm1
+ movdqu -46(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(30bytes):
+ mov $-30, %ebx
+ movdqu -30(%eax), %xmm1
+ movdqu -30(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -14(%eax), %ecx
+ mov -14(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ mov -10(%eax), %ecx
+ mov -10(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ mov -6(%eax), %ecx
+ mov -6(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzwl -2(%eax), %ecx
+ movzwl -2(%edx), %ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bh, %ch
+ mov $0, %eax
+ jne L(end)
+ RETURN
+
+ .p2align 4
+L(63bytes):
+ movdqu -63(%eax), %xmm1
+ movdqu -63(%edx), %xmm2
+ mov $-63, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(47bytes):
+ mov $-47, %ebx
+ movdqu -47(%eax), %xmm1
+ movdqu -47(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(31bytes):
+ mov $-31, %ebx
+ movdqu -31(%eax), %xmm1
+ movdqu -31(%edx), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ movl -15(%eax), %ecx
+ movl -15(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movl -11(%eax), %ecx
+ movl -11(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movl -7(%eax), %ecx
+ movl -7(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+ movzwl -3(%eax), %ecx
+ movzwl -3(%edx), %ebx
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+ movzbl -1(%eax), %eax
+ cmpb -1(%edx), %al
+ mov $0, %eax
+ jne L(end)
+ RETURN
+#endif
+
+ .p2align 4
+L(64bytes):
+ movdqu -64(%eax), %xmm1
+ movdqu -64(%edx), %xmm2
+ mov $-64, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(48bytes):
+ movdqu -48(%eax), %xmm1
+ movdqu -48(%edx), %xmm2
+ mov $-48, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(32bytes):
+ movdqu -32(%eax), %xmm1
+ movdqu -32(%edx), %xmm2
+ mov $-32, %ebx
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -16(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+ mov -16(%edx), %ebx
+ cmp %ebx, %ecx
+#else
+ cmp -16(%edx), %ecx
+#endif
+ jne L(find_diff)
+
+ mov -12(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+ mov -12(%edx), %ebx
+ cmp %ebx, %ecx
+#else
+ cmp -12(%edx), %ecx
+#endif
+ jne L(find_diff)
+
+ mov -8(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+ mov -8(%edx), %ebx
+ cmp %ebx, %ecx
+#else
+ cmp -8(%edx), %ecx
+#endif
+ jne L(find_diff)
+
+ mov -4(%eax), %ecx
+#ifndef USE_AS_WMEMCMP
+ mov -4(%edx), %ebx
+ cmp %ebx, %ecx
+#else
+ cmp -4(%edx), %ecx
+#endif
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+
+#ifndef USE_AS_WMEMCMP
+ .p2align 4
+L(less16bytes):
+ add %ebx, %eax
+ add %ebx, %edx
+
+ mov (%eax), %ecx
+ mov (%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov 4(%eax), %ecx
+ mov 4(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov 8(%eax), %ecx
+ mov 8(%edx), %ebx
+ cmp %ebx, %ecx
+ jne L(find_diff)
+
+ mov 12(%eax), %ecx
+ mov 12(%edx), %ebx
+ cmp %ebx, %ecx
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+#else
+ .p2align 4
+L(less16bytes):
+ add %ebx, %eax
+ add %ebx, %edx
+
+ mov (%eax), %ecx
+ cmp (%edx), %ecx
+ jne L(find_diff)
+
+ mov 4(%eax), %ecx
+ cmp 4(%edx), %ecx
+ jne L(find_diff)
+
+ mov 8(%eax), %ecx
+ cmp 8(%edx), %ecx
+ jne L(find_diff)
+
+ mov 12(%eax), %ecx
+ cmp 12(%edx), %ecx
+
+ mov $0, %eax
+ jne L(find_diff)
+ RETURN
+#endif
+
+ .p2align 4
+L(find_diff):
+#ifndef USE_AS_WMEMCMP
+ cmpb %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+ jne L(end)
+ shr $16,%ecx
+ shr $16,%ebx
+ cmp %bl, %cl
+ jne L(end)
+ cmp %bx, %cx
+L(end):
+ POP (%ebx)
+ mov $1, %eax
+ ja L(bigger)
+ neg %eax
+L(bigger):
+ ret
+#else
+ POP (%ebx)
+ mov $1, %eax
+ jg L(bigger)
+ neg %eax
+ ret
+
+ .p2align 4
+L(bigger):
+ ret
+#endif
+END (MEMCMP)
+
+ .section .rodata.sse4.2,"a",@progbits
+ .p2align 2
+ .type L(table_64bytes), @object
+#ifndef USE_AS_WMEMCMP
+L(table_64bytes):
+ .int JMPTBL (L(0bytes), L(table_64bytes))
+ .int JMPTBL (L(1bytes), L(table_64bytes))
+ .int JMPTBL (L(2bytes), L(table_64bytes))
+ .int JMPTBL (L(3bytes), L(table_64bytes))
+ .int JMPTBL (L(4bytes), L(table_64bytes))
+ .int JMPTBL (L(5bytes), L(table_64bytes))
+ .int JMPTBL (L(6bytes), L(table_64bytes))
+ .int JMPTBL (L(7bytes), L(table_64bytes))
+ .int JMPTBL (L(8bytes), L(table_64bytes))
+ .int JMPTBL (L(9bytes), L(table_64bytes))
+ .int JMPTBL (L(10bytes), L(table_64bytes))
+ .int JMPTBL (L(11bytes), L(table_64bytes))
+ .int JMPTBL (L(12bytes), L(table_64bytes))
+ .int JMPTBL (L(13bytes), L(table_64bytes))
+ .int JMPTBL (L(14bytes), L(table_64bytes))
+ .int JMPTBL (L(15bytes), L(table_64bytes))
+ .int JMPTBL (L(16bytes), L(table_64bytes))
+ .int JMPTBL (L(17bytes), L(table_64bytes))
+ .int JMPTBL (L(18bytes), L(table_64bytes))
+ .int JMPTBL (L(19bytes), L(table_64bytes))
+ .int JMPTBL (L(20bytes), L(table_64bytes))
+ .int JMPTBL (L(21bytes), L(table_64bytes))
+ .int JMPTBL (L(22bytes), L(table_64bytes))
+ .int JMPTBL (L(23bytes), L(table_64bytes))
+ .int JMPTBL (L(24bytes), L(table_64bytes))
+ .int JMPTBL (L(25bytes), L(table_64bytes))
+ .int JMPTBL (L(26bytes), L(table_64bytes))
+ .int JMPTBL (L(27bytes), L(table_64bytes))
+ .int JMPTBL (L(28bytes), L(table_64bytes))
+ .int JMPTBL (L(29bytes), L(table_64bytes))
+ .int JMPTBL (L(30bytes), L(table_64bytes))
+ .int JMPTBL (L(31bytes), L(table_64bytes))
+ .int JMPTBL (L(32bytes), L(table_64bytes))
+ .int JMPTBL (L(33bytes), L(table_64bytes))
+ .int JMPTBL (L(34bytes), L(table_64bytes))
+ .int JMPTBL (L(35bytes), L(table_64bytes))
+ .int JMPTBL (L(36bytes), L(table_64bytes))
+ .int JMPTBL (L(37bytes), L(table_64bytes))
+ .int JMPTBL (L(38bytes), L(table_64bytes))
+ .int JMPTBL (L(39bytes), L(table_64bytes))
+ .int JMPTBL (L(40bytes), L(table_64bytes))
+ .int JMPTBL (L(41bytes), L(table_64bytes))
+ .int JMPTBL (L(42bytes), L(table_64bytes))
+ .int JMPTBL (L(43bytes), L(table_64bytes))
+ .int JMPTBL (L(44bytes), L(table_64bytes))
+ .int JMPTBL (L(45bytes), L(table_64bytes))
+ .int JMPTBL (L(46bytes), L(table_64bytes))
+ .int JMPTBL (L(47bytes), L(table_64bytes))
+ .int JMPTBL (L(48bytes), L(table_64bytes))
+ .int JMPTBL (L(49bytes), L(table_64bytes))
+ .int JMPTBL (L(50bytes), L(table_64bytes))
+ .int JMPTBL (L(51bytes), L(table_64bytes))
+ .int JMPTBL (L(52bytes), L(table_64bytes))
+ .int JMPTBL (L(53bytes), L(table_64bytes))
+ .int JMPTBL (L(54bytes), L(table_64bytes))
+ .int JMPTBL (L(55bytes), L(table_64bytes))
+ .int JMPTBL (L(56bytes), L(table_64bytes))
+ .int JMPTBL (L(57bytes), L(table_64bytes))
+ .int JMPTBL (L(58bytes), L(table_64bytes))
+ .int JMPTBL (L(59bytes), L(table_64bytes))
+ .int JMPTBL (L(60bytes), L(table_64bytes))
+ .int JMPTBL (L(61bytes), L(table_64bytes))
+ .int JMPTBL (L(62bytes), L(table_64bytes))
+ .int JMPTBL (L(63bytes), L(table_64bytes))
+ .int JMPTBL (L(64bytes), L(table_64bytes))
+#else
+L(table_64bytes):
+ .int JMPTBL (L(0bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(4bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(8bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(12bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(16bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(20bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(24bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(28bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(32bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(36bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(40bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(44bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(48bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(52bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(56bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(60bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(64bytes), L(table_64bytes))
+#endif
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/silvermont/string/sse4-wmemcmp-slm.S
old mode 100644
new mode 100755
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86/silvermont/string/sse4-wmemcmp-slm.S
index 9d0a563..2c350bb
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86/silvermont/string/sse4-wmemcmp-slm.S
@@ -1,5 +1,5 @@
/*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,15 +28,6 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE (512*1024) /* Atom L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
+#define USE_AS_WMEMCMP
+#define MEMCMP wmemcmp
+#include "sse4-memcmp-slm.S"
diff --git a/libc/arch-x86/x86.mk b/libc/arch-x86/x86.mk
index a1d55f0..34da0ce 100644
--- a/libc/arch-x86/x86.mk
+++ b/libc/arch-x86/x86.mk
@@ -32,60 +32,15 @@
arch-x86/bionic/syscall.S \
arch-x86/bionic/vfork.S \
-ifeq ($(ARCH_X86_HAVE_SSSE3),true)
-libc_bionic_src_files_x86 += \
- arch-x86/string/ssse3-memcpy-atom.S \
- arch-x86/string/ssse3-memmove-atom.S \
- arch-x86/string/ssse3-bcopy-atom.S \
- arch-x86/string/ssse3-strncat-atom.S \
- arch-x86/string/ssse3-strncpy-atom.S \
- arch-x86/string/ssse3-strlcat-atom.S \
- arch-x86/string/ssse3-strlcpy-atom.S \
- arch-x86/string/ssse3-strcmp-atom.S \
- arch-x86/string/ssse3-strncmp-atom.S \
- arch-x86/string/ssse3-strcat-atom.S \
- arch-x86/string/ssse3-strcpy-atom.S \
- arch-x86/string/ssse3-memcmp-atom.S \
- arch-x86/string/ssse3-wmemcmp-atom.S \
- arch-x86/string/ssse3-memcmp16-atom.S \
- arch-x86/string/ssse3-wcscat-atom.S \
- arch-x86/string/ssse3-wcscpy-atom.S
-else
-libc_bionic_src_files_x86 += \
- arch-x86/string/memcpy.S \
- arch-x86/string/memmove.S \
- arch-x86/string/bcopy.S \
- arch-x86/string/strcmp.S \
- arch-x86/string/strncmp.S \
- arch-x86/string/strcat.S \
- arch-x86/string/memcmp.S \
- bionic/__memcmp16.cpp \
- upstream-freebsd/lib/libc/string/wcscpy.c \
- upstream-freebsd/lib/libc/string/wcscat.c \
- upstream-freebsd/lib/libc/string/wmemcmp.c \
- upstream-openbsd/lib/libc/string/strcpy.c \
- upstream-openbsd/lib/libc/string/strlcat.c \
- upstream-openbsd/lib/libc/string/strlcpy.c \
- upstream-openbsd/lib/libc/string/strncat.c \
- upstream-openbsd/lib/libc/string/strncpy.c \
-
+## ARCH variant specific source files
+arch_variant_mk := $(LOCAL_PATH)/arch-x86/$(TARGET_ARCH_VARIANT)/$(TARGET_ARCH_VARIANT).mk
+ifeq ($(wildcard $(arch_variant_mk)),)
+ arch_variant_mk := $(LOCAL_PATH)/arch-x86/generic/generic.mk
endif
+include $(arch_variant_mk)
+libc_common_additional_dependencies += $(arch_variant_mk)
-libc_bionic_src_files_x86 += \
- arch-x86/string/sse2-memset-atom.S \
- arch-x86/string/sse2-bzero-atom.S \
- arch-x86/string/sse2-memchr-atom.S \
- arch-x86/string/sse2-memrchr-atom.S \
- arch-x86/string/sse2-strchr-atom.S \
- arch-x86/string/sse2-strrchr-atom.S \
- arch-x86/string/sse2-index-atom.S \
- arch-x86/string/sse2-strlen-atom.S \
- arch-x86/string/sse2-strnlen-atom.S \
- arch-x86/string/sse2-wcschr-atom.S \
- arch-x86/string/sse2-wcsrchr-atom.S \
- arch-x86/string/sse2-wcslen-atom.S \
- arch-x86/string/sse2-wcscmp-atom.S \
-
+arch_variant_mk :=
libc_crt_target_cflags_x86 := \
-m32 \
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86_64/string/cache.h
similarity index 88%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86_64/string/cache.h
index 9d0a563..38acc6e 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86_64/string/cache.h
@@ -1,5 +1,5 @@
/*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,15 +28,9 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#if defined(__slm__)
/* Values are optimized for Silvermont */
#define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */
#define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE (512*1024) /* Atom L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Atom L1 Data Cache */
-#endif
#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86_64/string/sse2-bcopy-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86_64/string/sse2-bcopy-slm.S
index 9d0a563..effab0e 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86_64/string/sse2-bcopy-slm.S
@@ -1,5 +1,5 @@
/*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,15 +28,6 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE (512*1024) /* Atom L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
+#define USE_AS_BCOPY
+#define MEMMOVE bcopy
+#include "sse2-memmove-slm.S"
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86_64/string/sse2-bzero-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86_64/string/sse2-bzero-slm.S
index 9d0a563..446ea5b 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86_64/string/sse2-bzero-slm.S
@@ -1,5 +1,5 @@
/*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,15 +28,6 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE (512*1024) /* Atom L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
+#define USE_AS_BZERO_P
+#define MEMSET bzero
+#include "sse2-memset-slm.S"
diff --git a/libc/arch-x86_64/string/sse2-memcpy-slm.S b/libc/arch-x86_64/string/sse2-memcpy-slm.S
new file mode 100644
index 0000000..4c30fb6
--- /dev/null
+++ b/libc/arch-x86_64/string/sse2-memcpy-slm.S
@@ -0,0 +1,299 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cache.h"
+
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) push REG;
+#define POP(REG) pop REG;
+
+#define ENTRANCE PUSH (%rbx);
+#define RETURN_END POP (%rbx); ret
+#define RETURN RETURN_END;
+
+ .section .text.sse2,"ax",@progbits
+ENTRY (MEMCPY)
+ ENTRANCE
+ cmp %rsi, %rdi
+ je L(return)
+
+ cmp $16, %rdx
+ jbe L(len_0_16_bytes)
+
+ cmp $SHARED_CACHE_SIZE_HALF, %rdx
+ jae L(large_page)
+
+ movdqu (%rsi), %xmm0
+ movdqu -16(%rsi, %rdx), %xmm1
+ cmp $32, %rdx
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, -16(%rdi, %rdx)
+ jbe L(return)
+
+ movdqu 16(%rsi), %xmm0
+ movdqu -32(%rsi, %rdx), %xmm1
+ cmp $64, %rdx
+ movdqu %xmm0, 16(%rdi)
+ movdqu %xmm1, -32(%rdi, %rdx)
+ jbe L(return)
+
+ movdqu 32(%rsi), %xmm0
+ movdqu 48(%rsi), %xmm1
+ movdqu -48(%rsi, %rdx), %xmm2
+ movdqu -64(%rsi, %rdx), %xmm3
+ cmp $128, %rdx
+ movdqu %xmm0, 32(%rdi)
+ movdqu %xmm1, 48(%rdi)
+ movdqu %xmm2, -48(%rdi, %rdx)
+ movdqu %xmm3, -64(%rdi, %rdx)
+ jbe L(return)
+
+/* Now the main loop: we align the address of the destination. */
+ lea 64(%rdi), %r8
+ and $-64, %r8
+
+ add %rdi, %rdx
+ and $-64, %rdx
+
+ sub %rdi, %rsi
+
+/* We should stop two iterations before the termination
+ (in order not to misprefetch). */
+ sub $64, %rdx
+ cmp %r8, %rdx
+ je L(main_loop_just_one_iteration)
+
+ sub $64, %rdx
+ cmp %r8, %rdx
+ je L(main_loop_last_two_iterations)
+
+
+ .p2align 4
+L(main_loop_cache):
+
+ prefetcht0 128(%r8, %rsi)
+
+ movdqu (%r8, %rsi), %xmm0
+ movdqu 16(%r8, %rsi), %xmm1
+ movdqu 32(%r8, %rsi), %xmm2
+ movdqu 48(%r8, %rsi), %xmm3
+ movdqa %xmm0, (%r8)
+ movdqa %xmm1, 16(%r8)
+ movdqa %xmm2, 32(%r8)
+ movdqa %xmm3, 48(%r8)
+ lea 64(%r8), %r8
+ cmp %r8, %rdx
+ jne L(main_loop_cache)
+
+L(main_loop_last_two_iterations):
+ movdqu (%r8, %rsi), %xmm0
+ movdqu 16(%r8, %rsi), %xmm1
+ movdqu 32(%r8, %rsi), %xmm2
+ movdqu 48(%r8, %rsi), %xmm3
+ movdqu 64(%r8, %rsi), %xmm4
+ movdqu 80(%r8, %rsi), %xmm5
+ movdqu 96(%r8, %rsi), %xmm6
+ movdqu 112(%r8, %rsi), %xmm7
+ movdqa %xmm0, (%r8)
+ movdqa %xmm1, 16(%r8)
+ movdqa %xmm2, 32(%r8)
+ movdqa %xmm3, 48(%r8)
+ movdqa %xmm4, 64(%r8)
+ movdqa %xmm5, 80(%r8)
+ movdqa %xmm6, 96(%r8)
+ movdqa %xmm7, 112(%r8)
+ jmp L(return)
+
+L(main_loop_just_one_iteration):
+ movdqu (%r8, %rsi), %xmm0
+ movdqu 16(%r8, %rsi), %xmm1
+ movdqu 32(%r8, %rsi), %xmm2
+ movdqu 48(%r8, %rsi), %xmm3
+ movdqa %xmm0, (%r8)
+ movdqa %xmm1, 16(%r8)
+ movdqa %xmm2, 32(%r8)
+ movdqa %xmm3, 48(%r8)
+ jmp L(return)
+
+L(large_page):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm1
+ movdqu 32(%rsi), %xmm2
+ movdqu 48(%rsi), %xmm3
+ movdqu -64(%rsi, %rdx), %xmm4
+ movdqu -48(%rsi, %rdx), %xmm5
+ movdqu -32(%rsi, %rdx), %xmm6
+ movdqu -16(%rsi, %rdx), %xmm7
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, 16(%rdi)
+ movdqu %xmm2, 32(%rdi)
+ movdqu %xmm3, 48(%rdi)
+ movdqu %xmm4, -64(%rdi, %rdx)
+ movdqu %xmm5, -48(%rdi, %rdx)
+ movdqu %xmm6, -32(%rdi, %rdx)
+ movdqu %xmm7, -16(%rdi, %rdx)
+
+ movdqu 64(%rsi), %xmm0
+ movdqu 80(%rsi), %xmm1
+ movdqu 96(%rsi), %xmm2
+ movdqu 112(%rsi), %xmm3
+ movdqu -128(%rsi, %rdx), %xmm4
+ movdqu -112(%rsi, %rdx), %xmm5
+ movdqu -96(%rsi, %rdx), %xmm6
+ movdqu -80(%rsi, %rdx), %xmm7
+ movdqu %xmm0, 64(%rdi)
+ movdqu %xmm1, 80(%rdi)
+ movdqu %xmm2, 96(%rdi)
+ movdqu %xmm3, 112(%rdi)
+ movdqu %xmm4, -128(%rdi, %rdx)
+ movdqu %xmm5, -112(%rdi, %rdx)
+ movdqu %xmm6, -96(%rdi, %rdx)
+ movdqu %xmm7, -80(%rdi, %rdx)
+
+/* Now the main loop with non temporal stores. We align
+ the address of the destination. */
+ lea 128(%rdi), %r8
+ and $-128, %r8
+
+ add %rdi, %rdx
+ and $-128, %rdx
+
+ sub %rdi, %rsi
+
+ .p2align 4
+L(main_loop_large_page):
+ movdqu (%r8, %rsi), %xmm0
+ movdqu 16(%r8, %rsi), %xmm1
+ movdqu 32(%r8, %rsi), %xmm2
+ movdqu 48(%r8, %rsi), %xmm3
+ movdqu 64(%r8, %rsi), %xmm4
+ movdqu 80(%r8, %rsi), %xmm5
+ movdqu 96(%r8, %rsi), %xmm6
+ movdqu 112(%r8, %rsi), %xmm7
+ movntdq %xmm0, (%r8)
+ movntdq %xmm1, 16(%r8)
+ movntdq %xmm2, 32(%r8)
+ movntdq %xmm3, 48(%r8)
+ movntdq %xmm4, 64(%r8)
+ movntdq %xmm5, 80(%r8)
+ movntdq %xmm6, 96(%r8)
+ movntdq %xmm7, 112(%r8)
+ lea 128(%r8), %r8
+ cmp %r8, %rdx
+ jne L(main_loop_large_page)
+ sfence
+ jmp L(return)
+
+L(len_0_16_bytes):
+ testb $24, %dl
+ jne L(len_9_16_bytes)
+ testb $4, %dl
+ .p2align 4,,5
+ jne L(len_5_8_bytes)
+ test %rdx, %rdx
+ .p2align 4,,2
+ je L(return)
+ movzbl (%rsi), %ebx
+ testb $2, %dl
+ movb %bl, (%rdi)
+ je L(return)
+ movzwl -2(%rsi,%rdx), %ebx
+ movw %bx, -2(%rdi,%rdx)
+ jmp L(return)
+
+L(len_9_16_bytes):
+ movq (%rsi), %xmm0
+ movq -8(%rsi, %rdx), %xmm1
+ movq %xmm0, (%rdi)
+ movq %xmm1, -8(%rdi, %rdx)
+ jmp L(return)
+
+L(len_5_8_bytes):
+ movl (%rsi), %ebx
+ movl %ebx, (%rdi)
+ movl -4(%rsi,%rdx), %ebx
+ movl %ebx, -4(%rdi,%rdx)
+ jmp L(return)
+
+L(return):
+ mov %rdi, %rax
+ RETURN
+
+END (MEMCPY)
diff --git a/libc/arch-x86_64/string/sse2-memmove-slm.S b/libc/arch-x86_64/string/sse2-memmove-slm.S
new file mode 100644
index 0000000..ee8440e
--- /dev/null
+++ b/libc/arch-x86_64/string/sse2-memmove-slm.S
@@ -0,0 +1,635 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cache.h"
+
+#ifndef MEMMOVE
+# define MEMMOVE memmove
+#endif
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) push REG;
+#define POP(REG) pop REG;
+
+#define ENTRANCE PUSH (%rbx);
+#define RETURN_END POP (%rbx); ret
+#define RETURN RETURN_END;
+
+ .section .text.sse2,"ax",@progbits
+ENTRY (MEMMOVE)
+ ENTRANCE
+#ifdef USE_AS_BCOPY
+ xchg %rsi, %rdi
+#endif
+ mov %rdi, %rax
+
+/* Check whether we should copy backward or forward. */
+ cmp %rsi, %rdi
+ je L(mm_return)
+ ja L(mm_len_0_or_more_backward)
+
+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
+ separately. */
+ cmp $16, %rdx
+ jbe L(mm_len_0_16_bytes_forward)
+
+ cmp $32, %rdx
+ jg L(mm_len_32_or_more_forward)
+
+/* Copy [0..32] and return. */
+ movdqu (%rsi), %xmm0
+ movdqu -16(%rsi, %rdx), %xmm1
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, -16(%rdi, %rdx)
+ jmp L(mm_return)
+
+L(mm_len_32_or_more_forward):
+ cmp $64, %rdx
+ jg L(mm_len_64_or_more_forward)
+
+/* Copy [0..64] and return. */
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm1
+ movdqu -16(%rsi, %rdx), %xmm2
+ movdqu -32(%rsi, %rdx), %xmm3
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, 16(%rdi)
+ movdqu %xmm2, -16(%rdi, %rdx)
+ movdqu %xmm3, -32(%rdi, %rdx)
+ jmp L(mm_return)
+
+L(mm_len_64_or_more_forward):
+ cmp $128, %rdx
+ jg L(mm_len_128_or_more_forward)
+
+/* Copy [0..128] and return. */
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm1
+ movdqu 32(%rsi), %xmm2
+ movdqu 48(%rsi), %xmm3
+ movdqu -64(%rsi, %rdx), %xmm4
+ movdqu -48(%rsi, %rdx), %xmm5
+ movdqu -32(%rsi, %rdx), %xmm6
+ movdqu -16(%rsi, %rdx), %xmm7
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, 16(%rdi)
+ movdqu %xmm2, 32(%rdi)
+ movdqu %xmm3, 48(%rdi)
+ movdqu %xmm4, -64(%rdi, %rdx)
+ movdqu %xmm5, -48(%rdi, %rdx)
+ movdqu %xmm6, -32(%rdi, %rdx)
+ movdqu %xmm7, -16(%rdi, %rdx)
+ jmp L(mm_return)
+
+L(mm_len_128_or_more_forward):
+
+ cmp $SHARED_CACHE_SIZE_HALF, %rdx
+ jae L(mm_large_page_forward)
+
+ mov %rsi, %r8 // copy src to r8
+ mov %rdi, %r9 // copy dst to r9
+
+/* Aligning the address of destination. */
+/* save first unaligned 64 bytes */
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm1
+ movdqu 32(%rsi), %xmm2
+ movdqu 48(%rsi), %xmm3
+
+ lea 64(%r9), %rdi
+ and $-64, %rdi /* rdi now aligned to next 64 byte boundary */
+
+ sub %r9, %rsi /* rsi = src - dst = diff */
+
+ movdqu (%rdi, %rsi), %xmm4
+ movdqu 16(%rdi, %rsi), %xmm5
+ movdqu 32(%rdi, %rsi), %xmm6
+ movdqu 48(%rdi, %rsi), %xmm7
+
+ movdqu %xmm0, (%r9)
+ movdqu %xmm1, 16(%r9)
+ movdqu %xmm2, 32(%r9)
+ movdqu %xmm3, 48(%r9)
+ movdqa %xmm4, (%rdi)
+ movdqa %xmm5, 16(%rdi)
+ movdqa %xmm6, 32(%rdi)
+ movdqa %xmm7, 48(%rdi)
+ add $64, %rdi
+
+ lea (%r9, %rdx), %rbx
+ and $-64, %rbx
+
+ cmp %rdi, %rbx
+ jbe L(mm_copy_remaining_forward)
+
+ .p2align 4
+L(mm_main_loop_forward):
+
+ prefetcht0 128(%rdi, %rsi)
+
+ movdqu (%rdi, %rsi), %xmm0
+ movdqu 16(%rdi, %rsi), %xmm1
+ movdqu 32(%rdi, %rsi), %xmm2
+ movdqu 48(%rdi, %rsi), %xmm3
+ movdqa %xmm0, (%rdi)
+ movdqa %xmm1, 16(%rdi)
+ movdqa %xmm2, 32(%rdi)
+ movdqa %xmm3, 48(%rdi)
+ lea 64(%rdi), %rdi
+ cmp %rdi, %rbx
+ ja L(mm_main_loop_forward)
+
+L(mm_copy_remaining_forward):
+ add %r9, %rdx
+ sub %rdi, %rdx
+/* We copied all up till %rdi position in the dst.
+ In %rdx now is how many bytes are left to copy.
+ Now we need to advance %r8. */
+ lea (%rdi, %rsi), %r8
+
+L(mm_remaining_0_64_bytes_forward):
+ cmp $32, %rdx
+ ja L(mm_remaining_33_64_bytes_forward)
+ cmp $16, %rdx
+ ja L(mm_remaining_17_32_bytes_forward)
+ test %rdx, %rdx
+ .p2align 4,,2
+ je L(mm_return)
+
+ cmpb $8, %dl
+ ja L(mm_remaining_9_16_bytes_forward)
+ cmpb $4, %dl
+ .p2align 4,,5
+ ja L(mm_remaining_5_8_bytes_forward)
+ cmpb $2, %dl
+ .p2align 4,,1
+ ja L(mm_remaining_3_4_bytes_forward)
+ movzbl -1(%r8,%rdx), %esi
+ movzbl (%r8), %ebx
+ movb %sil, -1(%rdi,%rdx)
+ movb %bl, (%rdi)
+ jmp L(mm_return)
+
+L(mm_remaining_33_64_bytes_forward):
+ movdqu (%r8), %xmm0
+ movdqu 16(%r8), %xmm1
+ movdqu -32(%r8, %rdx), %xmm2
+ movdqu -16(%r8, %rdx), %xmm3
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, 16(%rdi)
+ movdqu %xmm2, -32(%rdi, %rdx)
+ movdqu %xmm3, -16(%rdi, %rdx)
+ jmp L(mm_return)
+
+L(mm_remaining_17_32_bytes_forward):
+ movdqu (%r8), %xmm0
+ movdqu -16(%r8, %rdx), %xmm1
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, -16(%rdi, %rdx)
+ jmp L(mm_return)
+
+L(mm_remaining_3_4_bytes_forward):
+ movzwl -2(%r8,%rdx), %esi
+ movzwl (%r8), %ebx
+ movw %si, -2(%rdi,%rdx)
+ movw %bx, (%rdi)
+ jmp L(mm_return)
+
+L(mm_remaining_5_8_bytes_forward):
+ movl (%r8), %esi
+ movl -4(%r8,%rdx), %ebx
+ movl %esi, (%rdi)
+ movl %ebx, -4(%rdi,%rdx)
+ jmp L(mm_return)
+
+L(mm_remaining_9_16_bytes_forward):
+ mov (%r8), %rsi
+ mov -8(%r8, %rdx), %rbx
+ mov %rsi, (%rdi)
+ mov %rbx, -8(%rdi, %rdx)
+ jmp L(mm_return)
+
+L(mm_len_0_16_bytes_forward):
+ testb $24, %dl
+ jne L(mm_len_9_16_bytes_forward)
+ testb $4, %dl
+ .p2align 4,,5
+ jne L(mm_len_5_8_bytes_forward)
+ test %rdx, %rdx
+ .p2align 4,,2
+ je L(mm_return)
+ testb $2, %dl
+ .p2align 4,,1
+ jne L(mm_len_2_4_bytes_forward)
+ movzbl -1(%rsi,%rdx), %ebx
+ movzbl (%rsi), %esi
+ movb %bl, -1(%rdi,%rdx)
+ movb %sil, (%rdi)
+ jmp L(mm_return)
+
+L(mm_len_2_4_bytes_forward):
+ movzwl -2(%rsi,%rdx), %ebx
+ movzwl (%rsi), %esi
+ movw %bx, -2(%rdi,%rdx)
+ movw %si, (%rdi)
+ jmp L(mm_return)
+
+L(mm_len_5_8_bytes_forward):
+ movl (%rsi), %ebx
+ movl -4(%rsi,%rdx), %esi
+ movl %ebx, (%rdi)
+ movl %esi, -4(%rdi,%rdx)
+ jmp L(mm_return)
+
+L(mm_len_9_16_bytes_forward):
+ mov (%rsi), %rbx
+ mov -8(%rsi, %rdx), %rsi
+ mov %rbx, (%rdi)
+ mov %rsi, -8(%rdi, %rdx)
+ jmp L(mm_return)
+
+/* The code for copying backwards. */
+L(mm_len_0_or_more_backward):
+
+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
+ separately. */
+ cmp $16, %rdx
+ jbe L(mm_len_0_16_bytes_backward)
+
+ cmp $32, %rdx
+ jg L(mm_len_32_or_more_backward)
+
+/* Copy [0..32] and return. */
+ movdqu (%rsi), %xmm0
+ movdqu -16(%rsi, %rdx), %xmm1
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, -16(%rdi, %rdx)
+ jmp L(mm_return)
+
+L(mm_len_32_or_more_backward):
+ cmp $64, %rdx
+ jg L(mm_len_64_or_more_backward)
+
+/* Copy [0..64] and return. */
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm1
+ movdqu -16(%rsi, %rdx), %xmm2
+ movdqu -32(%rsi, %rdx), %xmm3
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, 16(%rdi)
+ movdqu %xmm2, -16(%rdi, %rdx)
+ movdqu %xmm3, -32(%rdi, %rdx)
+ jmp L(mm_return)
+
+L(mm_len_64_or_more_backward):
+ cmp $128, %rdx
+ jg L(mm_len_128_or_more_backward)
+
+/* Copy [0..128] and return. */
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm1
+ movdqu 32(%rsi), %xmm2
+ movdqu 48(%rsi), %xmm3
+ movdqu -64(%rsi, %rdx), %xmm4
+ movdqu -48(%rsi, %rdx), %xmm5
+ movdqu -32(%rsi, %rdx), %xmm6
+ movdqu -16(%rsi, %rdx), %xmm7
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, 16(%rdi)
+ movdqu %xmm2, 32(%rdi)
+ movdqu %xmm3, 48(%rdi)
+ movdqu %xmm4, -64(%rdi, %rdx)
+ movdqu %xmm5, -48(%rdi, %rdx)
+ movdqu %xmm6, -32(%rdi, %rdx)
+ movdqu %xmm7, -16(%rdi, %rdx)
+ jmp L(mm_return)
+
+L(mm_len_128_or_more_backward):
+
+ cmp $SHARED_CACHE_SIZE_HALF, %rdx
+ jae L(mm_large_page_backward)
+
+/* Aligning the address of destination. We need to save
+ 16 bits from the source in order not to overwrite them. */
+ movdqu -16(%rsi, %rdx), %xmm0
+ movdqu -32(%rsi, %rdx), %xmm1
+ movdqu -48(%rsi, %rdx), %xmm2
+ movdqu -64(%rsi, %rdx), %xmm3
+
+ lea (%rdi, %rdx), %r9
+ and $-64, %r9 /* r9 = aligned dst */
+
+ mov %rsi, %r8
+ sub %rdi, %r8 /* r8 = src - dst, diff */
+
+ movdqu -16(%r9, %r8), %xmm4
+ movdqu -32(%r9, %r8), %xmm5
+ movdqu -48(%r9, %r8), %xmm6
+ movdqu -64(%r9, %r8), %xmm7
+
+ movdqu %xmm0, -16(%rdi, %rdx)
+ movdqu %xmm1, -32(%rdi, %rdx)
+ movdqu %xmm2, -48(%rdi, %rdx)
+ movdqu %xmm3, -64(%rdi, %rdx)
+ movdqa %xmm4, -16(%r9)
+ movdqa %xmm5, -32(%r9)
+ movdqa %xmm6, -48(%r9)
+ movdqa %xmm7, -64(%r9)
+ lea -64(%r9), %r9
+
+ lea 64(%rdi), %rbx
+ and $-64, %rbx
+
+/* Compute in %rdx how many bytes are left to copy after
+ the main loop stops. */
+ mov %rbx, %rdx
+ sub %rdi, %rdx
+
+ cmp %r9, %rbx
+ jb L(mm_main_loop_backward)
+ jmp L(mm_len_0_or_more_backward)
+
+ .p2align 4
+L(mm_main_loop_backward):
+
+ prefetcht0 -128(%r9, %r8)
+
+ movdqu -64(%r9, %r8), %xmm0
+ movdqu -48(%r9, %r8), %xmm1
+ movdqu -32(%r9, %r8), %xmm2
+ movdqu -16(%r9, %r8), %xmm3
+ movdqa %xmm0, -64(%r9)
+ movdqa %xmm1, -48(%r9)
+ movdqa %xmm2, -32(%r9)
+ movdqa %xmm3, -16(%r9)
+ lea -64(%r9), %r9
+ cmp %r9, %rbx
+ jb L(mm_main_loop_backward)
+ jmp L(mm_len_0_or_more_backward)
+
+/* Copy [0..16] and return. */
+L(mm_len_0_16_bytes_backward):
+ testb $24, %dl
+ jnz L(mm_len_9_16_bytes_backward)
+ testb $4, %dl
+ .p2align 4,,5
+ jnz L(mm_len_5_8_bytes_backward)
+ test %rdx, %rdx
+ .p2align 4,,2
+ je L(mm_return)
+ testb $2, %dl
+ .p2align 4,,1
+ jne L(mm_len_3_4_bytes_backward)
+ movzbl -1(%rsi,%rdx), %ebx
+ movzbl (%rsi), %ecx
+ movb %bl, -1(%rdi,%rdx)
+ movb %cl, (%rdi)
+ jmp L(mm_return)
+
+L(mm_len_3_4_bytes_backward):
+ movzwl -2(%rsi,%rdx), %ebx
+ movzwl (%rsi), %ecx
+ movw %bx, -2(%rdi,%rdx)
+ movw %cx, (%rdi)
+ jmp L(mm_return)
+
+L(mm_len_9_16_bytes_backward):
+ movl -4(%rsi,%rdx), %ebx
+ movl -8(%rsi,%rdx), %ecx
+ movl %ebx, -4(%rdi,%rdx)
+ movl %ecx, -8(%rdi,%rdx)
+ sub $8, %rdx
+ jmp L(mm_len_0_16_bytes_backward)
+
+L(mm_len_5_8_bytes_backward):
+ movl (%rsi), %ebx
+ movl -4(%rsi,%rdx), %ecx
+ movl %ebx, (%rdi)
+ movl %ecx, -4(%rdi,%rdx)
+
+L(mm_return):
+ RETURN
+
+/* Big length copy forward part. */
+
+L(mm_large_page_forward):
+/* Aligning the address of destination. We need to save
+ 16 bits from the source in order not to overwrite them. */
+
+ mov %rsi, %r8
+ mov %rdi, %r9
+
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm1
+ movdqu 32(%rsi), %xmm2
+ movdqu 48(%rsi), %xmm3
+
+ lea 64(%r9), %rdi
+ and $-64, %rdi /* rdi = aligned dst */
+
+ sub %r9, %rsi /* rsi = diff */
+
+ movdqu (%rdi, %rsi), %xmm4
+ movdqu 16(%rdi, %rsi), %xmm5
+ movdqu 32(%rdi, %rsi), %xmm6
+ movdqu 48(%rdi, %rsi), %xmm7
+
+ movdqu %xmm0, (%r9)
+ movdqu %xmm1, 16(%r9)
+ movdqu %xmm2, 32(%r9)
+ movdqu %xmm3, 48(%r9)
+ movntdq %xmm4, (%rdi)
+ movntdq %xmm5, 16(%rdi)
+ movntdq %xmm6, 32(%rdi)
+ movntdq %xmm7, 48(%rdi)
+ add $64, %rdi
+
+ lea (%r9, %rdx), %rbx
+ and $-128, %rbx
+
+ cmp %rdi, %rbx
+ jbe L(mm_copy_remaining_forward)
+
+ .p2align 4
+L(mm_large_page_loop_forward):
+ movdqu (%rdi, %rsi), %xmm0
+ movdqu 16(%rdi, %rsi), %xmm1
+ movdqu 32(%rdi, %rsi), %xmm2
+ movdqu 48(%rdi, %rsi), %xmm3
+ movdqu 64(%rdi, %rsi), %xmm4
+ movdqu 80(%rdi, %rsi), %xmm5
+ movdqu 96(%rdi, %rsi), %xmm6
+ movdqu 112(%rdi, %rsi), %xmm7
+ movntdq %xmm0, (%rdi)
+ movntdq %xmm1, 16(%rdi)
+ movntdq %xmm2, 32(%rdi)
+ movntdq %xmm3, 48(%rdi)
+ movntdq %xmm4, 64(%rdi)
+ movntdq %xmm5, 80(%rdi)
+ movntdq %xmm6, 96(%rdi)
+ movntdq %xmm7, 112(%rdi)
+ lea 128(%rdi), %rdi
+ cmp %rdi, %rbx
+ ja L(mm_large_page_loop_forward)
+ sfence
+
+ add %r9, %rdx
+ sub %rdi, %rdx
+/* We copied all up till %rdi position in the dst.
+ In %rdx now is how many bytes are left to copy.
+ Now we need to advance %r8. */
+ lea (%rdi, %rsi), %r8
+
+ cmp $64, %rdx
+ jb L(mm_remaining_0_64_bytes_forward)
+
+ movdqu (%r8), %xmm0
+ movdqu 16(%r8), %xmm1
+ movdqu 32(%r8), %xmm2
+ movdqu 48(%r8), %xmm3
+ movdqu -64(%r8, %rdx), %xmm4
+ movdqu -48(%r8, %rdx), %xmm5
+ movdqu -32(%r8, %rdx), %xmm6
+ movdqu -16(%r8, %rdx), %xmm7
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, 16(%rdi)
+ movdqu %xmm2, 32(%rdi)
+ movdqu %xmm3, 48(%rdi)
+ movdqu %xmm4, -64(%rdi, %rdx)
+ movdqu %xmm5, -48(%rdi, %rdx)
+ movdqu %xmm6, -32(%rdi, %rdx)
+ movdqu %xmm7, -16(%rdi, %rdx)
+ jmp L(mm_return)
+
+
+/* Big length copy backward part. */
+L(mm_large_page_backward):
+/* Aligning the address of destination. We need to save
+ 16 bits from the source in order not to overwrite them. */
+
+ movdqu -16(%rsi, %rdx), %xmm0
+ movdqu -32(%rsi, %rdx), %xmm1
+ movdqu -48(%rsi, %rdx), %xmm2
+ movdqu -64(%rsi, %rdx), %xmm3
+
+ lea (%rdi, %rdx), %r9
+ and $-64, %r9
+
+ mov %rsi, %r8
+ sub %rdi, %r8
+
+ movdqu -16(%r9, %r8), %xmm4
+ movdqu -32(%r9, %r8), %xmm5
+ movdqu -48(%r9, %r8), %xmm6
+ movdqu -64(%r9, %r8), %xmm7
+
+ movdqu %xmm0, -16(%rdi, %rdx)
+ movdqu %xmm1, -32(%rdi, %rdx)
+ movdqu %xmm2, -48(%rdi, %rdx)
+ movdqu %xmm3, -64(%rdi, %rdx)
+ movntdq %xmm4, -16(%r9)
+ movntdq %xmm5, -32(%r9)
+ movntdq %xmm6, -48(%r9)
+ movntdq %xmm7, -64(%r9)
+ lea -64(%r9), %r9
+
+ lea 128(%rdi), %rbx
+ and $-64, %rbx
+
+/* Compute in %rdx how many bytes are left to copy after
+ the main loop stops. */
+ mov %rbx, %rdx
+ sub %rdi, %rdx
+
+ cmp %r9, %rbx
+ jae L(mm_len_0_or_more_backward)
+
+ .p2align 4
+L(mm_large_page_loop_backward):
+ movdqu -64(%r9, %r8), %xmm0
+ movdqu -48(%r9, %r8), %xmm1
+ movdqu -32(%r9, %r8), %xmm2
+ movdqu -16(%r9, %r8), %xmm3
+ movntdq %xmm0, -64(%r9)
+ movntdq %xmm1, -48(%r9)
+ movntdq %xmm2, -32(%r9)
+ movntdq %xmm3, -16(%r9)
+ lea -64(%r9), %r9
+ cmp %r9, %rbx
+ jb L(mm_large_page_loop_backward)
+ jmp L(mm_len_0_or_more_backward)
+
+END (MEMMOVE)
diff --git a/libc/arch-x86_64/string/sse2-memset-slm.S b/libc/arch-x86_64/string/sse2-memset-slm.S
new file mode 100644
index 0000000..bfcafae
--- /dev/null
+++ b/libc/arch-x86_64/string/sse2-memset-slm.S
@@ -0,0 +1,173 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cache.h"
+
+#ifndef MEMSET
+# define MEMSET memset
+#endif
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n) .p2align n
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+ .section .text.sse2,"ax",@progbits
+ENTRY (MEMSET)
+ movq %rdi, %rax
+#ifdef USE_AS_BZERO_P
+ mov %rsi, %rdx
+ xor %rcx, %rcx
+#else
+ and $0xff, %rsi
+ mov $0x0101010101010101, %rcx
+ imul %rsi, %rcx
+#endif
+ cmpq $16, %rdx
+ jae L(16bytesormore)
+ testb $8, %dl
+ jnz L(8_15bytes)
+ testb $4, %dl
+ jnz L(4_7bytes)
+ testb $2, %dl
+ jnz L(2_3bytes)
+ testb $1, %dl
+ jz L(return)
+ movb %cl, (%rdi)
+L(return):
+ ret
+
+L(8_15bytes):
+ movq %rcx, (%rdi)
+ movq %rcx, -8(%rdi, %rdx)
+ ret
+
+L(4_7bytes):
+ movl %ecx, (%rdi)
+ movl %ecx, -4(%rdi, %rdx)
+ ret
+
+L(2_3bytes):
+ movw %cx, (%rdi)
+ movw %cx, -2(%rdi, %rdx)
+ ret
+
+ ALIGN (4)
+L(16bytesormore):
+#ifdef USE_AS_BZERO_P
+ pxor %xmm0, %xmm0
+#else
+ movd %rcx, %xmm0
+ pshufd $0, %xmm0, %xmm0
+#endif
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm0, -16(%rdi, %rdx)
+ cmpq $32, %rdx
+ jbe L(32bytesless)
+ movdqu %xmm0, 16(%rdi)
+ movdqu %xmm0, -32(%rdi, %rdx)
+ cmpq $64, %rdx
+ jbe L(64bytesless)
+ movdqu %xmm0, 32(%rdi)
+ movdqu %xmm0, 48(%rdi)
+ movdqu %xmm0, -64(%rdi, %rdx)
+ movdqu %xmm0, -48(%rdi, %rdx)
+ cmpq $128, %rdx
+ ja L(128bytesmore)
+L(32bytesless):
+L(64bytesless):
+ ret
+
+ ALIGN (4)
+L(128bytesmore):
+ leaq 64(%rdi), %rcx
+ andq $-64, %rcx
+ movq %rdx, %r8
+ addq %rdi, %rdx
+ andq $-64, %rdx
+ cmpq %rcx, %rdx
+ je L(return)
+
+#ifdef SHARED_CACHE_SIZE
+ cmp $SHARED_CACHE_SIZE, %r8
+#else
+ cmp __x86_64_shared_cache_size(%rip), %r8
+#endif
+ ja L(128bytesmore_nt)
+
+ ALIGN (4)
+L(128bytesmore_normal):
+ movdqa %xmm0, (%rcx)
+ movaps %xmm0, 0x10(%rcx)
+ movaps %xmm0, 0x20(%rcx)
+ movaps %xmm0, 0x30(%rcx)
+ addq $64, %rcx
+ cmpq %rcx, %rdx
+ jne L(128bytesmore_normal)
+ ret
+
+ ALIGN (4)
+L(128bytesmore_nt):
+ movntdq %xmm0, (%rcx)
+ movntdq %xmm0, 0x10(%rcx)
+ movntdq %xmm0, 0x20(%rcx)
+ movntdq %xmm0, 0x30(%rcx)
+ leaq 64(%rcx), %rcx
+ cmpq %rcx, %rdx
+ jne L(128bytesmore_nt)
+ sfence
+ ret
+
+END (MEMSET)
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86_64/string/sse2-stpcpy-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86_64/string/sse2-stpcpy-slm.S
index 9d0a563..0ad2d44 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86_64/string/sse2-stpcpy-slm.S
@@ -1,42 +1,33 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
-
- * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE (512*1024) /* Atom L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STPCPY
+#define STRCPY stpcpy
+#include "sse2-strcpy-slm.S"
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86_64/string/sse2-stpncpy-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86_64/string/sse2-stpncpy-slm.S
index 9d0a563..3066685 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86_64/string/sse2-stpncpy-slm.S
@@ -1,42 +1,34 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
-
- * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE (512*1024) /* Atom L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STRNCPY
+#define USE_AS_STPCPY
+#define STRCPY stpncpy
+#include "sse2-strcpy-slm.S"
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86_64/string/sse2-strcat-slm.S
similarity index 63%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86_64/string/sse2-strcat-slm.S
index 9d0a563..dd8207f 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86_64/string/sse2-strcat-slm.S
@@ -1,42 +1,87 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
-
- * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE (512*1024) /* Atom L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef STRCAT
+# define STRCAT strcat
+#endif
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#define USE_AS_STRCAT
+
+.text
+ENTRY (STRCAT)
+ mov %rdi, %r9
+#ifdef USE_AS_STRNCAT
+ mov %rdx, %r8
+#endif
+
+#define RETURN jmp L(Strcpy)
+#include "sse2-strlen-slm.S"
+
+#undef RETURN
+#define RETURN ret
+
+L(Strcpy):
+ lea (%r9, %rax), %rdi
+ mov %rsi, %rcx
+ mov %r9, %rax /* save result */
+
+#ifdef USE_AS_STRNCAT
+ test %r8, %r8
+ jz L(ExitZero)
+# define USE_AS_STRNCPY
+#endif
+#include "sse2-strcpy-slm.S"
diff --git a/libc/arch-x86_64/string/sse2-strcpy-slm.S b/libc/arch-x86_64/string/sse2-strcpy-slm.S
new file mode 100644
index 0000000..3e146bf
--- /dev/null
+++ b/libc/arch-x86_64/string/sse2-strcpy-slm.S
@@ -0,0 +1,1921 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef USE_AS_STRCAT
+
+# ifndef STRCPY
+# define STRCPY strcpy
+# endif
+
+# ifndef L
+# define L(label) .L##label
+# endif
+
+# ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+# endif
+
+# ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+# endif
+
+# ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+# endif
+
+# ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+# endif
+
+#endif
+
+#define JMPTBL(I, B) I - B
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ lea TABLE(%rip), %r11; \
+ movslq (%r11, INDEX, SCALE), %rcx; \
+ lea (%r11, %rcx), %rcx; \
+ jmp *%rcx
+
+#ifndef USE_AS_STRCAT
+
+# define RETURN ret
+
+.text
+ENTRY (STRCPY)
+# ifdef USE_AS_STRNCPY
+ mov %rdx, %r8
+ test %r8, %r8
+ jz L(ExitZero)
+# endif
+ mov %rsi, %rcx
+# ifndef USE_AS_STPCPY
+ mov %rdi, %rax /* save result */
+# endif
+
+#endif
+ and $63, %rcx
+ cmp $32, %rcx
+ jbe L(SourceStringAlignmentLess32)
+
+ and $-16, %rsi
+ and $15, %rcx
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+
+ pcmpeqb (%rsi), %xmm1
+ pmovmskb %xmm1, %rdx
+ shr %cl, %rdx
+#ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+ mov $16, %r10
+ sub %rcx, %r10
+ cmp %r10, %r8
+# else
+ mov $17, %r10
+ sub %rcx, %r10
+ cmp %r10, %r8
+# endif
+ jbe L(CopyFrom1To16BytesTailCase2OrCase3)
+#endif
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTail)
+
+ pcmpeqb 16(%rsi), %xmm0
+ pmovmskb %xmm0, %rdx
+#ifdef USE_AS_STRNCPY
+ add $16, %r10
+ cmp %r10, %r8
+ jbe L(CopyFrom1To32BytesCase2OrCase3)
+#endif
+ test %rdx, %rdx
+ jnz L(CopyFrom1To32Bytes)
+
+ movdqu (%rsi, %rcx), %xmm1 /* copy 16 bytes */
+ movdqu %xmm1, (%rdi)
+
+/* If source adress alignment != destination adress alignment */
+ .p2align 4
+L(Unalign16Both):
+ sub %rcx, %rdi
+#ifdef USE_AS_STRNCPY
+ add %rcx, %r8
+#endif
+ mov $16, %rcx
+ movdqa (%rsi, %rcx), %xmm1
+ movaps 16(%rsi, %rcx), %xmm2
+ movdqu %xmm1, (%rdi, %rcx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ sub $48, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+ test %rdx, %rdx
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm2)
+#else
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movaps 16(%rsi, %rcx), %xmm3
+ movdqu %xmm2, (%rdi, %rcx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+ test %rdx, %rdx
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm3)
+#else
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movaps 16(%rsi, %rcx), %xmm4
+ movdqu %xmm3, (%rdi, %rcx)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+ test %rdx, %rdx
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm4)
+#else
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movaps 16(%rsi, %rcx), %xmm1
+ movdqu %xmm4, (%rdi, %rcx)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+ test %rdx, %rdx
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm1)
+#else
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movaps 16(%rsi, %rcx), %xmm2
+ movdqu %xmm1, (%rdi, %rcx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+ test %rdx, %rdx
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm2)
+#else
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movaps 16(%rsi, %rcx), %xmm3
+ movdqu %xmm2, (%rdi, %rcx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %rcx
+#ifdef USE_AS_STRNCPY
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+ test %rdx, %rdx
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm3)
+#else
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ movdqu %xmm3, (%rdi, %rcx)
+ mov %rsi, %rdx
+ lea 16(%rsi, %rcx), %rsi
+ and $-0x40, %rsi
+ sub %rsi, %rdx
+ sub %rdx, %rdi
+#ifdef USE_AS_STRNCPY
+ lea 128(%r8, %rdx), %r8
+#endif
+L(Unaligned64Loop):
+ movaps (%rsi), %xmm2
+ movaps %xmm2, %xmm4
+ movaps 16(%rsi), %xmm5
+ movaps 32(%rsi), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 48(%rsi), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %rdx
+#ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(UnalignedLeaveCase2OrCase3)
+#endif
+ test %rdx, %rdx
+ jnz L(Unaligned64Leave)
+
+L(Unaligned64Loop_start):
+ add $64, %rdi
+ add $64, %rsi
+ movdqu %xmm4, -64(%rdi)
+ movaps (%rsi), %xmm2
+ movdqa %xmm2, %xmm4
+ movdqu %xmm5, -48(%rdi)
+ movaps 16(%rsi), %xmm5
+ pminub %xmm5, %xmm2
+ movaps 32(%rsi), %xmm3
+ movdqu %xmm6, -32(%rdi)
+ movaps %xmm3, %xmm6
+ movdqu %xmm7, -16(%rdi)
+ movaps 48(%rsi), %xmm7
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %rdx
+#ifdef USE_AS_STRNCPY
+ sub $64, %r8
+ jbe L(UnalignedLeaveCase2OrCase3)
+#endif
+ test %rdx, %rdx
+ jz L(Unaligned64Loop_start)
+
+L(Unaligned64Leave):
+ pxor %xmm1, %xmm1
+
+ pcmpeqb %xmm4, %xmm0
+ pcmpeqb %xmm5, %xmm1
+ pmovmskb %xmm0, %rdx
+ pmovmskb %xmm1, %rcx
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesUnaligned_0)
+ test %rcx, %rcx
+ jnz L(CopyFrom1To16BytesUnaligned_16)
+
+ pcmpeqb %xmm6, %xmm0
+ pcmpeqb %xmm7, %xmm1
+ pmovmskb %xmm0, %rdx
+ pmovmskb %xmm1, %rcx
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesUnaligned_32)
+
+ bsf %rcx, %rdx
+ movdqu %xmm4, (%rdi)
+ movdqu %xmm5, 16(%rdi)
+ movdqu %xmm6, 32(%rdi)
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea 48(%rdi, %rdx), %rax
+# endif
+ movdqu %xmm7, 48(%rdi)
+ add $15, %r8
+ sub %rdx, %r8
+ lea 49(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+#else
+ add $48, %rsi
+ add $48, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+#endif
+
+/* If source adress alignment == destination adress alignment */
+
+L(SourceStringAlignmentLess32):
+ pxor %xmm0, %xmm0
+ movdqu (%rsi), %xmm1
+ movdqu 16(%rsi), %xmm2
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %rdx
+
+#ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+ cmp $16, %r8
+# else
+ cmp $17, %r8
+# endif
+ jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
+#endif
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTail1)
+
+ pcmpeqb %xmm2, %xmm0
+ movdqu %xmm1, (%rdi)
+ pmovmskb %xmm0, %rdx
+
+#ifdef USE_AS_STRNCPY
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+ cmp $32, %r8
+# else
+ cmp $33, %r8
+# endif
+ jbe L(CopyFrom1To32Bytes1Case2OrCase3)
+#endif
+ test %rdx, %rdx
+ jnz L(CopyFrom1To32Bytes1)
+
+ and $15, %rcx
+ and $-16, %rsi
+
+ jmp L(Unalign16Both)
+
+/*------End of main part with loops---------------------*/
+
+/* Case1 */
+
+#if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %rcx, %rdi
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+#endif
+ .p2align 4
+L(CopyFrom1To16BytesTail):
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1):
+ add $16, %rsi
+ add $16, %rdi
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $16, %r8
+#endif
+L(CopyFrom1To16BytesTail1):
+ bsf %rdx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes):
+ bsf %rdx, %rdx
+ add %rcx, %rsi
+ add $16, %rdx
+ sub %rcx, %rdx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+ bsf %rdx, %rdx
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+# endif
+ movdqu %xmm4, (%rdi)
+ add $63, %r8
+ sub %rdx, %r8
+ lea 1(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+#else
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+#endif
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+ bsf %rcx, %rdx
+ movdqu %xmm4, (%rdi)
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea 16(%rdi, %rdx), %rax
+# endif
+ movdqu %xmm5, 16(%rdi)
+ add $47, %r8
+ sub %rdx, %r8
+ lea 17(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+#else
+ add $16, %rsi
+ add $16, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+#endif
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+ bsf %rdx, %rdx
+ movdqu %xmm4, (%rdi)
+ movdqu %xmm5, 16(%rdi)
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+# ifdef USE_AS_STPCPY
+ lea 32(%rdi, %rdx), %rax
+# endif
+ movdqu %xmm6, 32(%rdi)
+ add $31, %r8
+ sub %rdx, %r8
+ lea 33(%rdi, %rdx), %rdi
+ jmp L(StrncpyFillTailWithZero)
+#else
+ add $32, %rsi
+ add $32, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+#endif
+
+#ifdef USE_AS_STRNCPY
+# ifndef USE_AS_STRCAT
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm6):
+ movdqu %xmm6, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm5):
+ movdqu %xmm5, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm4):
+ movdqu %xmm4, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm3):
+ movdqu %xmm3, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm1):
+ movdqu %xmm1, (%rdi, %rcx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+# endif
+
+ .p2align 4
+L(CopyFrom1To16BytesExit):
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4)
+
+/* Case2 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %r8
+ add %rcx, %rdi
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2):
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ add $16, %rdx
+ sub %rcx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+/* Case2 or Case3, Case3 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+ add $16, %r8
+ add %rcx, %rdi
+ add %rcx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To32BytesCase2)
+ add %rcx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTailCase2)
+ add %rcx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+ add $16, %rdi
+ add $16, %rsi
+ sub $16, %r8
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTail1Case2)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+#endif
+
+/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/
+
+ .p2align 4
+L(Exit1):
+ mov %dh, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea (%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $1, %r8
+ lea 1(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit2):
+ mov (%rsi), %dx
+ mov %dx, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 1(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $2, %r8
+ lea 2(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit3):
+ mov (%rsi), %cx
+ mov %cx, (%rdi)
+ mov %dh, 2(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 2(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $3, %r8
+ lea 3(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit4):
+ mov (%rsi), %edx
+ mov %edx, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 3(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $4, %r8
+ lea 4(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit5):
+ mov (%rsi), %ecx
+ mov %dh, 4(%rdi)
+ mov %ecx, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 4(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $5, %r8
+ lea 5(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit6):
+ mov (%rsi), %ecx
+ mov 4(%rsi), %dx
+ mov %ecx, (%rdi)
+ mov %dx, 4(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 5(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $6, %r8
+ lea 6(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit7):
+ mov (%rsi), %ecx
+ mov 3(%rsi), %edx
+ mov %ecx, (%rdi)
+ mov %edx, 3(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 6(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $7, %r8
+ lea 7(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit8):
+ mov (%rsi), %rdx
+ mov %rdx, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 7(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $8, %r8
+ lea 8(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit9):
+ mov (%rsi), %rcx
+ mov %dh, 8(%rdi)
+ mov %rcx, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 8(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $9, %r8
+ lea 9(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit10):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %dx
+ mov %rcx, (%rdi)
+ mov %dx, 8(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 9(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $10, %r8
+ lea 10(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit11):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 7(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 10(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $11, %r8
+ lea 11(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit12):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 8(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 11(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $12, %r8
+ lea 12(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit13):
+ mov (%rsi), %rcx
+ mov 5(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 5(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 12(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $13, %r8
+ lea 13(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit14):
+ mov (%rsi), %rcx
+ mov 6(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 6(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 13(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $14, %r8
+ lea 14(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit15):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 7(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 14(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $15, %r8
+ lea 15(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit16):
+ movdqu (%rsi), %xmm0
+ movdqu %xmm0, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 15(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $16, %r8
+ lea 16(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit17):
+ movdqu (%rsi), %xmm0
+ movdqu %xmm0, (%rdi)
+ mov %dh, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 16(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $17, %r8
+ lea 17(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit18):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %cx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 17(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $18, %r8
+ lea 18(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit19):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 15(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 18(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $19, %r8
+ lea 19(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit20):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 19(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $20, %r8
+ lea 20(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit21):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+ mov %dh, 20(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 20(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $21, %r8
+ lea 21(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit22):
+ movdqu (%rsi), %xmm0
+ mov 14(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 14(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 21(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $22, %r8
+ lea 22(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit23):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 15(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 22(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $23, %r8
+ lea 23(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit24):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 23(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $24, %r8
+ lea 24(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit25):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 16(%rdi)
+ mov %dh, 24(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 24(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $25, %r8
+ lea 25(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit26):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %cx, 24(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 25(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $26, %r8
+ lea 26(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit27):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 23(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 23(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 26(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $27, %r8
+ lea 27(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit28):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 24(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 27(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $28, %r8
+ lea 28(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit29):
+ movdqu (%rsi), %xmm0
+ movdqu 13(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 13(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 28(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $29, %r8
+ lea 29(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit30):
+ movdqu (%rsi), %xmm0
+ movdqu 14(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 14(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 29(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $30, %r8
+ lea 30(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit31):
+ movdqu (%rsi), %xmm0
+ movdqu 15(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 15(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 30(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $31, %r8
+ lea 31(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+ .p2align 4
+L(Exit32):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 31(%rdi), %rax
+#endif
+#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+ sub $32, %r8
+ lea 32(%rdi), %rdi
+ jnz L(StrncpyFillTailWithZero)
+#endif
+ RETURN
+
+#ifdef USE_AS_STRNCPY
+
+ .p2align 4
+L(StrncpyExit0):
+#ifdef USE_AS_STPCPY
+ mov %rdi, %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, (%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit1):
+ mov (%rsi), %dl
+ mov %dl, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 1(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 1(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit2):
+ mov (%rsi), %dx
+ mov %dx, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 2(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 2(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit3):
+ mov (%rsi), %cx
+ mov 2(%rsi), %dl
+ mov %cx, (%rdi)
+ mov %dl, 2(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 3(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 3(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit4):
+ mov (%rsi), %edx
+ mov %edx, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 4(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 4(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit5):
+ mov (%rsi), %ecx
+ mov 4(%rsi), %dl
+ mov %ecx, (%rdi)
+ mov %dl, 4(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 5(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 5(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit6):
+ mov (%rsi), %ecx
+ mov 4(%rsi), %dx
+ mov %ecx, (%rdi)
+ mov %dx, 4(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 6(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 6(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit7):
+ mov (%rsi), %ecx
+ mov 3(%rsi), %edx
+ mov %ecx, (%rdi)
+ mov %edx, 3(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 7(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 7(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit8):
+ mov (%rsi), %rdx
+ mov %rdx, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 8(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 8(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit9):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %dl
+ mov %rcx, (%rdi)
+ mov %dl, 8(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 9(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 9(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit10):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %dx
+ mov %rcx, (%rdi)
+ mov %dx, 8(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 10(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 10(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit11):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 7(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 11(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 11(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit12):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 8(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 12(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 12(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit13):
+ mov (%rsi), %rcx
+ mov 5(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 5(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 13(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 13(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit14):
+ mov (%rsi), %rcx
+ mov 6(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 6(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 14(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 14(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit15):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 7(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 15(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 15(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit16):
+ movdqu (%rsi), %xmm0
+ movdqu %xmm0, (%rdi)
+#ifdef USE_AS_STPCPY
+ lea 16(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 16(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit17):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %cl
+ movdqu %xmm0, (%rdi)
+ mov %cl, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 17(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 17(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit18):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %cx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 18(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 18(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit19):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 15(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 19(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 19(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit20):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 20(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 20(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit21):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ mov 20(%rsi), %dl
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+ mov %dl, 20(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 21(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 21(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit22):
+ movdqu (%rsi), %xmm0
+ mov 14(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 14(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 22(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 22(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit23):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 15(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 23(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 23(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit24):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 24(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 24(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit25):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %cl
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %cl, 24(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 25(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 25(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit26):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %cx, 24(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 26(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 26(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit27):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 23(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 23(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 27(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 27(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit28):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 24(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 28(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 28(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit29):
+ movdqu (%rsi), %xmm0
+ movdqu 13(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 13(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 29(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 29(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit30):
+ movdqu (%rsi), %xmm0
+ movdqu 14(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 14(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 30(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 30(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit31):
+ movdqu (%rsi), %xmm0
+ movdqu 15(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 15(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 31(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 31(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit32):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 32(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 32(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit33):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm2
+ mov 32(%rsi), %cl
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+ mov %cl, 32(%rdi)
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 33(%rdi)
+#endif
+ RETURN
+
+#ifndef USE_AS_STRCAT
+
+ .p2align 4
+L(Fill0):
+ RETURN
+
+ .p2align 4
+L(Fill1):
+ mov %dl, (%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill2):
+ mov %dx, (%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill3):
+ mov %edx, -1(%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill4):
+ mov %edx, (%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill5):
+ mov %edx, (%rdi)
+ mov %dl, 4(%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill6):
+ mov %edx, (%rdi)
+ mov %dx, 4(%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill7):
+ mov %rdx, -1(%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill8):
+ mov %rdx, (%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill9):
+ mov %rdx, (%rdi)
+ mov %dl, 8(%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill10):
+ mov %rdx, (%rdi)
+ mov %dx, 8(%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill11):
+ mov %rdx, (%rdi)
+ mov %edx, 7(%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill12):
+ mov %rdx, (%rdi)
+ mov %edx, 8(%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill13):
+ mov %rdx, (%rdi)
+ mov %rdx, 5(%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill14):
+ mov %rdx, (%rdi)
+ mov %rdx, 6(%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill15):
+ movdqu %xmm0, -1(%rdi)
+ RETURN
+
+ .p2align 4
+L(Fill16):
+ movdqu %xmm0, (%rdi)
+ RETURN
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm2):
+ movdqu %xmm2, (%rdi, %rcx)
+
+ .p2align 4
+L(CopyFrom1To16BytesXmmExit):
+ bsf %rdx, %rdx
+ add $15, %r8
+ add %rcx, %rdi
+#ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+#endif
+ sub %rdx, %r8
+ lea 1(%rdi, %rdx), %rdi
+
+ .p2align 4
+L(StrncpyFillTailWithZero):
+ pxor %xmm0, %xmm0
+ xor %rdx, %rdx
+ sub $16, %r8
+ jbe L(StrncpyFillExit)
+
+ movdqu %xmm0, (%rdi)
+ add $16, %rdi
+
+ mov %rdi, %rsi
+ and $0xf, %rsi
+ sub %rsi, %rdi
+ add %rsi, %r8
+ sub $64, %r8
+ jb L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+ movdqa %xmm0, (%rdi)
+ movdqa %xmm0, 16(%rdi)
+ movdqa %xmm0, 32(%rdi)
+ movdqa %xmm0, 48(%rdi)
+ add $64, %rdi
+ sub $64, %r8
+ jae L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+ add $32, %r8
+ jl L(StrncpyFillLess32)
+ movdqa %xmm0, (%rdi)
+ movdqa %xmm0, 16(%rdi)
+ add $32, %rdi
+ sub $16, %r8
+ jl L(StrncpyFillExit)
+ movdqa %xmm0, (%rdi)
+ add $16, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+L(StrncpyFillLess32):
+ add $16, %r8
+ jl L(StrncpyFillExit)
+ movdqa %xmm0, (%rdi)
+ add $16, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+L(StrncpyFillExit):
+ add $16, %r8
+ BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4)
+
+/* end of ifndef USE_AS_STRCAT */
+#endif
+
+ .p2align 4
+L(UnalignedLeaveCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(Unaligned64LeaveCase2)
+L(Unaligned64LeaveCase3):
+ lea 64(%r8), %rcx
+ and $-16, %rcx
+ add $48, %r8
+ jl L(CopyFrom1To16BytesCase3)
+ movdqu %xmm4, (%rdi)
+ sub $16, %r8
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm5, 16(%rdi)
+ sub $16, %r8
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm6, 32(%rdi)
+ sub $16, %r8
+ jb L(CopyFrom1To16BytesCase3)
+ movdqu %xmm7, 48(%rdi)
+#ifdef USE_AS_STPCPY
+ lea 64(%rdi), %rax
+#endif
+#ifdef USE_AS_STRCAT
+ xor %ch, %ch
+ movb %ch, 64(%rdi)
+#endif
+ RETURN
+
+ .p2align 4
+L(Unaligned64LeaveCase2):
+ xor %rcx, %rcx
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %rdx
+ add $48, %r8
+ jle L(CopyFrom1To16BytesCase2OrCase3)
+ test %rdx, %rdx
+#ifndef USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm4)
+#else
+ jnz L(CopyFrom1To16Bytes)
+#endif
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %rdx
+ movdqu %xmm4, (%rdi)
+ add $16, %rcx
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %rdx, %rdx
+#ifndef USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm5)
+#else
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %rdx
+ movdqu %xmm5, 16(%rdi)
+ add $16, %rcx
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %rdx, %rdx
+#ifndef USE_AS_STRCAT
+ jnz L(CopyFrom1To16BytesUnalignedXmm6)
+#else
+ jnz L(CopyFrom1To16Bytes)
+#endif
+
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %rdx
+ movdqu %xmm6, 32(%rdi)
+ lea 16(%rdi, %rcx), %rdi
+ lea 16(%rsi, %rcx), %rsi
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4)
+
+ .p2align 4
+L(ExitZero):
+#ifndef USE_AS_STRCAT
+ mov %rdi, %rax
+#endif
+ RETURN
+
+#endif
+
+#ifndef USE_AS_STRCAT
+END (STRCPY)
+#else
+END (STRCAT)
+#endif
+ .p2align 4
+ .section .rodata
+L(ExitTable):
+ .int JMPTBL(L(Exit1), L(ExitTable))
+ .int JMPTBL(L(Exit2), L(ExitTable))
+ .int JMPTBL(L(Exit3), L(ExitTable))
+ .int JMPTBL(L(Exit4), L(ExitTable))
+ .int JMPTBL(L(Exit5), L(ExitTable))
+ .int JMPTBL(L(Exit6), L(ExitTable))
+ .int JMPTBL(L(Exit7), L(ExitTable))
+ .int JMPTBL(L(Exit8), L(ExitTable))
+ .int JMPTBL(L(Exit9), L(ExitTable))
+ .int JMPTBL(L(Exit10), L(ExitTable))
+ .int JMPTBL(L(Exit11), L(ExitTable))
+ .int JMPTBL(L(Exit12), L(ExitTable))
+ .int JMPTBL(L(Exit13), L(ExitTable))
+ .int JMPTBL(L(Exit14), L(ExitTable))
+ .int JMPTBL(L(Exit15), L(ExitTable))
+ .int JMPTBL(L(Exit16), L(ExitTable))
+ .int JMPTBL(L(Exit17), L(ExitTable))
+ .int JMPTBL(L(Exit18), L(ExitTable))
+ .int JMPTBL(L(Exit19), L(ExitTable))
+ .int JMPTBL(L(Exit20), L(ExitTable))
+ .int JMPTBL(L(Exit21), L(ExitTable))
+ .int JMPTBL(L(Exit22), L(ExitTable))
+ .int JMPTBL(L(Exit23), L(ExitTable))
+ .int JMPTBL(L(Exit24), L(ExitTable))
+ .int JMPTBL(L(Exit25), L(ExitTable))
+ .int JMPTBL(L(Exit26), L(ExitTable))
+ .int JMPTBL(L(Exit27), L(ExitTable))
+ .int JMPTBL(L(Exit28), L(ExitTable))
+ .int JMPTBL(L(Exit29), L(ExitTable))
+ .int JMPTBL(L(Exit30), L(ExitTable))
+ .int JMPTBL(L(Exit31), L(ExitTable))
+ .int JMPTBL(L(Exit32), L(ExitTable))
+#ifdef USE_AS_STRNCPY
+L(ExitStrncpyTable):
+ .int JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable))
+ .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable))
+# ifndef USE_AS_STRCAT
+ .p2align 4
+L(FillTable):
+ .int JMPTBL(L(Fill0), L(FillTable))
+ .int JMPTBL(L(Fill1), L(FillTable))
+ .int JMPTBL(L(Fill2), L(FillTable))
+ .int JMPTBL(L(Fill3), L(FillTable))
+ .int JMPTBL(L(Fill4), L(FillTable))
+ .int JMPTBL(L(Fill5), L(FillTable))
+ .int JMPTBL(L(Fill6), L(FillTable))
+ .int JMPTBL(L(Fill7), L(FillTable))
+ .int JMPTBL(L(Fill8), L(FillTable))
+ .int JMPTBL(L(Fill9), L(FillTable))
+ .int JMPTBL(L(Fill10), L(FillTable))
+ .int JMPTBL(L(Fill11), L(FillTable))
+ .int JMPTBL(L(Fill12), L(FillTable))
+ .int JMPTBL(L(Fill13), L(FillTable))
+ .int JMPTBL(L(Fill14), L(FillTable))
+ .int JMPTBL(L(Fill15), L(FillTable))
+ .int JMPTBL(L(Fill16), L(FillTable))
+# endif
+#endif
diff --git a/libc/arch-x86_64/string/sse2-strlen-slm.S b/libc/arch-x86_64/string/sse2-strlen-slm.S
new file mode 100644
index 0000000..3772fe7
--- /dev/null
+++ b/libc/arch-x86_64/string/sse2-strlen-slm.S
@@ -0,0 +1,294 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef USE_AS_STRCAT
+
+#ifndef STRLEN
+# define STRLEN strlen
+#endif
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+#define RETURN ret
+ .section .text.sse2,"ax",@progbits
+ENTRY (STRLEN)
+/* end ifndef USE_AS_STRCAT */
+#endif
+ xor %rax, %rax
+ mov %edi, %ecx
+ and $0x3f, %ecx
+ pxor %xmm0, %xmm0
+ cmp $0x30, %ecx
+ ja L(next)
+ movdqu (%rdi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit_less16)
+ mov %rdi, %rax
+ and $-16, %rax
+ jmp L(align16_start)
+L(next):
+ mov %rdi, %rax
+ and $-16, %rax
+ pcmpeqb (%rax), %xmm0
+ mov $-1, %r10d
+ sub %rax, %rcx
+ shl %cl, %r10d
+ pmovmskb %xmm0, %edx
+ and %r10d, %edx
+ jnz L(exit)
+L(align16_start):
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+ pcmpeqb 16(%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz L(exit64)
+
+ pcmpeqb 80(%rax), %xmm0
+ add $64, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz L(exit64)
+
+ pcmpeqb 80(%rax), %xmm0
+ add $64, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz L(exit64)
+
+ pcmpeqb 80(%rax), %xmm0
+ add $64, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz L(exit64)
+
+
+ test $0x3f, %rax
+ jz L(align64_loop)
+
+ pcmpeqb 80(%rax), %xmm0
+ add $80, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ test $0x3f, %rax
+ jz L(align64_loop)
+
+ pcmpeqb 16(%rax), %xmm1
+ add $16, %rax
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ test $0x3f, %rax
+ jz L(align64_loop)
+
+ pcmpeqb 16(%rax), %xmm2
+ add $16, %rax
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ test $0x3f, %rax
+ jz L(align64_loop)
+
+ pcmpeqb 16(%rax), %xmm3
+ add $16, %rax
+ pmovmskb %xmm3, %edx
+ test %edx, %edx
+ jnz L(exit)
+
+ add $16, %rax
+ .p2align 4
+ L(align64_loop):
+ movaps (%rax), %xmm4
+ pminub 16(%rax), %xmm4
+ movaps 32(%rax), %xmm5
+ pminub 48(%rax), %xmm5
+ add $64, %rax
+ pminub %xmm4, %xmm5
+ pcmpeqb %xmm0, %xmm5
+ pmovmskb %xmm5, %edx
+ test %edx, %edx
+ jz L(align64_loop)
+
+
+ pcmpeqb -64(%rax), %xmm0
+ sub $80, %rax
+ pmovmskb %xmm0, %edx
+ test %edx, %edx
+ jnz L(exit16)
+
+ pcmpeqb 32(%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ test %edx, %edx
+ jnz L(exit32)
+
+ pcmpeqb 48(%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ test %edx, %edx
+ jnz L(exit48)
+
+ pcmpeqb 64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $64, %rax
+ RETURN
+
+ .p2align 4
+L(exit):
+ sub %rdi, %rax
+L(exit_less16):
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ RETURN
+ .p2align 4
+L(exit16):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $16, %rax
+ RETURN
+ .p2align 4
+L(exit32):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $32, %rax
+ RETURN
+ .p2align 4
+L(exit48):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $48, %rax
+ RETURN
+ .p2align 4
+L(exit64):
+ sub %rdi, %rax
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ add $64, %rax
+#ifndef USE_AS_STRCAT
+ RETURN
+
+END (STRLEN)
+#endif
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86_64/string/sse2-strncat-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86_64/string/sse2-strncat-slm.S
index 9d0a563..6b4a430 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86_64/string/sse2-strncat-slm.S
@@ -1,42 +1,33 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
-
- * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE (512*1024) /* Atom L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STRNCAT
+#define STRCAT strncat
+#include "sse2-strcat-slm.S"
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86_64/string/sse2-strncpy-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86_64/string/sse2-strncpy-slm.S
index 9d0a563..594e78f 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86_64/string/sse2-strncpy-slm.S
@@ -1,42 +1,33 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
-
- * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE (512*1024) /* Atom L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STRNCPY
+#define STRCPY strncpy
+#include "sse2-strcpy-slm.S"
diff --git a/libc/arch-x86_64/string/sse4-memcmp-slm.S b/libc/arch-x86_64/string/sse4-memcmp-slm.S
new file mode 100644
index 0000000..8a8b180
--- /dev/null
+++ b/libc/arch-x86_64/string/sse4-memcmp-slm.S
@@ -0,0 +1,1799 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "cache.h"
+
+#ifndef MEMCMP
+# define MEMCMP memcmp
+#endif
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n) .p2align n
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n) .p2align n
+#endif
+
+#define JMPTBL(I, B) (I - B)
+
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ lea TABLE(%rip), %r11; \
+ movslq (%r11, INDEX, SCALE), %rcx; \
+ add %r11, %rcx; \
+ jmp *%rcx; \
+ ud2
+
+ .section .text.sse4.1,"ax",@progbits
+ENTRY (MEMCMP)
+#ifdef USE_AS_WMEMCMP
+ shl $2, %rdx
+#endif
+ pxor %xmm0, %xmm0
+ cmp $79, %rdx
+ ja L(79bytesormore)
+#ifndef USE_AS_WMEMCMP
+ cmp $1, %rdx
+ je L(firstbyte)
+#endif
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+#ifndef USE_AS_WMEMCMP
+ ALIGN (4)
+L(firstbyte):
+ movzbl (%rdi), %eax
+ movzbl (%rsi), %ecx
+ sub %ecx, %eax
+ ret
+#endif
+
+ ALIGN (4)
+L(79bytesormore):
+ movdqu (%rsi), %xmm1
+ movdqu (%rdi), %xmm2
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+ mov %rsi, %rcx
+ and $-16, %rsi
+ add $16, %rsi
+ sub %rsi, %rcx
+
+ sub %rcx, %rdi
+ add %rcx, %rdx
+ test $0xf, %rdi
+ jz L(2aligned)
+
+ cmp $128, %rdx
+ ja L(128bytesormore)
+L(less128bytes):
+ sub $64, %rdx
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqu 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqu 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+ cmp $32, %rdx
+ jb L(less32bytesin64)
+
+ movdqu 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqu 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin64):
+ add $64, %rdi
+ add $64, %rsi
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(128bytesormore):
+ cmp $512, %rdx
+ ja L(512bytesormore)
+ cmp $256, %rdx
+ ja L(less512bytes)
+L(less256bytes):
+ sub $128, %rdx
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqu 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqu 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+
+ movdqu 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqu 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+
+ movdqu 96(%rdi), %xmm2
+ pxor 96(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(112bytesin256)
+
+ movdqu 112(%rdi), %xmm2
+ pxor 112(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(128bytesin256)
+
+ add $128, %rsi
+ add $128, %rdi
+
+ cmp $64, %rdx
+ jae L(less128bytes)
+
+ cmp $32, %rdx
+ jb L(less32bytesin128)
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin128):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(less512bytes):
+ sub $256, %rdx
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqu 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqu 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+
+ movdqu 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqu 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+
+ movdqu 96(%rdi), %xmm2
+ pxor 96(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(112bytesin256)
+
+ movdqu 112(%rdi), %xmm2
+ pxor 112(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(128bytesin256)
+
+ movdqu 128(%rdi), %xmm2
+ pxor 128(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(144bytesin256)
+
+ movdqu 144(%rdi), %xmm2
+ pxor 144(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(160bytesin256)
+
+ movdqu 160(%rdi), %xmm2
+ pxor 160(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(176bytesin256)
+
+ movdqu 176(%rdi), %xmm2
+ pxor 176(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(192bytesin256)
+
+ movdqu 192(%rdi), %xmm2
+ pxor 192(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(208bytesin256)
+
+ movdqu 208(%rdi), %xmm2
+ pxor 208(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(224bytesin256)
+
+ movdqu 224(%rdi), %xmm2
+ pxor 224(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(240bytesin256)
+
+ movdqu 240(%rdi), %xmm2
+ pxor 240(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(256bytesin256)
+
+ add $256, %rsi
+ add $256, %rdi
+
+ cmp $128, %rdx
+ jae L(less256bytes)
+
+ cmp $64, %rdx
+ jae L(less128bytes)
+
+ cmp $32, %rdx
+ jb L(less32bytesin256)
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin256):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+ ALIGN (4)
+L(512bytesormore):
+#ifdef DATA_CACHE_SIZE_HALF
+ mov $DATA_CACHE_SIZE_HALF, %r8
+#else
+ mov __x86_64_data_cache_size_half(%rip), %r8
+#endif
+ mov %r8, %r9
+ shr $1, %r8
+ add %r9, %r8
+ cmp %r8, %rdx
+ ja L(L2_L3_cache_unaglined)
+ sub $64, %rdx
+ ALIGN (4)
+L(64bytesormore_loop):
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ movdqa %xmm2, %xmm1
+
+ movdqu 16(%rdi), %xmm3
+ pxor 16(%rsi), %xmm3
+ por %xmm3, %xmm1
+
+ movdqu 32(%rdi), %xmm4
+ pxor 32(%rsi), %xmm4
+ por %xmm4, %xmm1
+
+ movdqu 48(%rdi), %xmm5
+ pxor 48(%rsi), %xmm5
+ por %xmm5, %xmm1
+
+ ptest %xmm1, %xmm0
+ jnc L(64bytesormore_loop_end)
+ add $64, %rsi
+ add $64, %rdi
+ sub $64, %rdx
+ jae L(64bytesormore_loop)
+
+ add $64, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+L(L2_L3_cache_unaglined):
+ sub $64, %rdx
+ ALIGN (4)
+L(L2_L3_unaligned_128bytes_loop):
+ prefetchnta 0x1c0(%rdi)
+ prefetchnta 0x1c0(%rsi)
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ movdqa %xmm2, %xmm1
+
+ movdqu 16(%rdi), %xmm3
+ pxor 16(%rsi), %xmm3
+ por %xmm3, %xmm1
+
+ movdqu 32(%rdi), %xmm4
+ pxor 32(%rsi), %xmm4
+ por %xmm4, %xmm1
+
+ movdqu 48(%rdi), %xmm5
+ pxor 48(%rsi), %xmm5
+ por %xmm5, %xmm1
+
+ ptest %xmm1, %xmm0
+ jnc L(64bytesormore_loop_end)
+ add $64, %rsi
+ add $64, %rdi
+ sub $64, %rdx
+ jae L(L2_L3_unaligned_128bytes_loop)
+
+ add $64, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+/*
+ * This case is for machines which are sensitive for unaligned instructions.
+ */
+ ALIGN (4)
+L(2aligned):
+ cmp $128, %rdx
+ ja L(128bytesormorein2aligned)
+L(less128bytesin2aligned):
+ sub $64, %rdx
+
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqa 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqa 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqa 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+ cmp $32, %rdx
+ jb L(less32bytesin64in2alinged)
+
+ movdqa 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqa 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin64in2alinged):
+ add $64, %rdi
+ add $64, %rsi
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+ ALIGN (4)
+L(128bytesormorein2aligned):
+ cmp $512, %rdx
+ ja L(512bytesormorein2aligned)
+ cmp $256, %rdx
+ ja L(256bytesormorein2aligned)
+L(less256bytesin2alinged):
+ sub $128, %rdx
+
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqa 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqa 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqa 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+
+ movdqa 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqa 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+
+ movdqa 96(%rdi), %xmm2
+ pxor 96(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(112bytesin256)
+
+ movdqa 112(%rdi), %xmm2
+ pxor 112(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(128bytesin256)
+
+ add $128, %rsi
+ add $128, %rdi
+
+ cmp $64, %rdx
+ jae L(less128bytesin2aligned)
+
+ cmp $32, %rdx
+ jb L(less32bytesin128in2aligned)
+
+ movdqu (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqu 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin128in2aligned):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+ ALIGN (4)
+L(256bytesormorein2aligned):
+
+ sub $256, %rdx
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqa 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+
+ movdqa 32(%rdi), %xmm2
+ pxor 32(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(48bytesin256)
+
+ movdqa 48(%rdi), %xmm2
+ pxor 48(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(64bytesin256)
+
+ movdqa 64(%rdi), %xmm2
+ pxor 64(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(80bytesin256)
+
+ movdqa 80(%rdi), %xmm2
+ pxor 80(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(96bytesin256)
+
+ movdqa 96(%rdi), %xmm2
+ pxor 96(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(112bytesin256)
+
+ movdqa 112(%rdi), %xmm2
+ pxor 112(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(128bytesin256)
+
+ movdqa 128(%rdi), %xmm2
+ pxor 128(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(144bytesin256)
+
+ movdqa 144(%rdi), %xmm2
+ pxor 144(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(160bytesin256)
+
+ movdqa 160(%rdi), %xmm2
+ pxor 160(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(176bytesin256)
+
+ movdqa 176(%rdi), %xmm2
+ pxor 176(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(192bytesin256)
+
+ movdqa 192(%rdi), %xmm2
+ pxor 192(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(208bytesin256)
+
+ movdqa 208(%rdi), %xmm2
+ pxor 208(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(224bytesin256)
+
+ movdqa 224(%rdi), %xmm2
+ pxor 224(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(240bytesin256)
+
+ movdqa 240(%rdi), %xmm2
+ pxor 240(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(256bytesin256)
+
+ add $256, %rsi
+ add $256, %rdi
+
+ cmp $128, %rdx
+ jae L(less256bytesin2alinged)
+
+ cmp $64, %rdx
+ jae L(less128bytesin2aligned)
+
+ cmp $32, %rdx
+ jb L(less32bytesin256in2alinged)
+
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(16bytesin256)
+
+ movdqa 16(%rdi), %xmm2
+ pxor 16(%rsi), %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(32bytesin256)
+ sub $32, %rdx
+ add $32, %rdi
+ add $32, %rsi
+L(less32bytesin256in2alinged):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+ ALIGN (4)
+L(512bytesormorein2aligned):
+#ifdef DATA_CACHE_SIZE_HALF
+ mov $DATA_CACHE_SIZE_HALF, %r8
+#else
+ mov __x86_64_data_cache_size_half(%rip), %r8
+#endif
+ mov %r8, %r9
+ shr $1, %r8
+ add %r9, %r8
+ cmp %r8, %rdx
+ ja L(L2_L3_cache_aglined)
+
+ sub $64, %rdx
+ ALIGN (4)
+L(64bytesormore_loopin2aligned):
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ movdqa %xmm2, %xmm1
+
+ movdqa 16(%rdi), %xmm3
+ pxor 16(%rsi), %xmm3
+ por %xmm3, %xmm1
+
+ movdqa 32(%rdi), %xmm4
+ pxor 32(%rsi), %xmm4
+ por %xmm4, %xmm1
+
+ movdqa 48(%rdi), %xmm5
+ pxor 48(%rsi), %xmm5
+ por %xmm5, %xmm1
+
+ ptest %xmm1, %xmm0
+ jnc L(64bytesormore_loop_end)
+ add $64, %rsi
+ add $64, %rdi
+ sub $64, %rdx
+ jae L(64bytesormore_loopin2aligned)
+
+ add $64, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+L(L2_L3_cache_aglined):
+ sub $64, %rdx
+ ALIGN (4)
+L(L2_L3_aligned_128bytes_loop):
+ prefetchnta 0x1c0(%rdi)
+ prefetchnta 0x1c0(%rsi)
+ movdqa (%rdi), %xmm2
+ pxor (%rsi), %xmm2
+ movdqa %xmm2, %xmm1
+
+ movdqa 16(%rdi), %xmm3
+ pxor 16(%rsi), %xmm3
+ por %xmm3, %xmm1
+
+ movdqa 32(%rdi), %xmm4
+ pxor 32(%rsi), %xmm4
+ por %xmm4, %xmm1
+
+ movdqa 48(%rdi), %xmm5
+ pxor 48(%rsi), %xmm5
+ por %xmm5, %xmm1
+
+ ptest %xmm1, %xmm0
+ jnc L(64bytesormore_loop_end)
+ add $64, %rsi
+ add $64, %rdi
+ sub $64, %rdx
+ jae L(L2_L3_aligned_128bytes_loop)
+
+ add $64, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4)
+
+
+ ALIGN (4)
+L(64bytesormore_loop_end):
+ add $16, %rdi
+ add $16, %rsi
+ ptest %xmm2, %xmm0
+ jnc L(16bytes)
+
+ add $16, %rdi
+ add $16, %rsi
+ ptest %xmm3, %xmm0
+ jnc L(16bytes)
+
+ add $16, %rdi
+ add $16, %rsi
+ ptest %xmm4, %xmm0
+ jnc L(16bytes)
+
+ add $16, %rdi
+ add $16, %rsi
+ jmp L(16bytes)
+
+L(256bytesin256):
+ add $256, %rdi
+ add $256, %rsi
+ jmp L(16bytes)
+L(240bytesin256):
+ add $240, %rdi
+ add $240, %rsi
+ jmp L(16bytes)
+L(224bytesin256):
+ add $224, %rdi
+ add $224, %rsi
+ jmp L(16bytes)
+L(208bytesin256):
+ add $208, %rdi
+ add $208, %rsi
+ jmp L(16bytes)
+L(192bytesin256):
+ add $192, %rdi
+ add $192, %rsi
+ jmp L(16bytes)
+L(176bytesin256):
+ add $176, %rdi
+ add $176, %rsi
+ jmp L(16bytes)
+L(160bytesin256):
+ add $160, %rdi
+ add $160, %rsi
+ jmp L(16bytes)
+L(144bytesin256):
+ add $144, %rdi
+ add $144, %rsi
+ jmp L(16bytes)
+L(128bytesin256):
+ add $128, %rdi
+ add $128, %rsi
+ jmp L(16bytes)
+L(112bytesin256):
+ add $112, %rdi
+ add $112, %rsi
+ jmp L(16bytes)
+L(96bytesin256):
+ add $96, %rdi
+ add $96, %rsi
+ jmp L(16bytes)
+L(80bytesin256):
+ add $80, %rdi
+ add $80, %rsi
+ jmp L(16bytes)
+L(64bytesin256):
+ add $64, %rdi
+ add $64, %rsi
+ jmp L(16bytes)
+L(48bytesin256):
+ add $16, %rdi
+ add $16, %rsi
+L(32bytesin256):
+ add $16, %rdi
+ add $16, %rsi
+L(16bytesin256):
+ add $16, %rdi
+ add $16, %rsi
+L(16bytes):
+ mov -16(%rdi), %rax
+ mov -16(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+L(8bytes):
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(12bytes):
+ mov -12(%rdi), %rax
+ mov -12(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+L(4bytes):
+ mov -4(%rsi), %ecx
+ mov -4(%rdi), %eax
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+L(0bytes):
+ xor %eax, %eax
+ ret
+
+#ifndef USE_AS_WMEMCMP
+/* unreal case for wmemcmp */
+ ALIGN (4)
+L(65bytes):
+ movdqu -65(%rdi), %xmm1
+ movdqu -65(%rsi), %xmm2
+ mov $-65, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(49bytes):
+ movdqu -49(%rdi), %xmm1
+ movdqu -49(%rsi), %xmm2
+ mov $-49, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(33bytes):
+ movdqu -33(%rdi), %xmm1
+ movdqu -33(%rsi), %xmm2
+ mov $-33, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(17bytes):
+ mov -17(%rdi), %rax
+ mov -17(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+L(9bytes):
+ mov -9(%rdi), %rax
+ mov -9(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ movzbl -1(%rdi), %eax
+ movzbl -1(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(13bytes):
+ mov -13(%rdi), %rax
+ mov -13(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(5bytes):
+ mov -5(%rdi), %eax
+ mov -5(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ movzbl -1(%rdi), %eax
+ movzbl -1(%rsi), %edx
+ sub %edx, %eax
+ ret
+
+ ALIGN (4)
+L(66bytes):
+ movdqu -66(%rdi), %xmm1
+ movdqu -66(%rsi), %xmm2
+ mov $-66, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(50bytes):
+ movdqu -50(%rdi), %xmm1
+ movdqu -50(%rsi), %xmm2
+ mov $-50, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(34bytes):
+ movdqu -34(%rdi), %xmm1
+ movdqu -34(%rsi), %xmm2
+ mov $-34, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(18bytes):
+ mov -18(%rdi), %rax
+ mov -18(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+L(10bytes):
+ mov -10(%rdi), %rax
+ mov -10(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ movzwl -2(%rdi), %eax
+ movzwl -2(%rsi), %ecx
+ cmp %cl, %al
+ jne L(end)
+ and $0xffff, %eax
+ and $0xffff, %ecx
+ sub %ecx, %eax
+ ret
+
+ ALIGN (4)
+L(14bytes):
+ mov -14(%rdi), %rax
+ mov -14(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(6bytes):
+ mov -6(%rdi), %eax
+ mov -6(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+L(2bytes):
+ movzwl -2(%rsi), %ecx
+ movzwl -2(%rdi), %eax
+ cmp %cl, %al
+ jne L(end)
+ and $0xffff, %eax
+ and $0xffff, %ecx
+ sub %ecx, %eax
+ ret
+
+ ALIGN (4)
+L(67bytes):
+ movdqu -67(%rdi), %xmm2
+ movdqu -67(%rsi), %xmm1
+ mov $-67, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(51bytes):
+ movdqu -51(%rdi), %xmm2
+ movdqu -51(%rsi), %xmm1
+ mov $-51, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(35bytes):
+ movdqu -35(%rsi), %xmm1
+ movdqu -35(%rdi), %xmm2
+ mov $-35, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(19bytes):
+ mov -19(%rdi), %rax
+ mov -19(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+L(11bytes):
+ mov -11(%rdi), %rax
+ mov -11(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -4(%rdi), %eax
+ mov -4(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(15bytes):
+ mov -15(%rdi), %rax
+ mov -15(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(7bytes):
+ mov -7(%rdi), %eax
+ mov -7(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ mov -4(%rdi), %eax
+ mov -4(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(3bytes):
+ movzwl -3(%rdi), %eax
+ movzwl -3(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin2bytes)
+L(1bytes):
+ movzbl -1(%rdi), %eax
+ movzbl -1(%rsi), %ecx
+ sub %ecx, %eax
+ ret
+#endif
+
+ ALIGN (4)
+L(68bytes):
+ movdqu -68(%rdi), %xmm2
+ movdqu -68(%rsi), %xmm1
+ mov $-68, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(52bytes):
+ movdqu -52(%rdi), %xmm2
+ movdqu -52(%rsi), %xmm1
+ mov $-52, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(36bytes):
+ movdqu -36(%rdi), %xmm2
+ movdqu -36(%rsi), %xmm1
+ mov $-36, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(20bytes):
+ movdqu -20(%rdi), %xmm2
+ movdqu -20(%rsi), %xmm1
+ mov $-20, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -4(%rdi), %eax
+ mov -4(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+
+#ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
+ ALIGN (4)
+L(69bytes):
+ movdqu -69(%rsi), %xmm1
+ movdqu -69(%rdi), %xmm2
+ mov $-69, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(53bytes):
+ movdqu -53(%rsi), %xmm1
+ movdqu -53(%rdi), %xmm2
+ mov $-53, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(37bytes):
+ movdqu -37(%rsi), %xmm1
+ movdqu -37(%rdi), %xmm2
+ mov $-37, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(21bytes):
+ movdqu -21(%rsi), %xmm1
+ movdqu -21(%rdi), %xmm2
+ mov $-21, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(70bytes):
+ movdqu -70(%rsi), %xmm1
+ movdqu -70(%rdi), %xmm2
+ mov $-70, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(54bytes):
+ movdqu -54(%rsi), %xmm1
+ movdqu -54(%rdi), %xmm2
+ mov $-54, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(38bytes):
+ movdqu -38(%rsi), %xmm1
+ movdqu -38(%rdi), %xmm2
+ mov $-38, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(22bytes):
+ movdqu -22(%rsi), %xmm1
+ movdqu -22(%rdi), %xmm2
+ mov $-22, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(71bytes):
+ movdqu -71(%rsi), %xmm1
+ movdqu -71(%rdi), %xmm2
+ mov $-71, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(55bytes):
+ movdqu -55(%rdi), %xmm2
+ movdqu -55(%rsi), %xmm1
+ mov $-55, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(39bytes):
+ movdqu -39(%rdi), %xmm2
+ movdqu -39(%rsi), %xmm1
+ mov $-39, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(23bytes):
+ movdqu -23(%rdi), %xmm2
+ movdqu -23(%rsi), %xmm1
+ mov $-23, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+#endif
+
+ ALIGN (4)
+L(72bytes):
+ movdqu -72(%rsi), %xmm1
+ movdqu -72(%rdi), %xmm2
+ mov $-72, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(56bytes):
+ movdqu -56(%rdi), %xmm2
+ movdqu -56(%rsi), %xmm1
+ mov $-56, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(40bytes):
+ movdqu -40(%rdi), %xmm2
+ movdqu -40(%rsi), %xmm1
+ mov $-40, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(24bytes):
+ movdqu -24(%rdi), %xmm2
+ movdqu -24(%rsi), %xmm1
+ mov $-24, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+#ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
+ ALIGN (4)
+L(73bytes):
+ movdqu -73(%rsi), %xmm1
+ movdqu -73(%rdi), %xmm2
+ mov $-73, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(57bytes):
+ movdqu -57(%rdi), %xmm2
+ movdqu -57(%rsi), %xmm1
+ mov $-57, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(41bytes):
+ movdqu -41(%rdi), %xmm2
+ movdqu -41(%rsi), %xmm1
+ mov $-41, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(25bytes):
+ movdqu -25(%rdi), %xmm2
+ movdqu -25(%rsi), %xmm1
+ mov $-25, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -9(%rdi), %rax
+ mov -9(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ movzbl -1(%rdi), %eax
+ movzbl -1(%rsi), %ecx
+ sub %ecx, %eax
+ ret
+
+ ALIGN (4)
+L(74bytes):
+ movdqu -74(%rsi), %xmm1
+ movdqu -74(%rdi), %xmm2
+ mov $-74, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(58bytes):
+ movdqu -58(%rdi), %xmm2
+ movdqu -58(%rsi), %xmm1
+ mov $-58, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(42bytes):
+ movdqu -42(%rdi), %xmm2
+ movdqu -42(%rsi), %xmm1
+ mov $-42, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(26bytes):
+ movdqu -26(%rdi), %xmm2
+ movdqu -26(%rsi), %xmm1
+ mov $-26, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -10(%rdi), %rax
+ mov -10(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ movzwl -2(%rdi), %eax
+ movzwl -2(%rsi), %ecx
+ jmp L(diffin2bytes)
+
+ ALIGN (4)
+L(75bytes):
+ movdqu -75(%rsi), %xmm1
+ movdqu -75(%rdi), %xmm2
+ mov $-75, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(59bytes):
+ movdqu -59(%rdi), %xmm2
+ movdqu -59(%rsi), %xmm1
+ mov $-59, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(43bytes):
+ movdqu -43(%rdi), %xmm2
+ movdqu -43(%rsi), %xmm1
+ mov $-43, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(27bytes):
+ movdqu -27(%rdi), %xmm2
+ movdqu -27(%rsi), %xmm1
+ mov $-27, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -11(%rdi), %rax
+ mov -11(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -4(%rdi), %eax
+ mov -4(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+#endif
+ ALIGN (4)
+L(76bytes):
+ movdqu -76(%rsi), %xmm1
+ movdqu -76(%rdi), %xmm2
+ mov $-76, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(60bytes):
+ movdqu -60(%rdi), %xmm2
+ movdqu -60(%rsi), %xmm1
+ mov $-60, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(44bytes):
+ movdqu -44(%rdi), %xmm2
+ movdqu -44(%rsi), %xmm1
+ mov $-44, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(28bytes):
+ movdqu -28(%rdi), %xmm2
+ movdqu -28(%rsi), %xmm1
+ mov $-28, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -12(%rdi), %rax
+ mov -12(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -4(%rdi), %eax
+ mov -4(%rsi), %ecx
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+
+#ifndef USE_AS_WMEMCMP
+/* unreal cases for wmemcmp */
+ ALIGN (4)
+L(77bytes):
+ movdqu -77(%rsi), %xmm1
+ movdqu -77(%rdi), %xmm2
+ mov $-77, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(61bytes):
+ movdqu -61(%rdi), %xmm2
+ movdqu -61(%rsi), %xmm1
+ mov $-61, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(45bytes):
+ movdqu -45(%rdi), %xmm2
+ movdqu -45(%rsi), %xmm1
+ mov $-45, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(29bytes):
+ movdqu -29(%rdi), %xmm2
+ movdqu -29(%rsi), %xmm1
+ mov $-29, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -13(%rdi), %rax
+ mov -13(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(78bytes):
+ movdqu -78(%rsi), %xmm1
+ movdqu -78(%rdi), %xmm2
+ mov $-78, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(62bytes):
+ movdqu -62(%rdi), %xmm2
+ movdqu -62(%rsi), %xmm1
+ mov $-62, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(46bytes):
+ movdqu -46(%rdi), %xmm2
+ movdqu -46(%rsi), %xmm1
+ mov $-46, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(30bytes):
+ movdqu -30(%rdi), %xmm2
+ movdqu -30(%rsi), %xmm1
+ mov $-30, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -14(%rdi), %rax
+ mov -14(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+ ALIGN (4)
+L(79bytes):
+ movdqu -79(%rsi), %xmm1
+ movdqu -79(%rdi), %xmm2
+ mov $-79, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(63bytes):
+ movdqu -63(%rdi), %xmm2
+ movdqu -63(%rsi), %xmm1
+ mov $-63, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(47bytes):
+ movdqu -47(%rdi), %xmm2
+ movdqu -47(%rsi), %xmm1
+ mov $-47, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(31bytes):
+ movdqu -31(%rdi), %xmm2
+ movdqu -31(%rsi), %xmm1
+ mov $-31, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+ mov -15(%rdi), %rax
+ mov -15(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+#endif
+ ALIGN (4)
+L(64bytes):
+ movdqu -64(%rdi), %xmm2
+ movdqu -64(%rsi), %xmm1
+ mov $-64, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(48bytes):
+ movdqu -48(%rdi), %xmm2
+ movdqu -48(%rsi), %xmm1
+ mov $-48, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+L(32bytes):
+ movdqu -32(%rdi), %xmm2
+ movdqu -32(%rsi), %xmm1
+ mov $-32, %dl
+ pxor %xmm1, %xmm2
+ ptest %xmm2, %xmm0
+ jnc L(less16bytes)
+
+ mov -16(%rdi), %rax
+ mov -16(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+
+ mov -8(%rdi), %rax
+ mov -8(%rsi), %rcx
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ xor %eax, %eax
+ ret
+
+/*
+ * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block.
+ */
+ ALIGN (3)
+L(less16bytes):
+ movsbq %dl, %rdx
+ mov (%rsi, %rdx), %rcx
+ mov (%rdi, %rdx), %rax
+ cmp %rax, %rcx
+ jne L(diffin8bytes)
+ mov 8(%rsi, %rdx), %rcx
+ mov 8(%rdi, %rdx), %rax
+L(diffin8bytes):
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ shr $32, %rcx
+ shr $32, %rax
+
+#ifdef USE_AS_WMEMCMP
+/* for wmemcmp */
+ cmp %eax, %ecx
+ jne L(diffin4bytes)
+ xor %eax, %eax
+ ret
+#endif
+
+L(diffin4bytes):
+#ifndef USE_AS_WMEMCMP
+ cmp %cx, %ax
+ jne L(diffin2bytes)
+ shr $16, %ecx
+ shr $16, %eax
+L(diffin2bytes):
+ cmp %cl, %al
+ jne L(end)
+ and $0xffff, %eax
+ and $0xffff, %ecx
+ sub %ecx, %eax
+ ret
+#else
+
+/* for wmemcmp */
+ mov $1, %eax
+ jl L(nequal_bigger)
+ neg %eax
+ ret
+
+ ALIGN (4)
+L(nequal_bigger):
+ ret
+
+L(unreal_case):
+ xor %eax, %eax
+ ret
+#endif
+
+ ALIGN (4)
+L(end):
+ and $0xff, %eax
+ and $0xff, %ecx
+ sub %ecx, %eax
+ ret
+
+END (MEMCMP)
+
+ .section .rodata.sse4.1,"a",@progbits
+ ALIGN (3)
+#ifndef USE_AS_WMEMCMP
+L(table_64bytes):
+ .int JMPTBL (L(0bytes), L(table_64bytes))
+ .int JMPTBL (L(1bytes), L(table_64bytes))
+ .int JMPTBL (L(2bytes), L(table_64bytes))
+ .int JMPTBL (L(3bytes), L(table_64bytes))
+ .int JMPTBL (L(4bytes), L(table_64bytes))
+ .int JMPTBL (L(5bytes), L(table_64bytes))
+ .int JMPTBL (L(6bytes), L(table_64bytes))
+ .int JMPTBL (L(7bytes), L(table_64bytes))
+ .int JMPTBL (L(8bytes), L(table_64bytes))
+ .int JMPTBL (L(9bytes), L(table_64bytes))
+ .int JMPTBL (L(10bytes), L(table_64bytes))
+ .int JMPTBL (L(11bytes), L(table_64bytes))
+ .int JMPTBL (L(12bytes), L(table_64bytes))
+ .int JMPTBL (L(13bytes), L(table_64bytes))
+ .int JMPTBL (L(14bytes), L(table_64bytes))
+ .int JMPTBL (L(15bytes), L(table_64bytes))
+ .int JMPTBL (L(16bytes), L(table_64bytes))
+ .int JMPTBL (L(17bytes), L(table_64bytes))
+ .int JMPTBL (L(18bytes), L(table_64bytes))
+ .int JMPTBL (L(19bytes), L(table_64bytes))
+ .int JMPTBL (L(20bytes), L(table_64bytes))
+ .int JMPTBL (L(21bytes), L(table_64bytes))
+ .int JMPTBL (L(22bytes), L(table_64bytes))
+ .int JMPTBL (L(23bytes), L(table_64bytes))
+ .int JMPTBL (L(24bytes), L(table_64bytes))
+ .int JMPTBL (L(25bytes), L(table_64bytes))
+ .int JMPTBL (L(26bytes), L(table_64bytes))
+ .int JMPTBL (L(27bytes), L(table_64bytes))
+ .int JMPTBL (L(28bytes), L(table_64bytes))
+ .int JMPTBL (L(29bytes), L(table_64bytes))
+ .int JMPTBL (L(30bytes), L(table_64bytes))
+ .int JMPTBL (L(31bytes), L(table_64bytes))
+ .int JMPTBL (L(32bytes), L(table_64bytes))
+ .int JMPTBL (L(33bytes), L(table_64bytes))
+ .int JMPTBL (L(34bytes), L(table_64bytes))
+ .int JMPTBL (L(35bytes), L(table_64bytes))
+ .int JMPTBL (L(36bytes), L(table_64bytes))
+ .int JMPTBL (L(37bytes), L(table_64bytes))
+ .int JMPTBL (L(38bytes), L(table_64bytes))
+ .int JMPTBL (L(39bytes), L(table_64bytes))
+ .int JMPTBL (L(40bytes), L(table_64bytes))
+ .int JMPTBL (L(41bytes), L(table_64bytes))
+ .int JMPTBL (L(42bytes), L(table_64bytes))
+ .int JMPTBL (L(43bytes), L(table_64bytes))
+ .int JMPTBL (L(44bytes), L(table_64bytes))
+ .int JMPTBL (L(45bytes), L(table_64bytes))
+ .int JMPTBL (L(46bytes), L(table_64bytes))
+ .int JMPTBL (L(47bytes), L(table_64bytes))
+ .int JMPTBL (L(48bytes), L(table_64bytes))
+ .int JMPTBL (L(49bytes), L(table_64bytes))
+ .int JMPTBL (L(50bytes), L(table_64bytes))
+ .int JMPTBL (L(51bytes), L(table_64bytes))
+ .int JMPTBL (L(52bytes), L(table_64bytes))
+ .int JMPTBL (L(53bytes), L(table_64bytes))
+ .int JMPTBL (L(54bytes), L(table_64bytes))
+ .int JMPTBL (L(55bytes), L(table_64bytes))
+ .int JMPTBL (L(56bytes), L(table_64bytes))
+ .int JMPTBL (L(57bytes), L(table_64bytes))
+ .int JMPTBL (L(58bytes), L(table_64bytes))
+ .int JMPTBL (L(59bytes), L(table_64bytes))
+ .int JMPTBL (L(60bytes), L(table_64bytes))
+ .int JMPTBL (L(61bytes), L(table_64bytes))
+ .int JMPTBL (L(62bytes), L(table_64bytes))
+ .int JMPTBL (L(63bytes), L(table_64bytes))
+ .int JMPTBL (L(64bytes), L(table_64bytes))
+ .int JMPTBL (L(65bytes), L(table_64bytes))
+ .int JMPTBL (L(66bytes), L(table_64bytes))
+ .int JMPTBL (L(67bytes), L(table_64bytes))
+ .int JMPTBL (L(68bytes), L(table_64bytes))
+ .int JMPTBL (L(69bytes), L(table_64bytes))
+ .int JMPTBL (L(70bytes), L(table_64bytes))
+ .int JMPTBL (L(71bytes), L(table_64bytes))
+ .int JMPTBL (L(72bytes), L(table_64bytes))
+ .int JMPTBL (L(73bytes), L(table_64bytes))
+ .int JMPTBL (L(74bytes), L(table_64bytes))
+ .int JMPTBL (L(75bytes), L(table_64bytes))
+ .int JMPTBL (L(76bytes), L(table_64bytes))
+ .int JMPTBL (L(77bytes), L(table_64bytes))
+ .int JMPTBL (L(78bytes), L(table_64bytes))
+ .int JMPTBL (L(79bytes), L(table_64bytes))
+#else
+L(table_64bytes):
+ .int JMPTBL (L(0bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(4bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(8bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(12bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(16bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(20bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(24bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(28bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(32bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(36bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(40bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(44bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(48bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(52bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(56bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(60bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(64bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(68bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(72bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(76bytes), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+ .int JMPTBL (L(unreal_case), L(table_64bytes))
+#endif
diff --git a/libc/arch-x86_64/string/ssse3-strcmp-slm.S b/libc/arch-x86_64/string/ssse3-strcmp-slm.S
new file mode 100644
index 0000000..0dd8c27
--- /dev/null
+++ b/libc/arch-x86_64/string/ssse3-strcmp-slm.S
@@ -0,0 +1,1925 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef USE_AS_STRNCMP
+/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
+ if the new counter > the old one or is 0. */
+#define UPDATE_STRNCMP_COUNTER \
+ /* calculate left number to compare */ \
+ lea -16(%rcx, %r11), %r9; \
+ cmp %r9, %r11; \
+ jb L(strcmp_exitz); \
+ test %r9, %r9; \
+ je L(strcmp_exitz); \
+ mov %r9, %r11
+
+#else
+#define UPDATE_STRNCMP_COUNTER
+#ifndef STRCMP
+#define STRCMP strcmp
+#endif
+#endif
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+#define RETURN ret
+ .section .text.ssse3,"ax",@progbits
+ENTRY (STRCMP)
+/*
+ * This implementation uses SSE to compare up to 16 bytes at a time.
+ */
+#ifdef USE_AS_STRNCMP
+ test %rdx, %rdx
+ je L(strcmp_exitz)
+ cmp $1, %rdx
+ je L(Byte0)
+ mov %rdx, %r11
+#endif
+ mov %esi, %ecx
+ mov %edi, %eax
+/* Use 64bit AND here to avoid long NOP padding. */
+ and $0x3f, %rcx /* rsi alignment in cache line */
+ and $0x3f, %rax /* rdi alignment in cache line */
+ cmp $0x30, %ecx
+ ja L(crosscache) /* rsi: 16-byte load will cross cache line */
+ cmp $0x30, %eax
+ ja L(crosscache) /* rdi: 16-byte load will cross cache line */
+ movlpd (%rdi), %xmm1
+ movlpd (%rsi), %xmm2
+ movhpd 8(%rdi), %xmm1
+ movhpd 8(%rsi), %xmm2
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+ pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
+ jnz L(less16bytes) /* If not, find different value or null char */
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz) /* finish comparision */
+#endif
+ add $16, %rsi /* prepare to search next 16 bytes */
+ add $16, %rdi /* prepare to search next 16 bytes */
+
+ /*
+ * Determine source and destination string offsets from 16-byte alignment.
+ * Use relative offset difference between the two to determine which case
+ * below to use.
+ */
+ .p2align 4
+L(crosscache):
+ and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
+ and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
+ mov $0xffff, %edx /* for equivalent offset */
+ xor %r8d, %r8d
+ and $0xf, %ecx /* offset of rsi */
+ and $0xf, %eax /* offset of rdi */
+ cmp %eax, %ecx
+ je L(ashr_0) /* rsi and rdi relative offset same */
+ ja L(bigger)
+ mov %edx, %r8d /* r8d is offset flag for exit tail */
+ xchg %ecx, %eax
+ xchg %rsi, %rdi
+L(bigger):
+ lea 15(%rax), %r9
+ sub %rcx, %r9
+ lea L(unaligned_table)(%rip), %r10
+ movslq (%r10, %r9,4), %r9
+ lea (%r10, %r9), %r10
+ jmp *%r10 /* jump to corresponding case */
+
+/*
+ * The following cases will be handled by ashr_0
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(0~15) n(0~15) 15(15+ n-n) ashr_0
+ */
+ .p2align 4
+L(ashr_0):
+
+ movdqa (%rsi), %xmm1
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+ pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %r9d
+ shr %cl, %edx /* adjust 0xffff for offset */
+ shr %cl, %r9d /* adjust for 16-byte offset */
+ sub %r9d, %edx
+ /*
+ * edx must be the same with r9d if in left byte (16-rcx) is equal to
+ * the start from (16-rax) and no null char was seen.
+ */
+ jne L(less32bytes) /* mismatch or null char */
+ UPDATE_STRNCMP_COUNTER
+ mov $16, %rcx
+ mov $16, %r9
+ pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
+
+ /*
+ * Now both strings are aligned at 16-byte boundary. Loop over strings
+ * checking 32-bytes per iteration.
+ */
+ .p2align 4
+L(loop_ashr_0):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit) /* mismatch or null char seen */
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+ add $16, %rcx
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+ add $16, %rcx
+ jmp L(loop_ashr_0)
+
+/*
+ * The following cases will be handled by ashr_1
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(15) n -15 0(15 +(n-15) - n) ashr_1
+ */
+ .p2align 4
+L(ashr_1):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+ pslldq $15, %xmm2 /* shift first string to align with second */
+ pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx /* adjust 0xffff for offset */
+ shr %cl, %r9d /* adjust for 16-byte offset */
+ sub %r9d, %edx
+ jnz L(less32bytes) /* mismatch or null char seen */
+ movdqa (%rdi), %xmm3
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads*/
+ mov $1, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 1(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_1):
+ add $16, %r10
+ jg L(nibble_ashr_1) /* cross page boundary */
+
+L(gobble_ashr_1):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4 /* store for next cycle */
+
+ palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_1) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4 /* store for next cycle */
+
+ palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_1)
+
+ /*
+ * Nibble avoids loads across page boundary. This is to avoid a potential
+ * access into unmapped memory.
+ */
+ .p2align 4
+L(nibble_ashr_1):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/
+ pmovmskb %xmm0, %edx
+ test $0xfffe, %edx
+ jnz L(ashr_1_exittail) /* find null char*/
+
+#ifdef USE_AS_STRNCMP
+ cmp $14, %r11
+ jbe L(ashr_1_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10 /* substract 4K from %r10 */
+ jmp L(gobble_ashr_1)
+
+ /*
+ * Once find null char, determine if there is a string mismatch
+ * before the null char.
+ */
+ .p2align 4
+L(ashr_1_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $1, %xmm0
+ psrldq $1, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_2
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
+ */
+ .p2align 4
+L(ashr_2):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $14, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $2, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 2(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_2):
+ add $16, %r10
+ jg L(nibble_ashr_2)
+
+L(gobble_ashr_2):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_2) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_2)
+
+ .p2align 4
+L(nibble_ashr_2):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xfffc, %edx
+ jnz L(ashr_2_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $13, %r11
+ jbe L(ashr_2_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_2)
+
+ .p2align 4
+L(ashr_2_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $2, %xmm0
+ psrldq $2, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_3
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
+ */
+ .p2align 4
+L(ashr_3):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $13, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $3, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 3(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_3):
+ add $16, %r10
+ jg L(nibble_ashr_3)
+
+L(gobble_ashr_3):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_3) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_3)
+
+ .p2align 4
+L(nibble_ashr_3):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xfff8, %edx
+ jnz L(ashr_3_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $12, %r11
+ jbe L(ashr_3_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_3)
+
+ .p2align 4
+L(ashr_3_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $3, %xmm0
+ psrldq $3, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_4
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
+ */
+ .p2align 4
+L(ashr_4):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $12, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $4, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 4(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_4):
+ add $16, %r10
+ jg L(nibble_ashr_4)
+
+L(gobble_ashr_4):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_4) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_4)
+
+ .p2align 4
+L(nibble_ashr_4):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xfff0, %edx
+ jnz L(ashr_4_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $11, %r11
+ jbe L(ashr_4_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_4)
+
+ .p2align 4
+L(ashr_4_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $4, %xmm0
+ psrldq $4, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_5
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
+ */
+ .p2align 4
+L(ashr_5):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $11, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $5, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 5(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_5):
+ add $16, %r10
+ jg L(nibble_ashr_5)
+
+L(gobble_ashr_5):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_5) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_5)
+
+ .p2align 4
+L(nibble_ashr_5):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xffe0, %edx
+ jnz L(ashr_5_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $10, %r11
+ jbe L(ashr_5_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_5)
+
+ .p2align 4
+L(ashr_5_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $5, %xmm0
+ psrldq $5, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_6
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
+ */
+ .p2align 4
+L(ashr_6):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $10, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $6, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 6(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_6):
+ add $16, %r10
+ jg L(nibble_ashr_6)
+
+L(gobble_ashr_6):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_6) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_6)
+
+ .p2align 4
+L(nibble_ashr_6):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xffc0, %edx
+ jnz L(ashr_6_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $9, %r11
+ jbe L(ashr_6_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_6)
+
+ .p2align 4
+L(ashr_6_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $6, %xmm0
+ psrldq $6, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_7
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
+ */
+ .p2align 4
+L(ashr_7):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $9, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $7, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 7(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_7):
+ add $16, %r10
+ jg L(nibble_ashr_7)
+
+L(gobble_ashr_7):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_7) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_7)
+
+ .p2align 4
+L(nibble_ashr_7):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xff80, %edx
+ jnz L(ashr_7_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $8, %r11
+ jbe L(ashr_7_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_7)
+
+ .p2align 4
+L(ashr_7_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $7, %xmm0
+ psrldq $7, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_8
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
+ */
+ .p2align 4
+L(ashr_8):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $8, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $8, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 8(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_8):
+ add $16, %r10
+ jg L(nibble_ashr_8)
+
+L(gobble_ashr_8):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_8) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_8)
+
+ .p2align 4
+L(nibble_ashr_8):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xff00, %edx
+ jnz L(ashr_8_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $7, %r11
+ jbe L(ashr_8_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_8)
+
+ .p2align 4
+L(ashr_8_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $8, %xmm0
+ psrldq $8, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_9
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
+ */
+ .p2align 4
+L(ashr_9):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $7, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $9, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 9(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_9):
+ add $16, %r10
+ jg L(nibble_ashr_9)
+
+L(gobble_ashr_9):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_9) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3 /* store for next cycle */
+ jmp L(loop_ashr_9)
+
+ .p2align 4
+L(nibble_ashr_9):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xfe00, %edx
+ jnz L(ashr_9_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $6, %r11
+ jbe L(ashr_9_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_9)
+
+ .p2align 4
+L(ashr_9_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $9, %xmm0
+ psrldq $9, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_10
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
+ */
+ .p2align 4
+L(ashr_10):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $6, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $10, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 10(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_10):
+ add $16, %r10
+ jg L(nibble_ashr_10)
+
+L(gobble_ashr_10):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_10) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_10)
+
+ .p2align 4
+L(nibble_ashr_10):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xfc00, %edx
+ jnz L(ashr_10_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $5, %r11
+ jbe L(ashr_10_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_10)
+
+ .p2align 4
+L(ashr_10_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $10, %xmm0
+ psrldq $10, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_11
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
+ */
+ .p2align 4
+L(ashr_11):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $5, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $11, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 11(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_11):
+ add $16, %r10
+ jg L(nibble_ashr_11)
+
+L(gobble_ashr_11):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_11) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_11)
+
+ .p2align 4
+L(nibble_ashr_11):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xf800, %edx
+ jnz L(ashr_11_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $4, %r11
+ jbe L(ashr_11_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_11)
+
+ .p2align 4
+L(ashr_11_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $11, %xmm0
+ psrldq $11, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_12
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
+ */
+ .p2align 4
+L(ashr_12):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $4, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $12, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 12(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_12):
+ add $16, %r10
+ jg L(nibble_ashr_12)
+
+L(gobble_ashr_12):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_12) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_12)
+
+ .p2align 4
+L(nibble_ashr_12):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xf000, %edx
+ jnz L(ashr_12_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $3, %r11
+ jbe L(ashr_12_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_12)
+
+ .p2align 4
+L(ashr_12_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $12, %xmm0
+ psrldq $12, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_13
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
+ */
+ .p2align 4
+L(ashr_13):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $3, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $13, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 13(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_13):
+ add $16, %r10
+ jg L(nibble_ashr_13)
+
+L(gobble_ashr_13):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_13) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_13)
+
+ .p2align 4
+L(nibble_ashr_13):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xe000, %edx
+ jnz L(ashr_13_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $2, %r11
+ jbe L(ashr_13_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_13)
+
+ .p2align 4
+L(ashr_13_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $13, %xmm0
+ psrldq $13, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_14
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
+ */
+ .p2align 4
+L(ashr_14):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $2, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $14, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 14(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_14):
+ add $16, %r10
+ jg L(nibble_ashr_14)
+
+L(gobble_ashr_14):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_14) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_14)
+
+ .p2align 4
+L(nibble_ashr_14):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xc000, %edx
+ jnz L(ashr_14_exittail)
+
+#ifdef USE_AS_STRNCMP
+ cmp $1, %r11
+ jbe L(ashr_14_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_14)
+
+ .p2align 4
+L(ashr_14_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $14, %xmm0
+ psrldq $14, %xmm3
+ jmp L(aftertail)
+
+/*
+ * The following cases will be handled by ashr_15
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
+ */
+ .p2align 4
+L(ashr_15):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $1, %xmm2
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz L(less32bytes)
+
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $15, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 15(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+L(loop_ashr_15):
+ add $16, %r10
+ jg L(nibble_ashr_15)
+
+L(gobble_ashr_15):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg L(nibble_ashr_15) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz L(exit)
+
+#ifdef USE_AS_STRNCMP
+ sub $16, %r11
+ jbe L(strcmp_exitz)
+#endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp L(loop_ashr_15)
+
+ .p2align 4
+L(nibble_ashr_15):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0x8000, %edx
+ jnz L(ashr_15_exittail)
+
+#ifdef USE_AS_STRNCMP
+ test %r11, %r11
+ je L(ashr_15_exittail)
+#endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp L(gobble_ashr_15)
+
+ .p2align 4
+L(ashr_15_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $15, %xmm3
+ psrldq $15, %xmm0
+
+ .p2align 4
+L(aftertail):
+ pcmpeqb %xmm3, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ not %edx
+
+ .p2align 4
+L(exit):
+ lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */
+L(less32bytes):
+ lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
+ lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
+ test %r8d, %r8d
+ jz L(ret)
+ xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
+
+ .p2align 4
+L(ret):
+L(less16bytes):
+ bsf %rdx, %rdx /* find and store bit index in %rdx */
+
+#ifdef USE_AS_STRNCMP
+ sub %rdx, %r11
+ jbe L(strcmp_exitz)
+#endif
+ movzbl (%rsi, %rdx), %ecx
+ movzbl (%rdi, %rdx), %eax
+
+ sub %ecx, %eax
+ ret
+
+L(strcmp_exitz):
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(Byte0):
+ movzx (%rsi), %ecx
+ movzx (%rdi), %eax
+
+ sub %ecx, %eax
+ ret
+END (STRCMP)
+
+ .section .rodata,"a",@progbits
+ .p2align 3
+L(unaligned_table):
+ .int L(ashr_1) - L(unaligned_table)
+ .int L(ashr_2) - L(unaligned_table)
+ .int L(ashr_3) - L(unaligned_table)
+ .int L(ashr_4) - L(unaligned_table)
+ .int L(ashr_5) - L(unaligned_table)
+ .int L(ashr_6) - L(unaligned_table)
+ .int L(ashr_7) - L(unaligned_table)
+ .int L(ashr_8) - L(unaligned_table)
+ .int L(ashr_9) - L(unaligned_table)
+ .int L(ashr_10) - L(unaligned_table)
+ .int L(ashr_11) - L(unaligned_table)
+ .int L(ashr_12) - L(unaligned_table)
+ .int L(ashr_13) - L(unaligned_table)
+ .int L(ashr_14) - L(unaligned_table)
+ .int L(ashr_15) - L(unaligned_table)
+ .int L(ashr_0) - L(unaligned_table)
diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86_64/string/ssse3-strncmp-slm.S
similarity index 73%
copy from libc/arch-x86/string/cache.h
copy to libc/arch-x86_64/string/ssse3-strncmp-slm.S
index 9d0a563..0e40775 100644
--- a/libc/arch-x86/string/cache.h
+++ b/libc/arch-x86_64/string/ssse3-strncmp-slm.S
@@ -1,5 +1,5 @@
/*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,15 +28,6 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#if defined(__slm__)
-/* Values are optimized for Silvermont */
-#define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */
-#else
-/* Values are optimized for Atom */
-#define SHARED_CACHE_SIZE (512*1024) /* Atom L2 Cache */
-#define DATA_CACHE_SIZE (24*1024) /* Atom L1 Data Cache */
-#endif
-
-#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
-#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
+#define USE_AS_STRNCMP
+#define STRCMP strncmp
+#include "ssse3-strcmp-slm.S"
diff --git a/libc/arch-x86_64/x86_64.mk b/libc/arch-x86_64/x86_64.mk
index 9bce065..2bcf432 100644
--- a/libc/arch-x86_64/x86_64.mk
+++ b/libc/arch-x86_64/x86_64.mk
@@ -3,11 +3,7 @@
libc_common_src_files_x86_64 := \
bionic/index.cpp \
bionic/memchr.c \
- bionic/memcmp.c \
- bionic/memcpy.cpp \
- bionic/memmove.c \
bionic/memrchr.c \
- bionic/memset.c \
bionic/strchr.cpp \
bionic/strnlen.c \
bionic/strrchr.cpp \
@@ -18,16 +14,8 @@
upstream-freebsd/lib/libc/string/wcslen.c \
upstream-freebsd/lib/libc/string/wcsrchr.c \
upstream-freebsd/lib/libc/string/wmemcmp.c \
- upstream-openbsd/lib/libc/string/bcopy.c \
- upstream-openbsd/lib/libc/string/strcat.c \
- upstream-openbsd/lib/libc/string/strcmp.c \
- upstream-openbsd/lib/libc/string/strcpy.c \
upstream-openbsd/lib/libc/string/strlcat.c \
upstream-openbsd/lib/libc/string/strlcpy.c \
- upstream-openbsd/lib/libc/string/strlen.c \
- upstream-openbsd/lib/libc/string/strncat.c \
- upstream-openbsd/lib/libc/string/strncmp.c \
- upstream-openbsd/lib/libc/string/strncpy.c \
# Fortify implementations of libc functions.
libc_common_src_files_x86_64 += \
@@ -53,6 +41,23 @@
arch-x86_64/bionic/vfork.S \
bionic/__memcmp16.cpp \
+libc_bionic_src_files_x86_64 += \
+ arch-x86_64/string/sse2-bcopy-slm.S \
+ arch-x86_64/string/sse2-bzero-slm.S \
+ arch-x86_64/string/sse2-memcpy-slm.S \
+ arch-x86_64/string/sse2-memmove-slm.S \
+ arch-x86_64/string/sse2-memset-slm.S \
+ arch-x86_64/string/sse2-stpcpy-slm.S \
+ arch-x86_64/string/sse2-stpncpy-slm.S \
+ arch-x86_64/string/sse2-strcat-slm.S \
+ arch-x86_64/string/sse2-strcpy-slm.S \
+ arch-x86_64/string/sse2-strlen-slm.S \
+ arch-x86_64/string/sse2-strncat-slm.S \
+ arch-x86_64/string/sse2-strncpy-slm.S \
+ arch-x86_64/string/sse4-memcmp-slm.S \
+ arch-x86_64/string/ssse3-strcmp-slm.S \
+ arch-x86_64/string/ssse3-strncmp-slm.S \
+
libc_crt_target_cflags_x86_64 += \
-m64 \
-I$(LOCAL_PATH)/arch-x86_64/include
diff --git a/libc/bionic/wchar.cpp b/libc/bionic/wchar.cpp
index b46ad49..5da882f 100644
--- a/libc/bionic/wchar.cpp
+++ b/libc/bionic/wchar.cpp
@@ -32,23 +32,69 @@
#include <wchar.h>
//
-// This file is basically OpenBSD's citrus_utf8.c but rewritten to not require a 12-byte mbstate_t
-// so we're backwards-compatible with our LP32 ABI where mbstate_t was only 4 bytes. An additional
-// advantage of this is that callers who don't supply their own mbstate_t won't be accessing shared
-// state.
+// This file is basically OpenBSD's citrus_utf8.c but rewritten to not require a
+// 12-byte mbstate_t so we're backwards-compatible with our LP32 ABI where
+// mbstate_t was only 4 bytes.
//
-// We also implement the POSIX interface directly rather than being accessed via function pointers.
+// The state is the UTF-8 sequence. We only support <= 4-bytes sequences so LP32
+// mbstate_t already has enough space (out of the 4 available bytes we only
+// need 3 since we should never need to store the entire sequence in the
+// intermediary state).
+//
+// The C standard leaves the conversion state undefined after a bad conversion.
+// To avoid unexpected failures due to the possible use of the internal private
+// state we always reset the conversion state when encountering illegal
+// sequences.
+//
+// We also implement the POSIX interface directly rather than being accessed via
+// function pointers.
//
#define ERR_ILLEGAL_SEQUENCE static_cast<size_t>(-1)
#define ERR_INCOMPLETE_SEQUENCE static_cast<size_t>(-2)
-int mbsinit(const mbstate_t*) {
- // We have no state, so we're always in the initial state.
- return 1;
+static size_t mbstate_bytes_so_far(const mbstate_t* ps) {
+ return
+ (ps->__seq[2] != 0) ? 3 :
+ (ps->__seq[1] != 0) ? 2 :
+ (ps->__seq[0] != 0) ? 1 : 0;
}
-size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t*) {
+static void mbstate_set_byte(mbstate_t* ps, int i, char byte) {
+ ps->__seq[i] = static_cast<uint8_t>(byte);
+}
+
+static uint8_t mbstate_get_byte(const mbstate_t* ps, int n) {
+ return ps->__seq[n];
+}
+
+static size_t reset_and_return_illegal(int _errno, mbstate_t* ps) {
+ errno = _errno;
+ *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0;
+ return ERR_ILLEGAL_SEQUENCE;
+}
+
+static size_t reset_and_return(int _return, mbstate_t* ps) {
+ *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0;
+ return _return;
+}
+
+
+int mbsinit(const mbstate_t* ps) {
+ return (ps == NULL || (*(reinterpret_cast<const uint32_t*>(ps->__seq)) == 0));
+}
+
+size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t* ps) {
+ static mbstate_t __private_state;
+ mbstate_t* state = (ps == NULL) ? &__private_state : ps;
+
+ // We should never get to a state which has all 4 bytes of the sequence set.
+ // Full state verification is done when decoding the sequence (after we have
+ // all the bytes).
+ if (mbstate_get_byte(state, 3) != 0) {
+ return reset_and_return_illegal(EINVAL, state);
+ }
+
if (s == NULL) {
s = "";
n = 1;
@@ -59,8 +105,8 @@
return 0;
}
- int ch;
- if (((ch = static_cast<uint8_t>(*s)) & ~0x7f) == 0) {
+ uint8_t ch;
+ if (mbsinit(state) && (((ch = static_cast<uint8_t>(*s)) & ~0x7f) == 0)) {
// Fast path for plain ASCII characters.
if (pwc != NULL) {
*pwc = ch;
@@ -82,7 +128,9 @@
// between character codes and their multibyte representations.
wchar_t lower_bound;
- ch = static_cast<uint8_t>(*s);
+ // The first byte in the state (if any) tells the length.
+ size_t bytes_so_far = mbstate_bytes_so_far(state);
+ ch = bytes_so_far > 0 ? mbstate_get_byte(state, 0) : static_cast<uint8_t>(*s);
if ((ch & 0x80) == 0) {
mask = 0x7f;
length = 1;
@@ -101,106 +149,144 @@
lower_bound = 0x10000;
} else {
// Malformed input; input is not UTF-8. See RFC 3629.
- errno = EILSEQ;
- return ERR_ILLEGAL_SEQUENCE;
+ return reset_and_return_illegal(EILSEQ, state);
+ }
+
+ // Fill in the state.
+ size_t bytes_wanted = length - bytes_so_far;
+ size_t i;
+ for (i = 0; i < MIN(bytes_wanted, n); i++) {
+ if (!mbsinit(state) && ((*s & 0xc0) != 0x80)) {
+ // Malformed input; bad characters in the middle of a character.
+ return reset_and_return_illegal(EILSEQ, state);
+ }
+ mbstate_set_byte(state, bytes_so_far + i, *s++);
+ }
+ if (i < bytes_wanted) {
+ return ERR_INCOMPLETE_SEQUENCE;
}
// Decode the octet sequence representing the character in chunks
// of 6 bits, most significant first.
- wchar_t wch = static_cast<uint8_t>(*s++) & mask;
- size_t i;
- for (i = 1; i < MIN(length, n); i++) {
- if ((*s & 0xc0) != 0x80) {
- // Malformed input; bad characters in the middle of a character.
- errno = EILSEQ;
- return ERR_ILLEGAL_SEQUENCE;
- }
+ wchar_t wch = mbstate_get_byte(state, 0) & mask;
+ for (i = 1; i < length; i++) {
wch <<= 6;
- wch |= *s++ & 0x3f;
+ wch |= mbstate_get_byte(state, i) & 0x3f;
}
- if (i < length) {
- return ERR_INCOMPLETE_SEQUENCE;
- }
+
if (wch < lower_bound) {
// Malformed input; redundant encoding.
- errno = EILSEQ;
- return ERR_ILLEGAL_SEQUENCE;
+ return reset_and_return_illegal(EILSEQ, state);
}
if ((wch >= 0xd800 && wch <= 0xdfff) || wch == 0xfffe || wch == 0xffff) {
// Malformed input; invalid code points.
- errno = EILSEQ;
- return ERR_ILLEGAL_SEQUENCE;
+ return reset_and_return_illegal(EILSEQ, state);
}
if (pwc != NULL) {
*pwc = wch;
}
- return (wch == L'\0' ? 0 : length);
+ return reset_and_return(wch == L'\0' ? 0 : bytes_wanted, state);
}
size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstate_t* ps) {
+ static mbstate_t __private_state;
+ mbstate_t* state = (ps == NULL) ? &__private_state : ps;
size_t i, o, r;
if (dst == NULL) {
+ /*
+ * The fast path in the loop below is not safe if an ASCII
+ * character appears as anything but the first byte of a
+ * multibyte sequence. Check now to avoid doing it in the loop.
+ */
+ if ((nmc > 0) && (mbstate_bytes_so_far(state) > 0)
+ && (static_cast<uint8_t>((*src)[0]) < 0x80)) {
+ return reset_and_return_illegal(EILSEQ, state);
+ }
for (i = o = 0; i < nmc; i += r, o++) {
if (static_cast<uint8_t>((*src)[i]) < 0x80) {
// Fast path for plain ASCII characters.
if ((*src)[i] == '\0') {
- return o;
+ return reset_and_return(o, state);
}
r = 1;
} else {
- r = mbrtowc(NULL, *src + i, nmc - i, ps);
+ r = mbrtowc(NULL, *src + i, nmc - i, state);
if (r == ERR_ILLEGAL_SEQUENCE) {
- return r;
+ return reset_and_return_illegal(EILSEQ, state);
}
if (r == ERR_INCOMPLETE_SEQUENCE) {
- return o;
+ return reset_and_return_illegal(EILSEQ, state);
}
if (r == 0) {
- return o;
+ return reset_and_return(o, state);
}
}
}
- return o;
+ return reset_and_return(o, state);
}
+ /*
+ * The fast path in the loop below is not safe if an ASCII
+ * character appears as anything but the first byte of a
+ * multibyte sequence. Check now to avoid doing it in the loop.
+ */
+ if ((nmc > 0) && (mbstate_bytes_so_far(state) > 0)
+ && (static_cast<uint8_t>((*src)[0]) < 0x80)) {
+ return reset_and_return_illegal(EILSEQ, state);
+ }
for (i = o = 0; i < nmc && o < len; i += r, o++) {
if (static_cast<uint8_t>((*src)[i]) < 0x80) {
// Fast path for plain ASCII characters.
dst[o] = (*src)[i];
if ((*src)[i] == '\0') {
*src = NULL;
- return o;
+ return reset_and_return_illegal(EILSEQ, state);
}
r = 1;
} else {
- r = mbrtowc(dst + o, *src + i, nmc - i, ps);
+ r = mbrtowc(dst + o, *src + i, nmc - i, state);
if (r == ERR_ILLEGAL_SEQUENCE) {
*src += i;
- return r;
+ return reset_and_return_illegal(EILSEQ, state);
}
if (r == ERR_INCOMPLETE_SEQUENCE) {
*src += nmc;
- return o;
+ return reset_and_return(EILSEQ, state);
}
if (r == 0) {
*src = NULL;
- return o;
+ return reset_and_return(o, state);
}
}
}
*src += i;
- return o;
+ return reset_and_return(o, state);
}
size_t mbsrtowcs(wchar_t* dst, const char** src, size_t len, mbstate_t* ps) {
return mbsnrtowcs(dst, src, SIZE_MAX, len, ps);
}
-size_t wcrtomb(char* s, wchar_t wc, mbstate_t*) {
+size_t wcrtomb(char* s, wchar_t wc, mbstate_t* ps) {
+ static mbstate_t __private_state;
+ mbstate_t* state = (ps == NULL) ? &__private_state : ps;
+
if (s == NULL) {
- // Reset to initial shift state (no-op).
- return 1;
+ // Equivalent to wcrtomb(buf, L'\0', ps).
+ return reset_and_return(1, state);
+ }
+
+ // POSIX states that if wc is a null wide character, a null byte shall be
+ // stored, preceded by any shift sequence needed to restore the initial shift
+ // state. Since shift states are not supported, only the null byte is stored.
+ if (wc == L'\0') {
+ *s = '\0';
+ reset_and_return(1, state);
+ }
+
+ if (!mbsinit(state)) {
+ return reset_and_return_illegal(EILSEQ, state);
}
if ((wc & ~0x7f) == 0) {
@@ -246,6 +332,13 @@
}
size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstate_t* ps) {
+ static mbstate_t __private_state;
+ mbstate_t* state = (ps == NULL) ? &__private_state : ps;
+
+ if (!mbsinit(state)) {
+ return reset_and_return_illegal(EILSEQ, state);
+ }
+
char buf[MB_LEN_MAX];
size_t i, o, r;
if (dst == NULL) {
@@ -258,7 +351,7 @@
}
r = 1;
} else {
- r = wcrtomb(buf, wc, ps);
+ r = wcrtomb(buf, wc, state);
if (r == ERR_ILLEGAL_SEQUENCE) {
return r;
}
@@ -279,14 +372,14 @@
r = 1;
} else if (len - o >= sizeof(buf)) {
// Enough space to translate in-place.
- r = wcrtomb(dst + o, wc, ps);
+ r = wcrtomb(dst + o, wc, state);
if (r == ERR_ILLEGAL_SEQUENCE) {
*src += i;
return r;
}
} else {
// May not be enough space; use temp buffer.
- r = wcrtomb(buf, wc, ps);
+ r = wcrtomb(buf, wc, state);
if (r == ERR_ILLEGAL_SEQUENCE) {
*src += i;
return r;
diff --git a/libc/include/dirent.h b/libc/include/dirent.h
index bfe4ea4..71eb2e7 100644
--- a/libc/include/dirent.h
+++ b/libc/include/dirent.h
@@ -56,6 +56,8 @@
struct dirent { __DIRENT64_BODY };
struct dirent64 { __DIRENT64_BODY };
+#undef __DIRENT64_BODY
+
#define d_fileno d_ino
typedef struct DIR DIR;
diff --git a/libc/include/sched.h b/libc/include/sched.h
index 68115bb..e43b6cc 100644
--- a/libc/include/sched.h
+++ b/libc/include/sched.h
@@ -59,10 +59,10 @@
extern int sched_getcpu(void);
extern int setns(int, int);
-#ifdef __LP32__
-#define CPU_SETSIZE 32
-#else
+#ifdef __LP64__
#define CPU_SETSIZE 1024
+#else
+#define CPU_SETSIZE 32
#endif
#define __CPU_BITTYPE unsigned long int /* mandated by the kernel */
diff --git a/libc/include/sys/stat.h b/libc/include/sys/stat.h
index e62e76d..c0c168b 100644
--- a/libc/include/sys/stat.h
+++ b/libc/include/sys/stat.h
@@ -130,6 +130,8 @@
struct stat { __STAT64_BODY };
struct stat64 { __STAT64_BODY };
+#undef __STAT64_BODY
+
#define st_atimensec st_atime_nsec
#define st_mtimensec st_mtime_nsec
#define st_ctimensec st_ctime_nsec
diff --git a/libc/include/sys/vfs.h b/libc/include/sys/vfs.h
index cd6044d..5358ffb 100644
--- a/libc/include/sys/vfs.h
+++ b/libc/include/sys/vfs.h
@@ -107,6 +107,8 @@
struct statfs { __STATFS64_BODY };
struct statfs64 { __STATFS64_BODY };
+#undef __STATFS64_BODY
+
/* Declare that we have the f_namelen, f_frsize, and f_flags fields. */
#define _STATFS_F_NAMELEN
#define _STATFS_F_FRSIZE
diff --git a/libc/include/wchar.h b/libc/include/wchar.h
index fe2fe07..4ac468d 100644
--- a/libc/include/wchar.h
+++ b/libc/include/wchar.h
@@ -41,11 +41,9 @@
typedef __WINT_TYPE__ wint_t;
typedef struct {
-#ifdef __LP32__
- int dummy;
-#else
- // 8 bytes should be enough to support at least UTF-8
- char __reserved[8];
+ uint8_t __seq[4];
+#ifdef __LP64__
+ char __reserved[4];
#endif
} mbstate_t;
diff --git a/linker/Android.mk b/linker/Android.mk
index f0e6c13..d2bcfaf 100644
--- a/linker/Android.mk
+++ b/linker/Android.mk
@@ -6,6 +6,7 @@
debugger.cpp \
dlfcn.cpp \
linker.cpp \
+ linker_allocator.cpp \
linker_environ.cpp \
linker_phdr.cpp \
rt.cpp \
@@ -67,3 +68,5 @@
LOCAL_INTERMEDIATE_TARGETS :=
include $(LOCAL_PATH)/linker_executable.mk
endif
+
+include $(call first-makefiles-under,$(LOCAL_PATH))
diff --git a/linker/linker.cpp b/linker/linker.cpp
old mode 100755
new mode 100644
index b61e041..df53a84
--- a/linker/linker.cpp
+++ b/linker/linker.cpp
@@ -48,6 +48,7 @@
#include "linker_debug.h"
#include "linker_environ.h"
#include "linker_phdr.h"
+#include "linker_allocator.h"
/* >>> IMPORTANT NOTE - READ ME BEFORE MODIFYING <<<
*
@@ -69,14 +70,8 @@
// We can't use malloc(3) in the dynamic linker. We use a linked list of anonymous
// maps, each a single page in size. The pages are broken up into as many struct soinfo
-// objects as will fit, and they're all threaded together on a free list.
-#define SOINFO_PER_POOL ((PAGE_SIZE - sizeof(soinfo_pool_t*)) / sizeof(soinfo))
-struct soinfo_pool_t {
- soinfo_pool_t* next;
- soinfo info[SOINFO_PER_POOL];
-};
-static struct soinfo_pool_t* gSoInfoPools = NULL;
-static soinfo* gSoInfoFreeList = NULL;
+// objects as will fit.
+static LinkerAllocator<soinfo> gSoInfoAllocator;
static soinfo* solist = &libdl_info;
static soinfo* sonext = &libdl_info;
@@ -269,56 +264,13 @@
rtld_db_dlactivity();
}
-static bool ensure_free_list_non_empty() {
- if (gSoInfoFreeList != NULL) {
- return true;
- }
-
- // Allocate a new pool.
- soinfo_pool_t* pool = reinterpret_cast<soinfo_pool_t*>(mmap(NULL, sizeof(*pool),
- PROT_READ|PROT_WRITE,
- MAP_PRIVATE|MAP_ANONYMOUS, 0, 0));
- if (pool == MAP_FAILED) {
- return false;
- }
-
- // Add the pool to our list of pools.
- pool->next = gSoInfoPools;
- gSoInfoPools = pool;
-
- // Chain the entries in the new pool onto the free list.
- gSoInfoFreeList = &pool->info[0];
- soinfo* next = NULL;
- for (int i = SOINFO_PER_POOL - 1; i >= 0; --i) {
- pool->info[i].next = next;
- next = &pool->info[i];
- }
-
- return true;
-}
-
-static void set_soinfo_pool_protection(int protection) {
- for (soinfo_pool_t* p = gSoInfoPools; p != NULL; p = p->next) {
- if (mprotect(p, sizeof(*p), protection) == -1) {
- abort(); // Can't happen.
- }
- }
-}
-
static soinfo* soinfo_alloc(const char* name) {
if (strlen(name) >= SOINFO_NAME_LEN) {
DL_ERR("library name \"%s\" too long", name);
return NULL;
}
- if (!ensure_free_list_non_empty()) {
- DL_ERR("out of memory when loading \"%s\"", name);
- return NULL;
- }
-
- // Take the head element off the free list.
- soinfo* si = gSoInfoFreeList;
- gSoInfoFreeList = gSoInfoFreeList->next;
+ soinfo* si = gSoInfoAllocator.alloc();
// Initialize the new element.
memset(si, 0, sizeof(soinfo));
@@ -357,8 +309,8 @@
if (si == sonext) {
sonext = prev;
}
- si->next = gSoInfoFreeList;
- gSoInfoFreeList = si;
+
+ gSoInfoAllocator.free(si);
}
@@ -794,8 +746,8 @@
munmap(reinterpret_cast<void*>(si->base), si->size);
notify_gdb_of_unload(si);
- soinfo_free(si);
si->ref_count = 0;
+ soinfo_free(si);
} else {
si->ref_count--;
TRACE("not unloading '%s', decrementing ref_count to %zd", si->name, si->ref_count);
@@ -822,19 +774,19 @@
DL_ERR("invalid extended flags to android_dlopen_ext: %x", extinfo->flags);
return NULL;
}
- set_soinfo_pool_protection(PROT_READ | PROT_WRITE);
+ gSoInfoAllocator.protect_all(PROT_READ | PROT_WRITE);
soinfo* si = find_library(name, extinfo);
if (si != NULL) {
si->CallConstructors();
}
- set_soinfo_pool_protection(PROT_READ);
+ gSoInfoAllocator.protect_all(PROT_READ);
return si;
}
int do_dlclose(soinfo* si) {
- set_soinfo_pool_protection(PROT_READ | PROT_WRITE);
+ gSoInfoAllocator.protect_all(PROT_READ | PROT_WRITE);
int result = soinfo_unload(si);
- set_soinfo_pool_protection(PROT_READ);
+ gSoInfoAllocator.protect_all(PROT_READ);
return result;
}
@@ -1382,7 +1334,7 @@
// The function may have called dlopen(3) or dlclose(3), so we need to ensure our data structures
// are still writable. This happens with our debug malloc (see http://b/7941716).
- set_soinfo_pool_protection(PROT_READ | PROT_WRITE);
+ gSoInfoAllocator.protect_all(PROT_READ | PROT_WRITE);
}
void soinfo::CallPreInitConstructors() {
@@ -1933,6 +1885,11 @@
ldpreload_env = linker_env_get("LD_PRELOAD");
}
+ // Linker does not call constructors for its own
+ // global variables so we need to initialize
+ // the allocator explicitly.
+ gSoInfoAllocator.init();
+
INFO("[ android linker & debugger ]");
soinfo* si = soinfo_alloc(args.argv[0]);
@@ -2150,7 +2107,7 @@
args.abort_message_ptr = &gAbortMessage;
ElfW(Addr) start_address = __linker_init_post_relocation(args, linker_addr);
- set_soinfo_pool_protection(PROT_READ);
+ gSoInfoAllocator.protect_all(PROT_READ);
// Return the address that the calling assembly stub should jump to.
return start_address;
diff --git a/linker/linker_allocator.cpp b/linker/linker_allocator.cpp
new file mode 100644
index 0000000..805844fc
--- /dev/null
+++ b/linker/linker_allocator.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "linker_allocator.h"
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+struct LinkerAllocatorPage {
+ LinkerAllocatorPage* next;
+ uint8_t bytes[PAGE_SIZE-sizeof(LinkerAllocatorPage*)];
+};
+
+struct FreeBlockInfo {
+ void* next_block;
+ size_t num_free_blocks;
+};
+
+LinkerBlockAllocator::LinkerBlockAllocator()
+ : block_size_(0),
+ page_list_(nullptr),
+ free_block_list_(nullptr)
+{}
+
+void LinkerBlockAllocator::init(size_t block_size) {
+ block_size_ = block_size < sizeof(FreeBlockInfo) ? sizeof(FreeBlockInfo) : block_size;
+}
+
+
+void* LinkerBlockAllocator::alloc() {
+ if (free_block_list_ == nullptr) {
+ create_new_page();
+ } else {
+ protect_page(free_block_list_, PROT_READ | PROT_WRITE);
+ }
+
+ FreeBlockInfo* block_info = reinterpret_cast<FreeBlockInfo*>(free_block_list_);
+ if (block_info->num_free_blocks > 1) {
+ FreeBlockInfo* next_block_info = reinterpret_cast<FreeBlockInfo*>(
+ reinterpret_cast<char*>(free_block_list_) + block_size_);
+ next_block_info->next_block = block_info->next_block;
+ next_block_info->num_free_blocks = block_info->num_free_blocks - 1;
+ free_block_list_ = next_block_info;
+ } else {
+ free_block_list_ = block_info->next_block;
+ }
+
+ block_info->next_block = nullptr;
+ block_info->num_free_blocks = 0;
+
+ return block_info;
+}
+
+void LinkerBlockAllocator::free(void* block) {
+ if (block == nullptr) {
+ return;
+ }
+
+ LinkerAllocatorPage* page = find_page(block);
+
+ if (page == nullptr) {
+ abort();
+ }
+
+ ssize_t offset = reinterpret_cast<uint8_t*>(block) - page->bytes;
+
+ if (offset % block_size_ != 0) {
+ abort();
+ }
+
+ FreeBlockInfo* block_info = reinterpret_cast<FreeBlockInfo*>(block);
+
+ protect_page(block_info, PROT_READ | PROT_WRITE);
+ block_info->next_block = free_block_list_;
+ block_info->num_free_blocks = 1;
+ protect_page(block_info, PROT_READ);
+
+ free_block_list_ = block_info;
+}
+
+void LinkerBlockAllocator::protect_all(int prot) {
+ for (LinkerAllocatorPage* page = page_list_; page != nullptr; page = page->next) {
+ if (mprotect(page, PAGE_SIZE, prot) == -1) {
+ abort();
+ }
+ }
+}
+
+void LinkerBlockAllocator::protect_page(void* block, int prot) {
+ LinkerAllocatorPage* page = find_page(block);
+ if (page == nullptr || mprotect(page, PAGE_SIZE, prot) == -1) {
+ abort();
+ }
+}
+
+
+void LinkerBlockAllocator::create_new_page() {
+ LinkerAllocatorPage* page = reinterpret_cast<LinkerAllocatorPage*>(mmap(nullptr, PAGE_SIZE,
+ PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0));
+ if (page == MAP_FAILED) {
+ abort(); // oom
+ }
+
+ FreeBlockInfo* first_block = reinterpret_cast<FreeBlockInfo*>(page->bytes);
+ first_block->next_block = free_block_list_;
+ first_block->num_free_blocks = (PAGE_SIZE - sizeof(LinkerAllocatorPage*))/block_size_;
+
+ free_block_list_ = first_block;
+
+ page->next = page_list_;
+ page_list_ = page;
+}
+
+LinkerAllocatorPage* LinkerBlockAllocator::find_page(void* block) {
+ if (block == nullptr) {
+ abort();
+ }
+
+ LinkerAllocatorPage* page = page_list_;
+ const uint8_t* page_ptr = reinterpret_cast<const uint8_t*>(page);
+ while (page != nullptr) {
+ if (block >= (page_ptr + sizeof(page->next)) && block < (page_ptr + PAGE_SIZE)) {
+ return page;
+ }
+
+ page = page->next;
+ }
+
+ abort();
+}
diff --git a/linker/linker_allocator.h b/linker/linker_allocator.h
new file mode 100644
index 0000000..e5b63c5
--- /dev/null
+++ b/linker/linker_allocator.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2014 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LINKER_ALLOCATOR_H
+#define __LINKER_ALLOCATOR_H
+
+#include <stdlib.h>
+#include <limits.h>
+#include "private/bionic_macros.h"
+
+struct LinkerAllocatorPage;
+
+/*
+ * This class is a non-template version of the LinkerAllocator
+ * It keeps code inside .cpp file by keeping the interface
+ * template-free.
+ *
+ * Please use LinkerAllocator<type> where possible (everywhere).
+ */
+class LinkerBlockAllocator {
+ public:
+ LinkerBlockAllocator();
+
+ void init(size_t block_size);
+ void* alloc();
+ void free(void* block);
+ void protect_page(void* block, int prot);
+ void protect_all(int prot);
+
+ private:
+ void create_new_page();
+ LinkerAllocatorPage* find_page(void* block);
+
+ size_t block_size_;
+ LinkerAllocatorPage* page_list_;
+ void* free_block_list_;
+
+ DISALLOW_COPY_AND_ASSIGN(LinkerBlockAllocator);
+};
+
+/*
+ * A simple allocator for the dynamic linker. An allocator allocates instances
+ * of a single fixed-size type. Allocations are backed by page-sized private
+ * anonymous mmaps.
+ */
+template<typename T>
+class LinkerAllocator {
+ public:
+ LinkerAllocator() : block_allocator_() {}
+ void init() { block_allocator_.init(sizeof(T)); }
+ T* alloc() { return reinterpret_cast<T*>(block_allocator_.alloc()); }
+ void free(T* t) { block_allocator_.free(t); }
+ void protect_page(T* t, int prot) { block_allocator_.protect_page(t, prot); }
+ void protect_all(int prot) { block_allocator_.protect_all(prot); }
+ private:
+ LinkerBlockAllocator block_allocator_;
+ DISALLOW_COPY_AND_ASSIGN(LinkerAllocator);
+};
+#endif // __LINKER_ALLOCATOR_H
diff --git a/linker/tests/Android.mk b/linker/tests/Android.mk
new file mode 100644
index 0000000..600fe69
--- /dev/null
+++ b/linker/tests/Android.mk
@@ -0,0 +1,38 @@
+#
+# Copyright (C) 2012 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+ifneq ($(BUILD_TINY_ANDROID),true)
+
+LOCAL_PATH:= $(call my-dir)
+
+include $(CLEAR_VARS)
+LOCAL_MULTILIB := both
+LOCAL_MODULE := linker-unit-tests
+LOCAL_MODULE_STEM_32 := $(LOCAL_MODULE)32
+LOCAL_MODULE_STEM_64 := $(LOCAL_MODULE)64
+
+LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk
+
+LOCAL_CFLAGS += -g -Wall -Wextra -Werror -std=gnu++11
+LOCAL_C_INCLUDES := $(LOCAL_PATH)/../../libc/
+
+LOCAL_SRC_FILES := \
+ linker_allocator_test.cpp \
+ ../linker_allocator.cpp
+
+include $(BUILD_NATIVE_TEST)
+
+endif # !BUILD_TINY_ANDROID
diff --git a/linker/tests/linker_allocator_test.cpp b/linker/tests/linker_allocator_test.cpp
new file mode 100644
index 0000000..ccbdce6
--- /dev/null
+++ b/linker/tests/linker_allocator_test.cpp
@@ -0,0 +1,161 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+
+#include <gtest/gtest.h>
+
+#include "../linker_allocator.h"
+
+#include <unistd.h>
+
+namespace {
+
+struct test_struct_nominal {
+ void* pointer;
+ ssize_t value;
+};
+
+/*
+ * this one has size below allocator cap which is 2*sizeof(void*)
+ */
+struct test_struct_small {
+ char dummy_str[5];
+};
+
+/*
+ * 1009 byte struct (1009 is prime)
+ */
+struct test_struct_larger {
+ char dummy_str[1009];
+};
+
+static size_t kPageSize = sysconf(_SC_PAGE_SIZE);
+};
+
+TEST(linker_allocator, test_nominal) {
+ LinkerAllocator<test_struct_nominal> allocator;
+ allocator.init();
+
+ test_struct_nominal* ptr1 = allocator.alloc();
+ ASSERT_TRUE(ptr1 != nullptr);
+ test_struct_nominal* ptr2 = allocator.alloc();
+ ASSERT_TRUE(ptr2 != nullptr);
+ // they should be next to each other.
+ ASSERT_EQ(ptr1+1, ptr2);
+
+ ptr1->value = 42;
+
+ allocator.protect_page(ptr1, PROT_READ);
+
+ allocator.free(ptr1);
+ allocator.free(ptr2);
+}
+
+TEST(linker_allocator, test_small) {
+ LinkerAllocator<test_struct_small> allocator;
+ allocator.init();
+
+ char* ptr1 = reinterpret_cast<char*>(allocator.alloc());
+ char* ptr2 = reinterpret_cast<char*>(allocator.alloc());
+
+ ASSERT_TRUE(ptr1 != nullptr);
+ ASSERT_TRUE(ptr2 != nullptr);
+ ASSERT_EQ(ptr1+2*sizeof(void*), ptr2);
+}
+
+TEST(linker_allocator, test_larger) {
+ LinkerAllocator<test_struct_larger> allocator;
+ allocator.init();
+
+ test_struct_larger* ptr1 = allocator.alloc();
+ test_struct_larger* ptr2 = allocator.alloc();
+
+ ASSERT_TRUE(ptr1 != nullptr);
+ ASSERT_TRUE(ptr2 != nullptr);
+
+ ASSERT_EQ(ptr1+1, ptr2);
+
+ allocator.protect_page(ptr2, PROT_READ);
+
+ // lets allocate until we reach next page.
+ size_t n = kPageSize/sizeof(test_struct_larger) + 1 - 2;
+
+ for (size_t i=0; i<n; ++i) {
+ ASSERT_TRUE(allocator.alloc() != nullptr);
+ }
+
+}
+
+static void protect_one_page() {
+ LinkerAllocator<test_struct_larger> allocator;
+ allocator.init();
+
+ // number of allocs to reach the end of first page
+ size_t n = kPageSize/sizeof(test_struct_larger) - 1;
+ test_struct_larger* page1_ptr = allocator.alloc();
+
+ for (size_t i=0; i<n; ++i) {
+ allocator.alloc();
+ }
+
+ test_struct_larger* page2_ptr = allocator.alloc();
+
+ allocator.protect_page(page2_ptr, PROT_READ);
+
+ // check that we still have access to page1
+ page1_ptr->dummy_str[17] = 52;
+
+ fprintf(stderr, "trying to access protected page");
+
+ // this should result in segmentation fault
+ page2_ptr->dummy_str[12] = 3;
+}
+
+static void protect_all() {
+ LinkerAllocator<test_struct_larger> allocator;
+ allocator.init();
+
+ // number of allocs to reach the end of first page
+ size_t n = kPageSize/sizeof(test_struct_larger) - 1;
+ test_struct_larger* page1_ptr = allocator.alloc();
+
+ for (size_t i=0; i<n; ++i) {
+ allocator.alloc();
+ }
+
+ test_struct_larger* page2_ptr = allocator.alloc();
+ allocator.protect_all(PROT_READ);
+ allocator.protect_all(PROT_READ | PROT_WRITE);
+ // check access
+ page2_ptr->dummy_str[23] = 27;
+ page1_ptr->dummy_str[13] = 11;
+
+ allocator.protect_all(PROT_READ);
+ fprintf(stderr, "trying to access protected page");
+
+ // this should result in segmentation fault
+ page1_ptr->dummy_str[11] = 7;
+}
+
+TEST(linker_allocator, test_protect) {
+ testing::FLAGS_gtest_death_test_style = "threadsafe";
+ ASSERT_EXIT(protect_one_page(), testing::KilledBySignal(SIGSEGV), "trying to access protected page");
+ ASSERT_EXIT(protect_all(), testing::KilledBySignal(SIGSEGV), "trying to access protected page");
+}
+
diff --git a/tests/stdio_test.cpp b/tests/stdio_test.cpp
index 8f6ee2b..4725350 100644
--- a/tests/stdio_test.cpp
+++ b/tests/stdio_test.cpp
@@ -24,6 +24,9 @@
#include <sys/stat.h>
#include <unistd.h>
#include <wchar.h>
+#include <locale.h>
+
+#include "TemporaryFile.h"
TEST(stdio, tmpfile_fileno_fprintf_rewind_fgets) {
FILE* fp = tmpfile();
@@ -479,3 +482,127 @@
EXPECT_EQ(EBADF, errno);
#endif
}
+
+// Tests that we can only have a consistent and correct fpos_t when using
+// f*pos functions (i.e. fpos doesn't get inside a multi byte character).
+TEST(stdio, consistent_fpos_t) {
+ ASSERT_STREQ("C.UTF-8", setlocale(LC_CTYPE, "C.UTF-8"));
+ uselocale(LC_GLOBAL_LOCALE);
+
+ FILE* fp = tmpfile();
+ ASSERT_TRUE(fp != NULL);
+
+ wchar_t mb_one_bytes = L'h';
+ wchar_t mb_two_bytes = 0x00a2;
+ wchar_t mb_three_bytes = 0x20ac;
+ wchar_t mb_four_bytes = 0x24b62;
+
+ // Write to file.
+ ASSERT_EQ(mb_one_bytes, static_cast<wchar_t>(fputwc(mb_one_bytes, fp)));
+ ASSERT_EQ(mb_two_bytes, static_cast<wchar_t>(fputwc(mb_two_bytes, fp)));
+ ASSERT_EQ(mb_three_bytes, static_cast<wchar_t>(fputwc(mb_three_bytes, fp)));
+ ASSERT_EQ(mb_four_bytes, static_cast<wchar_t>(fputwc(mb_four_bytes, fp)));
+
+ rewind(fp);
+
+ // Record each character position.
+ fpos_t pos1;
+ fpos_t pos2;
+ fpos_t pos3;
+ fpos_t pos4;
+ fpos_t pos5;
+ EXPECT_EQ(0, fgetpos(fp, &pos1));
+ ASSERT_EQ(mb_one_bytes, static_cast<wchar_t>(fgetwc(fp)));
+ EXPECT_EQ(0, fgetpos(fp, &pos2));
+ ASSERT_EQ(mb_two_bytes, static_cast<wchar_t>(fgetwc(fp)));
+ EXPECT_EQ(0, fgetpos(fp, &pos3));
+ ASSERT_EQ(mb_three_bytes, static_cast<wchar_t>(fgetwc(fp)));
+ EXPECT_EQ(0, fgetpos(fp, &pos4));
+ ASSERT_EQ(mb_four_bytes, static_cast<wchar_t>(fgetwc(fp)));
+ EXPECT_EQ(0, fgetpos(fp, &pos5));
+
+#ifdef __BIONIC__
+ // Bionic's fpos_t is just an alias for off_t. This is inherited from OpenBSD
+ // upstream. Glibc differs by storing the mbstate_t inside its fpos_t. In
+ // Bionic (and upstream OpenBSD) the mbstate_t is stored inside the FILE
+ // structure.
+ ASSERT_EQ(0, static_cast<off_t>(pos1));
+ ASSERT_EQ(1, static_cast<off_t>(pos2));
+ ASSERT_EQ(3, static_cast<off_t>(pos3));
+ ASSERT_EQ(6, static_cast<off_t>(pos4));
+ ASSERT_EQ(10, static_cast<off_t>(pos5));
+#endif
+
+ // Exercise back and forth movements of the position.
+ ASSERT_EQ(0, fsetpos(fp, &pos2));
+ ASSERT_EQ(mb_two_bytes, static_cast<wchar_t>(fgetwc(fp)));
+ ASSERT_EQ(0, fsetpos(fp, &pos1));
+ ASSERT_EQ(mb_one_bytes, static_cast<wchar_t>(fgetwc(fp)));
+ ASSERT_EQ(0, fsetpos(fp, &pos4));
+ ASSERT_EQ(mb_four_bytes, static_cast<wchar_t>(fgetwc(fp)));
+ ASSERT_EQ(0, fsetpos(fp, &pos3));
+ ASSERT_EQ(mb_three_bytes, static_cast<wchar_t>(fgetwc(fp)));
+ ASSERT_EQ(0, fsetpos(fp, &pos5));
+ ASSERT_EQ(WEOF, fgetwc(fp));
+
+ fclose(fp);
+}
+
+// Exercise the interaction between fpos and seek.
+TEST(stdio, fpos_t_and_seek) {
+ ASSERT_STREQ("C.UTF-8", setlocale(LC_CTYPE, "C.UTF-8"));
+ uselocale(LC_GLOBAL_LOCALE);
+
+ // For glibc we need to close and re-open the file in order for fseek to work
+ // after using setlocale(LC_CTYPE, "C.UTF-8") and fputwc.
+ // TODO: find out if this is expected or a bug in glibc.
+ TemporaryFile tf;
+ FILE* fp = fdopen(tf.fd, "w+");
+ ASSERT_TRUE(fp != NULL);
+
+ wchar_t mb_two_bytes = 0x00a2;
+ wchar_t mb_three_bytes = 0x20ac;
+ wchar_t mb_four_bytes = 0x24b62;
+
+ // Write to file.
+ ASSERT_EQ(mb_two_bytes, static_cast<wchar_t>(fputwc(mb_two_bytes, fp)));
+ ASSERT_EQ(mb_three_bytes, static_cast<wchar_t>(fputwc(mb_three_bytes, fp)));
+ ASSERT_EQ(mb_four_bytes, static_cast<wchar_t>(fputwc(mb_four_bytes, fp)));
+
+ fflush(fp);
+ fclose(fp);
+
+ fp = fopen(tf.filename, "r");
+ ASSERT_TRUE(fp != NULL);
+
+ // Store a valid position.
+ fpos_t mb_two_bytes_pos;
+ ASSERT_EQ(0, fgetpos(fp, &mb_two_bytes_pos));
+
+ // Move inside mb_four_bytes with fseek.
+ long offset_inside_mb = 6;
+ ASSERT_EQ(0, fseek(fp, offset_inside_mb, SEEK_SET));
+
+ // Store the "inside multi byte" position.
+ fpos_t pos_inside_mb;
+ ASSERT_EQ(0, fgetpos(fp, &pos_inside_mb));
+ #ifdef __BIONIC__
+ ASSERT_EQ(offset_inside_mb, static_cast<off_t>(pos_inside_mb));
+ #endif
+
+ // Reading from within a byte should produce an error.
+ ASSERT_EQ(WEOF, fgetwc(fp));
+ ASSERT_EQ(EILSEQ, errno);
+
+ // Reverting to a valid position should work.
+ ASSERT_EQ(0, fsetpos(fp, &mb_two_bytes_pos));
+ ASSERT_EQ(mb_two_bytes, static_cast<wchar_t>(fgetwc(fp)));
+
+ // Moving withing a multi byte with fsetpos should work but reading should
+ // produce an error.
+ ASSERT_EQ(0, fsetpos(fp, &pos_inside_mb));
+ ASSERT_EQ(WEOF, fgetwc(fp));
+ ASSERT_EQ(EILSEQ, errno);
+
+ fclose(fp);
+}
diff --git a/tests/wchar_test.cpp b/tests/wchar_test.cpp
index 0d15f21..30d7bff 100644
--- a/tests/wchar_test.cpp
+++ b/tests/wchar_test.cpp
@@ -87,6 +87,29 @@
EXPECT_EQ(EILSEQ, errno);
}
+TEST(wchar, wcrtomb_start_state) {
+ char out[MB_LEN_MAX];
+ mbstate_t ps;
+
+ // Any non-initial state is invalid when calling wcrtomb.
+ memset(&ps, 0, sizeof(ps));
+ EXPECT_EQ(static_cast<size_t>(-2), mbrtowc(NULL, "\xc2", 1, &ps));
+ EXPECT_EQ(static_cast<size_t>(-1), wcrtomb(out, 0x00a2, &ps));
+ EXPECT_EQ(EILSEQ, errno);
+
+ // If the first argument to wcrtomb is NULL or the second is L'\0' the shift
+ // state should be reset.
+ memset(&ps, 0, sizeof(ps));
+ EXPECT_EQ(static_cast<size_t>(-2), mbrtowc(NULL, "\xc2", 1, &ps));
+ EXPECT_EQ(1U, wcrtomb(NULL, 0x00a2, &ps));
+ EXPECT_TRUE(mbsinit(&ps));
+
+ memset(&ps, 0, sizeof(ps));
+ EXPECT_EQ(static_cast<size_t>(-2), mbrtowc(NULL, "\xf0\xa4", 1, &ps));
+ EXPECT_EQ(1U, wcrtomb(out, L'\0', &ps));
+ EXPECT_TRUE(mbsinit(&ps));
+}
+
TEST(wchar, wcstombs_wcrtombs) {
const wchar_t chars[] = { L'h', L'e', L'l', L'l', L'o', 0 };
const wchar_t bad_chars[] = { L'h', L'i', static_cast<wchar_t>(0xffffffff), 0 };
@@ -184,6 +207,14 @@
EXPECT_EQ(EILSEQ, errno);
bytes[3] = 0;
EXPECT_STREQ("hix", bytes);
+
+ // Any non-initial state is invalid when calling wcsrtombs.
+ mbstate_t ps;
+ src = chars;
+ memset(&ps, 0, sizeof(ps));
+ ASSERT_EQ(static_cast<size_t>(-2), mbrtowc(NULL, "\xc2", 1, &ps));
+ EXPECT_EQ(static_cast<size_t>(-1), wcsrtombs(NULL, &src, 0, &ps));
+ EXPECT_EQ(EILSEQ, errno);
}
TEST(wchar, limits) {
@@ -267,6 +298,83 @@
ASSERT_EQ(EILSEQ, errno);
}
+void test_mbrtowc_incomplete(mbstate_t* ps) {
+ ASSERT_STREQ("C.UTF-8", setlocale(LC_CTYPE, "C.UTF-8"));
+ uselocale(LC_GLOBAL_LOCALE);
+
+ wchar_t out;
+ // 2-byte UTF-8.
+ ASSERT_EQ(static_cast<size_t>(-2), mbrtowc(&out, "\xc2", 1, ps));
+ ASSERT_EQ(1U, mbrtowc(&out, "\xa2" "cdef", 5, ps));
+ ASSERT_EQ(0x00a2, out);
+ ASSERT_TRUE(mbsinit(ps));
+ // 3-byte UTF-8.
+ ASSERT_EQ(static_cast<size_t>(-2), mbrtowc(&out, "\xe2", 1, ps));
+ ASSERT_EQ(static_cast<size_t>(-2), mbrtowc(&out, "\x82", 1, ps));
+ ASSERT_EQ(1U, mbrtowc(&out, "\xac" "def", 4, ps));
+ ASSERT_EQ(0x20ac, out);
+ ASSERT_TRUE(mbsinit(ps));
+ // 4-byte UTF-8.
+ ASSERT_EQ(static_cast<size_t>(-2), mbrtowc(&out, "\xf0", 1, ps));
+ ASSERT_EQ(static_cast<size_t>(-2), mbrtowc(&out, "\xa4\xad", 2, ps));
+ ASSERT_EQ(1U, mbrtowc(&out, "\xa2" "ef", 3, ps));
+ ASSERT_EQ(0x24b62, out);
+ ASSERT_TRUE(mbsinit(ps));
+
+ // Invalid 2-byte
+ ASSERT_EQ(static_cast<size_t>(-2), mbrtowc(&out, "\xc2", 1, ps));
+ ASSERT_EQ(static_cast<size_t>(-1), mbrtowc(&out, "\x20" "cdef", 5, ps));
+ ASSERT_EQ(EILSEQ, errno);
+}
+
+TEST(wchar, mbrtowc_incomplete) {
+ mbstate_t ps;
+ memset(&ps, 0, sizeof(ps));
+
+ test_mbrtowc_incomplete(&ps);
+ test_mbrtowc_incomplete(NULL);
+}
+
+void test_mbsrtowcs(mbstate_t* ps) {
+ wchar_t out[4];
+
+ const char* valid = "A" "\xc2\xa2" "\xe2\x82\xac" "\xf0\xa4\xad\xa2" "ef";
+ ASSERT_EQ(4U, mbsrtowcs(out, &valid, 4, ps));
+ ASSERT_EQ(L'A', out[0]);
+ ASSERT_EQ(0x00a2, out[1]);
+ ASSERT_EQ(0x20ac, out[2]);
+ ASSERT_EQ(0x24b62, out[3]);
+ ASSERT_EQ('e', *valid);
+
+ const char* invalid = "A" "\xc2\x20" "ef";
+ ASSERT_EQ(static_cast<size_t>(-1), mbsrtowcs(out, &invalid, 4, ps));
+ EXPECT_EQ(EILSEQ, errno);
+ ASSERT_EQ('\xc2', *invalid);
+
+ const char* incomplete = "A" "\xc2";
+ ASSERT_EQ(static_cast<size_t>(-1), mbsrtowcs(out, &incomplete, 2, ps));
+ EXPECT_EQ(EILSEQ, errno);
+ ASSERT_EQ('\xc2', *incomplete);
+}
+
+TEST(wchar, mbsrtowcs) {
+ ASSERT_STREQ("C.UTF-8", setlocale(LC_CTYPE, "C.UTF-8"));
+ uselocale(LC_GLOBAL_LOCALE);
+
+ mbstate_t ps;
+ memset(&ps, 0, sizeof(ps));
+ test_mbsrtowcs(&ps);
+ test_mbsrtowcs(NULL);
+
+ // Invalid multi byte continuation.
+ const char* invalid = "\x20";
+ wchar_t out;
+ ASSERT_EQ(static_cast<size_t>(-2), mbrtowc(&out, "\xc2", 1, &ps));
+ ASSERT_EQ(static_cast<size_t>(-1), mbsrtowcs(&out, &invalid, 1, &ps));
+ EXPECT_EQ(EILSEQ, errno);
+ ASSERT_EQ('\x20', *invalid);
+}
+
TEST(wchar, wcstod) {
ASSERT_DOUBLE_EQ(1.23, wcstod(L"1.23", NULL));
}