bionic/x86: Optimization for string routines
Optimized strcpy, strcat,
strncpy, strncat, strlcpy, strlcat,
memchr, memrchr, strchr, strrchr, index,
strnlen, strlen, wcslen, wmemcmp, wcscmp,
wcschr, wcsrchr, wcscpy, wcscat
Change-Id: I82b29132edf9a2e144e0bb3ee4ff5217df8d2a6d
Signed-off-by: Liubov Dmitrieva <liubov.dmitrieva@intel.com>
diff --git a/libc/arch-x86/string/bcopy_wrapper.S b/libc/arch-x86/string/bcopy_wrapper.S
deleted file mode 100644
index fa8774c..0000000
--- a/libc/arch-x86/string/bcopy_wrapper.S
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
-
- * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-
-#if defined(USE_SSSE3)
-
-# include "cache_wrapper.S"
-# undef __i686
-# define MEMCPY bcopy
-# define USE_AS_MEMMOVE
-# define USE_AS_BCOPY
-# include "ssse3-memcpy5.S"
-
-#else
-
-# include "bcopy.S"
-
-#endif
diff --git a/libc/arch-x86/string/bzero_wrapper.S b/libc/arch-x86/string/bzero_wrapper.S
deleted file mode 100644
index aa1bb9c..0000000
--- a/libc/arch-x86/string/bzero_wrapper.S
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
-
- * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(USE_SSE2)
-
-# include "cache_wrapper.S"
-# undef __i686
-# define USE_AS_BZERO
-# define sse2_memset5_atom bzero
-# include "sse2-memset5-atom.S"
-
-#else
-
-# include "bzero.S"
-
-#endif
diff --git a/libc/arch-x86/string/cache_wrapper.S b/libc/arch-x86/string/cache.h
similarity index 100%
rename from libc/arch-x86/string/cache_wrapper.S
rename to libc/arch-x86/string/cache.h
diff --git a/libc/arch-x86/string/memcpy_wrapper.S b/libc/arch-x86/string/memcpy_wrapper.S
deleted file mode 100644
index 7e765ea..0000000
--- a/libc/arch-x86/string/memcpy_wrapper.S
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
-
- * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(USE_SSSE3)
-
-# include "cache_wrapper.S"
-# undef __i686
-# define MEMCPY memcpy
-# define USE_AS_MEMMOVE
-# include "ssse3-memcpy5.S"
-
-#else
-
-# include "memcpy.S"
-
-#endif
diff --git a/libc/arch-x86/string/memmove_wrapper.S b/libc/arch-x86/string/memmove_wrapper.S
deleted file mode 100644
index 7e83e27..0000000
--- a/libc/arch-x86/string/memmove_wrapper.S
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
-
- * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(USE_SSSE3)
-
-# include "cache_wrapper.S"
-# undef __i686
-# define MEMCPY memmove
-# define USE_AS_MEMMOVE
-# include "ssse3-memcpy5.S"
-
-#else
-
-# include "memmove.S"
-
-#endif
diff --git a/libc/arch-x86/string/memset_wrapper.S b/libc/arch-x86/string/memset_wrapper.S
deleted file mode 100644
index d037a50..0000000
--- a/libc/arch-x86/string/memset_wrapper.S
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
-
- * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(USE_SSE2)
-
-# include "cache_wrapper.S"
-# undef __i686
-# define sse2_memset5_atom memset
-# include "sse2-memset5-atom.S"
-
-#else
-
-# include "memset.S"
-
-#endif
diff --git a/libc/arch-x86/string/memcmp_wrapper.S b/libc/arch-x86/string/sse2-bzero-atom.S
similarity index 92%
copy from libc/arch-x86/string/memcmp_wrapper.S
copy to libc/arch-x86/string/sse2-bzero-atom.S
index fa0c672..0ddc499 100644
--- a/libc/arch-x86/string/memcmp_wrapper.S
+++ b/libc/arch-x86/string/sse2-bzero-atom.S
@@ -28,13 +28,6 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#if defined(USE_SSSE3)
-
-# define MEMCMP memcmp
-# include "ssse3-memcmp3-new.S"
-
-#else
-
-# include "memcmp.S"
-
-#endif
+#define USE_AS_BZERO
+#define MEMSET bzero
+#include "sse2-memset-atom.S"
diff --git a/libc/arch-x86/string/memcmp_wrapper.S b/libc/arch-x86/string/sse2-index-atom.S
similarity index 90%
copy from libc/arch-x86/string/memcmp_wrapper.S
copy to libc/arch-x86/string/sse2-index-atom.S
index fa0c672..d51e1d4 100644
--- a/libc/arch-x86/string/memcmp_wrapper.S
+++ b/libc/arch-x86/string/sse2-index-atom.S
@@ -1,5 +1,5 @@
/*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2011, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,13 +28,5 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#if defined(USE_SSSE3)
-
-# define MEMCMP memcmp
-# include "ssse3-memcmp3-new.S"
-
-#else
-
-# include "memcmp.S"
-
-#endif
+#define strchr index
+#include "sse2-strchr-atom.S"
diff --git a/libc/arch-x86/string/sse2-memchr-atom.S b/libc/arch-x86/string/sse2-memchr-atom.S
new file mode 100644
index 0000000..013af9b
--- /dev/null
+++ b/libc/arch-x86/string/sse2-memchr-atom.S
@@ -0,0 +1,556 @@
+/*
+Copyright (c) 2011, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#define ENTRANCE PUSH (%edi);
+#define PARMS 8
+#define RETURN POP (%edi); ret; CFI_PUSH (%edi);
+
+#define STR1 PARMS
+#define STR2 STR1+4
+#define LEN STR2+4
+
+ .text
+ENTRY (memchr)
+ ENTRANCE
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+ mov LEN(%esp), %edx
+ test %edx, %edx
+ jz L(return_null)
+
+ punpcklbw %xmm1, %xmm1
+ mov %ecx, %edi
+ punpcklbw %xmm1, %xmm1
+
+ and $63, %ecx
+ pshufd $0, %xmm1, %xmm1
+ cmp $48, %ecx
+ ja L(crosscache)
+
+ movdqu (%edi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(match_case2_prolog)
+
+ sub $16, %edx
+ jbe L(return_null)
+ lea 16(%edi), %edi
+ and $15, %ecx
+ and $-16, %edi
+ add %ecx, %edx
+ sub $64, %edx
+ jbe L(exit_loop)
+ jmp L(loop_prolog)
+
+ .p2align 4
+L(crosscache):
+ and $15, %ecx
+ and $-16, %edi
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ sar %cl, %eax
+ test %eax, %eax
+
+ jnz L(match_case2_prolog1)
+ lea -16(%edx), %edx
+ add %ecx, %edx
+ jle L(return_null)
+ lea 16(%edi), %edi
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ .p2align 4
+L(loop_prolog):
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ xor %ecx, %ecx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+ movdqa 16(%edi), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+ movdqa 32(%edi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+ movdqa 48(%edi), %xmm4
+ pcmpeqb %xmm1, %xmm4
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+ lea 64(%edi), %edi
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ xor %ecx, %ecx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+ movdqa 16(%edi), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+ movdqa 32(%edi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+ movdqa 48(%edi), %xmm4
+ pcmpeqb %xmm1, %xmm4
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(match_case1)
+
+ lea 64(%edi), %edi
+ mov %edi, %ecx
+ and $-64, %edi
+ and $63, %ecx
+ add %ecx, %edx
+
+ .p2align 4
+L(align64_loop):
+ sub $64, %edx
+ jbe L(exit_loop)
+ movdqa (%edi), %xmm0
+ movdqa 16(%edi), %xmm2
+ movdqa 32(%edi), %xmm3
+ movdqa 48(%edi), %xmm4
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm1, %xmm4
+
+ pmaxub %xmm0, %xmm3
+ pmaxub %xmm2, %xmm4
+ pmaxub %xmm3, %xmm4
+ add $64, %edi
+ pmovmskb %xmm4, %eax
+
+ test %eax, %eax
+ jz L(align64_loop)
+
+ sub $64, %edi
+
+ pmovmskb %xmm0, %eax
+ xor %ecx, %ecx
+ test %eax, %eax
+ jnz L(match_case1)
+
+ pmovmskb %xmm2, %eax
+ lea 16(%ecx), %ecx
+ test %eax, %eax
+ jnz L(match_case1)
+
+ movdqa 32(%edi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ lea 16(%ecx), %ecx
+ test %eax, %eax
+ jnz L(match_case1)
+
+ pcmpeqb 48(%edi), %xmm1
+ pmovmskb %xmm1, %eax
+ lea 16(%ecx), %ecx
+
+ .p2align 4
+L(match_case1):
+ add %ecx, %edi
+ test %al, %al
+ jz L(match_case1_high)
+ mov %al, %cl
+ and $15, %cl
+ jz L(match_case1_8)
+ test $0x01, %al
+ jnz L(exit_case1_1)
+ test $0x02, %al
+ jnz L(exit_case1_2)
+ test $0x04, %al
+ jnz L(exit_case1_3)
+ lea 3(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case1_8):
+ test $0x10, %al
+ jnz L(exit_case1_5)
+ test $0x20, %al
+ jnz L(exit_case1_6)
+ test $0x40, %al
+ jnz L(exit_case1_7)
+ lea 7(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case1_high):
+ mov %ah, %ch
+ and $15, %ch
+ jz L(match_case1_high_8)
+ test $0x01, %ah
+ jnz L(exit_case1_9)
+ test $0x02, %ah
+ jnz L(exit_case1_10)
+ test $0x04, %ah
+ jnz L(exit_case1_11)
+ lea 11(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case1_high_8):
+ test $0x10, %ah
+ jnz L(exit_case1_13)
+ test $0x20, %ah
+ jnz L(exit_case1_14)
+ test $0x40, %ah
+ jnz L(exit_case1_15)
+ lea 15(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(exit_loop):
+ add $64, %edx
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ xor %ecx, %ecx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(match_case2)
+ cmp $16, %edx
+ jbe L(return_null)
+
+ movdqa 16(%edi), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(match_case2)
+ cmp $32, %edx
+ jbe L(return_null)
+
+ movdqa 32(%edi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(match_case2)
+ cmp $48, %edx
+ jbe L(return_null)
+
+ pcmpeqb 48(%edi), %xmm1
+ lea 16(%ecx), %ecx
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(match_case2)
+
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(exit_case1_1):
+ mov %edi, %eax
+ RETURN
+
+ .p2align 4
+L(exit_case1_2):
+ lea 1(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(exit_case1_3):
+ lea 2(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(exit_case1_5):
+ lea 4(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(exit_case1_6):
+ lea 5(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(exit_case1_7):
+ lea 6(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(exit_case1_9):
+ lea 8(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(exit_case1_10):
+ lea 9(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(exit_case1_11):
+ lea 10(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(exit_case1_13):
+ lea 12(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(exit_case1_14):
+ lea 13(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(exit_case1_15):
+ lea 14(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case2):
+ sub %ecx, %edx
+L(match_case2_prolog1):
+ add %ecx, %edi
+L(match_case2_prolog):
+ test %al, %al
+ jz L(match_case2_high)
+ mov %al, %cl
+ and $15, %cl
+ jz L(match_case2_8)
+ test $0x01, %al
+ jnz L(exit_case2_1)
+ test $0x02, %al
+ jnz L(exit_case2_2)
+ test $0x04, %al
+ jnz L(exit_case2_3)
+ sub $4, %edx
+ jb L(return_null)
+ lea 3(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case2_8):
+ test $0x10, %al
+ jnz L(exit_case2_5)
+ test $0x20, %al
+ jnz L(exit_case2_6)
+ test $0x40, %al
+ jnz L(exit_case2_7)
+ sub $8, %edx
+ jb L(return_null)
+ lea 7(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case2_high):
+ mov %ah, %ch
+ and $15, %ch
+ jz L(match_case2_high_8)
+ test $0x01, %ah
+ jnz L(exit_case2_9)
+ test $0x02, %ah
+ jnz L(exit_case2_10)
+ test $0x04, %ah
+ jnz L(exit_case2_11)
+ sub $12, %edx
+ jb L(return_null)
+ lea 11(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case2_high_8):
+ test $0x10, %ah
+ jnz L(exit_case2_13)
+ test $0x20, %ah
+ jnz L(exit_case2_14)
+ test $0x40, %ah
+ jnz L(exit_case2_15)
+ sub $16, %edx
+ jb L(return_null)
+ lea 15(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(exit_case2_1):
+ mov %edi, %eax
+ RETURN
+
+ .p2align 4
+L(exit_case2_2):
+ sub $2, %edx
+ jb L(return_null)
+ lea 1(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(exit_case2_3):
+ sub $3, %edx
+ jb L(return_null)
+ lea 2(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(exit_case2_5):
+ sub $5, %edx
+ jb L(return_null)
+ lea 4(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(exit_case2_6):
+ sub $6, %edx
+ jb L(return_null)
+ lea 5(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(exit_case2_7):
+ sub $7, %edx
+ jb L(return_null)
+ lea 6(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(exit_case2_9):
+ sub $9, %edx
+ jb L(return_null)
+ lea 8(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(exit_case2_10):
+ sub $10, %edx
+ jb L(return_null)
+ lea 9(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(exit_case2_11):
+ sub $11, %edx
+ jb L(return_null)
+ lea 10(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(exit_case2_13):
+ sub $13, %edx
+ jb L(return_null)
+ lea 12(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(exit_case2_14):
+ sub $14, %edx
+ jb L(return_null)
+ lea 13(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(exit_case2_15):
+ sub $15, %edx
+ jb L(return_null)
+ lea 14(%edi), %eax
+ RETURN
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ RETURN
+END (memchr)
diff --git a/libc/arch-x86/string/sse2-memrchr-atom.S b/libc/arch-x86/string/sse2-memrchr-atom.S
new file mode 100644
index 0000000..1aa1a1a
--- /dev/null
+++ b/libc/arch-x86/string/sse2-memrchr-atom.S
@@ -0,0 +1,778 @@
+/*
+Copyright (c) 2011, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#define PARMS 4
+#define STR1 PARMS
+#define STR2 STR1+4
+#define LEN STR2+4
+
+ .text
+ENTRY (memrchr)
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+ mov LEN(%esp), %edx
+
+ test %edx, %edx
+ jz L(return_null)
+ sub $16, %edx
+ jbe L(length_less16)
+
+ punpcklbw %xmm1, %xmm1
+ add %edx, %ecx
+ punpcklbw %xmm1, %xmm1
+
+ movdqu (%ecx), %xmm0
+ pshufd $0, %xmm1, %xmm1
+ pcmpeqb %xmm1, %xmm0
+
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(exit_dispatch)
+
+ sub $64, %ecx
+ mov %ecx, %eax
+ and $15, %eax
+ jz L(loop_prolog)
+
+ add $16, %ecx
+ add $16, %edx
+ and $-16, %ecx
+ sub %eax, %edx
+
+ .p2align 4
+/* Loop start on aligned string. */
+L(loop_prolog):
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%ecx), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa (%ecx), %xmm4
+ pcmpeqb %xmm1, %xmm4
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(exit_dispatch)
+
+ sub $64, %ecx
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%ecx), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa (%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(exit_dispatch)
+
+ mov %ecx, %eax
+ and $63, %eax
+ test %eax, %eax
+ jz L(align64_loop)
+
+ add $64, %ecx
+ add $64, %edx
+ and $-64, %ecx
+ sub %eax, %edx
+
+ .p2align 4
+L(align64_loop):
+ sub $64, %ecx
+ sub $64, %edx
+ jbe L(exit_loop)
+
+ movdqa (%ecx), %xmm0
+ movdqa 16(%ecx), %xmm2
+ movdqa 32(%ecx), %xmm3
+ movdqa 48(%ecx), %xmm4
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm1, %xmm4
+
+ pmaxub %xmm3, %xmm0
+ pmaxub %xmm4, %xmm2
+ pmaxub %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
+
+ test %eax, %eax
+ jz L(align64_loop)
+
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm2
+
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb (%ecx), %xmm1
+
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ pmovmskb %xmm1, %eax
+ test %ah, %ah
+ jnz L(exit_dispatch_high)
+ mov %al, %dl
+ and $15 << 4, %dl
+ jnz L(exit_dispatch_8)
+ test $0x08, %al
+ jnz L(exit_4)
+ test $0x04, %al
+ jnz L(exit_3)
+ test $0x02, %al
+ jnz L(exit_2)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(exit_loop):
+ add $64, %edx
+ cmp $32, %edx
+ jbe L(exit_loop_32)
+
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%ecx), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%ecx), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16_1)
+ cmp $48, %edx
+ jbe L(return_null)
+
+ pcmpeqb (%ecx), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches0_1)
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(exit_loop_32):
+ movdqa 48(%ecx), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48_1)
+ cmp $16, %edx
+ jbe L(return_null)
+
+ pcmpeqb 32(%ecx), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches32_1)
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(matches16):
+ lea 16(%ecx), %ecx
+ test %ah, %ah
+ jnz L(exit_dispatch_high)
+ mov %al, %dl
+ and $15 << 4, %dl
+ jnz L(exit_dispatch_8)
+ test $0x08, %al
+ jnz L(exit_4)
+ test $0x04, %al
+ jnz L(exit_3)
+ test $0x02, %al
+ jnz L(exit_2)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches32):
+ lea 32(%ecx), %ecx
+ test %ah, %ah
+ jnz L(exit_dispatch_high)
+ mov %al, %dl
+ and $15 << 4, %dl
+ jnz L(exit_dispatch_8)
+ test $0x08, %al
+ jnz L(exit_4)
+ test $0x04, %al
+ jnz L(exit_3)
+ test $0x02, %al
+ jnz L(exit_2)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches48):
+ lea 48(%ecx), %ecx
+
+ .p2align 4
+L(exit_dispatch):
+ test %ah, %ah
+ jnz L(exit_dispatch_high)
+ mov %al, %dl
+ and $15 << 4, %dl
+ jnz L(exit_dispatch_8)
+ test $0x08, %al
+ jnz L(exit_4)
+ test $0x04, %al
+ jnz L(exit_3)
+ test $0x02, %al
+ jnz L(exit_2)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_8):
+ test $0x80, %al
+ jnz L(exit_8)
+ test $0x40, %al
+ jnz L(exit_7)
+ test $0x20, %al
+ jnz L(exit_6)
+ lea 4(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_high):
+ mov %ah, %dh
+ and $15 << 4, %dh
+ jnz L(exit_dispatch_high_8)
+ test $0x08, %ah
+ jnz L(exit_12)
+ test $0x04, %ah
+ jnz L(exit_11)
+ test $0x02, %ah
+ jnz L(exit_10)
+ lea 8(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_high_8):
+ test $0x80, %ah
+ jnz L(exit_16)
+ test $0x40, %ah
+ jnz L(exit_15)
+ test $0x20, %ah
+ jnz L(exit_14)
+ lea 12(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_2):
+ lea 1(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_3):
+ lea 2(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_4):
+ lea 3(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_6):
+ lea 5(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_7):
+ lea 6(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_8):
+ lea 7(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_10):
+ lea 9(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_11):
+ lea 10(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_12):
+ lea 11(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_14):
+ lea 13(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_15):
+ lea 14(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_16):
+ lea 15(%ecx), %eax
+ ret
+
+ .p2align 4
+L(matches0_1):
+ lea -64(%edx), %edx
+
+ test %ah, %ah
+ jnz L(exit_dispatch_1_high)
+ mov %al, %ah
+ and $15 << 4, %ah
+ jnz L(exit_dispatch_1_8)
+ test $0x08, %al
+ jnz L(exit_1_4)
+ test $0x04, %al
+ jnz L(exit_1_3)
+ test $0x02, %al
+ jnz L(exit_1_2)
+
+ add $0, %edx
+ jl L(return_null)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches16_1):
+ lea -48(%edx), %edx
+ lea 16(%ecx), %ecx
+
+ test %ah, %ah
+ jnz L(exit_dispatch_1_high)
+ mov %al, %ah
+ and $15 << 4, %ah
+ jnz L(exit_dispatch_1_8)
+ test $0x08, %al
+ jnz L(exit_1_4)
+ test $0x04, %al
+ jnz L(exit_1_3)
+ test $0x02, %al
+ jnz L(exit_1_2)
+
+ add $0, %edx
+ jl L(return_null)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches32_1):
+ lea -32(%edx), %edx
+ lea 32(%ecx), %ecx
+
+ test %ah, %ah
+ jnz L(exit_dispatch_1_high)
+ mov %al, %ah
+ and $15 << 4, %ah
+ jnz L(exit_dispatch_1_8)
+ test $0x08, %al
+ jnz L(exit_1_4)
+ test $0x04, %al
+ jnz L(exit_1_3)
+ test $0x02, %al
+ jnz L(exit_1_2)
+
+ add $0, %edx
+ jl L(return_null)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(matches48_1):
+ lea -16(%edx), %edx
+ lea 48(%ecx), %ecx
+
+ .p2align 4
+L(exit_dispatch_1):
+ test %ah, %ah
+ jnz L(exit_dispatch_1_high)
+ mov %al, %ah
+ and $15 << 4, %ah
+ jnz L(exit_dispatch_1_8)
+ test $0x08, %al
+ jnz L(exit_1_4)
+ test $0x04, %al
+ jnz L(exit_1_3)
+ test $0x02, %al
+ jnz L(exit_1_2)
+
+ add $0, %edx
+ jl L(return_null)
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_1_8):
+ test $0x80, %al
+ jnz L(exit_1_8)
+ test $0x40, %al
+ jnz L(exit_1_7)
+ test $0x20, %al
+ jnz L(exit_1_6)
+
+ add $4, %edx
+ jl L(return_null)
+ lea 4(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_1_high):
+ mov %ah, %al
+ and $15 << 4, %al
+ jnz L(exit_dispatch_1_high_8)
+ test $0x08, %ah
+ jnz L(exit_1_12)
+ test $0x04, %ah
+ jnz L(exit_1_11)
+ test $0x02, %ah
+ jnz L(exit_1_10)
+
+ add $8, %edx
+ jl L(return_null)
+ lea 8(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_dispatch_1_high_8):
+ test $0x80, %ah
+ jnz L(exit_1_16)
+ test $0x40, %ah
+ jnz L(exit_1_15)
+ test $0x20, %ah
+ jnz L(exit_1_14)
+
+ add $12, %edx
+ jl L(return_null)
+ lea 12(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_2):
+ add $1, %edx
+ jl L(return_null)
+ lea 1(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_3):
+ add $2, %edx
+ jl L(return_null)
+ lea 2(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_4):
+ add $3, %edx
+ jl L(return_null)
+ lea 3(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_6):
+ add $5, %edx
+ jl L(return_null)
+ lea 5(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_7):
+ add $6, %edx
+ jl L(return_null)
+ lea 6(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_8):
+ add $7, %edx
+ jl L(return_null)
+ lea 7(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_10):
+ add $9, %edx
+ jl L(return_null)
+ lea 9(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_11):
+ add $10, %edx
+ jl L(return_null)
+ lea 10(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_12):
+ add $11, %edx
+ jl L(return_null)
+ lea 11(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_14):
+ add $13, %edx
+ jl L(return_null)
+ lea 13(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_15):
+ add $14, %edx
+ jl L(return_null)
+ lea 14(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit_1_16):
+ add $15, %edx
+ jl L(return_null)
+ lea 15(%ecx), %eax
+ ret
+
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(length_less16_offset0):
+ mov %dl, %cl
+ pcmpeqb (%eax), %xmm1
+
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+
+ mov %eax, %ecx
+ pmovmskb %xmm1, %eax
+
+ and %edx, %eax
+ test %eax, %eax
+ jnz L(exit_dispatch)
+
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(length_less16):
+ punpcklbw %xmm1, %xmm1
+ add $16, %edx
+ punpcklbw %xmm1, %xmm1
+
+ mov %ecx, %eax
+ pshufd $0, %xmm1, %xmm1
+
+ and $15, %ecx
+ jz L(length_less16_offset0)
+
+ PUSH (%edi)
+
+ mov %cl, %dh
+ add %dl, %dh
+ and $-16, %eax
+
+ sub $16, %dh
+ ja L(length_less16_part2)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edi
+
+ sar %cl, %edi
+ add %ecx, %eax
+ mov %dl, %cl
+
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+
+ and %edx, %edi
+ test %edi, %edi
+ jz L(ret_null)
+
+ bsr %edi, %edi
+ add %edi, %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(length_less16_part2):
+ movdqa 16(%eax), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %edi
+
+ mov %cl, %ch
+
+ mov %dh, %cl
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+
+ and %edx, %edi
+
+ test %edi, %edi
+ jnz L(length_less16_part2_return)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edi
+
+ mov %ch, %cl
+ sar %cl, %edi
+ test %edi, %edi
+ jz L(ret_null)
+
+ bsr %edi, %edi
+ add %edi, %eax
+ xor %ch, %ch
+ add %ecx, %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(length_less16_part2_return):
+ bsr %edi, %edi
+ lea 16(%eax, %edi), %eax
+ POP (%edi)
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(ret_null):
+ xor %eax, %eax
+ POP (%edi)
+ ret
+
+END (memrchr)
diff --git a/libc/arch-x86/string/sse2-memset5-atom.S b/libc/arch-x86/string/sse2-memset-atom.S
similarity index 99%
rename from libc/arch-x86/string/sse2-memset5-atom.S
rename to libc/arch-x86/string/sse2-memset-atom.S
index 557c019..a54bf51 100644
--- a/libc/arch-x86/string/sse2-memset5-atom.S
+++ b/libc/arch-x86/string/sse2-memset-atom.S
@@ -28,6 +28,9 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include "cache.h"
+#undef __i686
+
#ifndef L
# define L(label) .L##label
#endif
@@ -136,9 +139,13 @@
jmp *TABLE(,%ecx,4)
#endif
+#ifndef MEMSET
+# define MEMSET memset
+#endif
+
.section .text.sse2,"ax",@progbits
ALIGN (4)
-ENTRY (sse2_memset5_atom)
+ENTRY (MEMSET)
ENTRANCE
movl LEN(%esp), %ecx
@@ -911,4 +918,4 @@
SETRTNVAL
RETURN_END
-END (sse2_memset5_atom)
+END (MEMSET)
diff --git a/libc/arch-x86/string/sse2-strchr-atom.S b/libc/arch-x86/string/sse2-strchr-atom.S
new file mode 100644
index 0000000..e325181
--- /dev/null
+++ b/libc/arch-x86/string/sse2-strchr-atom.S
@@ -0,0 +1,391 @@
+/*
+Copyright (c) 2011, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#define PARMS 8
+#define ENTRANCE PUSH(%edi)
+#define RETURN POP (%edi); ret; CFI_PUSH (%edi);
+
+
+#define STR1 PARMS
+#define STR2 STR1+4
+
+ .text
+ENTRY (strchr)
+
+ ENTRANCE
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+
+ pxor %xmm2, %xmm2
+ mov %ecx, %edi
+ punpcklbw %xmm1, %xmm1
+ punpcklbw %xmm1, %xmm1
+ /* ECX has OFFSET. */
+ and $15, %ecx
+ pshufd $0, %xmm1, %xmm1
+ je L(loop)
+
+/* Handle unaligned string. */
+ and $-16, %edi
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ /* Find where NULL is. */
+ pmovmskb %xmm2, %edx
+ /* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+ /* Remove the leading bytes. */
+ sarl %cl, %edx
+ sarl %cl, %eax
+ test %eax, %eax
+ jz L(unaligned_no_match)
+ add %ecx, %edi
+ test %edx, %edx
+ jz L(match_case1)
+ jmp L(match_case2)
+
+ .p2align 4
+L(unaligned_no_match):
+ test %edx, %edx
+ jne L(return_null)
+
+ pxor %xmm2, %xmm2
+ add $16, %edi
+
+ .p2align 4
+/* Loop start on aligned string. */
+L(loop):
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+ test %edx, %edx
+ jnz L(return_null)
+ add $16, %edi
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+ test %edx, %edx
+ jnz L(return_null)
+ add $16, %edi
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+ test %edx, %edx
+ jnz L(return_null)
+ add $16, %edi
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+ test %edx, %edx
+ jnz L(return_null)
+ add $16, %edi
+ jmp L(loop)
+
+L(matches):
+ /* There is a match. First find where NULL is. */
+ test %edx, %edx
+ jz L(match_case1)
+
+ .p2align 4
+L(match_case2):
+ test %al, %al
+ jz L(match_higth_case2)
+
+ mov %al, %cl
+ and $15, %cl
+ jnz L(match_case2_4)
+
+ mov %dl, %ch
+ and $15, %ch
+ jnz L(return_null)
+
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x10, %dl
+ jnz L(return_null)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x20, %dl
+ jnz L(return_null)
+ test $0x40, %al
+ jnz L(Exit7)
+ test $0x40, %dl
+ jnz L(return_null)
+ lea 7(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case2_4):
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x01, %dl
+ jnz L(return_null)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x02, %dl
+ jnz L(return_null)
+ test $0x04, %al
+ jnz L(Exit3)
+ test $0x04, %dl
+ jnz L(return_null)
+ lea 3(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_higth_case2):
+ test %dl, %dl
+ jnz L(return_null)
+
+ mov %ah, %cl
+ and $15, %cl
+ jnz L(match_case2_12)
+
+ mov %dh, %ch
+ and $15, %ch
+ jnz L(return_null)
+
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x10, %dh
+ jnz L(return_null)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x20, %dh
+ jnz L(return_null)
+ test $0x40, %ah
+ jnz L(Exit15)
+ test $0x40, %dh
+ jnz L(return_null)
+ lea 15(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case2_12):
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x01, %dh
+ jnz L(return_null)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x02, %dh
+ jnz L(return_null)
+ test $0x04, %ah
+ jnz L(Exit11)
+ test $0x04, %dh
+ jnz L(return_null)
+ lea 11(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case1):
+ test %al, %al
+ jz L(match_higth_case1)
+
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+ lea 7(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_higth_case1):
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ test $0x08, %ah
+ jnz L(Exit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+ lea 15(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit1):
+ lea (%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit2):
+ lea 1(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit3):
+ lea 2(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit4):
+ lea 3(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit5):
+ lea 4(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit6):
+ lea 5(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit7):
+ lea 6(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit9):
+ lea 8(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit10):
+ lea 9(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit11):
+ lea 10(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit12):
+ lea 11(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit13):
+ lea 12(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit14):
+ lea 13(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit15):
+ lea 14(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ RETURN
+
+END (strchr)
diff --git a/libc/arch-x86/string/sse2-strlen-atom.S b/libc/arch-x86/string/sse2-strlen-atom.S
index 8911868..81768fb 100644
--- a/libc/arch-x86/string/sse2-strlen-atom.S
+++ b/libc/arch-x86/string/sse2-strlen-atom.S
@@ -1,71 +1,112 @@
-#define STRLEN sse2_strlen_atom
+/*
+Copyright (c) 2011, Intel Corporation
+All rights reserved.
-#ifndef L
-# define L(label) .L##label
-#endif
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
-#ifndef cfi_startproc
-# define cfi_startproc .cfi_startproc
-#endif
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
-#ifndef cfi_endproc
-# define cfi_endproc .cfi_endproc
-#endif
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
-#ifndef cfi_rel_offset
-# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
-#endif
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
-#ifndef cfi_restore
-# define cfi_restore(reg) .cfi_restore reg
-#endif
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
-#ifndef cfi_adjust_cfa_offset
-# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
-#endif
+#ifndef USE_AS_STRCAT
-#ifndef cfi_remember_state
-# define cfi_remember_state .cfi_remember_state
-#endif
+# ifndef STRLEN
+# define STRLEN strlen
+# endif
-#ifndef cfi_restore_state
-# define cfi_restore_state .cfi_restore_state
-#endif
+# ifndef L
+# define L(label) .L##label
+# endif
-#ifndef ENTRY
-# define ENTRY(name) \
- .type name, @function; \
- .globl name; \
- .p2align 4; \
-name: \
+# ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+# endif
+
+# ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+# endif
+
+/* calee safe register only for strnlen is required */
+
+# ifdef USE_AS_STRNLEN
+# ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+# endif
+
+# ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+# endif
+
+# ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+# endif
+# endif
+
+# ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
cfi_startproc
-#endif
+# endif
-#ifndef END
-# define END(name) \
- cfi_endproc; \
+# ifndef END
+# define END(name) \
+ cfi_endproc; \
.size name, .-name
-#endif
+# endif
-#define CFI_PUSH(REG) \
- cfi_adjust_cfa_offset (4); \
- cfi_rel_offset (REG, 0)
+# define PARMS 4
+# define STR PARMS
+# define RETURN ret
-#define CFI_POP(REG) \
- cfi_adjust_cfa_offset (-4); \
- cfi_restore (REG)
+# ifdef USE_AS_STRNLEN
+# define LEN PARMS + 8
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
-#define PUSH(REG) pushl REG; CFI_PUSH (REG)
-#define POP(REG) popl REG; CFI_POP (REG)
-#define PARMS 4
-#define STR PARMS
-#define ENTRANCE
-#define RETURN ret
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+# undef RETURN
+# define RETURN POP (%edi); ret; CFI_PUSH(%edi);
+# endif
.text
ENTRY (STRLEN)
- ENTRANCE
mov STR(%esp), %edx
+# ifdef USE_AS_STRNLEN
+ PUSH (%edi)
+ movl LEN(%esp), %edi
+ sub $4, %edi
+ jbe L(len_less4_prolog)
+# endif
+#endif
xor %eax, %eax
cmpb $0, (%edx)
jz L(exit_tail0)
@@ -75,6 +116,12 @@
jz L(exit_tail2)
cmpb $0, 3(%edx)
jz L(exit_tail3)
+
+#ifdef USE_AS_STRNLEN
+ sub $4, %edi
+ jbe L(len_less8_prolog)
+#endif
+
cmpb $0, 4(%edx)
jz L(exit_tail4)
cmpb $0, 5(%edx)
@@ -83,6 +130,12 @@
jz L(exit_tail6)
cmpb $0, 7(%edx)
jz L(exit_tail7)
+
+#ifdef USE_AS_STRNLEN
+ sub $4, %edi
+ jbe L(len_less12_prolog)
+#endif
+
cmpb $0, 8(%edx)
jz L(exit_tail8)
cmpb $0, 9(%edx)
@@ -91,6 +144,12 @@
jz L(exit_tail10)
cmpb $0, 11(%edx)
jz L(exit_tail11)
+
+#ifdef USE_AS_STRNLEN
+ sub $4, %edi
+ jbe L(len_less16_prolog)
+#endif
+
cmpb $0, 12(%edx)
jz L(exit_tail12)
cmpb $0, 13(%edx)
@@ -99,211 +158,531 @@
jz L(exit_tail14)
cmpb $0, 15(%edx)
jz L(exit_tail15)
+
pxor %xmm0, %xmm0
- mov %edx, %eax
- mov %edx, %ecx
+ lea 16(%edx), %eax
+ mov %eax, %ecx
and $-16, %eax
- add $16, %ecx
- add $16, %eax
+
+#ifdef USE_AS_STRNLEN
+ and $15, %edx
+ add %edx, %edi
+ sub $64, %edi
+ jbe L(len_less64)
+#endif
pcmpeqb (%eax), %xmm0
pmovmskb %xmm0, %edx
pxor %xmm1, %xmm1
- test %edx, %edx
lea 16(%eax), %eax
+ test %edx, %edx
jnz L(exit)
pcmpeqb (%eax), %xmm1
pmovmskb %xmm1, %edx
pxor %xmm2, %xmm2
- test %edx, %edx
lea 16(%eax), %eax
+ test %edx, %edx
jnz L(exit)
-
pcmpeqb (%eax), %xmm2
pmovmskb %xmm2, %edx
pxor %xmm3, %xmm3
- test %edx, %edx
lea 16(%eax), %eax
+ test %edx, %edx
jnz L(exit)
pcmpeqb (%eax), %xmm3
pmovmskb %xmm3, %edx
- test %edx, %edx
lea 16(%eax), %eax
+ test %edx, %edx
jnz L(exit)
+#ifdef USE_AS_STRNLEN
+ sub $64, %edi
+ jbe L(len_less64)
+#endif
+
pcmpeqb (%eax), %xmm0
pmovmskb %xmm0, %edx
- test %edx, %edx
lea 16(%eax), %eax
+ test %edx, %edx
jnz L(exit)
pcmpeqb (%eax), %xmm1
pmovmskb %xmm1, %edx
- test %edx, %edx
lea 16(%eax), %eax
+ test %edx, %edx
jnz L(exit)
pcmpeqb (%eax), %xmm2
pmovmskb %xmm2, %edx
- test %edx, %edx
lea 16(%eax), %eax
+ test %edx, %edx
jnz L(exit)
pcmpeqb (%eax), %xmm3
pmovmskb %xmm3, %edx
- test %edx, %edx
lea 16(%eax), %eax
+ test %edx, %edx
jnz L(exit)
+#ifdef USE_AS_STRNLEN
+ sub $64, %edi
+ jbe L(len_less64)
+#endif
+
pcmpeqb (%eax), %xmm0
pmovmskb %xmm0, %edx
- test %edx, %edx
lea 16(%eax), %eax
+ test %edx, %edx
jnz L(exit)
pcmpeqb (%eax), %xmm1
pmovmskb %xmm1, %edx
- test %edx, %edx
lea 16(%eax), %eax
+ test %edx, %edx
jnz L(exit)
pcmpeqb (%eax), %xmm2
pmovmskb %xmm2, %edx
- test %edx, %edx
lea 16(%eax), %eax
+ test %edx, %edx
jnz L(exit)
pcmpeqb (%eax), %xmm3
pmovmskb %xmm3, %edx
- test %edx, %edx
lea 16(%eax), %eax
+ test %edx, %edx
jnz L(exit)
+#ifdef USE_AS_STRNLEN
+ sub $64, %edi
+ jbe L(len_less64)
+#endif
+
pcmpeqb (%eax), %xmm0
pmovmskb %xmm0, %edx
- test %edx, %edx
lea 16(%eax), %eax
+ test %edx, %edx
jnz L(exit)
pcmpeqb (%eax), %xmm1
pmovmskb %xmm1, %edx
- test %edx, %edx
lea 16(%eax), %eax
+ test %edx, %edx
jnz L(exit)
pcmpeqb (%eax), %xmm2
pmovmskb %xmm2, %edx
- test %edx, %edx
lea 16(%eax), %eax
+ test %edx, %edx
jnz L(exit)
pcmpeqb (%eax), %xmm3
pmovmskb %xmm3, %edx
- test %edx, %edx
lea 16(%eax), %eax
+ test %edx, %edx
jnz L(exit)
+#ifdef USE_AS_STRNLEN
+ mov %eax, %edx
+ and $63, %edx
+ add %edx, %edi
+#endif
+
and $-0x40, %eax
- PUSH (%esi)
- PUSH (%edi)
- PUSH (%ebx)
- PUSH (%ebp)
- xor %ebp, %ebp
-L(aligned_64):
- pcmpeqb (%eax), %xmm0
- pcmpeqb 16(%eax), %xmm1
- pcmpeqb 32(%eax), %xmm2
- pcmpeqb 48(%eax), %xmm3
- pmovmskb %xmm0, %edx
- pmovmskb %xmm1, %esi
- pmovmskb %xmm2, %edi
- pmovmskb %xmm3, %ebx
- or %edx, %ebp
- or %esi, %ebp
- or %edi, %ebp
- or %ebx, %ebp
+
+ .p2align 4
+L(aligned_64_loop):
+#ifdef USE_AS_STRNLEN
+ sub $64, %edi
+ jbe L(len_less64)
+#endif
+ movaps (%eax), %xmm0
+ movaps 16(%eax), %xmm1
+ movaps 32(%eax), %xmm2
+ movaps 48(%eax), %xmm6
+ pminub %xmm1, %xmm0
+ pminub %xmm6, %xmm2
+ pminub %xmm0, %xmm2
+ pcmpeqb %xmm3, %xmm2
+ pmovmskb %xmm2, %edx
lea 64(%eax), %eax
- jz L(aligned_64)
-L(48leave):
test %edx, %edx
- jnz L(aligned_64_exit_16)
- test %esi, %esi
- jnz L(aligned_64_exit_32)
- test %edi, %edi
- jnz L(aligned_64_exit_48)
- mov %ebx, %edx
- lea (%eax), %eax
- jmp L(aligned_64_exit)
-L(aligned_64_exit_48):
- lea -16(%eax), %eax
- mov %edi, %edx
- jmp L(aligned_64_exit)
-L(aligned_64_exit_32):
- lea -32(%eax), %eax
- mov %esi, %edx
- jmp L(aligned_64_exit)
-L(aligned_64_exit_16):
- lea -48(%eax), %eax
-L(aligned_64_exit):
- POP (%ebp)
- POP (%ebx)
- POP (%edi)
- POP (%esi)
+ jz L(aligned_64_loop)
+
+ pcmpeqb -64(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 48(%ecx), %ecx
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea -16(%ecx), %ecx
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb -32(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea -16(%ecx), %ecx
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb %xmm6, %xmm3
+ pmovmskb %xmm3, %edx
+ lea -16(%ecx), %ecx
L(exit):
sub %ecx, %eax
test %dl, %dl
jz L(exit_high)
+
+ mov %dl, %cl
+ and $15, %cl
+ jz L(exit_8)
test $0x01, %dl
jnz L(exit_tail0)
-
test $0x02, %dl
jnz L(exit_tail1)
-
test $0x04, %dl
jnz L(exit_tail2)
+ add $3, %eax
+ RETURN
- test $0x08, %dl
- jnz L(exit_tail3)
-
+ .p2align 4
+L(exit_8):
test $0x10, %dl
jnz L(exit_tail4)
-
test $0x20, %dl
jnz L(exit_tail5)
-
test $0x40, %dl
jnz L(exit_tail6)
add $7, %eax
+ RETURN
+
+ .p2align 4
+L(exit_high):
+ mov %dh, %ch
+ and $15, %ch
+ jz L(exit_high_8)
+ test $0x01, %dh
+ jnz L(exit_tail8)
+ test $0x02, %dh
+ jnz L(exit_tail9)
+ test $0x04, %dh
+ jnz L(exit_tail10)
+ add $11, %eax
+ RETURN
+
+ .p2align 4
+L(exit_high_8):
+ test $0x10, %dh
+ jnz L(exit_tail12)
+ test $0x20, %dh
+ jnz L(exit_tail13)
+ test $0x40, %dh
+ jnz L(exit_tail14)
+ add $15, %eax
L(exit_tail0):
RETURN
-L(exit_high):
- add $8, %eax
- test $0x01, %dh
- jnz L(exit_tail0)
+#ifdef USE_AS_STRNLEN
- test $0x02, %dh
- jnz L(exit_tail1)
+ .p2align 4
+L(len_less64):
+ pxor %xmm0, %xmm0
+ add $64, %edi
- test $0x04, %dh
- jnz L(exit_tail2)
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ pxor %xmm1, %xmm1
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(strnlen_exit)
- test $0x08, %dh
- jnz L(exit_tail3)
+ sub $16, %edi
+ jbe L(return_start_len)
- test $0x10, %dh
- jnz L(exit_tail4)
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(strnlen_exit)
- test $0x20, %dh
- jnz L(exit_tail5)
+ sub $16, %edi
+ jbe L(return_start_len)
- test $0x40, %dh
- jnz L(exit_tail6)
- add $7, %eax
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(strnlen_exit)
+
+ sub $16, %edi
+ jbe L(return_start_len)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(strnlen_exit)
+
+#ifndef USE_AS_STRLCAT
+ movl LEN(%esp), %eax
RETURN
+#else
+ jmp L(return_start_len)
+#endif
+
+ .p2align 4
+L(strnlen_exit):
+ sub %ecx, %eax
+
+ test %dl, %dl
+ jz L(strnlen_exit_high)
+ mov %dl, %cl
+ and $15, %cl
+ jz L(strnlen_exit_8)
+ test $0x01, %dl
+ jnz L(exit_tail0)
+ test $0x02, %dl
+ jnz L(strnlen_exit_tail1)
+ test $0x04, %dl
+ jnz L(strnlen_exit_tail2)
+ sub $4, %edi
+ jb L(return_start_len)
+ lea 3(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_8):
+ test $0x10, %dl
+ jnz L(strnlen_exit_tail4)
+ test $0x20, %dl
+ jnz L(strnlen_exit_tail5)
+ test $0x40, %dl
+ jnz L(strnlen_exit_tail6)
+ sub $8, %edi
+ jb L(return_start_len)
+ lea 7(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_high):
+ mov %dh, %ch
+ and $15, %ch
+ jz L(strnlen_exit_high_8)
+ test $0x01, %dh
+ jnz L(strnlen_exit_tail8)
+ test $0x02, %dh
+ jnz L(strnlen_exit_tail9)
+ test $0x04, %dh
+ jnz L(strnlen_exit_tail10)
+ sub $12, %edi
+ jb L(return_start_len)
+ lea 11(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_high_8):
+ test $0x10, %dh
+ jnz L(strnlen_exit_tail12)
+ test $0x20, %dh
+ jnz L(strnlen_exit_tail13)
+ test $0x40, %dh
+ jnz L(strnlen_exit_tail14)
+ sub $16, %edi
+ jb L(return_start_len)
+ lea 15(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail1):
+ sub $2, %edi
+ jb L(return_start_len)
+ lea 1(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail2):
+ sub $3, %edi
+ jb L(return_start_len)
+ lea 2(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail4):
+ sub $5, %edi
+ jb L(return_start_len)
+ lea 4(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail5):
+ sub $6, %edi
+ jb L(return_start_len)
+ lea 5(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail6):
+ sub $7, %edi
+ jb L(return_start_len)
+ lea 6(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail8):
+ sub $9, %edi
+ jb L(return_start_len)
+ lea 8(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail9):
+ sub $10, %edi
+ jb L(return_start_len)
+ lea 9(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail10):
+ sub $11, %edi
+ jb L(return_start_len)
+ lea 10(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail12):
+ sub $13, %edi
+ jb L(return_start_len)
+ lea 12(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail13):
+ sub $14, %edi
+ jb L(return_start_len)
+ lea 13(%eax), %eax
+ RETURN
+
+ .p2align 4
+L(strnlen_exit_tail14):
+ sub $15, %edi
+ jb L(return_start_len)
+ lea 14(%eax), %eax
+ RETURN
+
+#ifndef USE_AS_STRLCAT
+ .p2align 4
+L(return_start_len):
+ movl LEN(%esp), %eax
+ RETURN
+#endif
+
+/* for prolog only */
+
+ .p2align 4
+L(len_less4_prolog):
+ xor %eax, %eax
+
+ add $4, %edi
+ jz L(exit_tail0)
+
+ cmpb $0, (%edx)
+ jz L(exit_tail0)
+ cmp $1, %edi
+ je L(exit_tail1)
+
+ cmpb $0, 1(%edx)
+ jz L(exit_tail1)
+ cmp $2, %edi
+ je L(exit_tail2)
+
+ cmpb $0, 2(%edx)
+ jz L(exit_tail2)
+ cmp $3, %edi
+ je L(exit_tail3)
+
+ cmpb $0, 3(%edx)
+ jz L(exit_tail3)
+ mov %edi, %eax
+ RETURN
+
+ .p2align 4
+L(len_less8_prolog):
+ add $4, %edi
+
+ cmpb $0, 4(%edx)
+ jz L(exit_tail4)
+ cmp $1, %edi
+ je L(exit_tail5)
+
+ cmpb $0, 5(%edx)
+ jz L(exit_tail5)
+ cmp $2, %edi
+ je L(exit_tail6)
+
+ cmpb $0, 6(%edx)
+ jz L(exit_tail6)
+ cmp $3, %edi
+ je L(exit_tail7)
+
+ cmpb $0, 7(%edx)
+ jz L(exit_tail7)
+ mov $8, %eax
+ RETURN
+
+
+ .p2align 4
+L(len_less12_prolog):
+ add $4, %edi
+
+ cmpb $0, 8(%edx)
+ jz L(exit_tail8)
+ cmp $1, %edi
+ je L(exit_tail9)
+
+ cmpb $0, 9(%edx)
+ jz L(exit_tail9)
+ cmp $2, %edi
+ je L(exit_tail10)
+
+ cmpb $0, 10(%edx)
+ jz L(exit_tail10)
+ cmp $3, %edi
+ je L(exit_tail11)
+
+ cmpb $0, 11(%edx)
+ jz L(exit_tail11)
+ mov $12, %eax
+ RETURN
+
+ .p2align 4
+L(len_less16_prolog):
+ add $4, %edi
+
+ cmpb $0, 12(%edx)
+ jz L(exit_tail12)
+ cmp $1, %edi
+ je L(exit_tail13)
+
+ cmpb $0, 13(%edx)
+ jz L(exit_tail13)
+ cmp $2, %edi
+ je L(exit_tail14)
+
+ cmpb $0, 14(%edx)
+ jz L(exit_tail14)
+ cmp $3, %edi
+ je L(exit_tail15)
+
+ cmpb $0, 15(%edx)
+ jz L(exit_tail15)
+ mov $16, %eax
+ RETURN
+#endif
.p2align 4
L(exit_tail1):
@@ -364,6 +743,7 @@
L(exit_tail15):
add $15, %eax
- ret
-
+#ifndef USE_AS_STRCAT
+ RETURN
END (STRLEN)
+#endif
diff --git a/libc/arch-x86/string/memcmp_wrapper.S b/libc/arch-x86/string/sse2-strnlen-atom.S
similarity index 90%
copy from libc/arch-x86/string/memcmp_wrapper.S
copy to libc/arch-x86/string/sse2-strnlen-atom.S
index fa0c672..1f89b4e 100644
--- a/libc/arch-x86/string/memcmp_wrapper.S
+++ b/libc/arch-x86/string/sse2-strnlen-atom.S
@@ -1,5 +1,5 @@
/*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2011, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,13 +28,6 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#if defined(USE_SSSE3)
-
-# define MEMCMP memcmp
-# include "ssse3-memcmp3-new.S"
-
-#else
-
-# include "memcmp.S"
-
-#endif
+#define USE_AS_STRNLEN 1
+#define STRLEN strnlen
+#include "sse2-strlen-atom.S"
diff --git a/libc/arch-x86/string/sse2-strrchr-atom.S b/libc/arch-x86/string/sse2-strrchr-atom.S
new file mode 100644
index 0000000..da3dc3b
--- /dev/null
+++ b/libc/arch-x86/string/sse2-strrchr-atom.S
@@ -0,0 +1,753 @@
+/*
+Copyright (c) 2011, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#define PARMS 8
+#define ENTRANCE PUSH(%edi);
+#define RETURN POP (%edi); ret; CFI_PUSH (%edi);
+
+#define STR1 PARMS
+#define STR2 STR1+4
+
+ .text
+ENTRY (strrchr)
+
+ ENTRANCE
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+
+ pxor %xmm2, %xmm2
+ mov %ecx, %edi
+ punpcklbw %xmm1, %xmm1
+ punpcklbw %xmm1, %xmm1
+ /* ECX has OFFSET. */
+ and $63, %ecx
+ pshufd $0, %xmm1, %xmm1
+ cmp $48, %ecx
+ ja L(crosscache)
+
+/* unaligned string. */
+ movdqu (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm1, %xmm0
+ /* Find where NULL is. */
+ pmovmskb %xmm2, %ecx
+ /* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+ add $16, %edi
+
+ test %eax, %eax
+ jnz L(unaligned_match1)
+
+ test %ecx, %ecx
+ jnz L(return_null)
+
+ and $-16, %edi
+
+ PUSH (%esi)
+ PUSH (%ebx)
+
+ xor %ebx, %ebx
+ jmp L(loop)
+
+ CFI_POP (%esi)
+ CFI_POP (%ebx)
+
+ .p2align 4
+L(unaligned_match1):
+ test %ecx, %ecx
+ jnz L(prolog_find_zero_1)
+
+ PUSH (%esi)
+ PUSH (%ebx)
+
+ mov %eax, %ebx
+ mov %edi, %esi
+ and $-16, %edi
+ jmp L(loop)
+
+ CFI_POP (%esi)
+ CFI_POP (%ebx)
+
+ .p2align 4
+L(crosscache):
+/* Hancle unaligned string. */
+ and $15, %ecx
+ and $-16, %edi
+ pxor %xmm3, %xmm3
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm3
+ pcmpeqb %xmm1, %xmm0
+ /* Find where NULL is. */
+ pmovmskb %xmm3, %edx
+ /* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+ /* Remove the leading bytes. */
+ shr %cl, %edx
+ shr %cl, %eax
+ add $16, %edi
+
+ test %eax, %eax
+ jnz L(unaligned_match)
+
+ test %edx, %edx
+ jnz L(return_null)
+
+ PUSH (%esi)
+ PUSH (%ebx)
+
+ xor %ebx, %ebx
+ jmp L(loop)
+
+ CFI_POP (%esi)
+ CFI_POP (%ebx)
+
+ .p2align 4
+L(unaligned_match):
+ test %edx, %edx
+ jnz L(prolog_find_zero)
+
+ PUSH (%esi)
+ PUSH (%ebx)
+
+ mov %eax, %ebx
+ lea (%edi, %ecx), %esi
+
+/* Loop start on aligned string. */
+ .p2align 4
+L(loop):
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm0
+ pcmpeqb %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jz L(loop)
+
+L(matches):
+ test %eax, %eax
+ jnz L(match)
+L(return_value):
+ test %ebx, %ebx
+ jz L(return_null_1)
+ mov %ebx, %eax
+ mov %esi, %edi
+
+ POP (%ebx)
+ POP (%esi)
+
+ jmp L(match_case1)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(return_null_1):
+ POP (%ebx)
+ POP (%esi)
+
+ xor %eax, %eax
+ RETURN
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(match):
+ pmovmskb %xmm2, %ecx
+ test %ecx, %ecx
+ jnz L(find_zero)
+ mov %eax, %ebx
+ mov %edi, %esi
+ jmp L(loop)
+
+ .p2align 4
+L(find_zero):
+ test %cl, %cl
+ jz L(find_zero_high)
+ mov %cl, %dl
+ and $15, %dl
+ jz L(find_zero_8)
+ test $0x01, %cl
+ jnz L(FindZeroExit1)
+ test $0x02, %cl
+ jnz L(FindZeroExit2)
+ test $0x04, %cl
+ jnz L(FindZeroExit3)
+ and $1 << 4 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_case1)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(find_zero_8):
+ test $0x10, %cl
+ jnz L(FindZeroExit5)
+ test $0x20, %cl
+ jnz L(FindZeroExit6)
+ test $0x40, %cl
+ jnz L(FindZeroExit7)
+ and $1 << 8 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_case1)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(find_zero_high):
+ mov %ch, %dh
+ and $15, %dh
+ jz L(find_zero_high_8)
+ test $0x01, %ch
+ jnz L(FindZeroExit9)
+ test $0x02, %ch
+ jnz L(FindZeroExit10)
+ test $0x04, %ch
+ jnz L(FindZeroExit11)
+ and $1 << 12 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_case1)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(find_zero_high_8):
+ test $0x10, %ch
+ jnz L(FindZeroExit13)
+ test $0x20, %ch
+ jnz L(FindZeroExit14)
+ test $0x40, %ch
+ jnz L(FindZeroExit15)
+ and $1 << 16 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_case1)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit1):
+ and $1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_case1)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit2):
+ and $1 << 2 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_case1)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit3):
+ and $1 << 3 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_case1)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit5):
+ and $1 << 5 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_case1)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit6):
+ and $1 << 6 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_case1)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit7):
+ and $1 << 7 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_case1)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit9):
+ and $1 << 9 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_case1)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit10):
+ and $1 << 10 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_case1)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit11):
+ and $1 << 11 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_case1)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit13):
+ and $1 << 13 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_case1)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit14):
+ and $1 << 14 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+ jmp L(match_case1)
+
+ CFI_PUSH (%ebx)
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(FindZeroExit15):
+ and $1 << 15 - 1, %eax
+ jz L(return_value)
+
+ POP (%ebx)
+ POP (%esi)
+
+ .p2align 4
+L(match_case1):
+ test %ah, %ah
+ jnz L(match_case1_high)
+ mov %al, %dl
+ and $15 << 4, %dl
+ jnz L(match_case1_8)
+ test $0x08, %al
+ jnz L(Exit4)
+ test $0x04, %al
+ jnz L(Exit3)
+ test $0x02, %al
+ jnz L(Exit2)
+ lea -16(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case1_8):
+ test $0x80, %al
+ jnz L(Exit8)
+ test $0x40, %al
+ jnz L(Exit7)
+ test $0x20, %al
+ jnz L(Exit6)
+ lea -12(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case1_high):
+ mov %ah, %dh
+ and $15 << 4, %dh
+ jnz L(match_case1_high_8)
+ test $0x08, %ah
+ jnz L(Exit12)
+ test $0x04, %ah
+ jnz L(Exit11)
+ test $0x02, %ah
+ jnz L(Exit10)
+ lea -8(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_case1_high_8):
+ test $0x80, %ah
+ jnz L(Exit16)
+ test $0x40, %ah
+ jnz L(Exit15)
+ test $0x20, %ah
+ jnz L(Exit14)
+ lea -4(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit2):
+ lea -15(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit3):
+ lea -14(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit4):
+ lea -13(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit6):
+ lea -11(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit7):
+ lea -10(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit8):
+ lea -9(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit10):
+ lea -7(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit11):
+ lea -6(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit12):
+ lea -5(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit14):
+ lea -3(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit15):
+ lea -2(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(Exit16):
+ lea -1(%edi), %eax
+ RETURN
+
+/* Return NULL. */
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero):
+ add %ecx, %edi
+ mov %edx, %ecx
+L(prolog_find_zero_1):
+ test %cl, %cl
+ jz L(prolog_find_zero_high)
+ mov %cl, %dl
+ and $15, %dl
+ jz L(prolog_find_zero_8)
+ test $0x01, %cl
+ jnz L(PrologFindZeroExit1)
+ test $0x02, %cl
+ jnz L(PrologFindZeroExit2)
+ test $0x04, %cl
+ jnz L(PrologFindZeroExit3)
+ and $1 << 4 - 1, %eax
+ jnz L(match_case1)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero_8):
+ test $0x10, %cl
+ jnz L(PrologFindZeroExit5)
+ test $0x20, %cl
+ jnz L(PrologFindZeroExit6)
+ test $0x40, %cl
+ jnz L(PrologFindZeroExit7)
+ and $1 << 8 - 1, %eax
+ jnz L(match_case1)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero_high):
+ mov %ch, %dh
+ and $15, %dh
+ jz L(prolog_find_zero_high_8)
+ test $0x01, %ch
+ jnz L(PrologFindZeroExit9)
+ test $0x02, %ch
+ jnz L(PrologFindZeroExit10)
+ test $0x04, %ch
+ jnz L(PrologFindZeroExit11)
+ and $1 << 12 - 1, %eax
+ jnz L(match_case1)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero_high_8):
+ test $0x10, %ch
+ jnz L(PrologFindZeroExit13)
+ test $0x20, %ch
+ jnz L(PrologFindZeroExit14)
+ test $0x40, %ch
+ jnz L(PrologFindZeroExit15)
+ and $1 << 16 - 1, %eax
+ jnz L(match_case1)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit1):
+ and $1, %eax
+ jnz L(match_case1)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit2):
+ and $1 << 2 - 1, %eax
+ jnz L(match_case1)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit3):
+ and $1 << 3 - 1, %eax
+ jnz L(match_case1)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit5):
+ and $1 << 5 - 1, %eax
+ jnz L(match_case1)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit6):
+ and $1 << 6 - 1, %eax
+ jnz L(match_case1)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit7):
+ and $1 << 7 - 1, %eax
+ jnz L(match_case1)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit9):
+ and $1 << 9 - 1, %eax
+ jnz L(match_case1)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit10):
+ and $1 << 10 - 1, %eax
+ jnz L(match_case1)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit11):
+ and $1 << 11 - 1, %eax
+ jnz L(match_case1)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit13):
+ and $1 << 13 - 1, %eax
+ jnz L(match_case1)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit14):
+ and $1 << 14 - 1, %eax
+ jnz L(match_case1)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(PrologFindZeroExit15):
+ and $1 << 15 - 1, %eax
+ jnz L(match_case1)
+ xor %eax, %eax
+ RETURN
+
+END (strrchr)
diff --git a/libc/arch-x86/string/sse2-wcschr-atom.S b/libc/arch-x86/string/sse2-wcschr-atom.S
new file mode 100644
index 0000000..729302b
--- /dev/null
+++ b/libc/arch-x86/string/sse2-wcschr-atom.S
@@ -0,0 +1,267 @@
+/*
+Copyright (c) 2011 Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#define PARMS 4
+
+
+#define STR1 PARMS
+#define STR2 STR1+4
+
+ .text
+ENTRY (wcschr)
+
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+
+ mov %ecx, %eax
+ punpckldq %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ punpckldq %xmm1, %xmm1
+
+ and $63, %eax
+ cmp $48, %eax
+ ja L(cross_cache)
+
+ movdqu (%ecx), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+ and $-16, %ecx
+ jmp L(loop)
+
+ .p2align 4
+L(cross_cache):
+ PUSH (%edi)
+ mov %ecx, %edi
+ mov %eax, %ecx
+ and $-16, %edi
+ and $15, %ecx
+ movdqa (%edi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+
+ sarl %cl, %edx
+ sarl %cl, %eax
+ test %eax, %eax
+ jz L(unaligned_no_match)
+
+ add %edi, %ecx
+ POP (%edi)
+
+ test %edx, %edx
+ jz L(match_case1)
+ test %al, %al
+ jz L(match_higth_case2)
+ test $15, %al
+ jnz L(match_case2_4)
+ test $15, %dl
+ jnz L(return_null)
+ lea 4(%ecx), %eax
+ ret
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(unaligned_no_match):
+ mov %edi, %ecx
+ POP (%edi)
+
+ test %edx, %edx
+ jnz L(return_null)
+
+ pxor %xmm2, %xmm2
+
+/* Loop start on aligned string. */
+ .p2align 4
+L(loop):
+ add $16, %ecx
+ movdqa (%ecx), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+ add $16, %ecx
+
+ movdqa (%ecx), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+ add $16, %ecx
+
+ movdqa (%ecx), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jnz L(matches)
+ add $16, %ecx
+
+ movdqa (%ecx), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %edx
+ pmovmskb %xmm0, %eax
+ or %eax, %edx
+ jz L(loop)
+
+ .p2align 4
+L(matches):
+ pmovmskb %xmm2, %edx
+ test %eax, %eax
+ jz L(return_null)
+ test %edx, %edx
+ jz L(match_case1)
+
+ .p2align 4
+L(match_case2):
+ test %al, %al
+ jz L(match_higth_case2)
+ test $15, %al
+ jnz L(match_case2_4)
+ test $15, %dl
+ jnz L(return_null)
+ lea 4(%ecx), %eax
+ ret
+
+ .p2align 4
+L(match_case2_4):
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(match_higth_case2):
+ test %dl, %dl
+ jnz L(return_null)
+ test $15, %ah
+ jnz L(match_case2_12)
+ test $15, %dh
+ jnz L(return_null)
+ lea 12(%ecx), %eax
+ ret
+
+ .p2align 4
+L(match_case2_12):
+ lea 8(%ecx), %eax
+ ret
+
+ .p2align 4
+L(match_case1):
+ test %al, %al
+ jz L(match_higth_case1)
+
+ test $0x01, %al
+ jnz L(exit0)
+ lea 4(%ecx), %eax
+ ret
+
+ .p2align 4
+L(match_higth_case1):
+ test $0x01, %ah
+ jnz L(exit3)
+ lea 12(%ecx), %eax
+ ret
+
+ .p2align 4
+L(exit0):
+ mov %ecx, %eax
+ ret
+
+ .p2align 4
+L(exit3):
+ lea 8(%ecx), %eax
+ ret
+
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ ret
+
+END (wcschr)
diff --git a/libc/arch-x86/string/sse2-wcscmp-atom.S b/libc/arch-x86/string/sse2-wcscmp-atom.S
new file mode 100644
index 0000000..8867d28
--- /dev/null
+++ b/libc/arch-x86/string/sse2-wcscmp-atom.S
@@ -0,0 +1,1062 @@
+/*
+Copyright (c) 2011 Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#define ENTRANCE PUSH(%esi); PUSH(%edi)
+#define RETURN POP(%edi); POP(%esi); ret; CFI_PUSH(%esi); CFI_PUSH(%edi);
+#define PARMS 4
+#define STR1 PARMS
+#define STR2 STR1+4
+
+ .text
+ENTRY (wcscmp)
+/*
+ * This implementation uses SSE to compare up to 16 bytes at a time.
+*/
+ mov STR1(%esp), %edx
+ mov STR2(%esp), %eax
+
+ mov (%eax), %ecx
+ cmp %ecx, (%edx)
+ jne L(neq)
+ test %ecx, %ecx
+ jz L(eq)
+
+ mov 4(%eax), %ecx
+ cmp %ecx, 4(%edx)
+ jne L(neq)
+ test %ecx, %ecx
+ jz L(eq)
+
+ mov 8(%eax), %ecx
+ cmp %ecx, 8(%edx)
+ jne L(neq)
+ test %ecx, %ecx
+ jz L(eq)
+
+ mov 12(%eax), %ecx
+ cmp %ecx, 12(%edx)
+ jne L(neq)
+ test %ecx, %ecx
+ jz L(eq)
+
+ ENTRANCE
+ add $16, %eax
+ add $16, %edx
+
+ mov %eax, %esi
+ mov %edx, %edi
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
+ mov %al, %ch
+ mov %dl, %cl
+ and $63, %eax /* esi alignment in cache line */
+ and $63, %edx /* edi alignment in cache line */
+ and $15, %cl
+ jz L(continue_00)
+ cmp $16, %edx
+ jb L(continue_0)
+ cmp $32, %edx
+ jb L(continue_16)
+ cmp $48, %edx
+ jb L(continue_32)
+
+L(continue_48):
+ and $15, %ch
+ jz L(continue_48_00)
+ cmp $16, %eax
+ jb L(continue_0_48)
+ cmp $32, %eax
+ jb L(continue_16_48)
+ cmp $48, %eax
+ jb L(continue_32_48)
+
+ .p2align 4
+L(continue_48_48):
+ mov (%esi), %ecx
+ cmp %ecx, (%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 4(%esi), %ecx
+ cmp %ecx, 4(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 8(%esi), %ecx
+ cmp %ecx, 8(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 12(%esi), %ecx
+ cmp %ecx, 12(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ movdqu 16(%edi), %xmm1
+ movdqu 16(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%edi), %xmm1
+ movdqu 32(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ movdqu 48(%edi), %xmm1
+ movdqu 48(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %esi
+ add $64, %edi
+ jmp L(continue_48_48)
+
+L(continue_0):
+ and $15, %ch
+ jz L(continue_0_00)
+ cmp $16, %eax
+ jb L(continue_0_0)
+ cmp $32, %eax
+ jb L(continue_0_16)
+ cmp $48, %eax
+ jb L(continue_0_32)
+
+ .p2align 4
+L(continue_0_48):
+ mov (%esi), %ecx
+ cmp %ecx, (%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 4(%esi), %ecx
+ cmp %ecx, 4(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 8(%esi), %ecx
+ cmp %ecx, 8(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 12(%esi), %ecx
+ cmp %ecx, 12(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ movdqu 16(%edi), %xmm1
+ movdqu 16(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%edi), %xmm1
+ movdqu 32(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ mov 48(%esi), %ecx
+ cmp %ecx, 48(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 52(%esi), %ecx
+ cmp %ecx, 52(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 56(%esi), %ecx
+ cmp %ecx, 56(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 60(%esi), %ecx
+ cmp %ecx, 60(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ add $64, %esi
+ add $64, %edi
+ jmp L(continue_0_48)
+
+ .p2align 4
+L(continue_00):
+ and $15, %ch
+ jz L(continue_00_00)
+ cmp $16, %eax
+ jb L(continue_00_0)
+ cmp $32, %eax
+ jb L(continue_00_16)
+ cmp $48, %eax
+ jb L(continue_00_32)
+
+ .p2align 4
+L(continue_00_48):
+ pcmpeqd (%edi), %xmm0
+ mov (%edi), %eax
+ pmovmskb %xmm0, %ecx
+ test %ecx, %ecx
+ jnz L(less4_double_words1)
+
+ cmp (%esi), %eax
+ jne L(nequal)
+
+ mov 4(%edi), %eax
+ cmp 4(%esi), %eax
+ jne L(nequal)
+
+ mov 8(%edi), %eax
+ cmp 8(%esi), %eax
+ jne L(nequal)
+
+ mov 12(%edi), %eax
+ cmp 12(%esi), %eax
+ jne L(nequal)
+
+ movdqu 16(%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 32(%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ movdqu 48(%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 48(%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %esi
+ add $64, %edi
+ jmp L(continue_00_48)
+
+ .p2align 4
+L(continue_32):
+ and $15, %ch
+ jz L(continue_32_00)
+ cmp $16, %eax
+ jb L(continue_0_32)
+ cmp $32, %eax
+ jb L(continue_16_32)
+ cmp $48, %eax
+ jb L(continue_32_32)
+
+ .p2align 4
+L(continue_32_48):
+ mov (%esi), %ecx
+ cmp %ecx, (%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 4(%esi), %ecx
+ cmp %ecx, 4(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 8(%esi), %ecx
+ cmp %ecx, 8(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 12(%esi), %ecx
+ cmp %ecx, 12(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 16(%esi), %ecx
+ cmp %ecx, 16(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 20(%esi), %ecx
+ cmp %ecx, 20(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 24(%esi), %ecx
+ cmp %ecx, 24(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 28(%esi), %ecx
+ cmp %ecx, 28(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ movdqu 32(%edi), %xmm1
+ movdqu 32(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ movdqu 48(%edi), %xmm1
+ movdqu 48(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %esi
+ add $64, %edi
+ jmp L(continue_32_48)
+
+ .p2align 4
+L(continue_16):
+ and $15, %ch
+ jz L(continue_16_00)
+ cmp $16, %eax
+ jb L(continue_0_16)
+ cmp $32, %eax
+ jb L(continue_16_16)
+ cmp $48, %eax
+ jb L(continue_16_32)
+
+ .p2align 4
+L(continue_16_48):
+ mov (%esi), %ecx
+ cmp %ecx, (%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 4(%esi), %ecx
+ cmp %ecx, 4(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 8(%esi), %ecx
+ cmp %ecx, 8(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 12(%esi), %ecx
+ cmp %ecx, 12(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ movdqu 16(%edi), %xmm1
+ movdqu 16(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ mov 32(%esi), %ecx
+ cmp %ecx, 32(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 36(%esi), %ecx
+ cmp %ecx, 36(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 40(%esi), %ecx
+ cmp %ecx, 40(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 44(%esi), %ecx
+ cmp %ecx, 44(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ movdqu 48(%edi), %xmm1
+ movdqu 48(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %esi
+ add $64, %edi
+ jmp L(continue_16_48)
+
+ .p2align 4
+L(continue_00_00):
+ movdqa (%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqa 16(%edi), %xmm3
+ pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%esi), %xmm3 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm3 /* packed sub of comparison results*/
+ pmovmskb %xmm3, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqa 32(%edi), %xmm5
+ pcmpeqd %xmm5, %xmm0 /* Any null double_word? */
+ pcmpeqd 32(%esi), %xmm5 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm5 /* packed sub of comparison results*/
+ pmovmskb %xmm5, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ movdqa 48(%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 48(%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %esi
+ add $64, %edi
+ jmp L(continue_00_00)
+
+ .p2align 4
+L(continue_00_32):
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd (%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ add $16, %esi
+ add $16, %edi
+ jmp L(continue_00_48)
+
+ .p2align 4
+L(continue_00_16):
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd (%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ add $32, %esi
+ add $32, %edi
+ jmp L(continue_00_48)
+
+ .p2align 4
+L(continue_00_0):
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd (%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%esi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 32(%edi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ add $48, %esi
+ add $48, %edi
+ jmp L(continue_00_48)
+
+ .p2align 4
+L(continue_48_00):
+ pcmpeqd (%esi), %xmm0
+ mov (%edi), %eax
+ pmovmskb %xmm0, %ecx
+ test %ecx, %ecx
+ jnz L(less4_double_words1)
+
+ cmp (%esi), %eax
+ jne L(nequal)
+
+ mov 4(%edi), %eax
+ cmp 4(%esi), %eax
+ jne L(nequal)
+
+ mov 8(%edi), %eax
+ cmp 8(%esi), %eax
+ jne L(nequal)
+
+ mov 12(%edi), %eax
+ cmp 12(%esi), %eax
+ jne L(nequal)
+
+ movdqu 16(%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 32(%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ movdqu 48(%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 48(%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %esi
+ add $64, %edi
+ jmp L(continue_48_00)
+
+ .p2align 4
+L(continue_32_00):
+ movdqu (%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ add $16, %esi
+ add $16, %edi
+ jmp L(continue_48_00)
+
+ .p2align 4
+L(continue_16_00):
+ movdqu (%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ add $32, %esi
+ add $32, %edi
+ jmp L(continue_48_00)
+
+ .p2align 4
+L(continue_0_00):
+ movdqu (%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%edi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 32(%esi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ add $48, %esi
+ add $48, %edi
+ jmp L(continue_48_00)
+
+ .p2align 4
+L(continue_32_32):
+ movdqu (%edi), %xmm1
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ add $16, %esi
+ add $16, %edi
+ jmp L(continue_48_48)
+
+ .p2align 4
+L(continue_16_16):
+ movdqu (%edi), %xmm1
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%edi), %xmm3
+ movdqu 16(%esi), %xmm4
+ pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm3 /* packed sub of comparison results*/
+ pmovmskb %xmm3, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ add $32, %esi
+ add $32, %edi
+ jmp L(continue_48_48)
+
+ .p2align 4
+L(continue_0_0):
+ movdqu (%edi), %xmm1
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%edi), %xmm3
+ movdqu 16(%esi), %xmm4
+ pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm3 /* packed sub of comparison results*/
+ pmovmskb %xmm3, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%edi), %xmm1
+ movdqu 32(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ add $48, %esi
+ add $48, %edi
+ jmp L(continue_48_48)
+
+ .p2align 4
+L(continue_0_16):
+ movdqu (%edi), %xmm1
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%edi), %xmm1
+ movdqu 16(%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ add $32, %esi
+ add $32, %edi
+ jmp L(continue_32_48)
+
+ .p2align 4
+L(continue_0_32):
+ movdqu (%edi), %xmm1
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ add $16, %esi
+ add $16, %edi
+ jmp L(continue_16_48)
+
+ .p2align 4
+L(continue_16_32):
+ movdqu (%edi), %xmm1
+ movdqu (%esi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ add $16, %esi
+ add $16, %edi
+ jmp L(continue_32_48)
+
+ .p2align 4
+L(less4_double_words1):
+ cmp (%esi), %eax
+ jne L(nequal)
+ test %eax, %eax
+ jz L(equal)
+
+ mov 4(%esi), %ecx
+ cmp %ecx, 4(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 8(%esi), %ecx
+ cmp %ecx, 8(%edi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 12(%esi), %ecx
+ cmp %ecx, 12(%edi)
+ jne L(nequal)
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(less4_double_words):
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words)
+ and $15, %dl
+ jz L(second_double_word)
+ mov (%esi), %ecx
+ cmp %ecx, (%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(second_double_word):
+ mov 4(%esi), %ecx
+ cmp %ecx, 4(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(next_two_double_words):
+ and $15, %dh
+ jz L(fourth_double_word)
+ mov 8(%esi), %ecx
+ cmp %ecx, 8(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(fourth_double_word):
+ mov 12(%esi), %ecx
+ cmp %ecx, 12(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(less4_double_words_16):
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words_16)
+ and $15, %dl
+ jz L(second_double_word_16)
+ mov 16(%esi), %ecx
+ cmp %ecx, 16(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(second_double_word_16):
+ mov 20(%esi), %ecx
+ cmp %ecx, 20(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(next_two_double_words_16):
+ and $15, %dh
+ jz L(fourth_double_word_16)
+ mov 24(%esi), %ecx
+ cmp %ecx, 24(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(fourth_double_word_16):
+ mov 28(%esi), %ecx
+ cmp %ecx, 28(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(less4_double_words_32):
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words_32)
+ and $15, %dl
+ jz L(second_double_word_32)
+ mov 32(%esi), %ecx
+ cmp %ecx, 32(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(second_double_word_32):
+ mov 36(%esi), %ecx
+ cmp %ecx, 36(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(next_two_double_words_32):
+ and $15, %dh
+ jz L(fourth_double_word_32)
+ mov 40(%esi), %ecx
+ cmp %ecx, 40(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(fourth_double_word_32):
+ mov 44(%esi), %ecx
+ cmp %ecx, 44(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(less4_double_words_48):
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words_48)
+ and $15, %dl
+ jz L(second_double_word_48)
+ mov 48(%esi), %ecx
+ cmp %ecx, 48(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(second_double_word_48):
+ mov 52(%esi), %ecx
+ cmp %ecx, 52(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(next_two_double_words_48):
+ and $15, %dh
+ jz L(fourth_double_word_48)
+ mov 56(%esi), %ecx
+ cmp %ecx, 56(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(fourth_double_word_48):
+ mov 60(%esi), %ecx
+ cmp %ecx, 60(%edi)
+ jne L(nequal)
+ RETURN
+
+ .p2align 4
+L(nequal):
+ mov $1, %eax
+ jg L(return)
+ neg %eax
+ RETURN
+
+ .p2align 4
+L(return):
+ RETURN
+
+ .p2align 4
+L(equal):
+ xorl %eax, %eax
+ RETURN
+
+ CFI_POP (%edi)
+ CFI_POP (%esi)
+
+ .p2align 4
+L(neq):
+ mov $1, %eax
+ jg L(neq_bigger)
+ neg %eax
+
+L(neq_bigger):
+ ret
+
+ .p2align 4
+L(eq):
+ xorl %eax, %eax
+ ret
+
+END (wcscmp)
+
diff --git a/libc/arch-x86/string/sse2-wcslen-atom.S b/libc/arch-x86/string/sse2-wcslen-atom.S
new file mode 100644
index 0000000..6a6ad51
--- /dev/null
+++ b/libc/arch-x86/string/sse2-wcslen-atom.S
@@ -0,0 +1,306 @@
+/*
+Copyright (c) 2011 Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef USE_AS_WCSCAT
+
+# ifndef L
+# define L(label) .L##label
+# endif
+
+# ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+# endif
+
+# ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+# endif
+
+# ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+# endif
+
+# ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+# endif
+
+# define PARMS 4
+# define STR PARMS
+# define RETURN ret
+
+ .text
+ENTRY (wcslen)
+ mov STR(%esp), %edx
+#endif
+ cmp $0, (%edx)
+ jz L(exit_tail0)
+ cmp $0, 4(%edx)
+ jz L(exit_tail1)
+ cmp $0, 8(%edx)
+ jz L(exit_tail2)
+ cmp $0, 12(%edx)
+ jz L(exit_tail3)
+ cmp $0, 16(%edx)
+ jz L(exit_tail4)
+ cmp $0, 20(%edx)
+ jz L(exit_tail5)
+ cmp $0, 24(%edx)
+ jz L(exit_tail6)
+ cmp $0, 28(%edx)
+ jz L(exit_tail7)
+
+ pxor %xmm0, %xmm0
+
+ lea 32(%edx), %eax
+ lea -16(%eax), %ecx
+ and $-16, %eax
+
+ pcmpeqd (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ pxor %xmm1, %xmm1
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ pxor %xmm2, %xmm2
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ pxor %xmm3, %xmm3
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ and $-0x40, %eax
+
+ .p2align 4
+L(aligned_64_loop):
+ movaps (%eax), %xmm0
+ movaps 16(%eax), %xmm1
+ movaps 32(%eax), %xmm2
+ movaps 48(%eax), %xmm6
+
+ pminub %xmm1, %xmm0
+ pminub %xmm6, %xmm2
+ pminub %xmm0, %xmm2
+ pcmpeqd %xmm3, %xmm2
+ pmovmskb %xmm2, %edx
+ lea 64(%eax), %eax
+ test %edx, %edx
+ jz L(aligned_64_loop)
+
+ pcmpeqd -64(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 48(%ecx), %ecx
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea -16(%ecx), %ecx
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd -32(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea -16(%ecx), %ecx
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd %xmm6, %xmm3
+ pmovmskb %xmm3, %edx
+ lea -16(%ecx), %ecx
+ test %edx, %edx
+ jnz L(exit)
+
+ jmp L(aligned_64_loop)
+
+ .p2align 4
+L(exit):
+ sub %ecx, %eax
+ shr $2, %eax
+ test %dl, %dl
+ jz L(exit_high)
+
+ mov %dl, %cl
+ and $15, %cl
+ jz L(exit_1)
+ RETURN
+
+ .p2align 4
+L(exit_high):
+ mov %dh, %ch
+ and $15, %ch
+ jz L(exit_3)
+ add $2, %eax
+ RETURN
+
+ .p2align 4
+L(exit_1):
+ add $1, %eax
+ RETURN
+
+ .p2align 4
+L(exit_3):
+ add $3, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail0):
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail1):
+ mov $1, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail2):
+ mov $2, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail3):
+ mov $3, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail4):
+ mov $4, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail5):
+ mov $5, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail6):
+ mov $6, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail7):
+ mov $7, %eax
+#ifndef USE_AS_WCSCAT
+ RETURN
+
+END (wcslen)
+#endif
diff --git a/libc/arch-x86/string/sse2-wcsrchr-atom.S b/libc/arch-x86/string/sse2-wcsrchr-atom.S
new file mode 100644
index 0000000..e30779d
--- /dev/null
+++ b/libc/arch-x86/string/sse2-wcsrchr-atom.S
@@ -0,0 +1,402 @@
+/*
+Copyright (c) 2011 Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#define PARMS 8
+#define ENTRANCE PUSH(%edi);
+#define RETURN POP(%edi); ret; CFI_PUSH(%edi);
+
+#define STR1 PARMS
+#define STR2 STR1+4
+
+ .text
+ENTRY (wcsrchr)
+
+ ENTRANCE
+ mov STR1(%esp), %ecx
+ movd STR2(%esp), %xmm1
+
+ mov %ecx, %edi
+ punpckldq %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ punpckldq %xmm1, %xmm1
+
+/* ECX has OFFSET. */
+ and $63, %ecx
+ cmp $48, %ecx
+ ja L(crosscache)
+
+/* unaligned string. */
+ movdqu (%edi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+/* Find where NULL is. */
+ pmovmskb %xmm2, %ecx
+/* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+ add $16, %edi
+
+ test %eax, %eax
+ jnz L(unaligned_match1)
+
+ test %ecx, %ecx
+ jnz L(return_null)
+
+ and $-16, %edi
+
+ PUSH (%esi)
+
+ xor %edx, %edx
+ jmp L(loop)
+
+ CFI_POP (%esi)
+
+ .p2align 4
+L(unaligned_match1):
+ test %ecx, %ecx
+ jnz L(prolog_find_zero_1)
+
+ PUSH (%esi)
+
+/* Save current match */
+ mov %eax, %edx
+ mov %edi, %esi
+ and $-16, %edi
+ jmp L(loop)
+
+ CFI_POP (%esi)
+
+ .p2align 4
+L(crosscache):
+/* Hancle unaligned string. */
+ and $15, %ecx
+ and $-16, %edi
+ pxor %xmm3, %xmm3
+ movdqa (%edi), %xmm0
+ pcmpeqd %xmm0, %xmm3
+ pcmpeqd %xmm1, %xmm0
+/* Find where NULL is. */
+ pmovmskb %xmm3, %edx
+/* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+/* Remove the leading bytes. */
+ shr %cl, %edx
+ shr %cl, %eax
+ add $16, %edi
+
+ test %eax, %eax
+ jnz L(unaligned_match)
+
+ test %edx, %edx
+ jnz L(return_null)
+
+ PUSH (%esi)
+
+ xor %edx, %edx
+ jmp L(loop)
+
+ CFI_POP (%esi)
+
+ .p2align 4
+L(unaligned_match):
+ test %edx, %edx
+ jnz L(prolog_find_zero)
+
+ PUSH (%esi)
+
+ mov %eax, %edx
+ lea (%edi, %ecx), %esi
+
+/* Loop start on aligned string. */
+ .p2align 4
+L(loop):
+ movdqa (%edi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ add $16, %edi
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm0, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm3
+ pcmpeqd %xmm3, %xmm2
+ add $16, %edi
+ pcmpeqd %xmm1, %xmm3
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm3, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm4
+ pcmpeqd %xmm4, %xmm2
+ add $16, %edi
+ pcmpeqd %xmm1, %xmm4
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm4, %eax
+ or %eax, %ecx
+ jnz L(matches)
+
+ movdqa (%edi), %xmm5
+ pcmpeqd %xmm5, %xmm2
+ add $16, %edi
+ pcmpeqd %xmm1, %xmm5
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm5, %eax
+ or %eax, %ecx
+ jz L(loop)
+
+ .p2align 4
+L(matches):
+ test %eax, %eax
+ jnz L(match)
+L(return_value):
+ test %edx, %edx
+ jz L(return_null_1)
+ mov %edx, %eax
+ mov %esi, %edi
+
+ POP (%esi)
+
+ test %ah, %ah
+ jnz L(match_third_or_fourth_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(return_null_1):
+ POP (%esi)
+
+ xor %eax, %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(match):
+ pmovmskb %xmm2, %ecx
+ test %ecx, %ecx
+ jnz L(find_zero)
+/* save match info */
+ mov %eax, %edx
+ mov %edi, %esi
+ jmp L(loop)
+
+ .p2align 4
+L(find_zero):
+ test %cl, %cl
+ jz L(find_zero_in_third_or_fourth_wchar)
+ test $15, %cl
+ jz L(find_zero_in_second_wchar)
+ and $1, %eax
+ jz L(return_value)
+
+ POP (%esi)
+
+ lea -16(%edi), %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(find_zero_in_second_wchar):
+ and $1 << 5 - 1, %eax
+ jz L(return_value)
+
+ POP (%esi)
+
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(find_zero_in_third_or_fourth_wchar):
+ test $15, %ch
+ jz L(find_zero_in_fourth_wchar)
+ and $1 << 9 - 1, %eax
+ jz L(return_value)
+
+ POP (%esi)
+
+ test %ah, %ah
+ jnz L(match_third_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(find_zero_in_fourth_wchar):
+
+ POP (%esi)
+
+ test %ah, %ah
+ jnz L(match_third_or_fourth_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(match_second_wchar):
+ lea -12(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_third_or_fourth_wchar):
+ test $15 << 4, %ah
+ jnz L(match_fourth_wchar)
+ lea -8(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_third_wchar):
+ lea -8(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(match_fourth_wchar):
+ lea -4(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(return_null):
+ xor %eax, %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero):
+ add %ecx, %edi
+ mov %edx, %ecx
+L(prolog_find_zero_1):
+ test %cl, %cl
+ jz L(prolog_find_zero_in_third_or_fourth_wchar)
+ test $15, %cl
+ jz L(prolog_find_zero_in_second_wchar)
+ and $1, %eax
+ jz L(return_null)
+
+ lea -16(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero_in_second_wchar):
+ and $1 << 5 - 1, %eax
+ jz L(return_null)
+
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero_in_third_or_fourth_wchar):
+ test $15, %ch
+ jz L(prolog_find_zero_in_fourth_wchar)
+ and $1 << 9 - 1, %eax
+ jz L(return_null)
+
+ test %ah, %ah
+ jnz L(match_third_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+ .p2align 4
+L(prolog_find_zero_in_fourth_wchar):
+ test %ah, %ah
+ jnz L(match_third_or_fourth_wchar)
+ test $15 << 4, %al
+ jnz L(match_second_wchar)
+ lea -16(%edi), %eax
+ RETURN
+
+END (wcsrchr)
diff --git a/libc/arch-x86/string/memcmp_wrapper.S b/libc/arch-x86/string/ssse3-bcopy-atom.S
similarity index 92%
copy from libc/arch-x86/string/memcmp_wrapper.S
copy to libc/arch-x86/string/ssse3-bcopy-atom.S
index fa0c672..e4b791a 100644
--- a/libc/arch-x86/string/memcmp_wrapper.S
+++ b/libc/arch-x86/string/ssse3-bcopy-atom.S
@@ -28,13 +28,8 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#if defined(USE_SSSE3)
-# define MEMCMP memcmp
-# include "ssse3-memcmp3-new.S"
-
-#else
-
-# include "memcmp.S"
-
-#endif
+#define MEMCPY bcopy
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#include "ssse3-memcpy-atom.S"
diff --git a/libc/arch-x86/string/ssse3-memcmp3-new.S b/libc/arch-x86/string/ssse3-memcmp-atom.S
similarity index 85%
rename from libc/arch-x86/string/ssse3-memcmp3-new.S
rename to libc/arch-x86/string/ssse3-memcmp-atom.S
index 5ad8791..30e3173 100644
--- a/libc/arch-x86/string/ssse3-memcmp3-new.S
+++ b/libc/arch-x86/string/ssse3-memcmp-atom.S
@@ -1,5 +1,5 @@
/*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2010, 2011 Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,24 +28,16 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#ifndef MEMCMP
-# define MEMCMP ssse3_memcmp3_new
-#endif
-
#ifndef L
# define L(label) .L##label
#endif
-#ifndef ALIGN
-# define ALIGN(n) .p2align n
-#endif
-
#ifndef cfi_startproc
-# define cfi_startproc .cfi_startproc
+# define cfi_startproc .cfi_startproc
#endif
#ifndef cfi_endproc
-# define cfi_endproc .cfi_endproc
+# define cfi_endproc .cfi_endproc
#endif
#ifndef cfi_rel_offset
@@ -53,7 +45,7 @@
#endif
#ifndef cfi_restore
-# define cfi_restore(reg) .cfi_restore reg
+# define cfi_restore(reg) .cfi_restore reg
#endif
#ifndef cfi_adjust_cfa_offset
@@ -61,35 +53,39 @@
#endif
#ifndef cfi_remember_state
-# define cfi_remember_state .cfi_remember_state
+# define cfi_remember_state .cfi_remember_state
#endif
#ifndef cfi_restore_state
-# define cfi_restore_state .cfi_restore_state
+# define cfi_restore_state .cfi_restore_state
#endif
#ifndef ENTRY
-# define ENTRY(name) \
- .type name, @function; \
- .globl name; \
- .p2align 4; \
-name: \
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
cfi_startproc
#endif
#ifndef END
-# define END(name) \
- cfi_endproc; \
+# define END(name) \
+ cfi_endproc; \
.size name, .-name
#endif
-#define CFI_PUSH(REG) \
- cfi_adjust_cfa_offset (4); \
- cfi_rel_offset (REG, 0)
+#ifndef MEMCMP
+# define MEMCMP memcmp
+#endif
-#define CFI_POP(REG) \
- cfi_adjust_cfa_offset (-4); \
- cfi_restore (REG)
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
#define PUSH(REG) pushl REG; CFI_PUSH (REG)
#define POP(REG) popl REG; CFI_POP (REG)
@@ -101,22 +97,39 @@
#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret
#define RETURN RETURN_END; cfi_restore_state; cfi_remember_state
- .section .text.ssse3,"ax",@progbits
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+ .text
ENTRY (MEMCMP)
movl LEN(%esp), %ecx
+
+#ifdef USE_AS_WMEMCMP
+ shl $2, %ecx
+ jz L(zero)
+#endif
+
movl BLK1(%esp), %eax
cmp $48, %ecx
movl BLK2(%esp), %edx
jae L(48bytesormore)
+
+#ifndef USE_AS_WMEMCMP
cmp $1, %ecx
jbe L(less1bytes)
- PUSH (%ebx)
+#endif
+
+ PUSH (%ebx)
add %ecx, %edx
add %ecx, %eax
jmp L(less48bytes)
- CFI_POP (%ebx)
- ALIGN (4)
+ CFI_POP (%ebx)
+
+#ifndef USE_AS_WMEMCMP
+ .p2align 4
L(less1bytes):
jb L(zero)
movb (%eax), %cl
@@ -127,29 +140,30 @@
neg %eax
L(1bytesend):
ret
+#endif
- ALIGN (4)
+ .p2align 4
L(zero):
- mov $0, %eax
+ xor %eax, %eax
ret
- ALIGN (4)
+ .p2align 4
L(48bytesormore):
- PUSH (%ebx)
- PUSH (%esi)
- PUSH (%edi)
+ PUSH (%ebx)
+ PUSH (%esi)
+ PUSH (%edi)
cfi_remember_state
- movdqu (%eax), %xmm3
- movdqu (%edx), %xmm0
+ movdqu (%eax), %xmm3
+ movdqu (%edx), %xmm0
movl %eax, %edi
movl %edx, %esi
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %edx
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
lea 16(%edi), %edi
- sub $0xffff, %edx
+ sub $0xffff, %edx
lea 16(%esi), %esi
- jnz L(less16bytes)
+ jnz L(less16bytes)
mov %edi, %edx
and $0xf, %edx
xor %edx, %edi
@@ -160,6 +174,7 @@
jz L(shr_0)
xor %edx, %esi
+#ifndef USE_AS_WMEMCMP
cmp $8, %edx
jae L(next_unaligned_table)
cmp $0, %edx
@@ -178,7 +193,7 @@
je L(shr_6)
jmp L(shr_7)
- ALIGN (4)
+ .p2align 2
L(next_unaligned_table):
cmp $8, %edx
je L(shr_8)
@@ -195,8 +210,17 @@
cmp $14, %edx
je L(shr_14)
jmp L(shr_15)
+#else
+ cmp $0, %edx
+ je L(shr_0)
+ cmp $4, %edx
+ je L(shr_4)
+ cmp $8, %edx
+ je L(shr_8)
+ jmp L(shr_12)
+#endif
- ALIGN (4)
+ .p2align 4
L(shr_0):
cmp $80, %ecx
jae L(shr_0_gobble)
@@ -215,13 +239,13 @@
lea (%ecx, %edi,1), %eax
lea (%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_0_gobble):
lea -48(%ecx), %ecx
movdqa (%esi), %xmm0
@@ -261,13 +285,14 @@
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea (%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+#ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_1):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -291,13 +316,13 @@
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 1(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_1_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -344,13 +369,14 @@
lea (%ecx, %edi,1), %eax
lea 1(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_2):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -374,13 +400,13 @@
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 2(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_2_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -427,13 +453,13 @@
lea (%ecx, %edi,1), %eax
lea 2(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_3):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -457,13 +483,13 @@
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 3(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_3_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -510,13 +536,14 @@
lea (%ecx, %edi,1), %eax
lea 3(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+#endif
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_4):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -540,13 +567,13 @@
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 4(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_4_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -593,13 +620,14 @@
lea (%ecx, %edi,1), %eax
lea 4(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+#ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_5):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -623,13 +651,13 @@
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 5(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_5_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -676,13 +704,13 @@
lea (%ecx, %edi,1), %eax
lea 5(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_6):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -706,13 +734,13 @@
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 6(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_6_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -759,13 +787,13 @@
lea (%ecx, %edi,1), %eax
lea 6(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_7):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -789,13 +817,13 @@
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 7(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_7_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -842,13 +870,14 @@
lea (%ecx, %edi,1), %eax
lea 7(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+#endif
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_8):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -872,13 +901,13 @@
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 8(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_8_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -925,13 +954,14 @@
lea (%ecx, %edi,1), %eax
lea 8(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+#ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_9):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -955,13 +985,13 @@
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 9(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_9_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1008,13 +1038,13 @@
lea (%ecx, %edi,1), %eax
lea 9(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_10):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1038,13 +1068,13 @@
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 10(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_10_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1091,13 +1121,13 @@
lea (%ecx, %edi,1), %eax
lea 10(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_11):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1121,13 +1151,13 @@
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 11(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_11_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1174,13 +1204,14 @@
lea (%ecx, %edi,1), %eax
lea 11(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+#endif
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_12):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1204,13 +1235,13 @@
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 12(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_12_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1257,13 +1288,14 @@
lea (%ecx, %edi,1), %eax
lea 12(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+#ifndef USE_AS_WMEMCMP
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_13):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1287,13 +1319,13 @@
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 13(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_13_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1340,13 +1372,13 @@
lea (%ecx, %edi,1), %eax
lea 13(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_14):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1370,13 +1402,13 @@
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 14(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_14_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1423,13 +1455,13 @@
lea (%ecx, %edi,1), %eax
lea 14(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_15):
cmp $80, %ecx
lea -48(%ecx), %ecx
@@ -1453,13 +1485,13 @@
jnz L(exit)
lea (%ecx, %edi,1), %eax
lea 15(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shr_15_gobble):
sub $32, %ecx
movdqa 16(%esi), %xmm0
@@ -1506,13 +1538,14 @@
lea (%ecx, %edi,1), %eax
lea 15(%ecx, %esi,1), %edx
- POP (%edi)
- POP (%esi)
+ POP (%edi)
+ POP (%esi)
jmp L(less48bytes)
+#endif
cfi_restore_state
cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(exit):
pmovmskb %xmm1, %ebx
sub $0xffff, %ebx
@@ -1520,9 +1553,12 @@
lea -16(%esi), %esi
lea -16(%edi), %edi
mov %ebx, %edx
+
L(first16bytes):
add %eax, %esi
L(less16bytes):
+
+#ifndef USE_AS_WMEMCMP
test %dl, %dl
jz L(next_24_bytes)
@@ -1547,61 +1583,61 @@
test $0x40, %dl
jnz L(Byte22)
L(Byte23):
- movzbl -9(%edi), %eax
- movzbl -9(%esi), %edx
+ movzbl -9(%edi), %eax
+ movzbl -9(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte16):
- movzbl -16(%edi), %eax
- movzbl -16(%esi), %edx
+ movzbl -16(%edi), %eax
+ movzbl -16(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte17):
- movzbl -15(%edi), %eax
- movzbl -15(%esi), %edx
+ movzbl -15(%edi), %eax
+ movzbl -15(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte18):
- movzbl -14(%edi), %eax
- movzbl -14(%esi), %edx
+ movzbl -14(%edi), %eax
+ movzbl -14(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte19):
- movzbl -13(%edi), %eax
- movzbl -13(%esi), %edx
+ movzbl -13(%edi), %eax
+ movzbl -13(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte20):
- movzbl -12(%edi), %eax
- movzbl -12(%esi), %edx
+ movzbl -12(%edi), %eax
+ movzbl -12(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte21):
- movzbl -11(%edi), %eax
- movzbl -11(%esi), %edx
+ movzbl -11(%edi), %eax
+ movzbl -11(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(Byte22):
- movzbl -10(%edi), %eax
- movzbl -10(%esi), %edx
+ movzbl -10(%edi), %eax
+ movzbl -10(%esi), %edx
sub %edx, %eax
RETURN
- ALIGN (4)
+ .p2align 4
L(next_24_bytes):
lea 8(%edi), %edi
lea 8(%esi), %esi
@@ -1626,20 +1662,70 @@
test $0x40, %dh
jnz L(Byte22)
- ALIGN (4)
+ .p2align 4
L(Byte31):
- movzbl -9(%edi), %eax
- movzbl -9(%esi), %edx
+ movzbl -9(%edi), %eax
+ movzbl -9(%esi), %edx
sub %edx, %eax
RETURN_END
+#else
+
+/* special for wmemcmp */
+ test %dl, %dl
+ jz L(next_two_double_words)
+ and $15, %dl
+ jz L(second_double_word)
+ mov -16(%edi), %ecx
+ cmp -16(%esi), %ecx
+ mov $1, %eax
+ jg L(nequal_bigger)
+ neg %eax
+ RETURN
+
+
+ .p2align 4
+L(second_double_word):
+ mov -12(%edi), %ecx
+ cmp -12(%esi), %ecx
+ mov $1, %eax
+ jg L(nequal_bigger)
+ neg %eax
+ RETURN
+
+ .p2align 4
+L(next_two_double_words):
+ and $15, %dh
+ jz L(fourth_double_word)
+ mov -8(%edi), %ecx
+ cmp -8(%esi), %ecx
+ mov $1, %eax
+ jg L(nequal_bigger)
+ neg %eax
+ RETURN
+
+ .p2align 4
+L(fourth_double_word):
+ mov -4(%edi), %ecx
+ cmp -4(%esi), %ecx
+ mov $1, %eax
+ jg L(nequal_bigger)
+ neg %eax
+ RETURN
+
+ .p2align 4
+L(nequal_bigger):
+ RETURN_END
+#endif
+
CFI_PUSH (%ebx)
- ALIGN (4)
+ .p2align 4
L(more8bytes):
cmp $16, %ecx
jae L(more16bytes)
cmp $8, %ecx
je L(8bytes)
+#ifndef USE_AS_WMEMCMP
cmp $9, %ecx
je L(9bytes)
cmp $10, %ecx
@@ -1653,13 +1739,17 @@
cmp $14, %ecx
je L(14bytes)
jmp L(15bytes)
+#else
+ jmp L(12bytes)
+#endif
- ALIGN (4)
+ .p2align 4
L(more16bytes):
cmp $24, %ecx
jae L(more24bytes)
cmp $16, %ecx
je L(16bytes)
+#ifndef USE_AS_WMEMCMP
cmp $17, %ecx
je L(17bytes)
cmp $18, %ecx
@@ -1673,13 +1763,17 @@
cmp $22, %ecx
je L(22bytes)
jmp L(23bytes)
+#else
+ jmp L(20bytes)
+#endif
- ALIGN (4)
+ .p2align 4
L(more24bytes):
cmp $32, %ecx
jae L(more32bytes)
cmp $24, %ecx
je L(24bytes)
+#ifndef USE_AS_WMEMCMP
cmp $25, %ecx
je L(25bytes)
cmp $26, %ecx
@@ -1693,13 +1787,17 @@
cmp $30, %ecx
je L(30bytes)
jmp L(31bytes)
+#else
+ jmp L(28bytes)
+#endif
- ALIGN (4)
+ .p2align 4
L(more32bytes):
cmp $40, %ecx
jae L(more40bytes)
cmp $32, %ecx
je L(32bytes)
+#ifndef USE_AS_WMEMCMP
cmp $33, %ecx
je L(33bytes)
cmp $34, %ecx
@@ -1713,11 +1811,35 @@
cmp $38, %ecx
je L(38bytes)
jmp L(39bytes)
+#else
+ jmp L(36bytes)
+#endif
- ALIGN (4)
+ .p2align 4
+L(less48bytes):
+ cmp $8, %ecx
+ jae L(more8bytes)
+#ifndef USE_AS_WMEMCMP
+ cmp $2, %ecx
+ je L(2bytes)
+ cmp $3, %ecx
+ je L(3bytes)
+ cmp $4, %ecx
+ je L(4bytes)
+ cmp $5, %ecx
+ je L(5bytes)
+ cmp $6, %ecx
+ je L(6bytes)
+ jmp L(7bytes)
+#else
+ jmp L(4bytes)
+#endif
+
+ .p2align 4
L(more40bytes):
cmp $40, %ecx
je L(40bytes)
+#ifndef USE_AS_WMEMCMP
cmp $41, %ecx
je L(41bytes)
cmp $42, %ecx
@@ -1732,24 +1854,7 @@
je L(46bytes)
jmp L(47bytes)
- ALIGN (4)
-L(less48bytes):
- cmp $8, %ecx
- jae L(more8bytes)
- cmp $2, %ecx
- je L(2bytes)
- cmp $3, %ecx
- je L(3bytes)
- cmp $4, %ecx
- je L(4bytes)
- cmp $5, %ecx
- je L(5bytes)
- cmp $6, %ecx
- je L(6bytes)
- jmp L(7bytes)
-
-
- ALIGN (4)
+ .p2align 4
L(44bytes):
mov -44(%eax), %ecx
mov -44(%edx), %ebx
@@ -1806,11 +1911,64 @@
cmp %ebx, %ecx
mov $0, %eax
jne L(find_diff)
- POP (%ebx)
+ POP (%ebx)
ret
CFI_PUSH (%ebx)
+#else
+ .p2align 4
+L(44bytes):
+ mov -44(%eax), %ecx
+ cmp -44(%edx), %ecx
+ jne L(find_diff)
+L(40bytes):
+ mov -40(%eax), %ecx
+ cmp -40(%edx), %ecx
+ jne L(find_diff)
+L(36bytes):
+ mov -36(%eax), %ecx
+ cmp -36(%edx), %ecx
+ jne L(find_diff)
+L(32bytes):
+ mov -32(%eax), %ecx
+ cmp -32(%edx), %ecx
+ jne L(find_diff)
+L(28bytes):
+ mov -28(%eax), %ecx
+ cmp -28(%edx), %ecx
+ jne L(find_diff)
+L(24bytes):
+ mov -24(%eax), %ecx
+ cmp -24(%edx), %ecx
+ jne L(find_diff)
+L(20bytes):
+ mov -20(%eax), %ecx
+ cmp -20(%edx), %ecx
+ jne L(find_diff)
+L(16bytes):
+ mov -16(%eax), %ecx
+ cmp -16(%edx), %ecx
+ jne L(find_diff)
+L(12bytes):
+ mov -12(%eax), %ecx
+ cmp -12(%edx), %ecx
+ jne L(find_diff)
+L(8bytes):
+ mov -8(%eax), %ecx
+ cmp -8(%edx), %ecx
+ jne L(find_diff)
+L(4bytes):
+ mov -4(%eax), %ecx
+ xor %eax, %eax
+ cmp -4(%edx), %ecx
+ jne L(find_diff)
+ POP (%ebx)
+ ret
+ CFI_PUSH (%ebx)
+#endif
- ALIGN (4)
+#ifndef USE_AS_WMEMCMP
+
+ .p2align 4
L(45bytes):
mov -45(%eax), %ecx
mov -45(%edx), %ebx
@@ -1870,11 +2028,11 @@
cmp -1(%edx), %cl
mov $0, %eax
jne L(end)
- POP (%ebx)
+ POP (%ebx)
ret
CFI_PUSH (%ebx)
- ALIGN (4)
+ .p2align 4
L(46bytes):
mov -46(%eax), %ecx
mov -46(%edx), %ebx
@@ -1938,11 +2096,11 @@
cmp %bh, %ch
mov $0, %eax
jne L(end)
- POP (%ebx)
+ POP (%ebx)
ret
CFI_PUSH (%ebx)
- ALIGN (4)
+ .p2align 4
L(47bytes):
movl -47(%eax), %ecx
movl -47(%edx), %ebx
@@ -2009,11 +2167,11 @@
cmpb -1(%edx), %al
mov $0, %eax
jne L(end)
- POP (%ebx)
+ POP (%ebx)
ret
CFI_PUSH (%ebx)
- ALIGN (4)
+ .p2align 4
L(find_diff):
cmpb %bl, %cl
jne L(end)
@@ -2024,12 +2182,29 @@
cmp %bl, %cl
jne L(end)
cmp %bx, %cx
+
+ .p2align 4
L(end):
- POP (%ebx)
+ POP (%ebx)
mov $1, %eax
ja L(bigger)
neg %eax
L(bigger):
ret
+#else
+/* for wmemcmp */
+ .p2align 4
+L(find_diff):
+ POP (%ebx)
+ mov $1, %eax
+ jg L(find_diff_bigger)
+ neg %eax
+ ret
+
+ .p2align 4
+L(find_diff_bigger):
+ ret
+
+#endif
END (MEMCMP)
diff --git a/libc/arch-x86/string/ssse3-memcpy5.S b/libc/arch-x86/string/ssse3-memcpy-atom.S
similarity index 99%
rename from libc/arch-x86/string/ssse3-memcpy5.S
rename to libc/arch-x86/string/ssse3-memcpy-atom.S
index b0612a6..1080a38 100644
--- a/libc/arch-x86/string/ssse3-memcpy5.S
+++ b/libc/arch-x86/string/ssse3-memcpy-atom.S
@@ -28,8 +28,11 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include "cache.h"
+#undef __i686
+
#ifndef MEMCPY
-# define MEMCPY ssse3_memcpy5
+# define MEMCPY memcpy
#endif
#ifndef L
diff --git a/libc/arch-x86/string/memcmp_wrapper.S b/libc/arch-x86/string/ssse3-memmove-atom.S
similarity index 92%
copy from libc/arch-x86/string/memcmp_wrapper.S
copy to libc/arch-x86/string/ssse3-memmove-atom.S
index fa0c672..be85596 100644
--- a/libc/arch-x86/string/memcmp_wrapper.S
+++ b/libc/arch-x86/string/ssse3-memmove-atom.S
@@ -28,13 +28,7 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#if defined(USE_SSSE3)
-# define MEMCMP memcmp
-# include "ssse3-memcmp3-new.S"
-
-#else
-
-# include "memcmp.S"
-
-#endif
+#define MEMCPY memmove
+#define USE_AS_MEMMOVE
+#include "ssse3-memcpy-atom.S"
diff --git a/libc/arch-x86/string/ssse3-strcat-atom.S b/libc/arch-x86/string/ssse3-strcat-atom.S
new file mode 100644
index 0000000..d9b6129
--- /dev/null
+++ b/libc/arch-x86/string/ssse3-strcat-atom.S
@@ -0,0 +1,620 @@
+/*
+Copyright (c) 2011, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+#endif
+
+#ifndef cfi_remember_state
+# define cfi_remember_state .cfi_remember_state
+#endif
+
+#ifndef cfi_restore_state
+# define cfi_restore_state .cfi_restore_state
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#ifndef STRCAT
+# define STRCAT strcat
+#endif
+
+#define PARMS 4
+#define STR1 PARMS+4
+#define STR2 STR1+4
+
+#ifdef USE_AS_STRNCAT
+# define LEN STR2+8
+#endif
+
+#define USE_AS_STRCAT
+
+ .section .text.ssse3,"ax",@progbits
+ENTRY (STRCAT)
+ PUSH (%edi)
+ mov STR1(%esp), %edi
+ mov %edi, %edx
+
+#define RETURN jmp L(StrcpyAtom)
+#include "sse2-strlen-atom.S"
+
+L(StrcpyAtom):
+ mov STR2(%esp), %ecx
+ lea (%edi, %eax), %edx
+#ifdef USE_AS_STRNCAT
+ PUSH (%ebx)
+ mov LEN(%esp), %ebx
+ test %ebx, %ebx
+ jz L(StrncatExit0)
+ cmp $8, %ebx
+ jbe L(StrncpyExit8Bytes)
+#endif
+ cmpb $0, (%ecx)
+ jz L(Exit1)
+ cmpb $0, 1(%ecx)
+ jz L(Exit2)
+ cmpb $0, 2(%ecx)
+ jz L(Exit3)
+ cmpb $0, 3(%ecx)
+ jz L(Exit4)
+ cmpb $0, 4(%ecx)
+ jz L(Exit5)
+ cmpb $0, 5(%ecx)
+ jz L(Exit6)
+ cmpb $0, 6(%ecx)
+ jz L(Exit7)
+ cmpb $0, 7(%ecx)
+ jz L(Exit8)
+ cmpb $0, 8(%ecx)
+ jz L(Exit9)
+#ifdef USE_AS_STRNCAT
+ cmp $16, %ebx
+ jb L(StrncpyExit15Bytes)
+#endif
+ cmpb $0, 9(%ecx)
+ jz L(Exit10)
+ cmpb $0, 10(%ecx)
+ jz L(Exit11)
+ cmpb $0, 11(%ecx)
+ jz L(Exit12)
+ cmpb $0, 12(%ecx)
+ jz L(Exit13)
+ cmpb $0, 13(%ecx)
+ jz L(Exit14)
+ cmpb $0, 14(%ecx)
+ jz L(Exit15)
+ cmpb $0, 15(%ecx)
+ jz L(Exit16)
+#ifdef USE_AS_STRNCAT
+ cmp $16, %ebx
+ je L(StrncatExit16)
+
+# define RETURN1 POP (%ebx); POP (%edi); ret; \
+ CFI_PUSH (%ebx); CFI_PUSH (%edi)
+# define USE_AS_STRNCPY
+#else
+# define RETURN1 POP(%edi); ret; CFI_PUSH(%edi)
+#endif
+#include "ssse3-strcpy-atom.S"
+
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(ExitHigh):
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ test $0x08, %ah
+ jnz L(Exit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+ movlpd (%ecx), %xmm0
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 8(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit1):
+ movb %bh, 1(%edx)
+L(Exit1):
+ movb (%ecx), %al
+ movb %al, (%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit2):
+ movb %bh, 2(%edx)
+L(Exit2):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit3):
+ movb %bh, 3(%edx)
+L(Exit3):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit4):
+ movb %bh, 4(%edx)
+L(Exit4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit5):
+ movb %bh, 5(%edx)
+L(Exit5):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit6):
+ movb %bh, 6(%edx)
+L(Exit6):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit7):
+ movb %bh, 7(%edx)
+L(Exit7):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit8):
+ movb %bh, 8(%edx)
+L(Exit8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit9):
+ movb %bh, 9(%edx)
+L(Exit9):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movb 8(%ecx), %al
+ movb %al, 8(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit10):
+ movb %bh, 10(%edx)
+L(Exit10):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movw 8(%ecx), %ax
+ movw %ax, 8(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit11):
+ movb %bh, 11(%edx)
+L(Exit11):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 7(%ecx), %eax
+ movl %eax, 7(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit12):
+ movb %bh, 12(%edx)
+L(Exit12):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit13):
+ movb %bh, 13(%edx)
+L(Exit13):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 5(%ecx), %xmm0
+ movlpd %xmm0, 5(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit14):
+ movb %bh, 14(%edx)
+L(Exit14):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 6(%ecx), %xmm0
+ movlpd %xmm0, 6(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit15):
+ movb %bh, 15(%edx)
+L(Exit15):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit16):
+ movb %bh, 16(%edx)
+L(Exit16):
+ movlpd (%ecx), %xmm0
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 8(%edx)
+ movl %edi, %eax
+ RETURN1
+
+#ifdef USE_AS_STRNCPY
+
+ CFI_PUSH(%esi)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %ebx
+ add %esi, %ecx
+ lea (%esi, %edx), %esi
+ lea -9(%ebx), %edx
+ and $1<<7, %dh
+ or %al, %dh
+ lea (%esi), %edx
+ POP (%esi)
+ jz L(ExitHighCase2)
+
+ test $0x01, %al
+ jnz L(Exit1)
+ cmp $1, %ebx
+ je L(StrncatExit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ cmp $2, %ebx
+ je L(StrncatExit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ cmp $3, %ebx
+ je L(StrncatExit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ cmp $4, %ebx
+ je L(StrncatExit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ cmp $5, %ebx
+ je L(StrncatExit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ cmp $6, %ebx
+ je L(StrncatExit6)
+ test $0x40, %al
+ jnz L(Exit7)
+ cmp $7, %ebx
+ je L(StrncatExit7)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ lea 7(%edx), %eax
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+ xor %cl, %cl
+ movb %cl, (%eax)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(ExitHighCase2):
+ test $0x01, %ah
+ jnz L(Exit9)
+ cmp $9, %ebx
+ je L(StrncatExit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ cmp $10, %ebx
+ je L(StrncatExit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ cmp $11, %ebx
+ je L(StrncatExit11)
+ test $0x8, %ah
+ jnz L(Exit12)
+ cmp $12, %ebx
+ je L(StrncatExit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ cmp $13, %ebx
+ je L(StrncatExit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ cmp $14, %ebx
+ je L(StrncatExit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+ cmp $15, %ebx
+ je L(StrncatExit15)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm1, 8(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ CFI_PUSH(%esi)
+
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase3):
+ add $16, %ebx
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+
+ cmp $8, %ebx
+ ja L(ExitHighCase3)
+ cmp $1, %ebx
+ je L(StrncatExit1)
+ cmp $2, %ebx
+ je L(StrncatExit2)
+ cmp $3, %ebx
+ je L(StrncatExit3)
+ cmp $4, %ebx
+ je L(StrncatExit4)
+ cmp $5, %ebx
+ je L(StrncatExit5)
+ cmp $6, %ebx
+ je L(StrncatExit6)
+ cmp $7, %ebx
+ je L(StrncatExit7)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movb %bh, 8(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(ExitHighCase3):
+ cmp $9, %ebx
+ je L(StrncatExit9)
+ cmp $10, %ebx
+ je L(StrncatExit10)
+ cmp $11, %ebx
+ je L(StrncatExit11)
+ cmp $12, %ebx
+ je L(StrncatExit12)
+ cmp $13, %ebx
+ je L(StrncatExit13)
+ cmp $14, %ebx
+ je L(StrncatExit14)
+ cmp $15, %ebx
+ je L(StrncatExit15)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm1, 8(%edx)
+ movb %bh, 16(%edx)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncatExit0):
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncpyExit15Bytes):
+ cmp $9, %ebx
+ je L(StrncatExit9)
+ cmpb $0, 9(%ecx)
+ jz L(Exit10)
+ cmp $10, %ebx
+ je L(StrncatExit10)
+ cmpb $0, 10(%ecx)
+ jz L(Exit11)
+ cmp $11, %ebx
+ je L(StrncatExit11)
+ cmpb $0, 11(%ecx)
+ jz L(Exit12)
+ cmp $12, %ebx
+ je L(StrncatExit12)
+ cmpb $0, 12(%ecx)
+ jz L(Exit13)
+ cmp $13, %ebx
+ je L(StrncatExit13)
+ cmpb $0, 13(%ecx)
+ jz L(Exit14)
+ cmp $14, %ebx
+ je L(StrncatExit14)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+ lea 14(%edx), %eax
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+ movb %bh, (%eax)
+ movl %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncpyExit8Bytes):
+ cmpb $0, (%ecx)
+ jz L(Exit1)
+ cmp $1, %ebx
+ je L(StrncatExit1)
+ cmpb $0, 1(%ecx)
+ jz L(Exit2)
+ cmp $2, %ebx
+ je L(StrncatExit2)
+ cmpb $0, 2(%ecx)
+ jz L(Exit3)
+ cmp $3, %ebx
+ je L(StrncatExit3)
+ cmpb $0, 3(%ecx)
+ jz L(Exit4)
+ cmp $4, %ebx
+ je L(StrncatExit4)
+ cmpb $0, 4(%ecx)
+ jz L(Exit5)
+ cmp $5, %ebx
+ je L(StrncatExit5)
+ cmpb $0, 5(%ecx)
+ jz L(Exit6)
+ cmp $6, %ebx
+ je L(StrncatExit6)
+ cmpb $0, 6(%ecx)
+ jz L(Exit7)
+ cmp $7, %ebx
+ je L(StrncatExit7)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ lea 7(%edx), %eax
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+ movb %bh, (%eax)
+ movl %edi, %eax
+ RETURN1
+
+#endif
+END (STRCAT)
diff --git a/libc/arch-x86/string/ssse3-strcmp-latest.S b/libc/arch-x86/string/ssse3-strcmp-atom.S
similarity index 99%
rename from libc/arch-x86/string/ssse3-strcmp-latest.S
rename to libc/arch-x86/string/ssse3-strcmp-atom.S
index 673ba57..1275379 100644
--- a/libc/arch-x86/string/ssse3-strcmp-latest.S
+++ b/libc/arch-x86/string/ssse3-strcmp-atom.S
@@ -107,8 +107,12 @@
sub %esi, %ebp
#endif
+#ifndef STRCMP
+# define STRCMP strcmp
+#endif
+
.section .text.ssse3,"ax",@progbits
-ENTRY (ssse3_strcmp_latest)
+ENTRY (STRCMP)
#ifdef USE_AS_STRNCMP
PUSH (%ebp)
#endif
@@ -2271,4 +2275,4 @@
ret
#endif
-END (ssse3_strcmp_latest)
+END (STRCMP)
diff --git a/libc/arch-x86/string/ssse3-strcpy-atom.S b/libc/arch-x86/string/ssse3-strcpy-atom.S
new file mode 100644
index 0000000..30254ca
--- /dev/null
+++ b/libc/arch-x86/string/ssse3-strcpy-atom.S
@@ -0,0 +1,3955 @@
+/*
+Copyright (c) 2011, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef USE_AS_STRCAT
+
+# ifndef L
+# define L(label) .L##label
+# endif
+
+# ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+# endif
+
+# ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+# endif
+
+# ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+# endif
+
+# ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+# endif
+
+# ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+# endif
+
+# ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+# endif
+
+# ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+# endif
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef STRCPY
+# define STRCPY strcpy
+# endif
+
+# ifdef USE_AS_STRNCPY
+# define PARMS 8
+# define ENTRANCE PUSH (%ebx)
+# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx);
+# define RETURN1 POP (%edi); POP (%ebx); ret; CFI_PUSH (%ebx); CFI_PUSH (%edi)
+# else
+# define PARMS 4
+# define ENTRANCE
+# define RETURN ret
+# define RETURN1 POP (%edi); ret; CFI_PUSH (%edi)
+# endif
+
+# ifdef USE_AS_STPCPY
+# define SAVE_RESULT(n) lea n(%edx), %eax
+# define SAVE_RESULT_TAIL(n) lea n(%edx), %eax
+# else
+# define SAVE_RESULT(n) movl %edi, %eax
+# define SAVE_RESULT_TAIL(n) movl %edx, %eax
+# endif
+
+# define STR1 PARMS
+# define STR2 STR1+4
+# define LEN STR2+4
+
+/* In this code following instructions are used for copying:
+ movb - 1 byte
+ movw - 2 byte
+ movl - 4 byte
+ movlpd - 8 byte
+ movaps - 16 byte - requires 16 byte alignment
+ of sourse and destination adresses.
+*/
+
+.text
+ENTRY (STRCPY)
+ ENTRANCE
+ mov STR1(%esp), %edx
+ mov STR2(%esp), %ecx
+# ifdef USE_AS_STRNCPY
+ movl LEN(%esp), %ebx
+ cmp $8, %ebx
+ jbe L(StrncpyExit8Bytes)
+# endif
+ cmpb $0, (%ecx)
+ jz L(ExitTail1)
+ cmpb $0, 1(%ecx)
+ jz L(ExitTail2)
+ cmpb $0, 2(%ecx)
+ jz L(ExitTail3)
+ cmpb $0, 3(%ecx)
+ jz L(ExitTail4)
+ cmpb $0, 4(%ecx)
+ jz L(ExitTail5)
+ cmpb $0, 5(%ecx)
+ jz L(ExitTail6)
+ cmpb $0, 6(%ecx)
+ jz L(ExitTail7)
+ cmpb $0, 7(%ecx)
+ jz L(ExitTail8)
+# ifdef USE_AS_STRNCPY
+ cmp $16, %ebx
+ jb L(StrncpyExit15Bytes)
+# endif
+ cmpb $0, 8(%ecx)
+ jz L(ExitTail9)
+ cmpb $0, 9(%ecx)
+ jz L(ExitTail10)
+ cmpb $0, 10(%ecx)
+ jz L(ExitTail11)
+ cmpb $0, 11(%ecx)
+ jz L(ExitTail12)
+ cmpb $0, 12(%ecx)
+ jz L(ExitTail13)
+ cmpb $0, 13(%ecx)
+ jz L(ExitTail14)
+ cmpb $0, 14(%ecx)
+ jz L(ExitTail15)
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRLCPY
+ cmp $16, %ebx
+ je L(ExitTail16)
+# endif
+ cmpb $0, 15(%ecx)
+ jz L(ExitTail16)
+
+# if defined USE_AS_STRNCPY && defined USE_AS_STRLCPY
+ cmp $16, %ebx
+ je L(StrlcpyExitTail16)
+# endif
+
+ PUSH (%edi)
+# ifndef USE_AS_STRLCPY
+ mov %edx, %edi
+# else
+ mov %ecx, %edi
+# endif
+#endif
+ PUSH (%esi)
+#ifdef USE_AS_STRNCPY
+ mov %ecx, %esi
+ sub $16, %ebx
+ and $0xf, %esi
+
+/* add 16 bytes ecx_offset to ebx */
+
+ add %esi, %ebx
+#endif
+ lea 16(%ecx), %esi
+ and $-16, %esi
+ pxor %xmm0, %xmm0
+ movlpd (%ecx), %xmm1
+ movlpd %xmm1, (%edx)
+
+ pcmpeqb (%esi), %xmm0
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm1, 8(%edx)
+
+ pmovmskb %xmm0, %eax
+ sub %ecx, %esi
+
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ mov %edx, %eax
+ lea 16(%edx), %edx
+ and $-16, %edx
+ sub %edx, %eax
+
+#ifdef USE_AS_STRNCPY
+ add %eax, %esi
+ lea -1(%esi), %esi
+ and $1<<31, %esi
+ test %esi, %esi
+ jnz L(ContinueCopy)
+ lea 16(%ebx), %ebx
+
+L(ContinueCopy):
+#endif
+ sub %eax, %ecx
+ mov %ecx, %eax
+ and $0xf, %eax
+ mov $0, %esi
+
+/* case: ecx_offset == edx_offset */
+
+ jz L(Align16Both)
+
+ cmp $8, %eax
+ jae L(ShlHigh8)
+ cmp $1, %eax
+ je L(Shl1)
+ cmp $2, %eax
+ je L(Shl2)
+ cmp $3, %eax
+ je L(Shl3)
+ cmp $4, %eax
+ je L(Shl4)
+ cmp $5, %eax
+ je L(Shl5)
+ cmp $6, %eax
+ je L(Shl6)
+ jmp L(Shl7)
+
+L(ShlHigh8):
+ je L(Shl8)
+ cmp $9, %eax
+ je L(Shl9)
+ cmp $10, %eax
+ je L(Shl10)
+ cmp $11, %eax
+ je L(Shl11)
+ cmp $12, %eax
+ je L(Shl12)
+ cmp $13, %eax
+ je L(Shl13)
+ cmp $14, %eax
+ je L(Shl14)
+ jmp L(Shl15)
+
+L(Align16Both):
+ movaps (%ecx), %xmm1
+ movaps 16(%ecx), %xmm2
+ movaps %xmm1, (%edx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm3
+ movaps %xmm2, (%edx, %esi)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm4
+ movaps %xmm3, (%edx, %esi)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm1
+ movaps %xmm4, (%edx, %esi)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm2
+ movaps %xmm1, (%edx, %esi)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm3
+ movaps %xmm2, (%edx, %esi)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm3, (%edx, %esi)
+ mov %ecx, %eax
+ lea 16(%ecx, %esi), %ecx
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ sub %eax, %edx
+#ifdef USE_AS_STRNCPY
+ lea 112(%ebx, %eax), %ebx
+#endif
+ mov $-0x40, %esi
+
+L(Aligned64Loop):
+ movaps (%ecx), %xmm2
+ movaps 32(%ecx), %xmm3
+ movaps %xmm2, %xmm4
+ movaps 16(%ecx), %xmm5
+ movaps %xmm3, %xmm6
+ movaps 48(%ecx), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ lea 64(%edx), %edx
+ pcmpeqb %xmm0, %xmm3
+ lea 64(%ecx), %ecx
+ pmovmskb %xmm3, %eax
+#ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeaveCase2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Aligned64Leave)
+ movaps %xmm4, -64(%edx)
+ movaps %xmm5, -48(%edx)
+ movaps %xmm6, -32(%edx)
+ movaps %xmm7, -16(%edx)
+ jmp L(Aligned64Loop)
+
+L(Aligned64Leave):
+#ifdef USE_AS_STRNCPY
+ lea 48(%ebx), %ebx
+#endif
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm5, %xmm0
+#ifdef USE_AS_STRNCPY
+ lea -16(%ebx), %ebx
+#endif
+ pmovmskb %xmm0, %eax
+ movaps %xmm4, -64(%edx)
+ lea 16(%esi), %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm6, %xmm0
+#ifdef USE_AS_STRNCPY
+ lea -16(%ebx), %ebx
+#endif
+ pmovmskb %xmm0, %eax
+ movaps %xmm5, -48(%edx)
+ lea 16(%esi), %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm6, -32(%edx)
+ pcmpeqb %xmm7, %xmm0
+#ifdef USE_AS_STRNCPY
+ lea -16(%ebx), %ebx
+#endif
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl1):
+ movaps -1(%ecx), %xmm1
+ movaps 15(%ecx), %xmm2
+L(Shl1Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit1Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 31(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit1Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 31(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit1Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 31(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit1Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl1LoopExit)
+
+ palignr $1, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 31(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -15(%ecx), %ecx
+ sub %eax, %edx
+#ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+#endif
+ movaps -1(%ecx), %xmm1
+
+L(Shl1LoopStart):
+ movaps 15(%ecx), %xmm2
+ movaps 31(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 47(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 63(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $1, %xmm4, %xmm5
+ palignr $1, %xmm3, %xmm4
+ test %eax, %eax
+ jnz L(Shl1Start)
+#ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave1)
+#endif
+ palignr $1, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl1LoopStart)
+
+L(Shl1LoopExit):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+ mov $15, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl2):
+ movaps -2(%ecx), %xmm1
+ movaps 14(%ecx), %xmm2
+L(Shl2Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit2Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 30(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit2Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 30(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit2Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 30(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit2Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl2LoopExit)
+
+ palignr $2, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 30(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -14(%ecx), %ecx
+ sub %eax, %edx
+#ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+#endif
+ movaps -2(%ecx), %xmm1
+
+L(Shl2LoopStart):
+ movaps 14(%ecx), %xmm2
+ movaps 30(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 46(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 62(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $2, %xmm4, %xmm5
+ palignr $2, %xmm3, %xmm4
+ test %eax, %eax
+ jnz L(Shl2Start)
+#ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave2)
+#endif
+ palignr $2, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl2LoopStart)
+
+L(Shl2LoopExit):
+ movlpd (%ecx), %xmm0
+ movlpd 6(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 6(%edx)
+ mov $14, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl3):
+ movaps -3(%ecx), %xmm1
+ movaps 13(%ecx), %xmm2
+L(Shl3Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit3Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 29(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit3Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 29(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit3Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 29(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit3Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl3LoopExit)
+
+ palignr $3, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 29(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -13(%ecx), %ecx
+ sub %eax, %edx
+#ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+#endif
+ movaps -3(%ecx), %xmm1
+
+L(Shl3LoopStart):
+ movaps 13(%ecx), %xmm2
+ movaps 29(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 45(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 61(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $3, %xmm4, %xmm5
+ palignr $3, %xmm3, %xmm4
+ test %eax, %eax
+ jnz L(Shl3Start)
+#ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave3)
+#endif
+ palignr $3, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl3LoopStart)
+
+L(Shl3LoopExit):
+ movlpd (%ecx), %xmm0
+ movlpd 5(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 5(%edx)
+ mov $13, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl4):
+ movaps -4(%ecx), %xmm1
+ movaps 12(%ecx), %xmm2
+L(Shl4Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit4Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit4Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit4Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit4Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 28(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -12(%ecx), %ecx
+ sub %eax, %edx
+#ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+#endif
+ movaps -4(%ecx), %xmm1
+
+L(Shl4LoopStart):
+ movaps 12(%ecx), %xmm2
+ movaps 28(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 44(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 60(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $4, %xmm4, %xmm5
+ palignr $4, %xmm3, %xmm4
+ test %eax, %eax
+ jnz L(Shl4Start)
+#ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave4)
+#endif
+ palignr $4, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 8(%edx)
+ mov $12, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl5):
+ movaps -5(%ecx), %xmm1
+ movaps 11(%ecx), %xmm2
+L(Shl5Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit5Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 27(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit5Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 27(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit5Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 27(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit5Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl5LoopExit)
+
+ palignr $5, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 27(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -11(%ecx), %ecx
+ sub %eax, %edx
+#ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+#endif
+ movaps -5(%ecx), %xmm1
+
+L(Shl5LoopStart):
+ movaps 11(%ecx), %xmm2
+ movaps 27(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 43(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 59(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $5, %xmm4, %xmm5
+ palignr $5, %xmm3, %xmm4
+ test %eax, %eax
+ jnz L(Shl5Start)
+#ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave5)
+#endif
+ palignr $5, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl5LoopStart)
+
+L(Shl5LoopExit):
+ movlpd (%ecx), %xmm0
+ movl 7(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 7(%edx)
+ mov $11, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl6):
+ movaps -6(%ecx), %xmm1
+ movaps 10(%ecx), %xmm2
+L(Shl6Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit6Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 26(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit6Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 26(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit6Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 26(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit6Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl6LoopExit)
+
+ palignr $6, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 26(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -10(%ecx), %ecx
+ sub %eax, %edx
+#ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+#endif
+ movaps -6(%ecx), %xmm1
+
+L(Shl6LoopStart):
+ movaps 10(%ecx), %xmm2
+ movaps 26(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 42(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 58(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $6, %xmm4, %xmm5
+ palignr $6, %xmm3, %xmm4
+ test %eax, %eax
+ jnz L(Shl6Start)
+#ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave6)
+#endif
+ palignr $6, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl6LoopStart)
+
+L(Shl6LoopExit):
+ movlpd (%ecx), %xmm0
+ movl 6(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 6(%edx)
+ mov $10, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl7):
+ movaps -7(%ecx), %xmm1
+ movaps 9(%ecx), %xmm2
+L(Shl7Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit7Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 25(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit7Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 25(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit7Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 25(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit7Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl7LoopExit)
+
+ palignr $7, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 25(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -9(%ecx), %ecx
+ sub %eax, %edx
+#ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+#endif
+ movaps -7(%ecx), %xmm1
+
+L(Shl7LoopStart):
+ movaps 9(%ecx), %xmm2
+ movaps 25(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 41(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 57(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $7, %xmm4, %xmm5
+ palignr $7, %xmm3, %xmm4
+ test %eax, %eax
+ jnz L(Shl7Start)
+#ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave7)
+#endif
+ palignr $7, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl7LoopStart)
+
+L(Shl7LoopExit):
+ movlpd (%ecx), %xmm0
+ movl 5(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 5(%edx)
+ mov $9, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl8):
+ movaps -8(%ecx), %xmm1
+ movaps 8(%ecx), %xmm2
+L(Shl8Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit8Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit8Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit8Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit8Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 24(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -8(%ecx), %ecx
+ sub %eax, %edx
+#ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+#endif
+ movaps -8(%ecx), %xmm1
+
+L(Shl8LoopStart):
+ movaps 8(%ecx), %xmm2
+ movaps 24(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 40(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 56(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $8, %xmm4, %xmm5
+ palignr $8, %xmm3, %xmm4
+ test %eax, %eax
+ jnz L(Shl8Start)
+#ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave8)
+#endif
+ palignr $8, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ mov $8, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl9):
+ movaps -9(%ecx), %xmm1
+ movaps 7(%ecx), %xmm2
+L(Shl9Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit9Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 23(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit9Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 23(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit9Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 23(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit9Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl9LoopExit)
+
+ palignr $9, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 23(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -7(%ecx), %ecx
+ sub %eax, %edx
+#ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+#endif
+ movaps -9(%ecx), %xmm1
+
+L(Shl9LoopStart):
+ movaps 7(%ecx), %xmm2
+ movaps 23(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 39(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 55(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $9, %xmm4, %xmm5
+ palignr $9, %xmm3, %xmm4
+ test %eax, %eax
+ jnz L(Shl9Start)
+#ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave9)
+#endif
+ palignr $9, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl9LoopStart)
+
+L(Shl9LoopExit):
+ movlpd -1(%ecx), %xmm0
+ movlpd %xmm0, -1(%edx)
+ mov $7, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl10):
+ movaps -10(%ecx), %xmm1
+ movaps 6(%ecx), %xmm2
+L(Shl10Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit10Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 22(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit10Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 22(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit10Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 22(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit10Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl10LoopExit)
+
+ palignr $10, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 22(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -6(%ecx), %ecx
+ sub %eax, %edx
+#ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+#endif
+ movaps -10(%ecx), %xmm1
+
+L(Shl10LoopStart):
+ movaps 6(%ecx), %xmm2
+ movaps 22(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 38(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 54(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $10, %xmm4, %xmm5
+ palignr $10, %xmm3, %xmm4
+ test %eax, %eax
+ jnz L(Shl10Start)
+#ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave10)
+#endif
+ palignr $10, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl10LoopStart)
+
+L(Shl10LoopExit):
+ movlpd -2(%ecx), %xmm0
+ movlpd %xmm0, -2(%edx)
+ mov $6, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl11):
+ movaps -11(%ecx), %xmm1
+ movaps 5(%ecx), %xmm2
+L(Shl11Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit11Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 21(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit11Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 21(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit11Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 21(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit11Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl11LoopExit)
+
+ palignr $11, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 21(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -5(%ecx), %ecx
+ sub %eax, %edx
+#ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+#endif
+ movaps -11(%ecx), %xmm1
+
+L(Shl11LoopStart):
+ movaps 5(%ecx), %xmm2
+ movaps 21(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 37(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 53(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $11, %xmm4, %xmm5
+ palignr $11, %xmm3, %xmm4
+ test %eax, %eax
+ jnz L(Shl11Start)
+#ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave11)
+#endif
+ palignr $11, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl11LoopStart)
+
+L(Shl11LoopExit):
+ movlpd -3(%ecx), %xmm0
+ movlpd %xmm0, -3(%edx)
+ mov $5, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl12):
+ movaps -12(%ecx), %xmm1
+ movaps 4(%ecx), %xmm2
+L(Shl12Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit12Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit12Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit12Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit12Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 20(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -4(%ecx), %ecx
+ sub %eax, %edx
+#ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+#endif
+ movaps -12(%ecx), %xmm1
+
+L(Shl12LoopStart):
+ movaps 4(%ecx), %xmm2
+ movaps 20(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 36(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 52(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $12, %xmm4, %xmm5
+ palignr $12, %xmm3, %xmm4
+ test %eax, %eax
+ jnz L(Shl12Start)
+#ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave12)
+#endif
+ palignr $12, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+ movl (%ecx), %esi
+ movl %esi, (%edx)
+ mov $4, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl13):
+ movaps -13(%ecx), %xmm1
+ movaps 3(%ecx), %xmm2
+L(Shl13Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit13Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 19(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit13Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 19(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit13Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 19(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit13Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl13LoopExit)
+
+ palignr $13, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 19(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -3(%ecx), %ecx
+ sub %eax, %edx
+#ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+#endif
+ movaps -13(%ecx), %xmm1
+
+L(Shl13LoopStart):
+ movaps 3(%ecx), %xmm2
+ movaps 19(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 35(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 51(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $13, %xmm4, %xmm5
+ palignr $13, %xmm3, %xmm4
+ test %eax, %eax
+ jnz L(Shl13Start)
+#ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave13)
+#endif
+ palignr $13, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl13LoopStart)
+
+L(Shl13LoopExit):
+ movl -1(%ecx), %esi
+ movl %esi, -1(%edx)
+ mov $3, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl14):
+ movaps -14(%ecx), %xmm1
+ movaps 2(%ecx), %xmm2
+L(Shl14Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit14Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 18(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit14Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 18(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit14Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 18(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit14Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl14LoopExit)
+
+ palignr $14, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 18(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -2(%ecx), %ecx
+ sub %eax, %edx
+#ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+#endif
+ movaps -14(%ecx), %xmm1
+
+L(Shl14LoopStart):
+ movaps 2(%ecx), %xmm2
+ movaps 18(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 34(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 50(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $14, %xmm4, %xmm5
+ palignr $14, %xmm3, %xmm4
+ test %eax, %eax
+ jnz L(Shl14Start)
+#ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave14)
+#endif
+ palignr $14, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl14LoopStart)
+
+L(Shl14LoopExit):
+ movl -2(%ecx), %esi
+ movl %esi, -2(%edx)
+ mov $2, %esi
+ jmp L(CopyFrom1To16Bytes)
+
+ .p2align 4
+L(Shl15):
+ movaps -15(%ecx), %xmm1
+ movaps 1(%ecx), %xmm2
+L(Shl15Start):
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit15Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm3, %xmm1
+ movaps %xmm2, (%edx)
+ movaps 17(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit15Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 17(%ecx), %xmm2
+ movaps %xmm3, %xmm1
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit15Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 17(%ecx), %xmm2
+
+ pcmpeqb %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+#ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ jbe L(StrncpyExit15Case2OrCase3)
+#endif
+ test %eax, %eax
+ jnz L(Shl15LoopExit)
+
+ palignr $15, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 17(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -1(%ecx), %ecx
+ sub %eax, %edx
+#ifdef USE_AS_STRNCPY
+ add %eax, %ebx
+#endif
+ movaps -15(%ecx), %xmm1
+
+L(Shl15LoopStart):
+ movaps 1(%ecx), %xmm2
+ movaps 17(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 33(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 49(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqb %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $15, %xmm4, %xmm5
+ palignr $15, %xmm3, %xmm4
+ test %eax, %eax
+ jnz L(Shl15Start)
+#ifdef USE_AS_STRNCPY
+ sub $64, %ebx
+ jbe L(StrncpyLeave15)
+#endif
+ palignr $15, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl15LoopStart)
+
+L(Shl15LoopExit):
+ movl -3(%ecx), %esi
+ movl %esi, -3(%edx)
+ mov $1, %esi
+#if defined USE_AS_STRCAT || defined USE_AS_STRLCPY
+ jmp L(CopyFrom1To16Bytes)
+#endif
+
+
+#if !defined USE_AS_STRCAT && !defined USE_AS_STRLCPY
+
+ .p2align 4
+L(CopyFrom1To16Bytes):
+# ifdef USE_AS_STRNCPY
+ add $16, %ebx
+# endif
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+ test %al, %al
+ jz L(ExitHigh8)
+
+L(CopyFrom1To16BytesLess8):
+ mov %al, %ah
+ and $15, %ah
+ jz L(ExitHigh4)
+
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+
+ .p2align 4
+L(Exit4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ SAVE_RESULT (3)
+# ifdef USE_AS_STRNCPY
+ sub $4, %ebx
+ lea 4(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitHigh4):
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+
+ .p2align 4
+L(Exit8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ SAVE_RESULT (7)
+# ifdef USE_AS_STRNCPY
+ sub $8, %ebx
+ lea 8(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitHigh8):
+ mov %ah, %al
+ and $15, %al
+ jz L(ExitHigh12)
+
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+
+ .p2align 4
+L(Exit12):
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %eax
+ movlpd %xmm0, (%edx)
+ movl %eax, 8(%edx)
+ SAVE_RESULT (11)
+# ifdef USE_AS_STRNCPY
+ sub $12, %ebx
+ lea 12(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitHigh12):
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+
+ .p2align 4
+L(Exit16):
+ movdqu (%ecx), %xmm0
+ movdqu %xmm0, (%edx)
+ SAVE_RESULT (15)
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ lea 16(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+# ifdef USE_AS_STRNCPY
+
+ CFI_PUSH(%esi)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %ebx
+ add %esi, %ecx
+ add %esi, %edx
+
+ POP (%esi)
+
+ test %al, %al
+ jz L(ExitHighCase2)
+
+ cmp $8, %ebx
+ ja L(CopyFrom1To16BytesLess8)
+
+ test $0x01, %al
+ jnz L(Exit1)
+ cmp $1, %ebx
+ je L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ cmp $2, %ebx
+ je L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ cmp $3, %ebx
+ je L(Exit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ cmp $4, %ebx
+ je L(Exit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ cmp $5, %ebx
+ je L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ cmp $6, %ebx
+ je L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+ cmp $7, %ebx
+ je L(Exit7)
+ jmp L(Exit8)
+
+ .p2align 4
+L(ExitHighCase2):
+ cmp $8, %ebx
+ jbe L(CopyFrom1To16BytesLess8Case3)
+
+ test $0x01, %ah
+ jnz L(Exit9)
+ cmp $9, %ebx
+ je L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ cmp $10, %ebx
+ je L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ cmp $11, %ebx
+ je L(Exit11)
+ test $0x8, %ah
+ jnz L(Exit12)
+ cmp $12, %ebx
+ je L(Exit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ cmp $13, %ebx
+ je L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ cmp $14, %ebx
+ je L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+ cmp $15, %ebx
+ je L(Exit15)
+ jmp L(Exit16)
+
+ CFI_PUSH(%esi)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase3):
+ add $16, %ebx
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+
+ cmp $8, %ebx
+ ja L(ExitHigh8Case3)
+
+L(CopyFrom1To16BytesLess8Case3):
+ cmp $4, %ebx
+ ja L(ExitHigh4Case3)
+
+ cmp $1, %ebx
+ je L(Exit1)
+ cmp $2, %ebx
+ je L(Exit2)
+ cmp $3, %ebx
+ je L(Exit3)
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ SAVE_RESULT (4)
+ RETURN1
+
+ .p2align 4
+L(ExitHigh4Case3):
+ cmp $5, %ebx
+ je L(Exit5)
+ cmp $6, %ebx
+ je L(Exit6)
+ cmp $7, %ebx
+ je L(Exit7)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ SAVE_RESULT (8)
+ RETURN1
+
+ .p2align 4
+L(ExitHigh8Case3):
+ cmp $12, %ebx
+ ja L(ExitHigh12Case3)
+
+ cmp $9, %ebx
+ je L(Exit9)
+ cmp $10, %ebx
+ je L(Exit10)
+ cmp $11, %ebx
+ je L(Exit11)
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %eax
+ movlpd %xmm0, (%edx)
+ movl %eax, 8(%edx)
+ SAVE_RESULT (12)
+ RETURN1
+
+ .p2align 4
+L(ExitHigh12Case3):
+ cmp $13, %ebx
+ je L(Exit13)
+ cmp $14, %ebx
+ je L(Exit14)
+ cmp $15, %ebx
+ je L(Exit15)
+ movlpd (%ecx), %xmm0
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 8(%edx)
+ SAVE_RESULT (16)
+ RETURN1
+
+# endif
+
+ .p2align 4
+L(Exit1):
+ movb (%ecx), %al
+ movb %al, (%edx)
+ SAVE_RESULT (0)
+# ifdef USE_AS_STRNCPY
+ sub $1, %ebx
+ lea 1(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit2):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ SAVE_RESULT (1)
+# ifdef USE_AS_STRNCPY
+ sub $2, %ebx
+ lea 2(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit3):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+ SAVE_RESULT (2)
+# ifdef USE_AS_STRNCPY
+ sub $3, %ebx
+ lea 3(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit5):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+ SAVE_RESULT (4)
+# ifdef USE_AS_STRNCPY
+ sub $5, %ebx
+ lea 5(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit6):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+ SAVE_RESULT (5)
+# ifdef USE_AS_STRNCPY
+ sub $6, %ebx
+ lea 6(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit7):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+ SAVE_RESULT (6)
+# ifdef USE_AS_STRNCPY
+ sub $7, %ebx
+ lea 7(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit9):
+ movlpd (%ecx), %xmm0
+ movb 8(%ecx), %al
+ movlpd %xmm0, (%edx)
+ movb %al, 8(%edx)
+ SAVE_RESULT (8)
+# ifdef USE_AS_STRNCPY
+ sub $9, %ebx
+ lea 9(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit10):
+ movlpd (%ecx), %xmm0
+ movw 8(%ecx), %ax
+ movlpd %xmm0, (%edx)
+ movw %ax, 8(%edx)
+ SAVE_RESULT (9)
+# ifdef USE_AS_STRNCPY
+ sub $10, %ebx
+ lea 10(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit11):
+ movlpd (%ecx), %xmm0
+ movl 7(%ecx), %eax
+ movlpd %xmm0, (%edx)
+ movl %eax, 7(%edx)
+ SAVE_RESULT (10)
+# ifdef USE_AS_STRNCPY
+ sub $11, %ebx
+ lea 11(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit13):
+ movlpd (%ecx), %xmm0
+ movlpd 5(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 5(%edx)
+ SAVE_RESULT (12)
+# ifdef USE_AS_STRNCPY
+ sub $13, %ebx
+ lea 13(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit14):
+ movlpd (%ecx), %xmm0
+ movlpd 6(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 6(%edx)
+ SAVE_RESULT (13)
+# ifdef USE_AS_STRNCPY
+ sub $14, %ebx
+ lea 14(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(Exit15):
+ movlpd (%ecx), %xmm0
+ movlpd 7(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 7(%edx)
+ SAVE_RESULT (14)
+# ifdef USE_AS_STRNCPY
+ sub $15, %ebx
+ lea 15(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+CFI_POP (%edi)
+
+# ifdef USE_AS_STRNCPY
+ .p2align 4
+L(Fill0):
+ RETURN
+
+ .p2align 4
+L(Fill1):
+ movb %dl, (%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill2):
+ movw %dx, (%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill3):
+ movw %dx, (%ecx)
+ movb %dl, 2(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill4):
+ movl %edx, (%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill5):
+ movl %edx, (%ecx)
+ movb %dl, 4(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill6):
+ movl %edx, (%ecx)
+ movw %dx, 4(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill7):
+ movl %edx, (%ecx)
+ movl %edx, 3(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill8):
+ movlpd %xmm0, (%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill9):
+ movlpd %xmm0, (%ecx)
+ movb %dl, 8(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill10):
+ movlpd %xmm0, (%ecx)
+ movw %dx, 8(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill11):
+ movlpd %xmm0, (%ecx)
+ movl %edx, 7(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill12):
+ movlpd %xmm0, (%ecx)
+ movl %edx, 8(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill13):
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 5(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill14):
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 6(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill15):
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 7(%ecx)
+ RETURN
+
+ .p2align 4
+L(Fill16):
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 8(%ecx)
+ RETURN
+
+ .p2align 4
+L(StrncpyFillExit1):
+ lea 16(%ebx), %ebx
+L(FillFrom1To16Bytes):
+ test %ebx, %ebx
+ jz L(Fill0)
+ cmp $16, %ebx
+ je L(Fill16)
+ cmp $8, %ebx
+ je L(Fill8)
+ jg L(FillMore8)
+ cmp $4, %ebx
+ je L(Fill4)
+ jg L(FillMore4)
+ cmp $2, %ebx
+ jl L(Fill1)
+ je L(Fill2)
+ jg L(Fill3)
+L(FillMore8): /* but less than 16 */
+ cmp $12, %ebx
+ je L(Fill12)
+ jl L(FillLess12)
+ cmp $14, %ebx
+ jl L(Fill13)
+ je L(Fill14)
+ jg L(Fill15)
+L(FillMore4): /* but less than 8 */
+ cmp $6, %ebx
+ jl L(Fill5)
+ je L(Fill6)
+ jg L(Fill7)
+L(FillLess12): /* but more than 8 */
+ cmp $10, %ebx
+ jl L(Fill9)
+ je L(Fill10)
+ jmp L(Fill11)
+
+ CFI_PUSH(%edi)
+
+ .p2align 4
+L(StrncpyFillTailWithZero1):
+ POP (%edi)
+L(StrncpyFillTailWithZero):
+ pxor %xmm0, %xmm0
+ xor %edx, %edx
+ sub $16, %ebx
+ jbe L(StrncpyFillExit1)
+
+ movlpd %xmm0, (%ecx)
+ movlpd %xmm0, 8(%ecx)
+
+ lea 16(%ecx), %ecx
+
+ mov %ecx, %edx
+ and $0xf, %edx
+ sub %edx, %ecx
+ add %edx, %ebx
+ xor %edx, %edx
+ sub $64, %ebx
+ jb L(StrncpyFillLess64)
+
+L(StrncpyFillLoopMovdqa):
+ movdqa %xmm0, (%ecx)
+ movdqa %xmm0, 16(%ecx)
+ movdqa %xmm0, 32(%ecx)
+ movdqa %xmm0, 48(%ecx)
+ lea 64(%ecx), %ecx
+ sub $64, %ebx
+ jae L(StrncpyFillLoopMovdqa)
+
+L(StrncpyFillLess64):
+ add $32, %ebx
+ jl L(StrncpyFillLess32)
+ movdqa %xmm0, (%ecx)
+ movdqa %xmm0, 16(%ecx)
+ lea 32(%ecx), %ecx
+ sub $16, %ebx
+ jl L(StrncpyFillExit1)
+ movdqa %xmm0, (%ecx)
+ lea 16(%ecx), %ecx
+ jmp L(FillFrom1To16Bytes)
+
+L(StrncpyFillLess32):
+ add $16, %ebx
+ jl L(StrncpyFillExit1)
+ movdqa %xmm0, (%ecx)
+ lea 16(%ecx), %ecx
+ jmp L(FillFrom1To16Bytes)
+# endif
+
+ .p2align 4
+L(ExitTail1):
+ movb (%ecx), %al
+ movb %al, (%edx)
+ SAVE_RESULT_TAIL (0)
+# ifdef USE_AS_STRNCPY
+ sub $1, %ebx
+ lea 1(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail2):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ SAVE_RESULT_TAIL (1)
+# ifdef USE_AS_STRNCPY
+ sub $2, %ebx
+ lea 2(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail3):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+ SAVE_RESULT_TAIL (2)
+# ifdef USE_AS_STRNCPY
+ sub $3, %ebx
+ lea 3(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ SAVE_RESULT_TAIL (3)
+# ifdef USE_AS_STRNCPY
+ sub $4, %ebx
+ lea 4(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail5):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+ SAVE_RESULT_TAIL (4)
+# ifdef USE_AS_STRNCPY
+ sub $5, %ebx
+ lea 5(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail6):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+ SAVE_RESULT_TAIL (5)
+# ifdef USE_AS_STRNCPY
+ sub $6, %ebx
+ lea 6(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail7):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+ SAVE_RESULT_TAIL (6)
+# ifdef USE_AS_STRNCPY
+ sub $7, %ebx
+ lea 7(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ SAVE_RESULT_TAIL (7)
+# ifdef USE_AS_STRNCPY
+ sub $8, %ebx
+ lea 8(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail9):
+ movlpd (%ecx), %xmm0
+ movb 8(%ecx), %al
+ movlpd %xmm0, (%edx)
+ movb %al, 8(%edx)
+ SAVE_RESULT_TAIL (8)
+# ifdef USE_AS_STRNCPY
+ sub $9, %ebx
+ lea 9(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail10):
+ movlpd (%ecx), %xmm0
+ movw 8(%ecx), %ax
+ movlpd %xmm0, (%edx)
+ movw %ax, 8(%edx)
+ SAVE_RESULT_TAIL (9)
+# ifdef USE_AS_STRNCPY
+ sub $10, %ebx
+ lea 10(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail11):
+ movlpd (%ecx), %xmm0
+ movl 7(%ecx), %eax
+ movlpd %xmm0, (%edx)
+ movl %eax, 7(%edx)
+ SAVE_RESULT_TAIL (10)
+# ifdef USE_AS_STRNCPY
+ sub $11, %ebx
+ lea 11(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail12):
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %eax
+ movlpd %xmm0, (%edx)
+ movl %eax, 8(%edx)
+ SAVE_RESULT_TAIL (11)
+# ifdef USE_AS_STRNCPY
+ sub $12, %ebx
+ lea 12(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail13):
+ movlpd (%ecx), %xmm0
+ movlpd 5(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 5(%edx)
+ SAVE_RESULT_TAIL (12)
+# ifdef USE_AS_STRNCPY
+ sub $13, %ebx
+ lea 13(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail14):
+ movlpd (%ecx), %xmm0
+ movlpd 6(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 6(%edx)
+ SAVE_RESULT_TAIL (13)
+# ifdef USE_AS_STRNCPY
+ sub $14, %ebx
+ lea 14(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail15):
+ movlpd (%ecx), %xmm0
+ movlpd 7(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 7(%edx)
+ SAVE_RESULT_TAIL (14)
+# ifdef USE_AS_STRNCPY
+ sub $15, %ebx
+ lea 15(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# endif
+ RETURN
+
+ .p2align 4
+L(ExitTail16):
+ movdqu (%ecx), %xmm0
+ movdqu %xmm0, (%edx)
+ SAVE_RESULT_TAIL (15)
+# ifdef USE_AS_STRNCPY
+ sub $16, %ebx
+ lea 16(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN
+#endif
+
+#ifdef USE_AS_STRNCPY
+# ifndef USE_AS_STRCAT
+ CFI_PUSH (%esi)
+ CFI_PUSH (%edi)
+# endif
+ .p2align 4
+L(StrncpyLeaveCase2OrCase3):
+ test %eax, %eax
+ jnz L(Aligned64LeaveCase2)
+
+L(Aligned64LeaveCase3):
+ add $48, %ebx
+ jle L(CopyFrom1To16BytesCase3)
+ movaps %xmm4, -64(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase3)
+ movaps %xmm5, -48(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase3)
+ movaps %xmm6, -32(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(Aligned64LeaveCase2):
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ add $48, %ebx
+ jle L(CopyFrom1To16BytesCase2OrCase3)
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm4, -64(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm6, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm5, -48(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm6, -32(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+ jmp L(CopyFrom1To16BytesCase2)
+
+/*--------------------------------------------------*/
+ .p2align 4
+L(StrncpyExit1Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movlpd 7(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 7(%edx)
+ mov $15, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit2Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movlpd 6(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 6(%edx)
+ mov $14, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit3Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movlpd 5(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 5(%edx)
+ mov $13, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit4Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 8(%edx)
+ mov $12, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit5Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movl 7(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 7(%edx)
+ mov $11, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit6Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movl 6(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 6(%edx)
+ mov $10, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit7Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movl 5(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 5(%edx)
+ mov $9, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit8Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ mov $8, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit9Case2OrCase3):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ mov $7, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit10Case2OrCase3):
+ movlpd -1(%ecx), %xmm0
+ movlpd %xmm0, -1(%edx)
+ mov $6, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit11Case2OrCase3):
+ movlpd -2(%ecx), %xmm0
+ movlpd %xmm0, -2(%edx)
+ mov $5, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit12Case2OrCase3):
+ movl (%ecx), %esi
+ movl %esi, (%edx)
+ mov $4, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit13Case2OrCase3):
+ movl -1(%ecx), %esi
+ movl %esi, -1(%edx)
+ mov $3, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit14Case2OrCase3):
+ movl -2(%ecx), %esi
+ movl %esi, -2(%edx)
+ mov $2, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+ .p2align 4
+L(StrncpyExit15Case2OrCase3):
+ movl -3(%ecx), %esi
+ movl %esi, -3(%edx)
+ mov $1, %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave1):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit1)
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 31(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit1)
+ palignr $1, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit1)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit1)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit1):
+ lea 15(%edx, %esi), %edx
+ lea 15(%ecx, %esi), %ecx
+ movdqu -16(%ecx), %xmm0
+ xor %esi, %esi
+ movdqu %xmm0, -16(%edx)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave2):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit2)
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 30(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit2)
+ palignr $2, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit2)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit2)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit2):
+ lea 14(%edx, %esi), %edx
+ lea 14(%ecx, %esi), %ecx
+ movdqu -16(%ecx), %xmm0
+ xor %esi, %esi
+ movdqu %xmm0, -16(%edx)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave3):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit3)
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 29(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit3)
+ palignr $3, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit3)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit3)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit3):
+ lea 13(%edx, %esi), %edx
+ lea 13(%ecx, %esi), %ecx
+ movdqu -16(%ecx), %xmm0
+ xor %esi, %esi
+ movdqu %xmm0, -16(%edx)
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave4):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit4)
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit4)
+ palignr $4, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit4)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit4)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit4):
+ lea 12(%edx, %esi), %edx
+ lea 12(%ecx, %esi), %ecx
+ movlpd -12(%ecx), %xmm0
+ movl -4(%ecx), %eax
+ movlpd %xmm0, -12(%edx)
+ movl %eax, -4(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave5):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit5)
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 27(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit5)
+ palignr $5, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit5)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit5)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit5):
+ lea 11(%edx, %esi), %edx
+ lea 11(%ecx, %esi), %ecx
+ movlpd -11(%ecx), %xmm0
+ movl -4(%ecx), %eax
+ movlpd %xmm0, -11(%edx)
+ movl %eax, -4(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave6):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit6)
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 26(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit6)
+ palignr $6, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit6)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit6)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit6):
+ lea 10(%edx, %esi), %edx
+ lea 10(%ecx, %esi), %ecx
+
+ movlpd -10(%ecx), %xmm0
+ movw -2(%ecx), %ax
+ movlpd %xmm0, -10(%edx)
+ movw %ax, -2(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave7):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit7)
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 25(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit7)
+ palignr $7, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit7)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit7)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit7):
+ lea 9(%edx, %esi), %edx
+ lea 9(%ecx, %esi), %ecx
+
+ movlpd -9(%ecx), %xmm0
+ movb -1(%ecx), %ah
+ movlpd %xmm0, -9(%edx)
+ movb %ah, -1(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave8):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit8)
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit8)
+ palignr $8, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit8)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit8)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit8):
+ lea 8(%edx, %esi), %edx
+ lea 8(%ecx, %esi), %ecx
+ movlpd -8(%ecx), %xmm0
+ movlpd %xmm0, -8(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave9):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit9)
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 23(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit9)
+ palignr $9, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit9)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit9)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit9):
+ lea 7(%edx, %esi), %edx
+ lea 7(%ecx, %esi), %ecx
+
+ movlpd -8(%ecx), %xmm0
+ movlpd %xmm0, -8(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave10):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit10)
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 22(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit10)
+ palignr $10, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit10)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit10)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit10):
+ lea 6(%edx, %esi), %edx
+ lea 6(%ecx, %esi), %ecx
+
+ movlpd -8(%ecx), %xmm0
+ movlpd %xmm0, -8(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave11):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit11)
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 21(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit11)
+ palignr $11, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit11)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit11)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit11):
+ lea 5(%edx, %esi), %edx
+ lea 5(%ecx, %esi), %ecx
+ movl -5(%ecx), %esi
+ movb -1(%ecx), %ah
+ movl %esi, -5(%edx)
+ movb %ah, -1(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave12):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit12)
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit12)
+ palignr $12, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit12)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit12)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit12):
+ lea 4(%edx, %esi), %edx
+ lea 4(%ecx, %esi), %ecx
+ movl -4(%ecx), %eax
+ movl %eax, -4(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave13):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit13)
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 19(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit13)
+ palignr $13, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit13)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit13)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit13):
+ lea 3(%edx, %esi), %edx
+ lea 3(%ecx, %esi), %ecx
+
+ movl -4(%ecx), %eax
+ movl %eax, -4(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave14):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit14)
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 18(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit14)
+ palignr $14, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit14)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit14)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit14):
+ lea 2(%edx, %esi), %edx
+ lea 2(%ecx, %esi), %ecx
+ movw -2(%ecx), %ax
+ movw %ax, -2(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+
+L(StrncpyLeave15):
+ movaps %xmm2, %xmm3
+ add $48, %ebx
+ jle L(StrncpyExit15)
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 17(%ecx), %xmm2
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit15)
+ palignr $15, %xmm3, %xmm2
+ movaps %xmm2, 16(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit15)
+ movaps %xmm4, 32(%edx)
+ lea 16(%esi), %esi
+ sub $16, %ebx
+ jbe L(StrncpyExit15)
+ movaps %xmm5, 48(%edx)
+ lea 16(%esi), %esi
+ lea -16(%ebx), %ebx
+L(StrncpyExit15):
+ lea 1(%edx, %esi), %edx
+ lea 1(%ecx, %esi), %ecx
+ movb -1(%ecx), %ah
+ movb %ah, -1(%edx)
+ xor %esi, %esi
+ jmp L(CopyFrom1To16BytesCase3)
+#endif
+
+#if !defined USE_AS_STRCAT && ! defined USE_AS_STRLCPY
+# ifdef USE_AS_STRNCPY
+ CFI_POP (%esi)
+ CFI_POP (%edi)
+
+ .p2align 4
+L(ExitTail0):
+ movl %edx, %eax
+ RETURN
+
+ .p2align 4
+L(StrncpyExit15Bytes):
+ cmp $12, %ebx
+ jbe L(StrncpyExit12Bytes)
+ cmpb $0, 8(%ecx)
+ jz L(ExitTail9)
+ cmpb $0, 9(%ecx)
+ jz L(ExitTail10)
+ cmpb $0, 10(%ecx)
+ jz L(ExitTail11)
+ cmpb $0, 11(%ecx)
+ jz L(ExitTail12)
+ cmp $13, %ebx
+ je L(ExitTail13)
+ cmpb $0, 12(%ecx)
+ jz L(ExitTail13)
+ cmp $14, %ebx
+ je L(ExitTail14)
+ cmpb $0, 13(%ecx)
+ jz L(ExitTail14)
+ movlpd (%ecx), %xmm0
+ movlpd 7(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 7(%edx)
+# ifdef USE_AS_STPCPY
+ lea 14(%edx), %eax
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit12Bytes):
+ cmp $9, %ebx
+ je L(ExitTail9)
+ cmpb $0, 8(%ecx)
+ jz L(ExitTail9)
+ cmp $10, %ebx
+ je L(ExitTail10)
+ cmpb $0, 9(%ecx)
+ jz L(ExitTail10)
+ cmp $11, %ebx
+ je L(ExitTail11)
+ cmpb $0, 10(%ecx)
+ jz L(ExitTail11)
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %eax
+ movlpd %xmm0, (%edx)
+ movl %eax, 8(%edx)
+ SAVE_RESULT_TAIL (11)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit8Bytes):
+ cmp $4, %ebx
+ jbe L(StrncpyExit4Bytes)
+ cmpb $0, (%ecx)
+ jz L(ExitTail1)
+ cmpb $0, 1(%ecx)
+ jz L(ExitTail2)
+ cmpb $0, 2(%ecx)
+ jz L(ExitTail3)
+ cmpb $0, 3(%ecx)
+ jz L(ExitTail4)
+
+ cmp $5, %ebx
+ je L(ExitTail5)
+ cmpb $0, 4(%ecx)
+ jz L(ExitTail5)
+ cmp $6, %ebx
+ je L(ExitTail6)
+ cmpb $0, 5(%ecx)
+ jz L(ExitTail6)
+ cmp $7, %ebx
+ je L(ExitTail7)
+ cmpb $0, 6(%ecx)
+ jz L(ExitTail7)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+# ifdef USE_AS_STPCPY
+ lea 7(%edx), %eax
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# else
+ movl %edx, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(StrncpyExit4Bytes):
+ test %ebx, %ebx
+ jz L(ExitTail0)
+ cmp $1, %ebx
+ je L(ExitTail1)
+ cmpb $0, (%ecx)
+ jz L(ExitTail1)
+ cmp $2, %ebx
+ je L(ExitTail2)
+ cmpb $0, 1(%ecx)
+ jz L(ExitTail2)
+ cmp $3, %ebx
+ je L(ExitTail3)
+ cmpb $0, 2(%ecx)
+ jz L(ExitTail3)
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ SAVE_RESULT_TAIL (3)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+ RETURN
+# endif
+
+END (STRCPY)
+#endif
diff --git a/libc/arch-x86/string/ssse3-strlcat-atom.S b/libc/arch-x86/string/ssse3-strlcat-atom.S
new file mode 100644
index 0000000..daaf254
--- /dev/null
+++ b/libc/arch-x86/string/ssse3-strlcat-atom.S
@@ -0,0 +1,1225 @@
+/*
+Copyright (c) 2011, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* Optimized strlcat with SSSE3 */
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+#define L(label) .L##Prolog_##label
+
+#define DST 4
+#define SRC DST+8
+#define LEN SRC+4
+
+ .text
+ENTRY (strlcat)
+ mov DST(%esp), %edx
+ PUSH (%ebx)
+ mov LEN(%esp), %ebx
+ sub $4, %ebx
+ jbe L(len_less4_prolog)
+
+#define RETURN jmp L(StrcpyStep)
+#define edi ebx
+
+#define USE_AS_STRNLEN
+#define USE_AS_STRCAT
+#define USE_AS_STRLCAT
+
+#include "sse2-strlen-atom.S"
+
+ .p2align 4
+L(StrcpyStep):
+
+#undef edi
+#undef L
+#define L(label) .L##label
+#undef RETURN
+#define RETURN POP (%ebx); ret; CFI_PUSH (%ebx);
+#define RETURN1 POP (%edi); POP (%ebx); ret; CFI_PUSH (%ebx); CFI_PUSH (%edi)
+
+ movl SRC(%esp), %ecx
+ movl LEN(%esp), %ebx
+
+ cmp %eax, %ebx
+ je L(CalculateLengthOfSrcProlog)
+ sub %eax, %ebx
+
+ test %ebx, %ebx
+ jz L(CalculateLengthOfSrcProlog)
+
+ mov DST + 4(%esp), %edx
+
+ PUSH (%edi)
+ add %eax, %edx
+ mov %ecx, %edi
+ sub %eax, %edi
+
+ cmp $8, %ebx
+ jbe L(StrncpyExit8Bytes)
+
+ cmpb $0, (%ecx)
+ jz L(Exit1)
+ cmpb $0, 1(%ecx)
+ jz L(Exit2)
+ cmpb $0, 2(%ecx)
+ jz L(Exit3)
+ cmpb $0, 3(%ecx)
+ jz L(Exit4)
+ cmpb $0, 4(%ecx)
+ jz L(Exit5)
+ cmpb $0, 5(%ecx)
+ jz L(Exit6)
+ cmpb $0, 6(%ecx)
+ jz L(Exit7)
+ cmpb $0, 7(%ecx)
+ jz L(Exit8)
+ cmp $16, %ebx
+ jb L(StrncpyExit15Bytes)
+ cmpb $0, 8(%ecx)
+ jz L(Exit9)
+ cmpb $0, 9(%ecx)
+ jz L(Exit10)
+ cmpb $0, 10(%ecx)
+ jz L(Exit11)
+ cmpb $0, 11(%ecx)
+ jz L(Exit12)
+ cmpb $0, 12(%ecx)
+ jz L(Exit13)
+ cmpb $0, 13(%ecx)
+ jz L(Exit14)
+ cmpb $0, 14(%ecx)
+ jz L(Exit15)
+ cmpb $0, 15(%ecx)
+ jz L(Exit16)
+ cmp $16, %ebx
+ je L(StrlcpyExit16)
+
+#define USE_AS_STRNCPY
+#include "ssse3-strcpy-atom.S"
+
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+ test %al, %al
+ jz L(ExitHigh8)
+
+L(CopyFrom1To16BytesLess8):
+ mov %al, %ah
+ and $15, %ah
+ jz L(ExitHigh4)
+
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+L(Exit4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+
+ lea 3(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(ExitHigh4):
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+L(Exit8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+
+ lea 7(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(ExitHigh8):
+ mov %ah, %al
+ and $15, %al
+ jz L(ExitHigh12)
+
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+L(Exit12):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+
+ lea 11(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(ExitHigh12):
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+L(Exit16):
+ movlpd (%ecx), %xmm0
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 8(%edx)
+
+ lea 15(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ CFI_PUSH(%esi)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %ebx
+ add %esi, %ecx
+ add %esi, %edx
+
+ POP (%esi)
+
+ test %al, %al
+ jz L(ExitHighCase2)
+
+ cmp $8, %ebx
+ ja L(CopyFrom1To16BytesLess8)
+
+ test $0x01, %al
+ jnz L(Exit1)
+ cmp $1, %ebx
+ je L(StrlcpyExit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ cmp $2, %ebx
+ je L(StrlcpyExit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ cmp $3, %ebx
+ je L(StrlcpyExit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ cmp $4, %ebx
+ je L(StrlcpyExit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ cmp $5, %ebx
+ je L(StrlcpyExit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ cmp $6, %ebx
+ je L(StrlcpyExit6)
+ test $0x40, %al
+ jnz L(Exit7)
+ cmp $7, %ebx
+ je L(StrlcpyExit7)
+ test $0x80, %al
+ jnz L(Exit8)
+ jmp L(StrlcpyExit8)
+
+ .p2align 4
+L(ExitHighCase2):
+ cmp $8, %ebx
+ jbe L(CopyFrom1To16BytesLess8Case3)
+
+ test $0x01, %ah
+ jnz L(Exit9)
+ cmp $9, %ebx
+ je L(StrlcpyExit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ cmp $10, %ebx
+ je L(StrlcpyExit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ cmp $11, %ebx
+ je L(StrlcpyExit11)
+ test $0x8, %ah
+ jnz L(Exit12)
+ cmp $12, %ebx
+ je L(StrlcpyExit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ cmp $13, %ebx
+ je L(StrlcpyExit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ cmp $14, %ebx
+ je L(StrlcpyExit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+ cmp $15, %ebx
+ je L(StrlcpyExit15)
+ test $0x80, %ah
+ jnz L(Exit16)
+ jmp L(StrlcpyExit16)
+
+ CFI_PUSH(%esi)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase3):
+ add $16, %ebx
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+
+ cmp $8, %ebx
+ ja L(ExitHigh8Case3)
+
+L(CopyFrom1To16BytesLess8Case3):
+ cmp $4, %ebx
+ ja L(ExitHigh4Case3)
+
+ cmp $1, %ebx
+ je L(StrlcpyExit1)
+ cmp $2, %ebx
+ je L(StrlcpyExit2)
+ cmp $3, %ebx
+ je L(StrlcpyExit3)
+L(StrlcpyExit4):
+ movb %bh, 3(%edx)
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+
+ lea 4(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(ExitHigh4Case3):
+ cmp $5, %ebx
+ je L(StrlcpyExit5)
+ cmp $6, %ebx
+ je L(StrlcpyExit6)
+ cmp $7, %ebx
+ je L(StrlcpyExit7)
+L(StrlcpyExit8):
+ movb %bh, 7(%edx)
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+
+ lea 8(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(ExitHigh8Case3):
+ cmp $12, %ebx
+ ja L(ExitHigh12Case3)
+
+ cmp $9, %ebx
+ je L(StrlcpyExit9)
+ cmp $10, %ebx
+ je L(StrlcpyExit10)
+ cmp $11, %ebx
+ je L(StrlcpyExit11)
+L(StrlcpyExit12):
+ movb %bh, 11(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 7(%ecx), %eax
+ movl %eax, 7(%edx)
+
+ lea 12(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(ExitHigh12Case3):
+ cmp $13, %ebx
+ je L(StrlcpyExit13)
+ cmp $14, %ebx
+ je L(StrlcpyExit14)
+ cmp $15, %ebx
+ je L(StrlcpyExit15)
+L(StrlcpyExit16):
+ movb %bh, 15(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+
+ lea 16(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(StrlcpyExit1):
+ movb %bh, (%edx)
+
+ lea 1(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit1):
+ movb (%ecx), %al
+ movb %al, (%edx)
+
+ mov %ecx, %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrlcpyExit2):
+ movb %bh, 1(%edx)
+ movb (%ecx), %al
+ movb %al, (%edx)
+
+ lea 2(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit2):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movl %edi, %eax
+
+ lea 1(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrlcpyExit3):
+ movb %bh, 2(%edx)
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+
+ lea 3(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit3):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+
+ lea 2(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrlcpyExit5):
+ movb %bh, 4(%edx)
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl %edi, %eax
+
+ lea 5(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit5):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+
+ lea 4(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrlcpyExit6):
+ movb %bh, 5(%edx)
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+
+ lea 6(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit6):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+
+ lea 5(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrlcpyExit7):
+ movb %bh, 6(%edx)
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+
+ lea 7(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit7):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+
+ lea 6(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrlcpyExit9):
+ movb %bh, 8(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+
+ lea 9(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit9):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movb 8(%ecx), %al
+ movb %al, 8(%edx)
+
+ lea 8(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrlcpyExit10):
+ movb %bh, 9(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movb 8(%ecx), %al
+ movb %al, 8(%edx)
+
+ lea 10(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit10):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movw 8(%ecx), %ax
+ movw %ax, 8(%edx)
+
+ lea 9(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrlcpyExit11):
+ movb %bh, 10(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movw 8(%ecx), %ax
+ movw %ax, 8(%edx)
+
+ lea 11(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit11):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 7(%ecx), %eax
+ movl %eax, 7(%edx)
+
+ lea 10(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrlcpyExit13):
+ movb %bh, 12(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+
+ lea 13(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit13):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 5(%ecx), %xmm0
+ movlpd %xmm0, 5(%edx)
+
+ lea 12(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrlcpyExit14):
+ movb %bh, 13(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 5(%ecx), %xmm0
+ movlpd %xmm0, 5(%edx)
+
+ lea 14(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit14):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 6(%ecx), %xmm0
+ movlpd %xmm0, 6(%edx)
+
+ lea 13(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrlcpyExit15):
+ movb %bh, 14(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 6(%ecx), %xmm0
+ movlpd %xmm0, 6(%edx)
+
+ lea 15(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit15):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+
+ lea 14(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrncpyExit15Bytes):
+ cmp $12, %ebx
+ ja L(StrncpyExit15Bytes1)
+
+ cmpb $0, 8(%ecx)
+ jz L(Exit9)
+ cmp $9, %ebx
+ je L(StrlcpyExit9)
+
+ cmpb $0, 9(%ecx)
+ jz L(Exit10)
+ cmp $10, %ebx
+ je L(StrlcpyExit10)
+
+ cmpb $0, 10(%ecx)
+ jz L(Exit11)
+ cmp $11, %ebx
+ je L(StrlcpyExit11)
+
+ cmpb $0, 11(%ecx)
+ jz L(Exit12)
+ jmp L(StrlcpyExit12)
+
+ .p2align 4
+L(StrncpyExit15Bytes1):
+ cmpb $0, 8(%ecx)
+ jz L(Exit9)
+ cmpb $0, 9(%ecx)
+ jz L(Exit10)
+ cmpb $0, 10(%ecx)
+ jz L(Exit11)
+ cmpb $0, 11(%ecx)
+ jz L(Exit12)
+
+ cmpb $0, 12(%ecx)
+ jz L(Exit13)
+ cmp $13, %ebx
+ je L(StrlcpyExit13)
+
+ cmpb $0, 13(%ecx)
+ jz L(Exit14)
+ cmp $14, %ebx
+ je L(StrlcpyExit14)
+
+ cmpb $0, 14(%ecx)
+ jz L(Exit15)
+ jmp L(StrlcpyExit15)
+
+ .p2align 4
+L(StrncpyExit8Bytes):
+ cmp $4, %ebx
+ ja L(StrncpyExit8Bytes1)
+
+ cmpb $0, (%ecx)
+ jz L(Exit1)
+ cmp $1, %ebx
+ je L(StrlcpyExit1)
+
+ cmpb $0, 1(%ecx)
+ jz L(Exit2)
+ cmp $2, %ebx
+ je L(StrlcpyExit2)
+
+ cmpb $0, 2(%ecx)
+ jz L(Exit3)
+ cmp $3, %ebx
+ je L(StrlcpyExit3)
+
+ cmpb $0, 3(%ecx)
+ jz L(Exit4)
+ jmp L(StrlcpyExit4)
+
+ .p2align 4
+L(StrncpyExit8Bytes1):
+ cmpb $0, (%ecx)
+ jz L(Exit1)
+ cmpb $0, 1(%ecx)
+ jz L(Exit2)
+ cmpb $0, 2(%ecx)
+ jz L(Exit3)
+ cmpb $0, 3(%ecx)
+ jz L(Exit4)
+
+ cmpb $0, 4(%ecx)
+ jz L(Exit5)
+ cmp $5, %ebx
+ je L(StrlcpyExit5)
+
+ cmpb $0, 5(%ecx)
+ jz L(Exit6)
+ cmp $6, %ebx
+ je L(StrlcpyExit6)
+
+ cmpb $0, 6(%ecx)
+ jz L(Exit7)
+ cmp $7, %ebx
+ je L(StrlcpyExit7)
+
+ cmpb $0, 7(%ecx)
+ jz L(Exit8)
+ jmp L(StrlcpyExit8)
+
+ CFI_POP (%edi)
+
+
+ .p2align 4
+L(Prolog_return_start_len):
+ movl LEN(%esp), %ebx
+ movl SRC(%esp), %ecx
+L(CalculateLengthOfSrcProlog):
+ mov %ecx, %edx
+ sub %ebx, %ecx
+
+ .p2align 4
+L(CalculateLengthOfSrc):
+ cmpb $0, (%edx)
+ jz L(exit_tail0)
+ cmpb $0, 1(%edx)
+ jz L(exit_tail1)
+ cmpb $0, 2(%edx)
+ jz L(exit_tail2)
+ cmpb $0, 3(%edx)
+ jz L(exit_tail3)
+
+ cmpb $0, 4(%edx)
+ jz L(exit_tail4)
+ cmpb $0, 5(%edx)
+ jz L(exit_tail5)
+ cmpb $0, 6(%edx)
+ jz L(exit_tail6)
+ cmpb $0, 7(%edx)
+ jz L(exit_tail7)
+
+ cmpb $0, 8(%edx)
+ jz L(exit_tail8)
+ cmpb $0, 9(%edx)
+ jz L(exit_tail9)
+ cmpb $0, 10(%edx)
+ jz L(exit_tail10)
+ cmpb $0, 11(%edx)
+ jz L(exit_tail11)
+
+ cmpb $0, 12(%edx)
+ jz L(exit_tail12)
+ cmpb $0, 13(%edx)
+ jz L(exit_tail13)
+ cmpb $0, 14(%edx)
+ jz L(exit_tail14)
+ cmpb $0, 15(%edx)
+ jz L(exit_tail15)
+
+ pxor %xmm0, %xmm0
+ lea 16(%edx), %eax
+ add $16, %ecx
+ and $-16, %eax
+
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ pxor %xmm1, %xmm1
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ pxor %xmm2, %xmm2
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ pxor %xmm3, %xmm3
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ and $-0x40, %eax
+
+ .p2align 4
+L(aligned_64_loop):
+ movaps (%eax), %xmm0
+ movaps 16(%eax), %xmm1
+ movaps 32(%eax), %xmm2
+ movaps 48(%eax), %xmm6
+ pminub %xmm1, %xmm0
+ pminub %xmm6, %xmm2
+ pminub %xmm0, %xmm2
+ pcmpeqb %xmm3, %xmm2
+ pmovmskb %xmm2, %edx
+ lea 64(%eax), %eax
+ test %edx, %edx
+ jz L(aligned_64_loop)
+
+ pcmpeqb -64(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 48(%ecx), %ecx
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea -16(%ecx), %ecx
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb -32(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea -16(%ecx), %ecx
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb %xmm6, %xmm3
+ pmovmskb %xmm3, %edx
+ lea -16(%ecx), %ecx
+
+ .p2align 4
+L(exit):
+ sub %ecx, %eax
+ test %dl, %dl
+ jz L(exit_more_8)
+
+ mov %dl, %cl
+ and $15, %cl
+ jz L(exit_more_4)
+ test $0x01, %dl
+ jnz L(exit_0)
+ test $0x02, %dl
+ jnz L(exit_1)
+ test $0x04, %dl
+ jnz L(exit_2)
+ add $3, %eax
+ RETURN
+
+ .p2align 4
+L(exit_more_4):
+ test $0x10, %dl
+ jnz L(exit_4)
+ test $0x20, %dl
+ jnz L(exit_5)
+ test $0x40, %dl
+ jnz L(exit_6)
+ add $7, %eax
+ RETURN
+
+ .p2align 4
+L(exit_more_8):
+ mov %dh, %ch
+ and $15, %ch
+ jz L(exit_more_12)
+ test $0x01, %dh
+ jnz L(exit_8)
+ test $0x02, %dh
+ jnz L(exit_9)
+ test $0x04, %dh
+ jnz L(exit_10)
+ add $11, %eax
+ RETURN
+
+ .p2align 4
+L(exit_more_12):
+ test $0x10, %dh
+ jnz L(exit_12)
+ test $0x20, %dh
+ jnz L(exit_13)
+ test $0x40, %dh
+ jnz L(exit_14)
+ add $15, %eax
+L(exit_0):
+ RETURN
+
+ .p2align 4
+L(exit_1):
+ add $1, %eax
+ RETURN
+
+L(exit_2):
+ add $2, %eax
+ RETURN
+
+L(exit_3):
+ add $3, %eax
+ RETURN
+
+L(exit_4):
+ add $4, %eax
+ RETURN
+
+L(exit_5):
+ add $5, %eax
+ RETURN
+
+L(exit_6):
+ add $6, %eax
+ RETURN
+
+L(exit_7):
+ add $7, %eax
+ RETURN
+
+L(exit_8):
+ add $8, %eax
+ RETURN
+
+L(exit_9):
+ add $9, %eax
+ RETURN
+
+L(exit_10):
+ add $10, %eax
+ RETURN
+
+L(exit_11):
+ add $11, %eax
+ RETURN
+
+L(exit_12):
+ add $12, %eax
+ RETURN
+
+L(exit_13):
+ add $13, %eax
+ RETURN
+
+L(exit_14):
+ add $14, %eax
+ RETURN
+
+L(exit_15):
+ add $15, %eax
+ RETURN
+
+L(exit_tail0):
+ mov %edx, %eax
+ sub %ecx, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail1):
+ lea 1(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail2):
+ lea 2(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail3):
+ lea 3(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail4):
+ lea 4(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail5):
+ lea 5(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail6):
+ lea 6(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail7):
+ lea 7(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail8):
+ lea 8(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail9):
+ lea 9(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail10):
+ lea 10(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail11):
+ lea 11(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail12):
+ lea 12(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail13):
+ lea 13(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail14):
+ lea 14(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail15):
+ lea 15(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+END (strlcat)
diff --git a/libc/arch-x86/string/ssse3-strlcpy-atom.S b/libc/arch-x86/string/ssse3-strlcpy-atom.S
new file mode 100644
index 0000000..cdb17cc
--- /dev/null
+++ b/libc/arch-x86/string/ssse3-strlcpy-atom.S
@@ -0,0 +1,1403 @@
+/*
+Copyright (c) 2011, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define USE_AS_STRNCPY
+#define STRCPY strlcpy
+#define STRLEN strlcpy
+#define USE_AS_STRLCPY
+#include "ssse3-strcpy-atom.S"
+
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+ test %al, %al
+ jz L(ExitHigh8)
+
+L(CopyFrom1To16BytesLess8):
+ mov %al, %ah
+ and $15, %ah
+ jz L(ExitHigh4)
+
+ test $0x01, %al
+ jnz L(Exit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ test $0x04, %al
+ jnz L(Exit3)
+L(Exit4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+
+ lea 3(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(ExitHigh4):
+ test $0x10, %al
+ jnz L(Exit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ test $0x40, %al
+ jnz L(Exit7)
+L(Exit8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+
+ lea 7(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(ExitHigh8):
+ mov %ah, %al
+ and $15, %al
+ jz L(ExitHigh12)
+
+ test $0x01, %ah
+ jnz L(Exit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+L(Exit12):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+
+ lea 11(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(ExitHigh12):
+ test $0x10, %ah
+ jnz L(Exit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+L(Exit16):
+ movlpd (%ecx), %xmm0
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 8(%edx)
+
+ lea 15(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ CFI_PUSH(%esi)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %ebx
+ add %esi, %ecx
+ add %esi, %edx
+
+ POP (%esi)
+
+ test %al, %al
+ jz L(ExitHighCase2)
+
+ cmp $8, %ebx
+ ja L(CopyFrom1To16BytesLess8)
+
+ test $0x01, %al
+ jnz L(Exit1)
+ cmp $1, %ebx
+ je L(StrlcpyExit1)
+ test $0x02, %al
+ jnz L(Exit2)
+ cmp $2, %ebx
+ je L(StrlcpyExit2)
+ test $0x04, %al
+ jnz L(Exit3)
+ cmp $3, %ebx
+ je L(StrlcpyExit3)
+ test $0x08, %al
+ jnz L(Exit4)
+ cmp $4, %ebx
+ je L(StrlcpyExit4)
+ test $0x10, %al
+ jnz L(Exit5)
+ cmp $5, %ebx
+ je L(StrlcpyExit5)
+ test $0x20, %al
+ jnz L(Exit6)
+ cmp $6, %ebx
+ je L(StrlcpyExit6)
+ test $0x40, %al
+ jnz L(Exit7)
+ cmp $7, %ebx
+ je L(StrlcpyExit7)
+ test $0x80, %al
+ jnz L(Exit8)
+ jmp L(StrlcpyExit8)
+
+ .p2align 4
+L(ExitHighCase2):
+ cmp $8, %ebx
+ jbe L(CopyFrom1To16BytesLess8Case3)
+
+ test $0x01, %ah
+ jnz L(Exit9)
+ cmp $9, %ebx
+ je L(StrlcpyExit9)
+ test $0x02, %ah
+ jnz L(Exit10)
+ cmp $10, %ebx
+ je L(StrlcpyExit10)
+ test $0x04, %ah
+ jnz L(Exit11)
+ cmp $11, %ebx
+ je L(StrlcpyExit11)
+ test $0x8, %ah
+ jnz L(Exit12)
+ cmp $12, %ebx
+ je L(StrlcpyExit12)
+ test $0x10, %ah
+ jnz L(Exit13)
+ cmp $13, %ebx
+ je L(StrlcpyExit13)
+ test $0x20, %ah
+ jnz L(Exit14)
+ cmp $14, %ebx
+ je L(StrlcpyExit14)
+ test $0x40, %ah
+ jnz L(Exit15)
+ cmp $15, %ebx
+ je L(StrlcpyExit15)
+ test $0x80, %ah
+ jnz L(Exit16)
+ jmp L(StrlcpyExit16)
+
+ CFI_PUSH(%esi)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %eax, %eax
+ jnz L(CopyFrom1To16BytesCase2)
+
+ .p2align 4
+L(CopyFrom1To16BytesCase3):
+ add $16, %ebx
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+
+ cmp $8, %ebx
+ ja L(ExitHigh8Case3)
+
+L(CopyFrom1To16BytesLess8Case3):
+ cmp $4, %ebx
+ ja L(ExitHigh4Case3)
+
+ cmp $1, %ebx
+ je L(StrlcpyExit1)
+ cmp $2, %ebx
+ je L(StrlcpyExit2)
+ cmp $3, %ebx
+ je L(StrlcpyExit3)
+L(StrlcpyExit4):
+ movb %bh, 3(%edx)
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+
+ lea 4(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(ExitHigh4Case3):
+ cmp $5, %ebx
+ je L(StrlcpyExit5)
+ cmp $6, %ebx
+ je L(StrlcpyExit6)
+ cmp $7, %ebx
+ je L(StrlcpyExit7)
+L(StrlcpyExit8):
+ movb %bh, 7(%edx)
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+
+ lea 8(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(ExitHigh8Case3):
+ cmp $12, %ebx
+ ja L(ExitHigh12Case3)
+
+ cmp $9, %ebx
+ je L(StrlcpyExit9)
+ cmp $10, %ebx
+ je L(StrlcpyExit10)
+ cmp $11, %ebx
+ je L(StrlcpyExit11)
+L(StrlcpyExit12):
+ movb %bh, 11(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 7(%ecx), %eax
+ movl %eax, 7(%edx)
+
+ lea 12(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(ExitHigh12Case3):
+ cmp $13, %ebx
+ je L(StrlcpyExit13)
+ cmp $14, %ebx
+ je L(StrlcpyExit14)
+ cmp $15, %ebx
+ je L(StrlcpyExit15)
+L(StrlcpyExit16):
+ movb %bh, 15(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+
+ lea 16(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(StrlcpyExit1):
+ movb %bh, (%edx)
+
+ lea 1(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit1):
+ movb (%ecx), %al
+ movb %al, (%edx)
+
+ mov %ecx, %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrlcpyExit2):
+ movb %bh, 1(%edx)
+ movb (%ecx), %al
+ movb %al, (%edx)
+
+ lea 2(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit2):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movl %edi, %eax
+
+ lea 1(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrlcpyExit3):
+ movb %bh, 2(%edx)
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+
+ lea 3(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit3):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+
+ lea 2(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrlcpyExit5):
+ movb %bh, 4(%edx)
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl %edi, %eax
+
+ lea 5(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit5):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+
+ lea 4(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrlcpyExit6):
+ movb %bh, 5(%edx)
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+
+ lea 6(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit6):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+
+ lea 5(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrlcpyExit7):
+ movb %bh, 6(%edx)
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+
+ lea 7(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit7):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+
+ lea 6(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrlcpyExit9):
+ movb %bh, 8(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+
+ lea 9(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit9):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movb 8(%ecx), %al
+ movb %al, 8(%edx)
+
+ lea 8(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrlcpyExit10):
+ movb %bh, 9(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movb 8(%ecx), %al
+ movb %al, 8(%edx)
+
+ lea 10(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit10):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movw 8(%ecx), %ax
+ movw %ax, 8(%edx)
+
+ lea 9(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrlcpyExit11):
+ movb %bh, 10(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movw 8(%ecx), %ax
+ movw %ax, 8(%edx)
+
+ lea 11(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit11):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 7(%ecx), %eax
+ movl %eax, 7(%edx)
+
+ lea 10(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrlcpyExit13):
+ movb %bh, 12(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+
+ lea 13(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit13):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 5(%ecx), %xmm0
+ movlpd %xmm0, 5(%edx)
+
+ lea 12(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrlcpyExit14):
+ movb %bh, 13(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 5(%ecx), %xmm0
+ movlpd %xmm0, 5(%edx)
+
+ lea 14(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit14):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 6(%ecx), %xmm0
+ movlpd %xmm0, 6(%edx)
+
+ lea 13(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ .p2align 4
+L(StrlcpyExit15):
+ movb %bh, 14(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 6(%ecx), %xmm0
+ movlpd %xmm0, 6(%edx)
+
+ lea 15(%ecx), %edx
+ mov %edi, %ecx
+ POP (%edi)
+ jmp L(CalculateLengthOfSrc)
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(Exit15):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+
+ lea 14(%ecx), %eax
+ sub %edi, %eax
+ RETURN1
+
+ CFI_POP (%edi)
+
+ .p2align 4
+L(StrlcpyExit0):
+ movl $0, %eax
+ RETURN
+
+ .p2align 4
+L(StrncpyExit15Bytes):
+ cmp $12, %ebx
+ ja L(StrncpyExit15Bytes1)
+
+ cmpb $0, 8(%ecx)
+ jz L(ExitTail9)
+ cmp $9, %ebx
+ je L(StrlcpyExitTail9)
+
+ cmpb $0, 9(%ecx)
+ jz L(ExitTail10)
+ cmp $10, %ebx
+ je L(StrlcpyExitTail10)
+
+ cmpb $0, 10(%ecx)
+ jz L(ExitTail11)
+ cmp $11, %ebx
+ je L(StrlcpyExitTail11)
+
+ cmpb $0, 11(%ecx)
+ jz L(ExitTail12)
+
+ movb %bh, 11(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 7(%ecx), %eax
+ movl %eax, 7(%edx)
+
+ lea 12(%ecx), %edx
+ jmp L(CalculateLengthOfSrc)
+
+ .p2align 4
+L(StrncpyExit15Bytes1):
+ cmpb $0, 8(%ecx)
+ jz L(ExitTail9)
+ cmpb $0, 9(%ecx)
+ jz L(ExitTail10)
+ cmpb $0, 10(%ecx)
+ jz L(ExitTail11)
+ cmpb $0, 11(%ecx)
+ jz L(ExitTail12)
+
+ cmpb $0, 12(%ecx)
+ jz L(ExitTail13)
+ cmp $13, %ebx
+ je L(StrlcpyExitTail13)
+
+ cmpb $0, 13(%ecx)
+ jz L(ExitTail14)
+ cmp $14, %ebx
+ je L(StrlcpyExitTail14)
+
+ cmpb $0, 14(%ecx)
+ jz L(ExitTail15)
+
+ movb %bh, 14(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 6(%ecx), %xmm0
+ movlpd %xmm0, 6(%edx)
+
+ lea 15(%ecx), %edx
+ jmp L(CalculateLengthOfSrc)
+
+ .p2align 4
+L(StrncpyExit8Bytes):
+ cmp $4, %ebx
+ ja L(StrncpyExit8Bytes1)
+
+ test %ebx, %ebx
+ jz L(StrlcpyExitTail0)
+
+ cmpb $0, (%ecx)
+ jz L(ExitTail1)
+ cmp $1, %ebx
+ je L(StrlcpyExitTail1)
+
+ cmpb $0, 1(%ecx)
+ jz L(ExitTail2)
+ cmp $2, %ebx
+ je L(StrlcpyExitTail2)
+
+ cmpb $0, 2(%ecx)
+ jz L(ExitTail3)
+ cmp $3, %ebx
+ je L(StrlcpyExitTail3)
+
+ cmpb $0, 3(%ecx)
+ jz L(ExitTail4)
+
+ movb %bh, 3(%edx)
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+
+ lea 4(%ecx), %edx
+ jmp L(CalculateLengthOfSrc)
+
+ .p2align 4
+L(StrncpyExit8Bytes1):
+ cmpb $0, (%ecx)
+ jz L(ExitTail1)
+ cmpb $0, 1(%ecx)
+ jz L(ExitTail2)
+ cmpb $0, 2(%ecx)
+ jz L(ExitTail3)
+ cmpb $0, 3(%ecx)
+ jz L(ExitTail4)
+
+ cmpb $0, 4(%ecx)
+ jz L(ExitTail5)
+ cmp $5, %ebx
+ je L(StrlcpyExitTail5)
+
+ cmpb $0, 5(%ecx)
+ jz L(ExitTail6)
+ cmp $6, %ebx
+ je L(StrlcpyExitTail6)
+
+ cmpb $0, 6(%ecx)
+ jz L(ExitTail7)
+ cmp $7, %ebx
+ je L(StrlcpyExitTail7)
+
+ cmpb $0, 7(%ecx)
+ jz L(ExitTail8)
+
+ movb %bh, 7(%edx)
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+
+ lea 8(%ecx), %edx
+ jmp L(CalculateLengthOfSrc)
+
+ .p2align 4
+L(StrlcpyExitTail0):
+ mov %ecx, %edx
+ jmp L(CalculateLengthOfSrc)
+
+ .p2align 4
+L(StrlcpyExitTail1):
+ movb %bh, (%edx)
+
+ lea 1(%ecx), %edx
+ jmp L(CalculateLengthOfSrc)
+
+ .p2align 4
+L(ExitTail1):
+ movb (%ecx), %al
+ movb %al, (%edx)
+
+ mov $0, %eax
+ RETURN
+
+ .p2align 4
+L(StrlcpyExitTail2):
+ movb %bh, 1(%edx)
+ movb (%ecx), %al
+ movb %al, (%edx)
+
+ lea 2(%ecx), %edx
+ jmp L(CalculateLengthOfSrc)
+
+ .p2align 4
+L(ExitTail2):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movl %edx, %eax
+
+ mov $1, %eax
+ RETURN
+
+ .p2align 4
+L(StrlcpyExitTail3):
+ movb %bh, 2(%edx)
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+
+ lea 3(%ecx), %edx
+ jmp L(CalculateLengthOfSrc)
+
+ .p2align 4
+L(ExitTail3):
+ movw (%ecx), %ax
+ movw %ax, (%edx)
+ movb 2(%ecx), %al
+ movb %al, 2(%edx)
+
+ mov $2, %eax
+ RETURN
+
+ .p2align 4
+L(ExitTail4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+
+ mov $3, %eax
+ RETURN
+
+ .p2align 4
+L(StrlcpyExitTail5):
+ movb %bh, 4(%edx)
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl %edx, %eax
+
+ lea 5(%ecx), %edx
+ jmp L(CalculateLengthOfSrc)
+
+ .p2align 4
+L(ExitTail5):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+
+ mov $4, %eax
+ RETURN
+
+ .p2align 4
+L(StrlcpyExitTail6):
+ movb %bh, 5(%edx)
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movb 4(%ecx), %al
+ movb %al, 4(%edx)
+
+ lea 6(%ecx), %edx
+ jmp L(CalculateLengthOfSrc)
+
+ .p2align 4
+L(ExitTail6):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+
+ mov $5, %eax
+ RETURN
+
+ .p2align 4
+L(StrlcpyExitTail7):
+ movb %bh, 6(%edx)
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movw 4(%ecx), %ax
+ movw %ax, 4(%edx)
+
+ lea 7(%ecx), %edx
+ jmp L(CalculateLengthOfSrc)
+
+ .p2align 4
+L(ExitTail7):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl 3(%ecx), %eax
+ movl %eax, 3(%edx)
+
+ mov $6, %eax
+ RETURN
+
+ .p2align 4
+L(ExitTail8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+
+ mov $7, %eax
+ RETURN
+
+ .p2align 4
+L(StrlcpyExitTail9):
+ movb %bh, 8(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+
+ lea 9(%ecx), %edx
+ jmp L(CalculateLengthOfSrc)
+
+ .p2align 4
+L(ExitTail9):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movb 8(%ecx), %al
+ movb %al, 8(%edx)
+
+ mov $8, %eax
+ RETURN
+
+ .p2align 4
+L(StrlcpyExitTail10):
+ movb %bh, 9(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movb 8(%ecx), %al
+ movb %al, 8(%edx)
+
+ lea 10(%ecx), %edx
+ jmp L(CalculateLengthOfSrc)
+
+ .p2align 4
+L(ExitTail10):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movw 8(%ecx), %ax
+ movw %ax, 8(%edx)
+
+ mov $9, %eax
+ RETURN
+
+ .p2align 4
+L(StrlcpyExitTail11):
+ movb %bh, 10(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movw 8(%ecx), %ax
+ movw %ax, 8(%edx)
+
+ lea 11(%ecx), %edx
+ jmp L(CalculateLengthOfSrc)
+
+ .p2align 4
+L(ExitTail11):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 7(%ecx), %eax
+ movl %eax, 7(%edx)
+
+ mov $10, %eax
+ RETURN
+
+ .p2align 4
+L(ExitTail12):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+
+ mov $11, %eax
+ RETURN
+
+ .p2align 4
+L(StrlcpyExitTail13):
+ movb %bh, 12(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+
+ lea 13(%ecx), %edx
+ jmp L(CalculateLengthOfSrc)
+
+ .p2align 4
+L(ExitTail13):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 5(%ecx), %xmm0
+ movlpd %xmm0, 5(%edx)
+
+ mov $12, %eax
+ RETURN
+
+ .p2align 4
+L(StrlcpyExitTail14):
+ movb %bh, 13(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 5(%ecx), %xmm0
+ movlpd %xmm0, 5(%edx)
+
+ lea 14(%ecx), %edx
+ jmp L(CalculateLengthOfSrc)
+
+ .p2align 4
+L(ExitTail14):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 6(%ecx), %xmm0
+ movlpd %xmm0, 6(%edx)
+
+ mov $13, %eax
+ RETURN
+
+ .p2align 4
+L(ExitTail15):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+
+ mov $14, %eax
+ RETURN
+
+ .p2align 4
+L(StrlcpyExitTail16):
+ movb %bh, 15(%edx)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
+
+ lea 16(%ecx), %edx
+ jmp L(CalculateLengthOfSrc)
+
+ .p2align 4
+L(ExitTail16):
+ movlpd (%ecx), %xmm0
+ movlpd 8(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 8(%edx)
+
+ mov $15, %eax
+ RETURN
+
+ .p2align 4
+L(CalculateLengthOfSrc):
+ xor %eax, %eax
+ cmpb $0, (%edx)
+ jz L(exit_tail0)
+ cmpb $0, 1(%edx)
+ jz L(exit_tail1)
+ cmpb $0, 2(%edx)
+ jz L(exit_tail2)
+ cmpb $0, 3(%edx)
+ jz L(exit_tail3)
+
+ cmpb $0, 4(%edx)
+ jz L(exit_tail4)
+ cmpb $0, 5(%edx)
+ jz L(exit_tail5)
+ cmpb $0, 6(%edx)
+ jz L(exit_tail6)
+ cmpb $0, 7(%edx)
+ jz L(exit_tail7)
+
+ cmpb $0, 8(%edx)
+ jz L(exit_tail8)
+ cmpb $0, 9(%edx)
+ jz L(exit_tail9)
+ cmpb $0, 10(%edx)
+ jz L(exit_tail10)
+ cmpb $0, 11(%edx)
+ jz L(exit_tail11)
+
+ cmpb $0, 12(%edx)
+ jz L(exit_tail12)
+ cmpb $0, 13(%edx)
+ jz L(exit_tail13)
+ cmpb $0, 14(%edx)
+ jz L(exit_tail14)
+ cmpb $0, 15(%edx)
+ jz L(exit_tail15)
+
+ pxor %xmm0, %xmm0
+ lea 16(%edx), %eax
+ add $16, %ecx
+ and $-16, %eax
+
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ pxor %xmm1, %xmm1
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ pxor %xmm2, %xmm2
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ pxor %xmm3, %xmm3
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm0
+ pmovmskb %xmm0, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm1
+ pmovmskb %xmm1, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm2
+ pmovmskb %xmm2, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb (%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 16(%eax), %eax
+ test %edx, %edx
+ jnz L(exit)
+
+ and $-0x40, %eax
+
+ .p2align 4
+L(aligned_64_loop):
+ movaps (%eax), %xmm0
+ movaps 16(%eax), %xmm1
+ movaps 32(%eax), %xmm2
+ movaps 48(%eax), %xmm6
+ pminub %xmm1, %xmm0
+ pminub %xmm6, %xmm2
+ pminub %xmm0, %xmm2
+ pcmpeqb %xmm3, %xmm2
+ pmovmskb %xmm2, %edx
+ lea 64(%eax), %eax
+ test %edx, %edx
+ jz L(aligned_64_loop)
+
+ pcmpeqb -64(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea 48(%ecx), %ecx
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ lea -16(%ecx), %ecx
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb -32(%eax), %xmm3
+ pmovmskb %xmm3, %edx
+ lea -16(%ecx), %ecx
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqb %xmm6, %xmm3
+ pmovmskb %xmm3, %edx
+ lea -16(%ecx), %ecx
+
+ .p2align 4
+L(exit):
+ sub %ecx, %eax
+ test %dl, %dl
+ jz L(exit_more_8)
+
+ mov %dl, %cl
+ and $15, %cl
+ jz L(exit_more_4)
+ test $0x01, %dl
+ jnz L(exit_0)
+ test $0x02, %dl
+ jnz L(exit_1)
+ test $0x04, %dl
+ jnz L(exit_2)
+ add $3, %eax
+ RETURN
+
+ .p2align 4
+L(exit_more_4):
+ test $0x10, %dl
+ jnz L(exit_4)
+ test $0x20, %dl
+ jnz L(exit_5)
+ test $0x40, %dl
+ jnz L(exit_6)
+ add $7, %eax
+ RETURN
+
+ .p2align 4
+L(exit_more_8):
+ mov %dh, %ch
+ and $15, %ch
+ jz L(exit_more_12)
+ test $0x01, %dh
+ jnz L(exit_8)
+ test $0x02, %dh
+ jnz L(exit_9)
+ test $0x04, %dh
+ jnz L(exit_10)
+ add $11, %eax
+ RETURN
+
+ .p2align 4
+L(exit_more_12):
+ test $0x10, %dh
+ jnz L(exit_12)
+ test $0x20, %dh
+ jnz L(exit_13)
+ test $0x40, %dh
+ jnz L(exit_14)
+ add $15, %eax
+L(exit_0):
+ RETURN
+
+ .p2align 4
+L(exit_1):
+ add $1, %eax
+ RETURN
+
+L(exit_2):
+ add $2, %eax
+ RETURN
+
+L(exit_3):
+ add $3, %eax
+ RETURN
+
+L(exit_4):
+ add $4, %eax
+ RETURN
+
+L(exit_5):
+ add $5, %eax
+ RETURN
+
+L(exit_6):
+ add $6, %eax
+ RETURN
+
+L(exit_7):
+ add $7, %eax
+ RETURN
+
+L(exit_8):
+ add $8, %eax
+ RETURN
+
+L(exit_9):
+ add $9, %eax
+ RETURN
+
+L(exit_10):
+ add $10, %eax
+ RETURN
+
+L(exit_11):
+ add $11, %eax
+ RETURN
+
+L(exit_12):
+ add $12, %eax
+ RETURN
+
+L(exit_13):
+ add $13, %eax
+ RETURN
+
+L(exit_14):
+ add $14, %eax
+ RETURN
+
+L(exit_15):
+ add $15, %eax
+ RETURN
+
+L(exit_tail0):
+ mov %edx, %eax
+ sub %ecx, %eax
+ RETURN
+
+ .p2align 4
+L(exit_tail1):
+ lea 1(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail2):
+ lea 2(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail3):
+ lea 3(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail4):
+ lea 4(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail5):
+ lea 5(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail6):
+ lea 6(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail7):
+ lea 7(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail8):
+ lea 8(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail9):
+ lea 9(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail10):
+ lea 10(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail11):
+ lea 11(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail12):
+ lea 12(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail13):
+ lea 13(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail14):
+ lea 14(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+L(exit_tail15):
+ lea 15(%edx), %eax
+ sub %ecx, %eax
+ RETURN
+
+END (STRCPY)
+
diff --git a/libc/arch-x86/string/memcmp_wrapper.S b/libc/arch-x86/string/ssse3-strncat-atom.S
similarity index 90%
copy from libc/arch-x86/string/memcmp_wrapper.S
copy to libc/arch-x86/string/ssse3-strncat-atom.S
index fa0c672..5618771 100644
--- a/libc/arch-x86/string/memcmp_wrapper.S
+++ b/libc/arch-x86/string/ssse3-strncat-atom.S
@@ -1,5 +1,5 @@
/*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2011, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,13 +28,7 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#if defined(USE_SSSE3)
+#define STRCAT strncat
+#define USE_AS_STRNCAT
-# define MEMCMP memcmp
-# include "ssse3-memcmp3-new.S"
-
-#else
-
-# include "memcmp.S"
-
-#endif
+#include "ssse3-strcat-atom.S"
diff --git a/libc/arch-x86/string/memcmp_wrapper.S b/libc/arch-x86/string/ssse3-strncmp-atom.S
similarity index 92%
rename from libc/arch-x86/string/memcmp_wrapper.S
rename to libc/arch-x86/string/ssse3-strncmp-atom.S
index fa0c672..4762d7e 100644
--- a/libc/arch-x86/string/memcmp_wrapper.S
+++ b/libc/arch-x86/string/ssse3-strncmp-atom.S
@@ -28,13 +28,8 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#if defined(USE_SSSE3)
-# define MEMCMP memcmp
-# include "ssse3-memcmp3-new.S"
+#define USE_AS_STRNCMP
+#define STRCMP strncmp
+#include "ssse3-strcmp-atom.S"
-#else
-
-# include "memcmp.S"
-
-#endif
diff --git a/libc/arch-x86/string/memcmp_wrapper.S b/libc/arch-x86/string/ssse3-strncpy-atom.S
similarity index 90%
copy from libc/arch-x86/string/memcmp_wrapper.S
copy to libc/arch-x86/string/ssse3-strncpy-atom.S
index fa0c672..0948b6d 100644
--- a/libc/arch-x86/string/memcmp_wrapper.S
+++ b/libc/arch-x86/string/ssse3-strncpy-atom.S
@@ -1,5 +1,5 @@
/*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2011, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,13 +28,6 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#if defined(USE_SSSE3)
-
-# define MEMCMP memcmp
-# include "ssse3-memcmp3-new.S"
-
-#else
-
-# include "memcmp.S"
-
-#endif
+#define USE_AS_STRNCPY
+#define STRCPY strncpy
+#include "ssse3-strcpy-atom.S"
diff --git a/libc/arch-x86/string/ssse3-wcscat-atom.S b/libc/arch-x86/string/ssse3-wcscat-atom.S
new file mode 100644
index 0000000..17b0843
--- /dev/null
+++ b/libc/arch-x86/string/ssse3-wcscat-atom.S
@@ -0,0 +1,114 @@
+/*
+Copyright (c) 2011 Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+#endif
+
+#ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+#endif
+
+#ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#define PARMS 4
+#define STR1 PARMS+4
+#define STR2 STR1+4
+
+#define USE_AS_WCSCAT
+
+.text
+ENTRY (wcscat)
+ PUSH (%edi)
+ mov STR1(%esp), %edi
+ mov %edi, %edx
+
+#define RETURN jmp L(WcscpyAtom)
+#include "sse2-wcslen-atom.S"
+
+L(WcscpyAtom):
+ shl $2, %eax
+ mov STR2(%esp), %ecx
+ lea (%edi, %eax), %edx
+
+ cmp $0, (%ecx)
+ jz L(Exit4)
+ cmp $0, 4(%ecx)
+ jz L(Exit8)
+ cmp $0, 8(%ecx)
+ jz L(Exit12)
+ cmp $0, 12(%ecx)
+ jz L(Exit16)
+
+#undef RETURN
+#define RETURN POP(%edi); ret; CFI_PUSH(%edi)
+#include "ssse3-wcscpy-atom.S"
+
+END (wcscat)
diff --git a/libc/arch-x86/string/ssse3-wcscpy-atom.S b/libc/arch-x86/string/ssse3-wcscpy-atom.S
new file mode 100644
index 0000000..8ba84bc
--- /dev/null
+++ b/libc/arch-x86/string/ssse3-wcscpy-atom.S
@@ -0,0 +1,652 @@
+/*
+Copyright (c) 2011, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef USE_AS_WCSCAT
+
+# ifndef L
+# define L(label) .L##label
+# endif
+
+# ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+# endif
+
+# ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+# endif
+
+# ifndef cfi_rel_offset
+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off
+# endif
+
+# ifndef cfi_restore
+# define cfi_restore(reg) .cfi_restore reg
+# endif
+
+# ifndef cfi_adjust_cfa_offset
+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off
+# endif
+
+# ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+# endif
+
+# ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+# endif
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# define PARMS 4
+# define RETURN POP (%edi); ret; CFI_PUSH (%edi)
+
+# define STR1 PARMS
+# define STR2 STR1+4
+# define LEN STR2+4
+
+.text
+ENTRY (wcscpy)
+ mov STR1(%esp), %edx
+ mov STR2(%esp), %ecx
+
+ cmp $0, (%ecx)
+ jz L(ExitTail4)
+ cmp $0, 4(%ecx)
+ jz L(ExitTail8)
+ cmp $0, 8(%ecx)
+ jz L(ExitTail12)
+ cmp $0, 12(%ecx)
+ jz L(ExitTail16)
+
+ PUSH (%edi)
+ mov %edx, %edi
+#endif
+ PUSH (%esi)
+ lea 16(%ecx), %esi
+
+ and $-16, %esi
+
+ pxor %xmm0, %xmm0
+ pcmpeqd (%esi), %xmm0
+ movdqu (%ecx), %xmm1
+ movdqu %xmm1, (%edx)
+
+ pmovmskb %xmm0, %eax
+ sub %ecx, %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ mov %edx, %eax
+ lea 16(%edx), %edx
+ and $-16, %edx
+ sub %edx, %eax
+
+ sub %eax, %ecx
+ mov %ecx, %eax
+ and $0xf, %eax
+ mov $0, %esi
+
+ jz L(Align16Both)
+ cmp $4, %eax
+ je L(Shl4)
+ cmp $8, %eax
+ je L(Shl8)
+ jmp L(Shl12)
+
+L(Align16Both):
+ movaps (%ecx), %xmm1
+ movaps 16(%ecx), %xmm2
+ movaps %xmm1, (%edx)
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm3
+ movaps %xmm2, (%edx, %esi)
+ pcmpeqd %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm4
+ movaps %xmm3, (%edx, %esi)
+ pcmpeqd %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm1
+ movaps %xmm4, (%edx, %esi)
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm2
+ movaps %xmm1, (%edx, %esi)
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps 16(%ecx, %esi), %xmm3
+ movaps %xmm2, (%edx, %esi)
+ pcmpeqd %xmm3, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm3, (%edx, %esi)
+ mov %ecx, %eax
+ lea 16(%ecx, %esi), %ecx
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ sub %eax, %edx
+
+ mov $-0x40, %esi
+
+L(Aligned64Loop):
+ movaps (%ecx), %xmm2
+ movaps 32(%ecx), %xmm3
+ movaps %xmm2, %xmm4
+ movaps 16(%ecx), %xmm5
+ movaps %xmm3, %xmm6
+ movaps 48(%ecx), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ lea 64(%edx), %edx
+ pcmpeqd %xmm0, %xmm3
+ lea 64(%ecx), %ecx
+ pmovmskb %xmm3, %eax
+
+ test %eax, %eax
+ jnz L(Aligned64Leave)
+ movaps %xmm4, -64(%edx)
+ movaps %xmm5, -48(%edx)
+ movaps %xmm6, -32(%edx)
+ movaps %xmm7, -16(%edx)
+ jmp L(Aligned64Loop)
+
+L(Aligned64Leave):
+ pcmpeqd %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqd %xmm5, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm4, -64(%edx)
+ lea 16(%esi), %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ pcmpeqd %xmm6, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm5, -48(%edx)
+ lea 16(%esi), %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ movaps %xmm6, -32(%edx)
+ pcmpeqd %xmm7, %xmm0
+ pmovmskb %xmm0, %eax
+ lea 16(%esi), %esi
+ test %eax, %eax
+ jnz L(CopyFrom1To16Bytes)
+
+ mov $-0x40, %esi
+ movaps %xmm7, -16(%edx)
+ jmp L(Aligned64Loop)
+
+ .p2align 4
+L(Shl4):
+ movaps -4(%ecx), %xmm1
+ movaps 12(%ecx), %xmm2
+L(Shl4Start):
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm1
+
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 28(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+
+ test %eax, %eax
+ jnz L(Shl4LoopExit)
+
+ palignr $4, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 28(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -12(%ecx), %ecx
+ sub %eax, %edx
+
+ movaps -4(%ecx), %xmm1
+
+L(Shl4LoopStart):
+ movaps 12(%ecx), %xmm2
+ movaps 28(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 44(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 60(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqd %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $4, %xmm4, %xmm5
+ palignr $4, %xmm3, %xmm4
+ test %eax, %eax
+ jnz L(Shl4Start)
+
+ palignr $4, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 8(%edx)
+ POP (%esi)
+ add $12, %edx
+ add $12, %ecx
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit4)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl %edi, %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(Shl8):
+ movaps -8(%ecx), %xmm1
+ movaps 8(%ecx), %xmm2
+L(Shl8Start):
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm1
+
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 24(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+
+ test %eax, %eax
+ jnz L(Shl8LoopExit)
+
+ palignr $8, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 24(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -8(%ecx), %ecx
+ sub %eax, %edx
+
+ movaps -8(%ecx), %xmm1
+
+L(Shl8LoopStart):
+ movaps 8(%ecx), %xmm2
+ movaps 24(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 40(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 56(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqd %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $8, %xmm4, %xmm5
+ palignr $8, %xmm3, %xmm4
+ test %eax, %eax
+ jnz L(Shl8Start)
+
+ palignr $8, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ POP (%esi)
+ add $8, %edx
+ add $8, %ecx
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit4)
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl %edi, %eax
+ RETURN
+
+ CFI_PUSH (%esi)
+
+ .p2align 4
+L(Shl12):
+ movaps -12(%ecx), %xmm1
+ movaps 4(%ecx), %xmm2
+L(Shl12Start):
+ pcmpeqd %xmm2, %xmm0
+ pmovmskb %xmm0, %eax
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm1
+
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+ movaps %xmm2, %xmm3
+
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps 20(%ecx), %xmm2
+
+ pcmpeqd %xmm2, %xmm0
+ lea 16(%edx), %edx
+ pmovmskb %xmm0, %eax
+ lea 16(%ecx), %ecx
+
+ test %eax, %eax
+ jnz L(Shl12LoopExit)
+
+ palignr $12, %xmm3, %xmm2
+ movaps %xmm2, (%edx)
+ lea 20(%ecx), %ecx
+ lea 16(%edx), %edx
+
+ mov %ecx, %eax
+ and $-0x40, %ecx
+ sub %ecx, %eax
+ lea -4(%ecx), %ecx
+ sub %eax, %edx
+
+ movaps -12(%ecx), %xmm1
+
+L(Shl12LoopStart):
+ movaps 4(%ecx), %xmm2
+ movaps 20(%ecx), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 36(%ecx), %xmm4
+ movaps %xmm4, %xmm7
+ movaps 52(%ecx), %xmm5
+ pminub %xmm2, %xmm6
+ pminub %xmm5, %xmm7
+ pminub %xmm6, %xmm7
+ pcmpeqd %xmm0, %xmm7
+ pmovmskb %xmm7, %eax
+ movaps %xmm5, %xmm7
+ palignr $12, %xmm4, %xmm5
+ palignr $12, %xmm3, %xmm4
+ test %eax, %eax
+ jnz L(Shl12Start)
+
+ palignr $12, %xmm2, %xmm3
+ lea 64(%ecx), %ecx
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm7, %xmm1
+ movaps %xmm5, 48(%edx)
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ jmp L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+ movl (%ecx), %esi
+ movl %esi, (%edx)
+ mov $4, %esi
+
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %esi, %edx
+ add %esi, %ecx
+
+ POP (%esi)
+ test %al, %al
+ jz L(ExitHigh)
+ test $0x01, %al
+ jnz L(Exit4)
+L(Exit8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl %edi, %eax
+ RETURN
+
+ .p2align 4
+L(ExitHigh):
+ test $0x01, %ah
+ jnz L(Exit12)
+L(Exit16):
+ movdqu (%ecx), %xmm0
+ movdqu %xmm0, (%edx)
+ movl %edi, %eax
+ RETURN
+
+ .p2align 4
+L(Exit4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl %edi, %eax
+ RETURN
+
+ .p2align 4
+L(Exit12):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+ movl %edi, %eax
+ RETURN
+
+CFI_POP (%edi)
+
+ .p2align 4
+L(ExitTail4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ movl %edx, %eax
+ ret
+
+ .p2align 4
+L(ExitTail8):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl %edx, %eax
+ ret
+
+ .p2align 4
+L(ExitTail12):
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movl 8(%ecx), %eax
+ movl %eax, 8(%edx)
+ movl %edx, %eax
+ ret
+
+ .p2align 4
+L(ExitTail16):
+ movdqu (%ecx), %xmm0
+ movdqu %xmm0, (%edx)
+ movl %edx, %eax
+ ret
+
+#ifndef USE_AS_WCSCAT
+END (wcscpy)
+#endif
diff --git a/libc/arch-x86/string/memcmp_wrapper.S b/libc/arch-x86/string/ssse3-wmemcmp-atom.S
similarity index 90%
copy from libc/arch-x86/string/memcmp_wrapper.S
copy to libc/arch-x86/string/ssse3-wmemcmp-atom.S
index fa0c672..c146b04 100644
--- a/libc/arch-x86/string/memcmp_wrapper.S
+++ b/libc/arch-x86/string/ssse3-wmemcmp-atom.S
@@ -1,5 +1,5 @@
/*
-Copyright (c) 2010, Intel Corporation
+Copyright (c) 2011, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,13 +28,6 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
-#if defined(USE_SSSE3)
-
-# define MEMCMP memcmp
-# include "ssse3-memcmp3-new.S"
-
-#else
-
-# include "memcmp.S"
-
-#endif
+#define MEMCMP wmemcmp
+#define USE_AS_WMEMCMP 1
+#include "ssse3-memcmp-atom.S"
diff --git a/libc/arch-x86/string/strcmp_wrapper.S b/libc/arch-x86/string/strcmp_wrapper.S
deleted file mode 100644
index 20f3064..0000000
--- a/libc/arch-x86/string/strcmp_wrapper.S
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
-
- * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(USE_SSSE3)
-
-# define ssse3_strcmp_latest strcmp
-# include "ssse3-strcmp-latest.S"
-
-#else
-
-# include "strcmp.S"
-
-#endif
diff --git a/libc/arch-x86/string/strlen_wrapper.S b/libc/arch-x86/string/strlen_wrapper.S
deleted file mode 100644
index e62786b..0000000
--- a/libc/arch-x86/string/strlen_wrapper.S
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
-
- * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(USE_SSE2)
-
-# define sse2_strlen_atom strlen
-# include "sse2-strlen-atom.S"
-
-#else
-
-# include "strlen.S"
-
-#endif
diff --git a/libc/arch-x86/string/strncmp_wrapper.S b/libc/arch-x86/string/strncmp_wrapper.S
deleted file mode 100644
index 191d755..0000000
--- a/libc/arch-x86/string/strncmp_wrapper.S
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
-Copyright (c) 2010, Intel Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice,
- * this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
-
- * Neither the name of Intel Corporation nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-#if defined(USE_SSSE3)
-
-# define USE_AS_STRNCMP
-# define ssse3_strcmp_latest strncmp
-# include "ssse3-strcmp-latest.S"
-
-#else
-
-# include "strncmp.S"
-
-#endif
-