Add 64-bit slm optimized strlcpy and srlcat.
Change-Id: Ic948934d91c83bbfdfd00c05ee8b14952e012549
Signed-off-by: Varvara Rainchik <varvara.rainchik@intel.com>
diff --git a/libc/arch-x86_64/string/sse2-strlcpy-slm.S b/libc/arch-x86_64/string/sse2-strlcpy-slm.S
new file mode 100755
index 0000000..9d4b52f
--- /dev/null
+++ b/libc/arch-x86_64/string/sse2-strlcpy-slm.S
@@ -0,0 +1,1062 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+
+ * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef L
+# define L(label) .L##label
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc .cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc .cfi_endproc
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name) \
+ .type name, @function; \
+ .globl name; \
+ .p2align 4; \
+name: \
+ cfi_startproc
+#endif
+
+#ifndef END
+# define END(name) \
+ cfi_endproc; \
+ .size name, .-name
+#endif
+
+
+#ifndef STRLCPY
+# define STRLCPY strlcpy
+#endif
+
+#define JMPTBL(I, B) I - B
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ lea TABLE(%rip), %r11; \
+ movslq (%r11, INDEX, SCALE), %rcx; \
+ lea (%r11, %rcx), %rcx; \
+ jmp *%rcx
+
+#define RETURN \
+ add %r9, %rax; \
+ ret
+
+.text
+ENTRY (STRLCPY)
+ xor %rax, %rax
+ xor %r9, %r9
+ mov %rdx, %r8
+ cmp $0, %r8
+ jz L(CalculateSrcLen)
+
+#ifdef USE_AS_STRLCAT
+ xor %rcx, %rcx
+ pxor %xmm0, %xmm0
+
+ movdqu (%rdi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %rdx
+
+ cmp $17, %r8
+ jb L(SizeEndCase1)
+ test %rdx, %rdx
+ jnz L(StringEndCase1)
+
+ add $16, %rax
+ movdqu 16(%rdi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %rdx
+
+ cmp $33, %r8
+ jb L(SizeEndCase1)
+ test %rdx, %rdx
+ jnz L(StringEndCase1)
+
+ mov %rdi, %rcx
+ and $15, %rcx
+ and $-16, %rdi
+
+ add %rcx, %r8
+ sub $16, %r8
+
+L(DstLenLoop):
+ movdqa (%rdi, %rax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %rdx
+ sub $16, %r8
+ jbe L(SizeEndCase2)
+ test %rdx, %rdx
+ jnz L(StringEndCase2)
+ add $16, %rax
+ jmp L(DstLenLoop)
+
+L(StringEndCase2):
+ add $16, %r8
+ bsf %rdx, %rdx
+ sub %rdx, %r8
+ add %rdx, %rax
+ sub %rcx, %r9
+ add %rax, %rdi
+ jmp L(CopySrcString)
+
+L(SizeEndCase1):
+ test %rdx, %rdx
+ jz L(SizeEnd)
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ cmp %r8, %rax
+ jb L(StringEnd)
+L(SizeEnd):
+ mov %r8, %r9
+ jmp L(CalculateSrcLenCase1)
+
+L(SizeEndCase2):
+ add $16, %r8
+ test %rdx, %rdx
+ jz L(StringEndCase4)
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(StringEndCase3)
+L(StringEndCase4):
+ add %r8, %rax
+ sub %rcx, %rax
+ mov %rax, %r9
+ jmp L(CalculateSrcLenCase1)
+
+L(StringEndCase3):
+ add %rdx, %rax
+ sub %rcx, %r9
+ add %rax, %rdi
+ sub %rdx, %r8
+ jmp L(CopySrcString)
+
+L(StringEndCase1):
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ sub %rcx, %rax
+L(StringEnd):
+ add %rax, %rdi
+ sub %rax, %r8
+#endif
+
+ mov %rsi, %rcx
+ and $63, %rcx
+ cmp $32, %rcx
+ jbe L(CopySrcString)
+
+ and $-16, %rsi
+ and $15, %rcx
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+
+ pcmpeqb (%rsi), %xmm1
+ pmovmskb %xmm1, %rdx
+ shr %cl, %rdx
+ mov $16, %r10
+ sub %rcx, %r10
+ cmp %r10, %r8
+ jbe L(CopyFrom1To16BytesTailCase2OrCase3)
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTail)
+
+ pcmpeqb 16(%rsi), %xmm0
+ pmovmskb %xmm0, %rdx
+ add $16, %r10
+ cmp %r10, %r8
+ jbe L(CopyFrom1To32BytesCase2OrCase3)
+ test %rdx, %rdx
+ jnz L(CopyFrom1To32Bytes)
+
+ movdqu (%rsi, %rcx), %xmm1
+ movdqu %xmm1, (%rdi)
+#ifdef USE_AS_STRLCAT
+ add %rax, %r9
+#endif
+ jmp L(LoopStart)
+
+ .p2align 4
+L(CopySrcString):
+#ifdef USE_AS_STRLCAT
+ add %rax, %r9
+ xor %rax, %rax
+#endif
+ pxor %xmm0, %xmm0
+ movdqu (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %rdx
+
+ cmp $17, %r8
+ jb L(CopyFrom1To16BytesTail1Case2OrCase3)
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTail1)
+
+ movdqu 16(%rsi), %xmm2
+ pcmpeqb %xmm2, %xmm0
+ movdqu %xmm1, (%rdi)
+ pmovmskb %xmm0, %rdx
+ add $16, %rax
+
+ cmp $33, %r8
+ jb L(CopyFrom1To32Bytes1Case2OrCase3)
+ test %rdx, %rdx
+ jnz L(CopyFrom1To32Bytes1)
+
+ mov %rsi, %rcx
+ and $15, %rcx
+ and $-16, %rsi
+
+L(LoopStart):
+ sub %rcx, %rdi
+ add %rcx, %r8
+ sub $16, %r8
+ mov $16, %rax
+
+L(16Loop):
+ movdqa (%rsi, %rax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %rdx
+ sub $16, %r8
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesXmmExit)
+ movdqu %xmm1, (%rdi, %rax)
+ add $16, %rax
+ jmp L(16Loop)
+
+/*------End of main part with loops---------------------*/
+
+/* Case1 */
+ .p2align 4
+L(CopyFrom1To16Bytes):
+ add %rcx, %rdi
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesTail):
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1):
+ add $16, %rsi
+ add $16, %rdi
+ sub $16, %r8
+L(CopyFrom1To16BytesTail1):
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes):
+ bsf %rdx, %rdx
+ add %rcx, %rsi
+ add $16, %rdx
+ sub %rcx, %rdx
+ add %rdx, %rax
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesExit):
+ add %rdx, %rax
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
+
+/* Case2 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %r8
+ add %rax, %rdi
+ add %rax, %rsi
+ bsf %rdx, %rdx
+ sub %rcx, %rax
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ add %r8, %rax
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2):
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ add $16, %rdx
+ sub %rcx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ add %r8, %rax
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+ add %rcx, %rsi
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ add %r8, %rax
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesTail1Case2):
+ bsf %rdx, %rdx
+ cmp %r8, %rdx
+ jb L(CopyFrom1To16BytesExit)
+ add %r8, %rax
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
+
+/* Case2 or Case3, Case3 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesCase2)
+ add $16, %r8
+ add %rax, %rdi
+ add %rax, %rsi
+ add %r8, %rax
+ sub %rcx, %rax
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To32BytesCase2)
+ add %rcx, %rsi
+ add %r8, %rax
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTailCase2)
+ add %rcx, %rsi
+ add %r8, %rax
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+ add $16, %rdi
+ add $16, %rsi
+ sub $16, %r8
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+ test %rdx, %rdx
+ jnz L(CopyFrom1To16BytesTail1Case2)
+ add %r8, %rax
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %r8, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesXmmExit):
+ bsf %rdx, %rdx
+ add %rax, %rdi
+ add %rax, %rsi
+ add %rdx, %rax
+ sub %rcx, %rax
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStringTailTable), %rdx, 4)
+
+/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/
+
+
+ .p2align 4
+L(Exit0):
+ RETURN
+
+ .p2align 4
+L(Exit1):
+ movb $0, (%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit2):
+ movb (%rsi), %dh
+ movb %dh, (%rdi)
+ movb $0, 1(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit3):
+ movw (%rsi), %dx
+ movw %dx, (%rdi)
+ movb $0, 2(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit4):
+ movw (%rsi), %cx
+ movb 2(%rsi), %dh
+ movw %cx, (%rdi)
+ movb %dh, 2(%rdi)
+ movb $0, 3(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit5):
+ movl (%rsi), %edx
+ movl %edx, (%rdi)
+ movb $0, 4(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit6):
+ movl (%rsi), %ecx
+ movb 4(%rsi), %dh
+ movl %ecx, (%rdi)
+ movb %dh, 4(%rdi)
+ movb $0, 5(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit7):
+ movl (%rsi), %ecx
+ movw 4(%rsi), %dx
+ movl %ecx, (%rdi)
+ movw %dx, 4(%rdi)
+ movb $0, 6(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit8):
+ movl (%rsi), %ecx
+ movl 3(%rsi), %edx
+ movl %ecx, (%rdi)
+ movl %edx, 3(%rdi)
+ movb $0, 7(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit9):
+ movq (%rsi), %rdx
+ movq %rdx, (%rdi)
+ movb $0, 8(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit10):
+ movq (%rsi), %rcx
+ movb 8(%rsi), %dh
+ movq %rcx, (%rdi)
+ movb %dh, 8(%rdi)
+ movb $0, 9(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit11):
+ movq (%rsi), %rcx
+ movw 8(%rsi), %dx
+ movq %rcx, (%rdi)
+ movw %dx, 8(%rdi)
+ movb $0, 10(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit12):
+ movq (%rsi), %rcx
+ movl 7(%rsi), %edx
+ movq %rcx, (%rdi)
+ movl %edx, 7(%rdi)
+ movb $0, 11(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit13):
+ movq (%rsi), %rcx
+ movl 8(%rsi), %edx
+ movq %rcx, (%rdi)
+ movl %edx, 8(%rdi)
+ movb $0, 12(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit14):
+ movq (%rsi), %rcx
+ movq 5(%rsi), %rdx
+ movq %rcx, (%rdi)
+ movq %rdx, 5(%rdi)
+ movb $0, 13(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit15):
+ movq (%rsi), %rcx
+ movq 6(%rsi), %rdx
+ movq %rcx, (%rdi)
+ movq %rdx, 6(%rdi)
+ movb $0, 14(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit16):
+ movq (%rsi), %rcx
+ movq 7(%rsi), %rdx
+ movq %rcx, (%rdi)
+ movq %rdx, 7(%rdi)
+ movb $0, 15(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit17):
+ movdqu (%rsi), %xmm0
+ movdqu %xmm0, (%rdi)
+ movb $0, 16(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit18):
+ movdqu (%rsi), %xmm0
+ movb 16(%rsi), %dh
+ movdqu %xmm0, (%rdi)
+ movb %dh, 16(%rdi)
+ movb $0, 17(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit19):
+ movdqu (%rsi), %xmm0
+ movw 16(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ movw %cx, 16(%rdi)
+ movb $0, 18(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit20):
+ movdqu (%rsi), %xmm0
+ movl 15(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ movl %ecx, 15(%rdi)
+ movb $0, 19(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit21):
+ movdqu (%rsi), %xmm0
+ movl 16(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ movl %ecx, 16(%rdi)
+ movb $0, 20(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit22):
+ movdqu (%rsi), %xmm0
+ movl 16(%rsi), %ecx
+ movb 20(%rsi), %dh
+ movdqu %xmm0, (%rdi)
+ movl %ecx, 16(%rdi)
+ movb %dh, 20(%rdi)
+ movb $0, 21(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit23):
+ movdqu (%rsi), %xmm0
+ movq 14(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ movq %rcx, 14(%rdi)
+ movb $0, 22(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit24):
+ movdqu (%rsi), %xmm0
+ movq 15(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ movq %rcx, 15(%rdi)
+ movb $0, 23(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit25):
+ movdqu (%rsi), %xmm0
+ movq 16(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ movq %rcx, 16(%rdi)
+ movb $0, 24(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit26):
+ movdqu (%rsi), %xmm0
+ movq 16(%rsi), %rcx
+ movb 24(%rsi), %dh
+ movdqu %xmm0, (%rdi)
+ movq %rcx, 16(%rdi)
+ mov %dh, 24(%rdi)
+ movb $0, 25(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit27):
+ movdqu (%rsi), %xmm0
+ movq 16(%rsi), %rdx
+ movw 24(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ movq %rdx, 16(%rdi)
+ movw %cx, 24(%rdi)
+ movb $0, 26(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit28):
+ movdqu (%rsi), %xmm0
+ movq 16(%rsi), %rdx
+ movl 23(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ movq %rdx, 16(%rdi)
+ movl %ecx, 23(%rdi)
+ movb $0, 27(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit29):
+ movdqu (%rsi), %xmm0
+ movq 16(%rsi), %rdx
+ movl 24(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ movq %rdx, 16(%rdi)
+ movl %ecx, 24(%rdi)
+ movb $0, 28(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit30):
+ movdqu (%rsi), %xmm0
+ movdqu 13(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 13(%rdi)
+ movb $0, 29(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit31):
+ movdqu (%rsi), %xmm0
+ movdqu 14(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 14(%rdi)
+ movb $0, 30(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(Exit32):
+ movdqu (%rsi), %xmm0
+ movdqu 15(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 15(%rdi)
+ movb $0, 31(%rdi)
+ jmp L(CalculateSrcLen)
+
+ .p2align 4
+L(StringTail0):
+ mov (%rsi), %dl
+ mov %dl, (%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail1):
+ mov (%rsi), %dx
+ mov %dx, (%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail2):
+ mov (%rsi), %cx
+ mov 2(%rsi), %dl
+ mov %cx, (%rdi)
+ mov %dl, 2(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail3):
+ mov (%rsi), %edx
+ mov %edx, (%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail4):
+ mov (%rsi), %ecx
+ mov 4(%rsi), %dl
+ mov %ecx, (%rdi)
+ mov %dl, 4(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail5):
+ mov (%rsi), %ecx
+ mov 4(%rsi), %dx
+ mov %ecx, (%rdi)
+ mov %dx, 4(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail6):
+ mov (%rsi), %ecx
+ mov 3(%rsi), %edx
+ mov %ecx, (%rdi)
+ mov %edx, 3(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail7):
+ mov (%rsi), %rdx
+ mov %rdx, (%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail8):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %dl
+ mov %rcx, (%rdi)
+ mov %dl, 8(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail9):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %dx
+ mov %rcx, (%rdi)
+ mov %dx, 8(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail10):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 7(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail11):
+ mov (%rsi), %rcx
+ mov 8(%rsi), %edx
+ mov %rcx, (%rdi)
+ mov %edx, 8(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail12):
+ mov (%rsi), %rcx
+ mov 5(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 5(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail13):
+ mov (%rsi), %rcx
+ mov 6(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 6(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail14):
+ mov (%rsi), %rcx
+ mov 7(%rsi), %rdx
+ mov %rcx, (%rdi)
+ mov %rdx, 7(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail15):
+ movdqu (%rsi), %xmm0
+ movdqu %xmm0, (%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail16):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %cl
+ movdqu %xmm0, (%rdi)
+ mov %cl, 16(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail17):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %cx, 16(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail18):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 15(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail19):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail20):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %ecx
+ mov 20(%rsi), %dl
+ movdqu %xmm0, (%rdi)
+ mov %ecx, 16(%rdi)
+ mov %dl, 20(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail21):
+ movdqu (%rsi), %xmm0
+ mov 14(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 14(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail22):
+ movdqu (%rsi), %xmm0
+ mov 15(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 15(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail23):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rcx
+ movdqu %xmm0, (%rdi)
+ mov %rcx, 16(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail24):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %cl
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %cl, 24(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail25):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %cx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %cx, 24(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail26):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 23(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 23(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail27):
+ movdqu (%rsi), %xmm0
+ mov 16(%rsi), %rdx
+ mov 24(%rsi), %ecx
+ movdqu %xmm0, (%rdi)
+ mov %rdx, 16(%rdi)
+ mov %ecx, 24(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail28):
+ movdqu (%rsi), %xmm0
+ movdqu 13(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 13(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail29):
+ movdqu (%rsi), %xmm0
+ movdqu 14(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 14(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail30):
+ movdqu (%rsi), %xmm0
+ movdqu 15(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 15(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail31):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm2
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail32):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm2
+ mov 32(%rsi), %cl
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+ mov %cl, 32(%rdi)
+ RETURN
+
+ .p2align 4
+L(StringTail33):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm2
+ mov 32(%rsi), %cl
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+ mov %cl, 32(%rdi)
+ RETURN
+
+ .p2align 4
+L(CalculateSrcLenCase1):
+ xor %r8, %r8
+ xor %rax, %rax
+L(CalculateSrcLen):
+ pxor %xmm0, %xmm0
+ xor %rcx, %rcx
+ add %r8, %rsi
+ movdqu (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %rdx
+ test %rdx, %rdx
+ jnz L(SrcLenLoopEnd)
+
+ add %rax, %r9
+ mov $16, %rax
+ mov %rsi, %rcx
+ and $15, %rcx
+ and $-16, %rsi
+L(SrcLenLoop):
+ movdqa (%rsi, %rax), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %rdx
+ test %rdx, %rdx
+ jnz L(SrcLenLoopEnd)
+ add $16, %rax
+ jmp L(SrcLenLoop)
+
+ .p2align 4
+L(SrcLenLoopEnd):
+ bsf %rdx, %rdx
+ add %rdx, %rax
+ sub %rcx, %rax
+ RETURN
+
+END (STRLCPY)
+
+ .p2align 4
+ .section .rodata
+L(ExitTable):
+ .int JMPTBL(L(Exit0), L(ExitTable))
+ .int JMPTBL(L(Exit1), L(ExitTable))
+ .int JMPTBL(L(Exit2), L(ExitTable))
+ .int JMPTBL(L(Exit3), L(ExitTable))
+ .int JMPTBL(L(Exit4), L(ExitTable))
+ .int JMPTBL(L(Exit5), L(ExitTable))
+ .int JMPTBL(L(Exit6), L(ExitTable))
+ .int JMPTBL(L(Exit7), L(ExitTable))
+ .int JMPTBL(L(Exit8), L(ExitTable))
+ .int JMPTBL(L(Exit9), L(ExitTable))
+ .int JMPTBL(L(Exit10), L(ExitTable))
+ .int JMPTBL(L(Exit11), L(ExitTable))
+ .int JMPTBL(L(Exit12), L(ExitTable))
+ .int JMPTBL(L(Exit13), L(ExitTable))
+ .int JMPTBL(L(Exit14), L(ExitTable))
+ .int JMPTBL(L(Exit15), L(ExitTable))
+ .int JMPTBL(L(Exit16), L(ExitTable))
+ .int JMPTBL(L(Exit17), L(ExitTable))
+ .int JMPTBL(L(Exit18), L(ExitTable))
+ .int JMPTBL(L(Exit19), L(ExitTable))
+ .int JMPTBL(L(Exit20), L(ExitTable))
+ .int JMPTBL(L(Exit21), L(ExitTable))
+ .int JMPTBL(L(Exit22), L(ExitTable))
+ .int JMPTBL(L(Exit23), L(ExitTable))
+ .int JMPTBL(L(Exit24), L(ExitTable))
+ .int JMPTBL(L(Exit25), L(ExitTable))
+ .int JMPTBL(L(Exit26), L(ExitTable))
+ .int JMPTBL(L(Exit27), L(ExitTable))
+ .int JMPTBL(L(Exit28), L(ExitTable))
+ .int JMPTBL(L(Exit29), L(ExitTable))
+ .int JMPTBL(L(Exit30), L(ExitTable))
+ .int JMPTBL(L(Exit31), L(ExitTable))
+ .int JMPTBL(L(Exit32), L(ExitTable))
+L(ExitStringTailTable):
+ .int JMPTBL(L(StringTail0), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail1), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail2), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail3), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail4), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail5), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail6), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail7), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail8), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail9), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail10), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail11), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail12), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail13), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail14), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail15), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail16), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail17), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail18), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail19), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail20), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail21), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail22), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail23), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail24), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail25), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail26), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail27), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail28), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail29), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail30), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail31), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail32), L(ExitStringTailTable))
+ .int JMPTBL(L(StringTail33), L(ExitStringTailTable))