Add ssse3 implementation of __memcmp16.

__memcmp16 was missing in x86. Also added C-version for backward
compatibility. Added bionic test for __memcmp16 and for wmemcmp.

Change-Id: I33718441e7ee343cdb021d91dbeaf9ce2d4d7eb4
Signed-off-by: Alexander Ivchenko <alexander.ivchenko@intel.com>
diff --git a/libc/arch-x86/string/ssse3-memcmp-atom.S b/libc/arch-x86/string/ssse3-memcmp-atom.S
index 30e3173..0387084 100644
--- a/libc/arch-x86/string/ssse3-memcmp-atom.S
+++ b/libc/arch-x86/string/ssse3-memcmp-atom.S
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2010, 2011 Intel Corporation
+Copyright (c) 2010, 2011, 2012, 2013 Intel Corporation
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -106,9 +106,12 @@
 ENTRY (MEMCMP)
 	movl	LEN(%esp), %ecx
 
-#ifdef USE_AS_WMEMCMP
+#ifdef USE_WCHAR
 	shl	$2, %ecx
 	jz	L(zero)
+#elif defined USE_UTF16
+	shl	$1, %ecx
+	jz	L(zero)
 #endif
 
 	movl	BLK1(%esp), %eax
@@ -116,7 +119,7 @@
 	movl	BLK2(%esp), %edx
 	jae	L(48bytesormore)
 
-#ifndef USE_AS_WMEMCMP
+#if !defined(USE_WCHAR) && !defined(USE_UTF16)
 	cmp	$1, %ecx
 	jbe	L(less1bytes)
 #endif
@@ -128,7 +131,7 @@
 
 	CFI_POP	(%ebx)
 
-#ifndef USE_AS_WMEMCMP
+#if !defined(USE_WCHAR) && !defined(USE_UTF16)
 	.p2align 4
 L(less1bytes):
 	jb	L(zero)
@@ -174,7 +177,7 @@
 	jz	L(shr_0)
 	xor	%edx, %esi
 
-#ifndef USE_AS_WMEMCMP
+#if !defined(USE_WCHAR) && !defined(USE_UTF16)
 	cmp	$8, %edx
 	jae	L(next_unaligned_table)
 	cmp	$0, %edx
@@ -210,7 +213,7 @@
 	cmp	$14, %edx
 	je	L(shr_14)
 	jmp	L(shr_15)
-#else
+#elif defined(USE_WCHAR)
 	cmp	$0, %edx
 	je	L(shr_0)
 	cmp	$4, %edx
@@ -218,6 +221,22 @@
 	cmp	$8, %edx
 	je	L(shr_8)
 	jmp	L(shr_12)
+#elif defined(USE_UTF16)
+	cmp	$0, %edx
+	je	L(shr_0)
+	cmp	$2, %edx
+	je	L(shr_2)
+	cmp	$4, %edx
+	je	L(shr_4)
+	cmp	$6, %edx
+	je	L(shr_6)
+	cmp	$8, %edx
+	je	L(shr_8)
+	cmp	$10, %edx
+	je	L(shr_10)
+	cmp	$12, %edx
+	je	L(shr_12)
+	jmp	L(shr_14)
 #endif
 
 	.p2align 4
@@ -289,7 +308,7 @@
 	POP	(%esi)
 	jmp	L(less48bytes)
 
-#ifndef USE_AS_WMEMCMP
+#if !defined(USE_WCHAR) && !defined(USE_UTF16)
 	cfi_restore_state
 	cfi_remember_state
 	.p2align 4
@@ -372,8 +391,10 @@
 	POP	(%edi)
 	POP	(%esi)
 	jmp	L(less48bytes)
+#endif
 
 
+#if !defined(USE_WCHAR)
 	cfi_restore_state
 	cfi_remember_state
 	.p2align 4
@@ -456,7 +477,9 @@
 	POP	(%edi)
 	POP	(%esi)
 	jmp	L(less48bytes)
+#endif
 
+#if !defined(USE_WCHAR) && !defined(USE_UTF16)
 	cfi_restore_state
 	cfi_remember_state
 	.p2align 4
@@ -624,7 +647,7 @@
 	POP	(%esi)
 	jmp	L(less48bytes)
 
-#ifndef USE_AS_WMEMCMP
+#if !defined(USE_WCHAR) && !defined(USE_UTF16)
 	cfi_restore_state
 	cfi_remember_state
 	.p2align 4
@@ -707,7 +730,9 @@
 	POP	(%edi)
 	POP	(%esi)
 	jmp	L(less48bytes)
+#endif
 
+#if !defined(USE_WCHAR)
 	cfi_restore_state
 	cfi_remember_state
 	.p2align 4
@@ -790,7 +815,9 @@
 	POP	(%edi)
 	POP	(%esi)
 	jmp	L(less48bytes)
+#endif
 
+#if !defined(USE_WCHAR) && !defined(USE_UTF16)
 	cfi_restore_state
 	cfi_remember_state
 	.p2align 4
@@ -958,7 +985,7 @@
 	POP	(%esi)
 	jmp	L(less48bytes)
 
-#ifndef USE_AS_WMEMCMP
+#if !defined(USE_WCHAR) && !defined(USE_UTF16)
 	cfi_restore_state
 	cfi_remember_state
 	.p2align 4
@@ -1041,7 +1068,9 @@
 	POP	(%edi)
 	POP	(%esi)
 	jmp	L(less48bytes)
+#endif
 
+#if !defined(USE_WCHAR)
 	cfi_restore_state
 	cfi_remember_state
 	.p2align 4
@@ -1124,7 +1153,9 @@
 	POP	(%edi)
 	POP	(%esi)
 	jmp	L(less48bytes)
+#endif
 
+#if !defined(USE_WCHAR) && !defined(USE_UTF16)
 	cfi_restore_state
 	cfi_remember_state
 	.p2align 4
@@ -1292,7 +1323,7 @@
 	POP	(%esi)
 	jmp	L(less48bytes)
 
-#ifndef USE_AS_WMEMCMP
+#if !defined(USE_WCHAR) && !defined(USE_UTF16)
 	cfi_restore_state
 	cfi_remember_state
 	.p2align 4
@@ -1375,7 +1406,9 @@
 	POP	(%edi)
 	POP	(%esi)
 	jmp	L(less48bytes)
+#endif
 
+#if !defined(USE_WCHAR)
 	cfi_restore_state
 	cfi_remember_state
 	.p2align 4
@@ -1458,7 +1491,9 @@
 	POP	(%edi)
 	POP	(%esi)
 	jmp	L(less48bytes)
+#endif
 
+#if !defined(USE_WCHAR) && !defined(USE_UTF16)
 	cfi_restore_state
 	cfi_remember_state
 	.p2align 4
@@ -1558,7 +1593,7 @@
 	add	%eax, %esi
 L(less16bytes):
 
-#ifndef USE_AS_WMEMCMP
+#if !defined(USE_WCHAR) && !defined(USE_UTF16)
 	test	%dl, %dl
 	jz	L(next_24_bytes)
 
@@ -1668,7 +1703,7 @@
 	movzbl	-9(%esi), %edx
 	sub	%edx, %eax
 	RETURN_END
-#else
+#elif defined(USE_AS_WMEMCMP)
 
 /* special for wmemcmp */
 	test	%dl, %dl
@@ -1682,7 +1717,6 @@
 	neg	%eax
 	RETURN
 
-
 	.p2align 4
 L(second_double_word):
 	mov	-12(%edi), %ecx
@@ -1691,7 +1725,7 @@
 	jg	L(nequal_bigger)
 	neg	%eax
 	RETURN
-	
+
 	.p2align 4
 L(next_two_double_words):
 	and	$15, %dh
@@ -1715,6 +1749,79 @@
 	.p2align 4
 L(nequal_bigger):
 	RETURN_END
+
+#elif defined(USE_AS_MEMCMP16)
+
+/* special for __memcmp16 */
+	test	%dl, %dl
+	jz	L(next_four_words)
+	test	$15, %dl
+	jz	L(second_two_words)
+	test	$3, %dl
+	jz	L(second_word)
+	movzwl	-16(%edi), %eax
+	movzwl	-16(%esi), %ebx
+	subl	%ebx, %eax
+	RETURN
+
+	.p2align 4
+L(second_word):
+	movzwl	-14(%edi), %eax
+	movzwl	-14(%esi), %ebx
+	subl	%ebx, %eax
+	RETURN
+
+	.p2align 4
+L(second_two_words):
+	test	$63, %dl
+	jz	L(fourth_word)
+	movzwl	-12(%edi), %eax
+	movzwl	-12(%esi), %ebx
+	subl	%ebx, %eax
+	RETURN
+
+	.p2align 4
+L(fourth_word):
+	movzwl	-10(%edi), %eax
+	movzwl	-10(%esi), %ebx
+	subl	%ebx, %eax
+	RETURN
+
+	.p2align 4
+L(next_four_words):
+	test	$15, %dh
+	jz	L(fourth_two_words)
+	test	$3, %dh
+	jz	L(sixth_word)
+	movzwl	-8(%edi), %eax
+	movzwl	-8(%esi), %ebx
+	subl	%ebx, %eax
+	RETURN
+
+	.p2align 4
+L(sixth_word):
+	movzwl	-6(%edi), %eax
+	movzwl	-6(%esi), %ebx
+	subl	%ebx, %eax
+	RETURN
+
+	.p2align 4
+L(fourth_two_words):
+	test	$63, %dh
+	jz	L(eighth_word)
+	movzwl	-4(%edi), %eax
+	movzwl	-4(%esi), %ebx
+	subl	%ebx, %eax
+	RETURN
+
+	.p2align 4
+L(eighth_word):
+	movzwl	-2(%edi), %eax
+	movzwl	-2(%esi), %ebx
+	subl	%ebx, %eax
+	RETURN
+#else
+# error Unreachable preprocessor case
 #endif
 
 	CFI_PUSH (%ebx)
@@ -1725,7 +1832,7 @@
 	jae	L(more16bytes)
 	cmp	$8, %ecx
 	je	L(8bytes)
-#ifndef USE_AS_WMEMCMP
+#if !defined(USE_WCHAR) && !defined(USE_UTF16)
 	cmp	$9, %ecx
 	je	L(9bytes)
 	cmp	$10, %ecx
@@ -1739,8 +1846,16 @@
 	cmp	$14, %ecx
 	je	L(14bytes)
 	jmp	L(15bytes)
-#else
+#elif defined(USE_WCHAR) && !defined(USE_UTF16)
 	jmp	L(12bytes)
+#elif defined(USE_UTF16) && !defined(USE_WCHAR)
+	cmp	$10, %ecx
+	je	L(10bytes)
+	cmp	$12, %ecx
+	je	L(12bytes)
+	jmp	L(14bytes)
+#else
+# error Unreachable preprocessor case
 #endif
 
 	.p2align 4
@@ -1749,7 +1864,7 @@
 	jae	L(more24bytes)
 	cmp	$16, %ecx
 	je	L(16bytes)
-#ifndef USE_AS_WMEMCMP
+#if !defined(USE_WCHAR) && !defined(USE_UTF16)
 	cmp	$17, %ecx
 	je	L(17bytes)
 	cmp	$18, %ecx
@@ -1763,8 +1878,16 @@
 	cmp	$22, %ecx
 	je	L(22bytes)
 	jmp	L(23bytes)
-#else
+#elif defined(USE_WCHAR) && !defined(USE_UTF16)
 	jmp	L(20bytes)
+#elif defined(USE_UTF16) && !defined(USE_WCHAR)
+	cmp	$18, %ecx
+	je	L(18bytes)
+	cmp	$20, %ecx
+	je	L(20bytes)
+	jmp	L(22bytes)
+#else
+# error Unreachable preprocessor case
 #endif
 
 	.p2align 4
@@ -1773,7 +1896,7 @@
 	jae	L(more32bytes)
 	cmp	$24, %ecx
 	je	L(24bytes)
-#ifndef USE_AS_WMEMCMP
+#if !defined(USE_WCHAR) && !defined(USE_UTF16)
 	cmp	$25, %ecx
 	je	L(25bytes)
 	cmp	$26, %ecx
@@ -1787,8 +1910,16 @@
 	cmp	$30, %ecx
 	je	L(30bytes)
 	jmp	L(31bytes)
-#else
+#elif defined(USE_WCHAR) && !defined(USE_UTF16)
 	jmp	L(28bytes)
+#elif defined(USE_UTF16) && !defined(USE_WCHAR)
+	cmp	$26, %ecx
+	je	L(26bytes)
+	cmp	$28, %ecx
+	je	L(28bytes)
+	jmp	L(30bytes)
+#else
+# error Unreachable preprocessor case
 #endif
 
 	.p2align 4
@@ -1797,7 +1928,7 @@
 	jae	L(more40bytes)
 	cmp	$32, %ecx
 	je	L(32bytes)
-#ifndef USE_AS_WMEMCMP
+#if !defined(USE_WCHAR) && !defined(USE_UTF16)
 	cmp	$33, %ecx
 	je	L(33bytes)
 	cmp	$34, %ecx
@@ -1811,15 +1942,23 @@
 	cmp	$38, %ecx
 	je	L(38bytes)
 	jmp	L(39bytes)
-#else
+#elif defined(USE_WCHAR) && !defined(USE_UTF16)
 	jmp	L(36bytes)
+#elif defined(USE_UTF16) && !defined(USE_WCHAR)
+	cmp	$34, %ecx
+	je	L(34bytes)
+	cmp	$36, %ecx
+	je	L(36bytes)
+	jmp	L(38bytes)
+#else
+# error Unreachable preprocessor case
 #endif
 
 	.p2align 4
 L(less48bytes):
 	cmp	$8, %ecx
 	jae	L(more8bytes)
-#ifndef USE_AS_WMEMCMP
+#if !defined(USE_WCHAR) && !defined(USE_UTF16)
 	cmp	$2, %ecx
 	je	L(2bytes)
 	cmp	$3, %ecx
@@ -1831,15 +1970,23 @@
 	cmp	$6, %ecx
 	je	L(6bytes)
 	jmp	L(7bytes)
-#else
+#elif defined(USE_WCHAR) && !defined(USE_UTF16)
 	jmp	L(4bytes)
+#elif defined(USE_UTF16) && !defined(USE_WCHAR)
+	cmp	$2, %ecx
+	je	L(2bytes)
+	cmp	$4, %ecx
+	je	L(4bytes)
+	jmp	L(6bytes)
+#else
+# error Unreachable preprocessor case
 #endif
 
 	.p2align 4
 L(more40bytes):
 	cmp	$40, %ecx
 	je	L(40bytes)
-#ifndef USE_AS_WMEMCMP
+#if !defined(USE_WCHAR) && !defined(USE_UTF16)
 	cmp	$41, %ecx
 	je	L(41bytes)
 	cmp	$42, %ecx
@@ -1853,7 +2000,15 @@
 	cmp	$46, %ecx
 	je	L(46bytes)
 	jmp	L(47bytes)
+#elif defined(USE_UTF16) && !defined(USE_WCHAR)
+	cmp	$42, %ecx
+	je	L(42bytes)
+	cmp	$44, %ecx
+	je	L(44bytes)
+	jmp	L(46bytes)
+#endif
 
+#if !defined(USE_AS_WMEMCMP) && !defined(USE_AS_MEMCMP16)
 	.p2align 4
 L(44bytes):
 	mov	-44(%eax), %ecx
@@ -1914,7 +2069,8 @@
 	POP	(%ebx)
 	ret
 	CFI_PUSH (%ebx)
-#else
+#elif defined(USE_AS_WMEMCMP)
+
 	.p2align 4
 L(44bytes):
 	mov	-44(%eax), %ecx
@@ -1964,9 +2120,131 @@
 	POP	(%ebx)
 	ret
 	CFI_PUSH (%ebx)
+#elif defined USE_AS_MEMCMP16
+
+	.p2align 4
+L(46bytes):
+	movzwl	-46(%eax), %ecx
+	movzwl	-46(%edx), %ebx
+	subl	%ebx, %ecx
+	jne	L(memcmp16_exit)
+L(44bytes):
+	movzwl	-44(%eax), %ecx
+	movzwl	-44(%edx), %ebx
+	subl	%ebx, %ecx
+	jne	L(memcmp16_exit)
+L(42bytes):
+	movzwl	-42(%eax), %ecx
+	movzwl	-42(%edx), %ebx
+	subl	%ebx, %ecx
+	jne	L(memcmp16_exit)
+L(40bytes):
+	movzwl	-40(%eax), %ecx
+	movzwl	-40(%edx), %ebx
+	subl	%ebx, %ecx
+	jne	L(memcmp16_exit)
+L(38bytes):
+	movzwl	-38(%eax), %ecx
+	movzwl	-38(%edx), %ebx
+	subl	%ebx, %ecx
+	jne	L(memcmp16_exit)
+L(36bytes):
+	movzwl	-36(%eax), %ecx
+	movzwl	-36(%edx), %ebx
+	subl	%ebx, %ecx
+	jne	L(memcmp16_exit)
+L(34bytes):
+	movzwl	-34(%eax), %ecx
+	movzwl	-34(%edx), %ebx
+	subl	%ebx, %ecx
+	jne	L(memcmp16_exit)
+L(32bytes):
+	movzwl	-32(%eax), %ecx
+	movzwl	-32(%edx), %ebx
+	subl	%ebx, %ecx
+	jne	L(memcmp16_exit)
+L(30bytes):
+	movzwl	-30(%eax), %ecx
+	movzwl	-30(%edx), %ebx
+	subl	%ebx, %ecx
+	jne	L(memcmp16_exit)
+L(28bytes):
+	movzwl	-28(%eax), %ecx
+	movzwl	-28(%edx), %ebx
+	subl	%ebx, %ecx
+	jne	L(memcmp16_exit)
+L(26bytes):
+	movzwl	-26(%eax), %ecx
+	movzwl	-26(%edx), %ebx
+	subl	%ebx, %ecx
+	jne	L(memcmp16_exit)
+L(24bytes):
+	movzwl	-24(%eax), %ecx
+	movzwl	-24(%edx), %ebx
+	subl	%ebx, %ecx
+	jne	L(memcmp16_exit)
+L(22bytes):
+	movzwl	-22(%eax), %ecx
+	movzwl	-22(%edx), %ebx
+	subl	%ebx, %ecx
+	jne	L(memcmp16_exit)
+L(20bytes):
+	movzwl	-20(%eax), %ecx
+	movzwl	-20(%edx), %ebx
+	subl	%ebx, %ecx
+	jne	L(memcmp16_exit)
+L(18bytes):
+	movzwl	-18(%eax), %ecx
+	movzwl	-18(%edx), %ebx
+	subl	%ebx, %ecx
+	jne	L(memcmp16_exit)
+L(16bytes):
+	movzwl	-16(%eax), %ecx
+	movzwl	-16(%edx), %ebx
+	subl	%ebx, %ecx
+	jne	L(memcmp16_exit)
+L(14bytes):
+	movzwl	-14(%eax), %ecx
+	movzwl	-14(%edx), %ebx
+	subl	%ebx, %ecx
+	jne	L(memcmp16_exit)
+L(12bytes):
+	movzwl	-12(%eax), %ecx
+	movzwl	-12(%edx), %ebx
+	subl	%ebx, %ecx
+	jne	L(memcmp16_exit)
+L(10bytes):
+	movzwl	-10(%eax), %ecx
+	movzwl	-10(%edx), %ebx
+	subl	%ebx, %ecx
+	jne	L(memcmp16_exit)
+L(8bytes):
+	movzwl	-8(%eax), %ecx
+	movzwl	-8(%edx), %ebx
+	subl	%ebx, %ecx
+	jne	L(memcmp16_exit)
+L(6bytes):
+	movzwl	-6(%eax), %ecx
+	movzwl	-6(%edx), %ebx
+	subl	%ebx, %ecx
+	jne	L(memcmp16_exit)
+L(4bytes):
+	movzwl	-4(%eax), %ecx
+	movzwl	-4(%edx), %ebx
+	subl	%ebx, %ecx
+	jne	L(memcmp16_exit)
+L(2bytes):
+	movzwl	-2(%eax), %eax
+	movzwl	-2(%edx), %ebx
+	subl	%ebx, %eax
+	POP	(%ebx)
+	ret
+	CFI_PUSH (%ebx)
+#else
+# error Unreachable preprocessor case
 #endif
 
-#ifndef USE_AS_WMEMCMP
+#if !defined(USE_AS_WMEMCMP) && !defined(USE_AS_MEMCMP16)
 
 	.p2align 4
 L(45bytes):
@@ -2191,9 +2469,8 @@
 	neg	%eax
 L(bigger):
 	ret
-#else
+#elif defined(USE_AS_WMEMCMP)
 
-/* for wmemcmp */
 	.p2align 4
 L(find_diff):
 	POP	(%ebx)
@@ -2206,5 +2483,14 @@
 L(find_diff_bigger):
 	ret
 
+#elif defined(USE_AS_MEMCMP16)
+
+	.p2align 4
+L(memcmp16_exit):
+	POP	(%ebx)
+	mov	%ecx, %eax
+	ret
+#else
+# error Unreachable preprocessor case
 #endif
 END (MEMCMP)
diff --git a/libc/arch-x86/string/ssse3-memcmp16-atom.S b/libc/arch-x86/string/ssse3-memcmp16-atom.S
new file mode 100644
index 0000000..1be8f3d
--- /dev/null
+++ b/libc/arch-x86/string/ssse3-memcmp16-atom.S
@@ -0,0 +1,37 @@
+/*
+Copyright (c) 2013, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#define MEMCMP  __memcmp16
+
+/* int __memcmp16(const unsigned short *ptr1, const unsigned short *ptr2, size_t n); */
+
+#define USE_UTF16
+#define USE_AS_MEMCMP16 1
+#include "ssse3-memcmp-atom.S"
diff --git a/libc/arch-x86/string/ssse3-wmemcmp-atom.S b/libc/arch-x86/string/ssse3-wmemcmp-atom.S
index c146b04..2c3fa02 100644
--- a/libc/arch-x86/string/ssse3-wmemcmp-atom.S
+++ b/libc/arch-x86/string/ssse3-wmemcmp-atom.S
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2011, Intel Corporation
+Copyright (c) 2011, 2012, 2013 Intel Corporation
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -29,5 +29,7 @@
 */
 
 #define MEMCMP  wmemcmp
+
+#define USE_WCHAR
 #define USE_AS_WMEMCMP 1
 #include "ssse3-memcmp-atom.S"
diff --git a/libc/arch-x86/x86.mk b/libc/arch-x86/x86.mk
index 3924ce3..19c1402 100644
--- a/libc/arch-x86/x86.mk
+++ b/libc/arch-x86/x86.mk
@@ -27,6 +27,7 @@
 	arch-x86/string/ssse3-strcpy-atom.S \
 	arch-x86/string/ssse3-memcmp-atom.S \
 	arch-x86/string/ssse3-wmemcmp-atom.S \
+	arch-x86/string/ssse3-memcmp16-atom.S \
 	arch-x86/string/ssse3-wcscat-atom.S \
 	arch-x86/string/ssse3-wcscpy-atom.S
 else
@@ -38,6 +39,7 @@
 	arch-x86/string/strncmp.S \
 	arch-x86/string/strcat.S \
 	arch-x86/string/memcmp.S \
+	string/memcmp16.c \
 	string/strcpy.c \
 	string/strncat.c \
 	string/strncpy.c \
diff --git a/libc/string/memcmp16.c b/libc/string/memcmp16.c
new file mode 100644
index 0000000..1267722
--- /dev/null
+++ b/libc/string/memcmp16.c
@@ -0,0 +1,45 @@
+/*
+Copyright (c) 2013 Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <stddef.h>
+
+/* Unoptimised version of __memcmp16 */
+int __memcmp16(const unsigned short *ptr1, const unsigned short *ptr2, size_t n)
+{
+  size_t i;
+
+  for (i = 0; i < n; i++) {
+    if (*ptr1 != *ptr2)
+      return *ptr1 - *ptr2;
+    ptr1++;
+    ptr2++;
+  }
+  return 0;
+}
diff --git a/tests/string_test.cpp b/tests/string_test.cpp
index 91d1904..ef43f5d 100644
--- a/tests/string_test.cpp
+++ b/tests/string_test.cpp
@@ -131,14 +131,15 @@
 // expected result and then run function and compare what we got.
 // These tests contributed by Intel Corporation.
 // TODO: make these tests more intention-revealing and less random.
+template<class Character>
 struct StringTestState {
   StringTestState(size_t MAX_LEN) : MAX_LEN(MAX_LEN) {
     int max_alignment = 64;
 
     // TODO: fix the tests to not sometimes use twice their specified "MAX_LEN".
-    glob_ptr = reinterpret_cast<char*>(valloc(2 * MAX_LEN + max_alignment));
-    glob_ptr1 = reinterpret_cast<char*>(valloc(2 * MAX_LEN + max_alignment));
-    glob_ptr2 = reinterpret_cast<char*>(valloc(2 * MAX_LEN + max_alignment));
+    glob_ptr = reinterpret_cast<Character*>(valloc(2 * sizeof(Character) * MAX_LEN + max_alignment));
+    glob_ptr1 = reinterpret_cast<Character*>(valloc(2 * sizeof(Character) * MAX_LEN + max_alignment));
+    glob_ptr2 = reinterpret_cast<Character*>(valloc(2 * sizeof(Character) * MAX_LEN + max_alignment));
 
     InitLenArray();
 
@@ -163,12 +164,12 @@
   }
 
   const size_t MAX_LEN;
-  char *ptr, *ptr1, *ptr2;
+  Character *ptr, *ptr1, *ptr2;
   size_t n;
   int len[ITER + 1];
 
  private:
-  char *glob_ptr, *glob_ptr1, *glob_ptr2;
+  Character *glob_ptr, *glob_ptr1, *glob_ptr2;
 
   // Calculate input lengths and fill state.len with them.
   // Test small lengths with more density than big ones. Manually push
@@ -188,7 +189,7 @@
 };
 
 TEST(string, strcat) {
-  StringTestState state(SMALL);
+  StringTestState<char> state(SMALL);
   for (size_t i = 1; i < state.n; i++) {
     for (size_t j = 0; j < POS_ITER; j++) {
       state.NewIteration();
@@ -380,7 +381,7 @@
 TEST(string, strchr) {
   int seek_char = random() & 255;
 
-  StringTestState state(SMALL);
+  StringTestState<char> state(SMALL);
   for (size_t i = 1; i < state.n; i++) {
     for (size_t j = 0; j < POS_ITER; j++) {
       state.NewIteration();
@@ -411,7 +412,7 @@
 }
 
 TEST(string, strcmp) {
-  StringTestState state(SMALL);
+  StringTestState<char> state(SMALL);
   for (size_t i = 1; i < state.n; i++) {
     for (size_t j = 0; j < POS_ITER; j++) {
       state.NewIteration();
@@ -448,7 +449,7 @@
 }
 
 TEST(string, strcpy) {
-  StringTestState state(SMALL);
+  StringTestState<char> state(SMALL);
   for (size_t j = 0; j < POS_ITER; j++) {
     state.NewIteration();
 
@@ -476,7 +477,7 @@
 
 #if __BIONIC__
 TEST(string, strlcat) {
-  StringTestState state(SMALL);
+  StringTestState<char> state(SMALL);
   for (size_t i = 0; i < state.n; i++) {
     for (size_t j = 0; j < POS_ITER; j++) {
       state.NewIteration();
@@ -505,7 +506,7 @@
 
 #if __BIONIC__
 TEST(string, strlcpy) {
-  StringTestState state(SMALL);
+  StringTestState<char> state(SMALL);
   for (size_t j = 0; j < POS_ITER; j++) {
     state.NewIteration();
 
@@ -539,7 +540,7 @@
 #endif
 
 TEST(string, strncat) {
-  StringTestState state(SMALL);
+  StringTestState<char> state(SMALL);
   for (size_t i = 1; i < state.n; i++) {
     for (size_t j = 0; j < POS_ITER; j++) {
       state.NewIteration();
@@ -565,7 +566,7 @@
 }
 
 TEST(string, strncmp) {
-  StringTestState state(SMALL);
+  StringTestState<char> state(SMALL);
   for (size_t i = 1; i < state.n; i++) {
     for (size_t j = 0; j < POS_ITER; j++) {
       state.NewIteration();
@@ -602,7 +603,7 @@
 }
 
 TEST(string, strncpy) {
-  StringTestState state(SMALL);
+  StringTestState<char> state(SMALL);
   for (size_t j = 0; j < ITER; j++) {
     state.NewIteration();
 
@@ -630,7 +631,7 @@
 
 TEST(string, strrchr) {
   int seek_char = random() & 255;
-  StringTestState state(SMALL);
+  StringTestState<char> state(SMALL);
   for (size_t i = 1; i < state.n; i++) {
     for (size_t j = 0; j < POS_ITER; j++) {
       state.NewIteration();
@@ -662,7 +663,7 @@
 
 TEST(string, memchr) {
   int seek_char = random() & 255;
-  StringTestState state(SMALL);
+  StringTestState<char> state(SMALL);
   for (size_t i = 0; i < state.n; i++) {
     for (size_t j = 0; j < POS_ITER; j++) {
       state.NewIteration();
@@ -685,7 +686,7 @@
 
 TEST(string, memrchr) {
   int seek_char = random() & 255;
-  StringTestState state(SMALL);
+  StringTestState<char> state(SMALL);
   for (size_t i = 0; i < state.n; i++) {
     for (size_t j = 0; j < POS_ITER; j++) {
       state.NewIteration();
@@ -707,7 +708,7 @@
 }
 
 TEST(string, memcmp) {
-  StringTestState state(SMALL);
+  StringTestState<char> state(SMALL);
   for (size_t i = 0; i < state.n; i++) {
     for (size_t j = 0; j < POS_ITER; j++) {
       state.NewIteration();
@@ -728,8 +729,61 @@
   }
 }
 
+#if defined(__BIONIC__)
+extern "C" int __memcmp16(const unsigned short *ptr1, const unsigned short *ptr2, size_t n);
+
+TEST(string, __memcmp16) {
+  StringTestState<unsigned short> state(SMALL);
+
+  for (size_t i = 0; i < state.n; i++) {
+    for (size_t j = 0; j < POS_ITER; j++) {
+      state.NewIteration();
+
+      unsigned short mask = 0xffff;
+      unsigned short c1 = rand() & mask;
+      unsigned short c2 = rand() & mask;
+
+      std::fill(state.ptr1, state.ptr1 + state.MAX_LEN, c1);
+      std::fill(state.ptr2, state.ptr2 + state.MAX_LEN, c1);
+
+      int pos = (state.len[i] == 0) ? 0 : (random() % state.len[i]);
+      state.ptr2[pos] = c2;
+
+      int expected = (static_cast<unsigned short>(c1) - static_cast<unsigned short>(c2));
+      int actual = __memcmp16(state.ptr1, state.ptr2, (size_t) state.MAX_LEN);
+
+      ASSERT_EQ(expected, actual);
+    }
+  }
+}
+#endif
+
+TEST(string, wmemcmp) {
+  StringTestState<wchar_t> state(SMALL);
+
+  for (size_t i = 0; i < state.n; i++) {
+    for (size_t j = 0; j < POS_ITER; j++) {
+      state.NewIteration();
+
+      long long mask = ((long long) 1 << 8 * sizeof(wchar_t)) - 1;
+      int c1 = rand() & mask;
+      int c2 = rand() & mask;
+      wmemset(state.ptr1, c1, state.MAX_LEN);
+      wmemset(state.ptr2, c1, state.MAX_LEN);
+
+      int pos = (state.len[i] == 0) ? 0 : (random() % state.len[i]);
+      state.ptr2[pos] = c2;
+
+      int expected = (static_cast<int>(c1) - static_cast<int>(c2));
+      int actual = wmemcmp(state.ptr1, state.ptr2, (size_t) state.MAX_LEN);
+
+      ASSERT_EQ(signum(expected), signum(actual));
+    }
+  }
+}
+
 TEST(string, memcpy) {
-  StringTestState state(LARGE);
+  StringTestState<char> state(LARGE);
   int rand = random() & 255;
   for (size_t i = 0; i < state.n - 1; i++) {
     for (size_t j = 0; j < POS_ITER; j++) {
@@ -751,7 +805,7 @@
 }
 
 TEST(string, memset) {
-  StringTestState state(LARGE);
+  StringTestState<char> state(LARGE);
   char ch = random () & 255;
   for (size_t i = 0; i < state.n - 1; i++) {
     for (size_t j = 0; j < POS_ITER; j++) {
@@ -773,7 +827,7 @@
 }
 
 TEST(string, memmove) {
-  StringTestState state(LARGE);
+  StringTestState<char> state(LARGE);
   for (size_t i = 0; i < state.n - 1; i++) {
     for (size_t j = 0; j < POS_ITER; j++) {
       state.NewIteration();
@@ -794,7 +848,7 @@
 }
 
 TEST(string, bcopy) {
-  StringTestState state(LARGE);
+  StringTestState<char> state(LARGE);
   for (size_t i = 0; i < state.n; i++) {
     for (size_t j = 0; j < POS_ITER; j++) {
       state.NewIteration();
@@ -813,7 +867,7 @@
 }
 
 TEST(string, bzero) {
-  StringTestState state(LARGE);
+  StringTestState<char> state(LARGE);
   for (size_t j = 0; j < ITER; j++) {
     state.NewIteration();