MIPS support for libc.

Change-Id: I2864dea04b3faf2d919165dcaa600af5b16c41c8
Signed-off-by: Chris Dearman <chris@mips.com>
Signed-off-by: Raghu Gandham <raghu@mips.com>
diff --git a/libc/arch-mips/string/memcpy.S b/libc/arch-mips/string/memcpy.S
new file mode 100644
index 0000000..aabdfcf
--- /dev/null
+++ b/libc/arch-mips/string/memcpy.S
@@ -0,0 +1,423 @@
+/*
+ * Copyright (c) 2009
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/************************************************************************
+ *
+ *  memcpy.S
+ *  Version: "043009"
+ *
+ ************************************************************************/
+
+
+/************************************************************************
+ *  Include files
+ ************************************************************************/
+
+#include "machine/asm.h"
+
+
+/* 
+ * This routine could be optimized for MIPS64. The current code only
+ * uses MIPS32 instructions.
+ */	
+#if defined(__MIPSEB__)
+#  define LWHI	lwl		/* high part is left in big-endian	*/
+#  define SWHI	swl		/* high part is left in big-endian	*/
+#  define LWLO	lwr		/* low part is right in big-endian	*/
+#  define SWLO	swr		/* low part is right in big-endian	*/
+#endif
+
+#if defined(__MIPSEL__)
+#  define LWHI	lwr		/* high part is right in little-endian	*/
+#  define SWHI	swr		/* high part is right in little-endian	*/
+#  define LWLO	lwl		/* low part is left in big-endian	*/
+#  define SWLO	swl		/* low part is left in big-endian	*/
+#endif
+
+LEAF(memcpy,0)
+
+	.set	noreorder
+	.set	noat
+/*
+ * Below we handle the case where memcpy is called with overlapping src and dst.
+ * Although memcpy is not required to handle this case, some parts of Android like Skia
+ * rely on such usage. We call memmove to handle such cases.
+ */
+	subu	t0,a0,a1
+	sra	AT,t0,31
+	xor	t1,t0,AT
+	subu	t0,t1,AT
+	sltu	AT,t0,a2
+	beq	AT,zero,.Lmemcpy
+	 la	t9,memmove
+	jr	t9
+	 nop
+.Lmemcpy:
+	slti	AT,a2,8
+	bne	AT,zero,.Llast8
+	 move	v0,a0	# memcpy returns the dst pointer
+
+# Test if the src and dst are word-aligned, or can be made word-aligned
+	xor	t8,a1,a0
+	andi	t8,t8,0x3		# t8 is a0/a1 word-displacement
+
+	bne	t8,zero,.Lunaligned
+	 negu	a3,a0
+
+	andi	a3,a3,0x3	# we need to copy a3 bytes to make a0/a1 aligned
+	beq	a3,zero,.Lchk16w # when a3=0 then the dst (a0) is word-aligned
+	 subu	a2,a2,a3	# now a2 is the remining bytes count
+
+	LWHI	t8,0(a1)
+	addu	a1,a1,a3
+	SWHI	t8,0(a0)
+	addu	a0,a0,a3
+
+# Now the dst/src are mutually word-aligned with word-aligned addresses
+.Lchk16w:
+	andi	t8,a2,0x3f	# any whole 64-byte chunks?
+				# t8 is the byte count after 64-byte chunks
+
+	beq	a2,t8,.Lchk8w	# if a2==t8, no 64-byte chunks
+				# There will be at most 1 32-byte chunk after it
+	 subu	a3,a2,t8	# subtract from a2 the reminder
+                                # Here a3 counts bytes in 16w chunks
+	addu	a3,a0,a3	# Now a3 is the final dst after 64-byte chunks
+
+	addu	t0,a0,a2	# t0 is the "past the end" address
+
+# When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
+# the "t0-32" address
+# This means: for x=128 the last "safe" a0 address is "t0-160"
+# Alternatively, for x=64 the last "safe" a0 address is "t0-96"
+# In the current version we will use "pref 30,128(a0)", so "t0-160" is the limit
+	subu	t9,t0,160	# t9 is the "last safe pref 30,128(a0)" address
+
+	pref    0,0(a1)		# bring the first line of src, addr 0
+	pref    0,32(a1)	# bring the second line of src, addr 32
+	pref    0,64(a1)	# bring the third line of src, addr 64
+	pref	30,32(a0)	# safe, as we have at least 64 bytes ahead
+# In case the a0 > t9 don't use "pref 30" at all
+	sgtu	v1,a0,t9
+	bgtz	v1,.Lloop16w	# skip "pref 30,64(a0)" for too short arrays
+	 nop
+# otherwise, start with using pref30
+	pref	30,64(a0)
+.Lloop16w:
+	pref	0,96(a1)
+	lw	t0,0(a1)
+	bgtz	v1,.Lskip_pref30_96	# skip "pref 30,96(a0)"
+	 lw	t1,4(a1)
+	pref    30,96(a0)   # continue setting up the dest, addr 96
+.Lskip_pref30_96:
+	lw	t2,8(a1)
+	lw	t3,12(a1)
+	lw	t4,16(a1)
+	lw	t5,20(a1)
+	lw	t6,24(a1)
+	lw	t7,28(a1)
+        pref    0,128(a1)    # bring the next lines of src, addr 128
+
+	sw	t0,0(a0)
+	sw	t1,4(a0)
+	sw	t2,8(a0)
+	sw	t3,12(a0)
+	sw	t4,16(a0)
+	sw	t5,20(a0)
+	sw	t6,24(a0)
+	sw	t7,28(a0)
+
+	lw	t0,32(a1)
+	bgtz	v1,.Lskip_pref30_128	# skip "pref 30,128(a0)"
+	 lw	t1,36(a1)
+	pref    30,128(a0)   # continue setting up the dest, addr 128
+.Lskip_pref30_128:
+	lw	t2,40(a1)
+	lw	t3,44(a1)
+	lw	t4,48(a1)
+	lw	t5,52(a1)
+	lw	t6,56(a1)
+	lw	t7,60(a1)
+        pref    0, 160(a1)    # bring the next lines of src, addr 160
+
+	sw	t0,32(a0)
+	sw	t1,36(a0)
+	sw	t2,40(a0)
+	sw	t3,44(a0)
+	sw	t4,48(a0)
+	sw	t5,52(a0)
+	sw	t6,56(a0)
+	sw	t7,60(a0)
+
+	addiu	a0,a0,64	# adding 64 to dest
+	sgtu	v1,a0,t9
+	bne	a0,a3,.Lloop16w
+	 addiu	a1,a1,64	# adding 64 to src
+	move	a2,t8
+
+# Here we have src and dest word-aligned but less than 64-bytes to go
+
+.Lchk8w:
+	pref 0, 0x0(a1)
+	andi	t8,a2,0x1f	# is there a 32-byte chunk?
+				# the t8 is the reminder count past 32-bytes
+	beq	a2,t8,.Lchk1w	# when a2=t8, no 32-byte chunk
+	 nop
+
+	lw	t0,0(a1)
+	lw	t1,4(a1)
+	lw	t2,8(a1)
+	lw	t3,12(a1)
+	lw	t4,16(a1)
+	lw	t5,20(a1)
+	lw	t6,24(a1)
+	lw	t7,28(a1)
+	addiu	a1,a1,32
+
+	sw	t0,0(a0)
+	sw	t1,4(a0)
+	sw	t2,8(a0)
+	sw	t3,12(a0)
+	sw	t4,16(a0)
+	sw	t5,20(a0)
+	sw	t6,24(a0)
+	sw	t7,28(a0)
+	addiu	a0,a0,32
+
+.Lchk1w:
+	andi	a2,t8,0x3	# now a2 is the reminder past 1w chunks
+	beq	a2,t8,.Llast8
+	 subu	a3,t8,a2	# a3 is count of bytes in 1w chunks
+	addu	a3,a0,a3	# now a3 is the dst address past the 1w chunks
+
+# copying in words (4-byte chunks)
+.LwordCopy_loop:
+	lw	t3,0(a1)	# the first t3 may be equal t0 ... optimize?
+	addiu	a1,a1,4
+	addiu	a0,a0,4
+	bne	a0,a3,.LwordCopy_loop
+	 sw	t3,-4(a0)
+
+# For the last (<8) bytes
+.Llast8:
+	blez	a2,.Lleave
+	 addu	a3,a0,a2	# a3 is the last dst address
+.Llast8loop:
+	lb	v1,0(a1)
+	addiu	a1,a1,1
+	addiu	a0,a0,1
+	bne	a0,a3,.Llast8loop
+	 sb	v1,-1(a0)
+
+.Lleave:
+	j	ra
+	 nop
+
+#
+# UNALIGNED case
+#
+
+.Lunaligned:
+	# got here with a3="negu a0"
+	andi	a3,a3,0x3	# test if the a0 is word aligned
+	beqz	a3,.Lua_chk16w
+	 subu	a2,a2,a3	# bytes left after initial a3 bytes
+
+	LWHI	v1,0(a1)
+	LWLO	v1,3(a1)
+	addu	a1,a1,a3	# a3 may be here 1, 2 or 3
+	SWHI	v1,0(a0)
+	addu	a0,a0,a3	# below the dst will be word aligned (NOTE1)
+
+.Lua_chk16w:
+	andi	t8,a2,0x3f	# any whole 64-byte chunks?
+				# t8 is the byte count after 64-byte chunks
+	beq	a2,t8,.Lua_chk8w # if a2==t8, no 64-byte chunks
+				# There will be at most 1 32-byte chunk after it
+	 subu	a3,a2,t8	# subtract from a2 the reminder
+                                # Here a3 counts bytes in 16w chunks
+	addu	a3,a0,a3	# Now a3 is the final dst after 64-byte chunks
+
+	addu	t0,a0,a2	# t0 is the "past the end" address
+
+	subu	t9,t0,160	# t9 is the "last safe pref 30,128(a0)" address
+
+	pref    0,0(a1)		# bring the first line of src, addr 0
+	pref    0,32(a1)	# bring the second line of src, addr 32
+	pref    0,64(a1)	# bring the third line of src, addr 64
+	pref	30,32(a0)	# safe, as we have at least 64 bytes ahead
+# In case the a0 > t9 don't use "pref 30" at all
+	sgtu	v1,a0,t9
+	bgtz	v1,.Lua_loop16w	# skip "pref 30,64(a0)" for too short arrays
+	 nop
+# otherwise, start with using pref30
+	pref	30,64(a0)
+.Lua_loop16w:
+	pref	0,96(a1)
+	LWHI	t0,0(a1)
+	LWLO	t0,3(a1)
+	LWHI	t1,4(a1)
+	bgtz	v1,.Lua_skip_pref30_96
+	 LWLO	t1,7(a1)
+	pref    30,96(a0)   # continue setting up the dest, addr 96
+.Lua_skip_pref30_96:
+	LWHI	t2,8(a1)
+	LWLO	t2,11(a1)
+	LWHI	t3,12(a1)
+	LWLO	t3,15(a1)
+	LWHI	t4,16(a1)
+	LWLO	t4,19(a1)
+	LWHI	t5,20(a1)
+	LWLO	t5,23(a1)
+	LWHI	t6,24(a1)
+	LWLO	t6,27(a1)
+	LWHI	t7,28(a1)
+	LWLO	t7,31(a1)
+        pref    0,128(a1)    # bring the next lines of src, addr 128
+
+	sw	t0,0(a0)
+	sw	t1,4(a0)
+	sw	t2,8(a0)
+	sw	t3,12(a0)
+	sw	t4,16(a0)
+	sw	t5,20(a0)
+	sw	t6,24(a0)
+	sw	t7,28(a0)
+
+	LWHI	t0,32(a1)
+	LWLO	t0,35(a1)
+	LWHI	t1,36(a1)
+	bgtz	v1,.Lua_skip_pref30_128
+	LWLO	t1,39(a1)
+	pref    30,128(a0)   # continue setting up the dest, addr 128
+.Lua_skip_pref30_128:
+	LWHI	t2,40(a1)
+	LWLO	t2,43(a1)
+	LWHI	t3,44(a1)
+	LWLO	t3,47(a1)
+	LWHI	t4,48(a1)
+	LWLO	t4,51(a1)
+	LWHI	t5,52(a1)
+	LWLO	t5,55(a1)
+	LWHI	t6,56(a1)
+	LWLO	t6,59(a1)
+	LWHI	t7,60(a1)
+	LWLO	t7,63(a1)
+        pref    0, 160(a1)    # bring the next lines of src, addr 160
+
+	sw	t0,32(a0)
+	sw	t1,36(a0)
+	sw	t2,40(a0)
+	sw	t3,44(a0)
+	sw	t4,48(a0)
+	sw	t5,52(a0)
+	sw	t6,56(a0)
+	sw	t7,60(a0)
+
+	addiu	a0,a0,64	# adding 64 to dest
+	sgtu	v1,a0,t9
+	bne	a0,a3,.Lua_loop16w
+	 addiu	a1,a1,64	# adding 64 to src
+	move	a2,t8
+
+# Here we have src and dest word-aligned but less than 64-bytes to go
+
+.Lua_chk8w:
+	pref 0, 0x0(a1)
+	andi	t8,a2,0x1f	# is there a 32-byte chunk?
+				# the t8 is the reminder count
+	beq	a2,t8,.Lua_chk1w # when a2=t8, no 32-byte chunk
+	 nop
+
+	LWHI	t0,0(a1)
+	LWLO	t0,3(a1)
+	LWHI	t1,4(a1)
+	LWLO	t1,7(a1)
+	LWHI	t2,8(a1)
+	LWLO	t2,11(a1)
+	LWHI	t3,12(a1)
+	LWLO	t3,15(a1)
+	LWHI	t4,16(a1)
+	LWLO	t4,19(a1)
+	LWHI	t5,20(a1)
+	LWLO	t5,23(a1)
+	LWHI	t6,24(a1)
+	LWLO	t6,27(a1)
+	LWHI	t7,28(a1)
+	LWLO	t7,31(a1)
+	addiu	a1,a1,32
+
+	sw	t0,0(a0)
+	sw	t1,4(a0)
+	sw	t2,8(a0)
+	sw	t3,12(a0)
+	sw	t4,16(a0)
+	sw	t5,20(a0)
+	sw	t6,24(a0)
+	sw	t7,28(a0)
+	addiu	a0,a0,32
+
+.Lua_chk1w:
+	andi	a2,t8,0x3	# now a2 is the reminder past 1w chunks
+	beq	a2,t8,.Lua_smallCopy
+	 subu	a3,t8,a2	# a3 is count of bytes in 1w chunks
+	addu	a3,a0,a3	# now a3 is the dst address past the 1w chunks
+
+# copying in words (4-byte chunks)
+.Lua_wordCopy_loop:
+	LWHI	v1,0(a1)
+	LWLO	v1,3(a1)
+	addiu	a1,a1,4
+	addiu	a0,a0,4		# note: dst=a0 is word aligned here, see NOTE1
+	bne	a0,a3,.Lua_wordCopy_loop
+	 sw	v1,-4(a0)
+
+# Now less than 4 bytes (value in a2) left to copy
+.Lua_smallCopy:
+	beqz	a2,.Lleave
+	addu	a3,a0,a2	# a3 is the last dst address
+.Lua_smallCopy_loop:
+	lb	v1,0(a1)
+	addiu	a1,a1,1
+	addiu	a0,a0,1
+	bne	a0,a3,.Lua_smallCopy_loop
+	 sb	v1,-1(a0)
+
+	j	ra
+	 nop
+
+	.set	at
+	.set	reorder
+
+END(memcpy)
+
+
+/************************************************************************
+ *  Implementation : Static functions
+ ************************************************************************/
diff --git a/libc/arch-mips/string/memset.S b/libc/arch-mips/string/memset.S
new file mode 100644
index 0000000..a1c5055
--- /dev/null
+++ b/libc/arch-mips/string/memset.S
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) 2009
+ *      MIPS Technologies, Inc., California.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/************************************************************************
+ *
+ *  memset.S, version "64h" with 1 cache line horizon for "pref 30" and 14 nops
+ *  Version: "043009"
+ *
+ ************************************************************************/
+
+
+/************************************************************************
+ *  Include files
+ ************************************************************************/
+
+#include "machine/asm.h"
+
+/* 
+ * This routine could be optimized for MIPS64. The current code only
+ * uses MIPS32 instructions.
+ */	
+
+#if defined(__MIPSEB__)
+#  define SWHI	swl		/* high part is left in big-endian	*/
+#  define SWLO	swr		/* low part is right in big-endian	*/
+#endif
+
+#if defined(__MIPSEL__)
+#  define SWHI	swr		/* high part is right in little-endian	*/
+#  define SWLO	swl		/* low part is left in little-endian	*/
+#endif
+
+#if !(defined(XGPROF) || defined(XPROF))
+#undef SETUP_GP
+#define SETUP_GP
+#endif
+
+#ifdef NDEBUG
+#define DBG #
+#else
+#define DBG
+#endif
+
+/*
+ * void _memset16(uint16_t* dst, uint16_t value, size_t size);
+ */
+
+LEAF(_memset16,0)
+	.set noreorder
+DBG	/* Check parameters */
+DBG	andi	t0,a0,1			# a0 must be halfword aligned
+DBG	tne	t0,zero
+DBG	andi	t2,a2,1			# a2 must be even
+DBG	tne	t2,zero
+
+#ifdef FIXARGS
+	# ensure count is even
+#if (__mips==32) && (__mips_isa_rev>=2)
+	ins	a2,zero,0,1
+#else
+	ori	a2,1
+	xori	a2,1
+#endif
+#endif
+
+#if (__mips==32) && (__mips_isa_rev>=2)
+	ins	a1,a1,16,16
+#else
+	andi	a1,0xffff
+	sll	t3,a1,16
+	or	a1,t3
+#endif
+
+	beqz	a2,.Ldone
+	 andi	t1,a0,2
+	beqz	t1,.Lalignok
+	 addu	t0,a0,a2		# t0 is the "past the end" address
+	sh	a1,0(a0)		# store one halfword to get aligned
+	addu	a0,2
+	subu	a2,2
+.Lalignok:
+	slti	t1,a2,4			# .Laligned for 4 or more bytes
+	beqz	t1,.Laligned
+	 sne	t1,a2,2			# one more halfword?
+	bnez	t1,.Ldone
+	 nop
+	sh	a1,0(a0)
+.Ldone:
+	j	ra
+	 nop
+	.set reorder
+END(_memset16)
+
+/*
+ * void _memset32(uint32_t* dst, uint32_t value, size_t size);
+ */
+
+LEAF(_memset32,0)
+	.set noreorder
+DBG	/* Check parameters */
+DBG	andi	t0,a0,3			# a0 must be word aligned
+DBG	tne	t0,zero
+DBG	andi	t2,a2,3			# a2 must be a multiple of 4 bytes
+DBG	tne	t2,zero
+
+#ifdef FIXARGS
+	# ensure count is a multiple of 4
+#if (__mips==32) && (__mips_isa_rev>=2)
+	ins	$a2,$0,0,2
+#else
+	ori	a2,3
+	xori	a2,3
+#endif
+#endif
+
+	bnez	a2,.Laligned		# any work to do?
+	 addu	t0,a0,a2		# t0 is the "past the end" address
+
+	j	ra
+	 nop
+	.set reorder
+END(_memset32)
+
+LEAF(memset,0)
+
+	.set	noreorder
+	.set	noat
+
+	addu	t0,a0,a2		# t0 is the "past the end" address
+	slti	AT,a2,4			# is a2 less than 4?
+	bne	AT,zero,.Llast4		# if yes, go to last4
+	 move	v0,a0			# memset returns the dst pointer
+
+	beq	a1,zero,.Lset0
+	 subu	v1,zero,a0
+
+	# smear byte into 32 bit word
+#if (__mips==32) && (__mips_isa_rev>=2)
+	ins     a1, a1, 8, 8        # Replicate fill byte into half-word.
+	ins     a1, a1, 16, 16      # Replicate fill byte into word.
+#else
+	and	a1,0xff
+	sll	AT,a1,8
+	or	a1,AT
+	sll	AT,a1,16
+	or	a1,AT
+#endif
+
+.Lset0:
+	andi	v1,v1,0x3		# word-unaligned address?
+	beq	v1,zero,.Laligned	# v1 is the unalignment count
+	 subu	a2,a2,v1
+	SWHI	a1,0(a0)
+	addu	a0,a0,v1
+
+# Here we have the "word-aligned" a0 (until the "last4")
+.Laligned:
+	andi	t8,a2,0x3f	# any 64-byte chunks?
+				# t8 is the byte count past 64-byte chunks
+	beq	a2,t8,.Lchk8w	# when a2==t8, no 64-byte chunks
+				# There will be at most 1 32-byte chunk then
+	 subu	a3,a2,t8	# subtract from a2 the reminder
+				# Here a3 counts bytes in 16w chunks
+	addu	a3,a0,a3	# Now a3 is the final dst after 64-byte chunks
+
+# Find out, if there are any 64-byte chunks after which will be still at least
+# 96 bytes left. The value "96" is calculated as needed buffer for
+# "pref 30,64(a0)" prefetch, which can be used as "pref 30,0(a0)" after
+# incrementing "a0" by 64.
+# For "a2" below 160 there will be no such "pref 30 safe" 64-byte chunk.
+#
+	sltiu	v1,a2,160
+	bgtz	v1,.Lloop16w_nopref30	# skip "pref 30,0(a0)"
+	 subu	t7,a2,96	# subtract "pref 30 unsafe" region
+		# below we have at least 1 64-byte chunk which is "pref 30 safe"
+	andi	t6,t7,0x3f	# t6 is past "64-byte safe chunks" reminder
+	subu	t5,t7,t6	# subtract from t7 the reminder
+				# Here t5 counts bytes in 16w "safe" chunks
+	addu	t4,a0,t5	# Now t4 is the dst after 64-byte "safe" chunks
+
+# Don't use "pref 30,0(a0)" for a0 in a "middle" of a cache line
+#	pref	30,0(a0)
+# Here we are in the region, where it is safe to use "pref 30,64(a0)"
+.Lloop16w:
+	addiu	a0,a0,64
+	pref	30,-32(a0)	# continue setting up the dest, addr 64-32
+	sw	a1,-64(a0)
+	sw	a1,-60(a0)
+	sw	a1,-56(a0)
+	sw	a1,-52(a0)
+	sw	a1,-48(a0)
+	sw	a1,-44(a0)
+	sw	a1,-40(a0)
+	sw	a1,-36(a0)
+	nop
+	nop			# the extra nop instructions help to balance
+	nop			# cycles needed for "store" + "fill" + "evict" 
+	nop			# For 64byte store there are needed 8 fill
+	nop			# and 8 evict cycles, i.e. at least 32 instr.
+	nop
+	nop
+	pref	30,0(a0)	# continue setting up the dest, addr 64-0
+	sw	a1,-32(a0)
+	sw	a1,-28(a0)
+	sw	a1,-24(a0)
+	sw	a1,-20(a0)
+	sw	a1,-16(a0)
+	sw	a1,-12(a0)
+	sw	a1,-8(a0)
+	sw	a1,-4(a0)
+	nop
+	nop
+	nop
+	nop			# NOTE: adding 14 nop-s instead of 12 nop-s
+	nop			# gives better results for "fast" memory
+	nop
+	bne	a0,t4,.Lloop16w
+	 nop
+
+	beq	a0,a3,.Lchk8w	# maybe no more 64-byte chunks?
+	 nop			# this "delayed slot" is useless ...
+
+.Lloop16w_nopref30:	# there could be up to 3 "64-byte nopref30" chunks
+	addiu	a0,a0,64
+	sw	a1,-64(a0)
+	sw	a1,-60(a0)
+	sw	a1,-56(a0)
+	sw	a1,-52(a0)
+	sw	a1,-48(a0)
+	sw	a1,-44(a0)
+	sw	a1,-40(a0)
+	sw	a1,-36(a0)
+	sw	a1,-32(a0)
+	sw	a1,-28(a0)
+	sw	a1,-24(a0)
+	sw	a1,-20(a0)
+	sw	a1,-16(a0)
+	sw	a1,-12(a0)
+	sw	a1,-8(a0)
+	bne	a0,a3,.Lloop16w_nopref30
+	 sw	a1,-4(a0)
+
+.Lchk8w:		# t8 here is the byte count past 64-byte chunks
+
+	andi	t7,t8,0x1f	# is there a 32-byte chunk?
+				# the t7 is the reminder count past 32-bytes
+	beq	t8,t7,.Lchk1w	# when t8==t7, no 32-byte chunk
+	 move	a2,t7
+
+	sw	a1,0(a0)
+	sw	a1,4(a0)
+	sw	a1,8(a0)
+	sw	a1,12(a0)
+	sw	a1,16(a0)
+	sw	a1,20(a0)
+	sw	a1,24(a0)
+	sw	a1,28(a0)
+	addiu	a0,a0,32
+
+.Lchk1w:
+	andi	t8,a2,0x3	# now t8 is the reminder past 1w chunks
+	beq	a2,t8,.Llast4aligned
+	 subu	a3,a2,t8	# a3 is the count of bytes in 1w chunks
+	addu	a3,a0,a3	# now a3 is the dst address past the 1w chunks
+
+# copying in words (4-byte chunks)
+.LwordCopy_loop:
+	addiu	a0,a0,4
+	bne	a0,a3,.LwordCopy_loop
+	 sw	a1,-4(a0)
+
+# store last 0-3 bytes
+# this will repeat the last store if the memset finishes on a word boundary
+.Llast4aligned:
+	j	ra
+	 SWLO	a1,-1(t0)
+
+.Llast4:
+	beq	a0,t0,.Llast4e
+.Llast4l:
+	 addiu	a0,a0,1
+	bne	a0,t0,.Llast4l
+	 sb	a1,-1(a0)
+.Llast4e:
+	j	ra
+	 nop
+
+	.set	at
+	.set	reorder
+
+END(memset)
+
+
+/************************************************************************
+ *  Implementation : Static functions
+ ************************************************************************/
+
diff --git a/libc/arch-mips/string/mips-string-ops.h b/libc/arch-mips/string/mips-string-ops.h
new file mode 100644
index 0000000..50f7e3a
--- /dev/null
+++ b/libc/arch-mips/string/mips-string-ops.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2010 MIPS Technologies, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      * Redistributions of source code must retain the above copyright
+ *        notice, this list of conditions and the following disclaimer.
+ *      * Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimer
+ *        in the documentation and/or other materials provided with
+ *        the distribution.
+ *      * Neither the name of MIPS Technologies Inc. nor the names of its
+ *        contributors may be used to endorse or promote products derived
+ *        from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __MIPS_STRING_OPS_H
+#define __MIPS_STRING_OPS_H
+    /* This definition of the byte bitfields uses the
+       assumption that the layout of the bitfields is
+       equivalent to the layout in memory.  Generally,
+       for the MIPS ABIs, this is true. If you compile
+       the strcmp.c file with -DSMOKE_TEST_NEW_STRCMP,
+       this assumption will be tested.
+
+       Also, regardless of char signedness, ANSI C dictates that
+       strcmp() treats each character as unsigned char.  For
+       strlen and the like, signedness doesn't matter.
+
+       Also, this code assumes that there are 8-bits per 'char'.  */
+
+#if __mips64
+typedef struct bits
+{
+  unsigned B0:8, B1:8, B2:8, B3:8, B4:8, B5:8, B6:8, B7:8;
+} bits_t;
+#else
+typedef struct bits
+{
+  unsigned B0:8, B1:8, B2:8, B3:8;
+} bits_t;
+#endif
+
+#ifndef _ULW
+    /* for MIPS GCC, there is no unaligned builtins - so this code forces
+       the compiler to treat the pointer access as unaligned.  */
+struct ulw
+{
+  unsigned b;
+} __attribute__ ((packed));
+
+#define _ULW(__x) ((struct ulw *) ((char *)(&__x)))->b;
+#endif
+
+/* This union assumes that small structures can be in registers.  If
+   not, then memory accesses will be done - not optimal, but ok.  */
+typedef union
+{
+  unsigned v;
+  bits_t b;
+} bitfields_t;
+
+#ifndef detect_zero
+/* __mips_dsp, __mips_dspr2, and __mips64 are predefined by
+   the compiler, based on command line options.  */
+#if (__mips_dsp || __mips_dspr2) && !__mips64
+#define __mips_using_dsp 1
+
+/* DSP 4-lane (8 unsigned bits per line) subtract and saturate
+ * Intrinsic operation. How this works:
+ *     Given a 4-byte string of "ABC\0", subtract this as
+ *     an unsigned integer from 0x01010101:
+ *	   0x01010101
+ *       - 0x41424300
+ *        -----------
+ (         0xbfbebe01 <-- answer without saturation
+ *	   0x00000001 <-- answer with saturation
+ * When this 4-lane vector is treated as an unsigned int value,
+ * a non-zero answer indicates the presence of a zero in the
+ * original 4-byte argument.  */
+
+typedef signed char v4i8 __attribute__ ((vector_size (4)));
+
+#define detect_zero(__x,__y,__01s,__80s)\
+       ((unsigned) __builtin_mips_subu_s_qb((v4i8) __01s,(v4i8) __x))
+
+    /* sets all 4 lanes to requested byte.  */
+#define set_byte_lanes(__x) ((unsigned) __builtin_mips_repl_qb(__x))
+
+    /* sets all 4 lanes to 0x01.  */
+#define def_and_set_01(__x) unsigned __x = (unsigned) __builtin_mips_repl_qb(0x01)
+
+    /* sets all 4 lanes to 0x80. Not needed when subu_s.qb used. */
+#define def_and_set_80(__x) /* do nothing */
+
+#else
+    /* this version, originally published in the 80's, uses
+       a reverse-carry-set like determination of the zero byte.
+       The steps are, for __x = 0x31ff0001:
+       __x - _01s = 0x30fdff00
+       ~__x = 0xce00fffe
+       ((__x - _01s) & ~__x) = 0x0000ff00
+       x & _80s = 0x00008000 <- byte 3 was zero
+       Some implementaions naively assume that characters are
+       always 7-bit unsigned ASCII. With that assumption, the
+       "& ~x" is usually discarded. Since character strings
+       are 8-bit, the and is needed to catch the case of
+       a false positive when the byte is 0x80. */
+
+#define detect_zero(__x,__y,_01s,_80s)\
+	((unsigned) (((__x) - _01s) & ~(__x)) & _80s)
+
+#if __mips64
+#define def_and_set_80(__x) unsigned __x =  0x8080808080808080ul
+#define def_and_set_01(__x)  unsigned __x = 0x0101010101010101ul
+#else
+#define def_and_set_80(__x) unsigned __x = 0x80808080ul
+#define def_and_set_01(__x) unsigned __x = 0x01010101ul
+#endif
+
+#endif
+#endif
+
+/* dealing with 'void *' conversions without using extra variables. */
+#define get_byte(__x,__idx) (((unsigned char *) (__x))[__idx])
+#define set_byte(__x,__idx,__fill) ((unsigned char *) (__x))[__idx] = (__fill)
+#define get_word(__x,__idx) (((unsigned *) (__x))[__idx])
+#define set_word(__x,__idx,__fill) ((unsigned *) (__x))[__idx] = (__fill)
+#define inc_ptr_as(__type,__x,__inc) __x = (void *) (((__type) __x) + (__inc))
+#define cvt_ptr_to(__type,__x) ((__type) (__x))
+
+#endif
diff --git a/libc/arch-mips/string/mips_strlen.c b/libc/arch-mips/string/mips_strlen.c
new file mode 100644
index 0000000..9fb7e6a
--- /dev/null
+++ b/libc/arch-mips/string/mips_strlen.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2010 MIPS Technologies, Inc.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *      * Redistributions of source code must retain the above copyright
+ *        notice, this list of conditions and the following disclaimer.
+ *      * Redistributions in binary form must reproduce the above copyright
+ *        notice, this list of conditions and the following disclaimer
+ *        in the documentation and/or other materials provided with
+ *        the distribution.
+ *      * Neither the name of MIPS Technologies Inc. nor the names of its
+ *        contributors may be used to endorse or promote products derived
+ *        from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "mips-string-ops.h"
+
+#define do_strlen_word(__av) {\
+    if (detect_zero(x,x,_01s,_80s)) break;\
+    x = __av;\
+    cnt += sizeof (unsigned);\
+    }
+
+#define do_strlen_byte(__x) {\
+  if ((bx.b.B##__x) == 0) break;\
+  ++cnt;\
+  }
+
+#if SMOKE_TEST_MIPS_STRLEN
+#define strlen my_strlen
+#endif
+
+int
+strlen (const void *_a)
+{
+  int cnt = 0;
+  unsigned x;
+
+  /* align the string to word boundary so we can do word at a time.  */
+  if ((cvt_ptr_to (unsigned, _a) & (sizeof (unsigned) - 1)) != 0)
+    {
+      if ((cvt_ptr_to (unsigned, _a) & 1) != 0)
+	{
+	  if (get_byte (_a, 0) == 0)
+	    return cnt;
+	  /* set bit 1 so 2-bytes are checked and incremented. */
+	  inc_ptr_as (char *, _a, 1);
+	  ++cnt;
+	}
+      if ((cvt_ptr_to (unsigned, _a) & 2) != 0)
+	{
+	  if (get_byte (_a, 0) == 0)
+	    return cnt + 0;
+	  if (get_byte (_a, 1) == 0)
+	    return cnt + 1;
+	  inc_ptr_as (char *, _a, 2);
+	  cnt += 2;
+	}
+    }
+
+#if __mips64
+#error strlen: mips64 check for 4-byte alignment not implemented.
+#endif
+
+  if (1)
+    {
+      def_and_set_01 (_01s);
+      def_and_set_80 (_80s);
+
+      /* as advantagous as it is to performance, this code cannot pre-load
+         the following word, nor can it prefetch the next line at the start
+         of the loop since the string can be at the end of a page with the
+         following page unmapped. There are tests in the suite to catch
+         any attempt to go beyond the current word. */
+      x = get_word (_a, 0);
+      while (1)
+	{
+	  /* doing 8 words should cover most strings.  */
+	  do_strlen_word (get_word (_a, 1));
+	  do_strlen_word (get_word (_a, 2));
+	  do_strlen_word (get_word (_a, 3));
+	  do_strlen_word (get_word (_a, 4));
+	  do_strlen_word (get_word (_a, 5));
+	  do_strlen_word (get_word (_a, 6));
+	  do_strlen_word (get_word (_a, 7));
+	  do_strlen_word (get_word (_a, 8));
+	  inc_ptr_as (unsigned *, _a, 8);
+	}
+    }
+  while (1)
+    {
+      /* pull apart the last word processed and find the zero.  */
+      bitfields_t bx;
+      bx.v = x;
+#if __mips64
+      do_strlen_byte (0);
+      do_strlen_byte (1);
+      do_strlen_byte (2);
+      do_strlen_byte (3);
+      do_strlen_byte (4);
+      do_strlen_byte (5);
+      do_strlen_byte (6);
+#else
+      do_strlen_byte (0);
+      do_strlen_byte (1);
+      do_strlen_byte (2);
+#endif
+      /* last byte is zero */
+      break;
+    }
+  return cnt;
+}
+
+#undef do_strlen_byte
+#undef do_strlen_word
+
+#if SMOKE_TEST_MIPS_STRLEN
+#include <stdio.h>
+char str1[] = "DHRYSTONE PROGRAM, 1'ST STRING";
+char str2[] = "DHRYSTONE PROGRAM, 2'ST STRING";
+
+char str3[] = "another string";
+char str4[] = "another";
+
+char str5[] = "somes tring";
+char str6[] = "somes_tring";
+
+char str7[16], str8[16];
+
+static char *
+chk (unsigned mine, unsigned libs, int *errors)
+{
+  static char answer[1024];
+  char *result = mine == libs ? "PASS" : "FAIL";
+  sprintf (answer, "new_strlen=%d: lib_strlen=%d: %s!", mine, libs, result);
+  if (mine != libs)
+    (*errors)++;
+  return answer;
+}
+
+int
+main (int argc, char **argv)
+{
+  int errors = 0;
+  /* set -1 in one position */
+  str6[5] = 0xff;
+  /* set zero in same position with junk in following 3 */
+  str7[0] = str8[0] = 0;
+  str7[1] = 0xff;
+  str7[2] = 'a';
+  str7[3] = 2;
+  str8[1] = 's';
+  str8[2] = -2;
+  str8[3] = 0;
+
+  fprintf (stderr, "========== mips_strlen%s test...\n",
+	   argv[0] ? argv[0] : "unknown strlen");
+#define P(__x,__y) {\
+    int a = my_strlen(__x + __y);\
+    int b = (strlen)(__x + __y) /* library version */;\
+    fprintf(stderr,"%s+%d: %s\n",#__x,__y,chk(a,b,&errors));\
+    }
+
+  P (str1, 0);
+  P (str1, 1);
+  P (str1, 2);
+  P (str1, 3);
+
+  P (str2, 0);
+  P (str2, 1);
+  P (str2, 2);
+  P (str2, 3);
+
+  P (str3, 0);
+  P (str3, 1);
+  P (str3, 2);
+  P (str3, 3);
+
+  P (str4, 0);
+  P (str4, 1);
+  P (str4, 2);
+  P (str4, 3);
+
+  P (str5, 0);
+  P (str5, 1);
+  P (str5, 2);
+  P (str5, 3);
+
+  P (str6, 0);
+  P (str6, 1);
+  P (str6, 2);
+  P (str6, 3);
+
+  P (str7, 0);
+  P (str7, 1);
+  P (str7, 2);
+  P (str7, 3);
+
+  P (str8, 0);
+  P (str8, 1);
+  P (str8, 2);
+  P (str8, 3);
+
+  return errors;
+}
+#endif