Blame - libc/arch-arm/bionic/memcmp.S - platform_bionic

blob: f45b56bc3997d9232d9b476d1714c58940dad332 [file] [log] [blame]

The Android Open Source Project	1dc9e47	2009-03-03 19:28:35 -0800	[diff] [blame^]	1	/*
				2	* Copyright (C) 2008 The Android Open Source Project
				3	* All rights reserved.
				4	*
				5	* Redistribution and use in source and binary forms, with or without
				6	* modification, are permitted provided that the following conditions
				7	* are met:
				8	* * Redistributions of source code must retain the above copyright
				9	* notice, this list of conditions and the following disclaimer.
				10	* * Redistributions in binary form must reproduce the above copyright
				11	* notice, this list of conditions and the following disclaimer in
				12	* the documentation and/or other materials provided with the
				13	* distribution.
				14	*
				15	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				16	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				17	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
				18	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
				19	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
				20	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
				21	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
				22	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
				23	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
				24	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
				25	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
				26	* SUCH DAMAGE.
				27	*/
				28
				29	#include <machine/cpu-features.h>
				30
				31	.text
				32
				33	.global memcmp
				34	.type memcmp, %function
				35	.align 4
				36
				37	/*
				38	* Optimized memcmp() for ARM9.
				39	* This would not be optimal on XScale or ARM11, where more prefetching
				40	* and use of PLD will be needed.
				41	* The 2 major optimzations here are
				42	* (1) The main loop compares 16 bytes at a time
				43	* (2) The loads are scheduled in a way they won't stall
				44	*/
				45
				46	memcmp:
				47	PLD (r0, #0)
				48	PLD (r1, #0)
				49
				50	/* take of the case where length is 0 or the buffers are the same */
				51	cmp r0, r1
				52	cmpne r2, #0
				53	moveq r0, #0
				54	bxeq lr
				55
				56	/* save registers */
				57	stmfd sp!, {r4, lr}
				58
				59	PLD (r0, #32)
				60	PLD (r1, #32)
				61
				62	/* since r0 hold the result, move the first source
				63	* pointer somewhere else
				64	*/
				65
				66	mov r4, r0
				67
				68	/* make sure we have at least 8+4 bytes, this simplify things below
				69	* and avoid some overhead for small blocks
				70	*/
				71	cmp r2, #(8+4)
				72	bmi 8f
				73
				74	/* align first pointer to word boundary
				75	* offset = -src & 3
				76	*/
				77	rsb r3, r4, #0
				78	ands r3, r3, #3
				79	beq 0f
				80
				81	/* align first pointer */
				82	sub r2, r2, r3
				83	1: ldrb r0, [r4], #1
				84	ldrb ip, [r1], #1
				85	subs r0, r0, ip
				86	bne 9f
				87	subs r3, r3, #1
				88	bne 1b
				89
				90
				91	0: /* here the first pointer is aligned, and we have at least 4 bytes
				92	* to process.
				93	*/
				94
				95	/* see if the pointers are congruent */
				96	eor r0, r4, r1
				97	ands r0, r0, #3
				98	bne 5f
				99
				100	/* congruent case, 32 bytes per iteration
				101	* We need to make sure there are at least 32+4 bytes left
				102	* because we effectively read ahead one word, and we could
				103	* read past the buffer (and segfault) if we're not careful.
				104	*/
				105
				106	ldr ip, [r1]
				107	subs r2, r2, #(32 + 4)
				108	bmi 1f
				109
				110	0: PLD (r4, #64)
				111	PLD (r1, #64)
				112	ldr r0, [r4], #4
				113	ldr lr, [r1, #4]!
				114	eors r0, r0, ip
				115	ldreq r0, [r4], #4
				116	ldreq ip, [r1, #4]!
				117	eoreqs r0, r0, lr
				118	ldreq r0, [r4], #4
				119	ldreq lr, [r1, #4]!
				120	eoreqs r0, r0, ip
				121	ldreq r0, [r4], #4
				122	ldreq ip, [r1, #4]!
				123	eoreqs r0, r0, lr
				124	ldreq r0, [r4], #4
				125	ldreq lr, [r1, #4]!
				126	eoreqs r0, r0, ip
				127	ldreq r0, [r4], #4
				128	ldreq ip, [r1, #4]!
				129	eoreqs r0, r0, lr
				130	ldreq r0, [r4], #4
				131	ldreq lr, [r1, #4]!
				132	eoreqs r0, r0, ip
				133	ldreq r0, [r4], #4
				134	ldreq ip, [r1, #4]!
				135	eoreqs r0, r0, lr
				136	bne 2f
				137	subs r2, r2, #32
				138	bhs 0b
				139
				140	/* do we have at least 4 bytes left? */
				141	1: adds r2, r2, #(32 - 4 + 4)
				142	bmi 4f
				143
				144	/* finish off 4 bytes at a time */
				145	3: ldr r0, [r4], #4
				146	ldr ip, [r1], #4
				147	eors r0, r0, ip
				148	bne 2f
				149	subs r2, r2, #4
				150	bhs 3b
				151
				152	/* are we done? */
				153	4: adds r2, r2, #4
				154	moveq r0, #0
				155	beq 9f
				156
				157	/* finish off the remaining bytes */
				158	b 8f
				159
				160	2: /* the last 4 bytes are different, restart them */
				161	sub r4, r4, #4
				162	sub r1, r1, #4
				163	mov r2, #4
				164
				165	/* process the last few bytes */
				166	8: ldrb r0, [r4], #1
				167	ldrb ip, [r1], #1
				168	// stall
				169	subs r0, r0, ip
				170	bne 9f
				171	subs r2, r2, #1
				172	bne 8b
				173
				174	9: /* restore registers and return */
				175	ldmfd sp!, {r4, lr}
				176	bx lr
				177
				178
				179
				180
				181
				182	5: /************* non-congruent case *************/
				183	and r0, r1, #3
				184	cmp r0, #2
				185	bne 4f
				186
				187	/* here, offset is 2 (16-bits aligned, special cased) */
				188
				189	/* make sure we have at least 16 bytes to process */
				190	subs r2, r2, #16
				191	addmi r2, r2, #16
				192	bmi 8b
				193
				194	/* align the unaligned pointer */
				195	bic r1, r1, #3
				196	ldr lr, [r1], #4
				197
				198	6: PLD (r1, #64)
				199	PLD (r4, #64)
				200	mov ip, lr, lsr #16
				201	ldr lr, [r1], #4
				202	ldr r0, [r4], #4
				203	orr ip, ip, lr, lsl #16
				204	eors r0, r0, ip
				205	moveq ip, lr, lsr #16
				206	ldreq lr, [r1], #4
				207	ldreq r0, [r4], #4
				208	orreq ip, ip, lr, lsl #16
				209	eoreqs r0, r0, ip
				210	moveq ip, lr, lsr #16
				211	ldreq lr, [r1], #4
				212	ldreq r0, [r4], #4
				213	orreq ip, ip, lr, lsl #16
				214	eoreqs r0, r0, ip
				215	moveq ip, lr, lsr #16
				216	ldreq lr, [r1], #4
				217	ldreq r0, [r4], #4
				218	orreq ip, ip, lr, lsl #16
				219	eoreqs r0, r0, ip
				220	bne 7f
				221	subs r2, r2, #16
				222	bhs 6b
				223	sub r1, r1, #2
				224	/* are we done? */
				225	adds r2, r2, #16
				226	moveq r0, #0
				227	beq 9b
				228	/* finish off the remaining bytes */
				229	b 8b
				230
				231	7: /* fix up the 2 pointers and fallthrough... */
				232	sub r1, r1, #(4+2)
				233	sub r4, r4, #4
				234	mov r2, #4
				235	b 8b
				236
				237
				238	4: /************* offset is 1 or 3 (less optimized) *************/
				239
				240	stmfd sp!, {r5, r6, r7}
				241
				242	// r5 = rhs
				243	// r6 = lhs
				244	// r7 = scratch
				245
				246	mov r5, r0, lsl #3 /* r5 = right shift */
				247	rsb r6, r5, #32 /* r6 = left shift */
				248
				249	/* align the unaligned pointer */
				250	bic r1, r1, #3
				251	ldr r7, [r1], #4
				252	sub r2, r2, #8
				253
				254	6: mov ip, r7, lsr r5
				255	ldr r7, [r1], #4
				256	ldr r0, [r4], #4
				257	orr ip, ip, r7, lsl r6
				258	eors r0, r0, ip
				259	moveq ip, r7, lsr r5
				260	ldreq r7, [r1], #4
				261	ldreq r0, [r4], #4
				262	orreq ip, ip, r7, lsl r6
				263	eoreqs r0, r0, ip
				264	bne 7f
				265	subs r2, r2, #8
				266	bhs 6b
				267
				268	sub r1, r1, r6, lsr #3
				269	ldmfd sp!, {r5, r6, r7}
				270
				271	/* are we done? */
				272	adds r2, r2, #8
				273	moveq r0, #0
				274	beq 9b
				275
				276	/* finish off the remaining bytes */
				277	b 8b
				278
				279	7: /* fix up the 2 pointers and fallthrough... */
				280	sub r1, r1, #4
				281	sub r1, r1, r6, lsr #3
				282	sub r4, r4, #4
				283	mov r2, #4
				284	ldmfd sp!, {r5, r6, r7}
				285	b 8b