Blame - libc/arch-arm/bionic/memcmp.S - platform_bionic

blob: d19dfb97f8206617bbd8e8931dbacb3147895ea3 [file] [log] [blame]

The Android Open Source Project	a27d2ba	2008-10-21 07:00:00 -0700	[diff] [blame^]	1	/*
				2	* Copyright (C) 2008 The Android Open Source Project
				3	* All rights reserved.
				4	*
				5	* Redistribution and use in source and binary forms, with or without
				6	* modification, are permitted provided that the following conditions
				7	* are met:
				8	* * Redistributions of source code must retain the above copyright
				9	* notice, this list of conditions and the following disclaimer.
				10	* * Redistributions in binary form must reproduce the above copyright
				11	* notice, this list of conditions and the following disclaimer in
				12	* the documentation and/or other materials provided with the
				13	* distribution.
				14	*
				15	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				16	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				17	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
				18	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
				19	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
				20	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
				21	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
				22	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
				23	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
				24	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
				25	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
				26	* SUCH DAMAGE.
				27	*/
				28	.text
				29
				30	.global memcmp
				31	.type memcmp, %function
				32	.align 4
				33
				34	/*
				35	* Optimized memcmp() for ARM9.
				36	* This would not be optimal on XScale or ARM11, where more prefetching
				37	* and use of PLD will be needed.
				38	* The 2 major optimzations here are
				39	* (1) The main loop compares 16 bytes at a time
				40	* (2) The loads are scheduled in a way they won't stall
				41	*/
				42
				43	memcmp:
				44	pld [r0, #0]
				45	pld [r1, #0]
				46
				47	/* take of the case where length is 0 or the buffers are the same */
				48	cmp r0, r1
				49	cmpne r2, #0
				50	moveq r0, #0
				51	bxeq lr
				52
				53	/* save registers */
				54	stmfd sp!, {r4, lr}
				55
				56	pld [r0, #32]
				57	pld [r1, #32]
				58
				59	/* since r0 hold the result, move the first source
				60	* pointer somewhere else
				61	*/
				62
				63	mov r4, r0
				64
				65	/* make sure we have at least 8+4 bytes, this simplify things below
				66	* and avoid some overhead for small blocks
				67	*/
				68	cmp r2, #(8+4)
				69	bmi 8f
				70
				71	/* align first pointer to word boundary
				72	* offset = -src & 3
				73	*/
				74	rsb r3, r4, #0
				75	ands r3, r3, #3
				76	beq 0f
				77
				78	/* align first pointer */
				79	sub r2, r2, r3
				80	1: ldrb r0, [r4], #1
				81	ldrb ip, [r1], #1
				82	subs r0, r0, ip
				83	bne 9f
				84	subs r3, r3, #1
				85	bne 1b
				86
				87
				88	0: /* here the first pointer is aligned, and we have at least 4 bytes
				89	* to process.
				90	*/
				91
				92	/* see if the pointers are congruent */
				93	eor r0, r4, r1
				94	ands r0, r0, #3
				95	bne 5f
				96
				97	/* congruent case, 32 bytes per iteration
				98	* We need to make sure there are at least 32+4 bytes left
				99	* because we effectively read ahead one word, and we could
				100	* read past the buffer (and segfault) if we're not careful.
				101	*/
				102
				103	ldr ip, [r1]
				104	subs r2, r2, #(32 + 4)
				105	bmi 1f
				106
				107	0: pld [r4, #64]
				108	pld [r1, #64]
				109	ldr r0, [r4], #4
				110	ldr lr, [r1, #4]!
				111	eors r0, r0, ip
				112	ldreq r0, [r4], #4
				113	ldreq ip, [r1, #4]!
				114	eoreqs r0, r0, lr
				115	ldreq r0, [r4], #4
				116	ldreq lr, [r1, #4]!
				117	eoreqs r0, r0, ip
				118	ldreq r0, [r4], #4
				119	ldreq ip, [r1, #4]!
				120	eoreqs r0, r0, lr
				121	ldreq r0, [r4], #4
				122	ldreq lr, [r1, #4]!
				123	eoreqs r0, r0, ip
				124	ldreq r0, [r4], #4
				125	ldreq ip, [r1, #4]!
				126	eoreqs r0, r0, lr
				127	ldreq r0, [r4], #4
				128	ldreq lr, [r1, #4]!
				129	eoreqs r0, r0, ip
				130	ldreq r0, [r4], #4
				131	ldreq ip, [r1, #4]!
				132	eoreqs r0, r0, lr
				133	bne 2f
				134	subs r2, r2, #32
				135	bhs 0b
				136
				137	/* do we have at least 4 bytes left? */
				138	1: adds r2, r2, #(32 - 4 + 4)
				139	bmi 4f
				140
				141	/* finish off 4 bytes at a time */
				142	3: ldr r0, [r4], #4
				143	ldr ip, [r1], #4
				144	eors r0, r0, ip
				145	bne 2f
				146	subs r2, r2, #4
				147	bhs 3b
				148
				149	/* are we done? */
				150	4: adds r2, r2, #4
				151	moveq r0, #0
				152	beq 9f
				153
				154	/* finish off the remaining bytes */
				155	b 8f
				156
				157	2: /* the last 4 bytes are different, restart them */
				158	sub r4, r4, #4
				159	sub r1, r1, #4
				160	mov r2, #4
				161
				162	/* process the last few bytes */
				163	8: ldrb r0, [r4], #1
				164	ldrb ip, [r1], #1
				165	// stall
				166	subs r0, r0, ip
				167	bne 9f
				168	subs r2, r2, #1
				169	bne 8b
				170
				171	9: /* restore registers and return */
				172	ldmfd sp!, {r4, lr}
				173	bx lr
				174
				175
				176
				177
				178
				179	5: /************* non-congruent case *************/
				180	and r0, r1, #3
				181	cmp r0, #2
				182	bne 4f
				183
				184	/* here, offset is 2 (16-bits aligned, special cased) */
				185
				186	/* make sure we have at least 16 bytes to process */
				187	subs r2, r2, #16
				188	addmi r2, r2, #16
				189	bmi 8b
				190
				191	/* align the unaligned pointer */
				192	bic r1, r1, #3
				193	ldr lr, [r1], #4
				194
				195	6: pld [r1, #64]
				196	pld [r4, #64]
				197	mov ip, lr, lsr #16
				198	ldr lr, [r1], #4
				199	ldr r0, [r4], #4
				200	orr ip, ip, lr, lsl #16
				201	eors r0, r0, ip
				202	moveq ip, lr, lsr #16
				203	ldreq lr, [r1], #4
				204	ldreq r0, [r4], #4
				205	orreq ip, ip, lr, lsl #16
				206	eoreqs r0, r0, ip
				207	moveq ip, lr, lsr #16
				208	ldreq lr, [r1], #4
				209	ldreq r0, [r4], #4
				210	orreq ip, ip, lr, lsl #16
				211	eoreqs r0, r0, ip
				212	moveq ip, lr, lsr #16
				213	ldreq lr, [r1], #4
				214	ldreq r0, [r4], #4
				215	orreq ip, ip, lr, lsl #16
				216	eoreqs r0, r0, ip
				217	bne 7f
				218	subs r2, r2, #16
				219	bhs 6b
				220	sub r1, r1, #2
				221	/* are we done? */
				222	adds r2, r2, #16
				223	moveq r0, #0
				224	beq 9b
				225	/* finish off the remaining bytes */
				226	b 8b
				227
				228	7: /* fix up the 2 pointers and fallthrough... */
				229	sub r1, r1, #(4+2)
				230	sub r4, r4, #4
				231	mov r2, #4
				232	b 8b
				233
				234
				235	4: /************* offset is 1 or 3 (less optimized) *************/
				236
				237	stmfd sp!, {r5, r6, r7}
				238
				239	// r5 = rhs
				240	// r6 = lhs
				241	// r7 = scratch
				242
				243	mov r5, r0, lsl #3 /* r5 = right shift */
				244	rsb r6, r5, #32 /* r6 = left shift */
				245
				246	/* align the unaligned pointer */
				247	bic r1, r1, #3
				248	ldr r7, [r1], #4
				249	sub r2, r2, #8
				250
				251	6: mov ip, r7, lsr r5
				252	ldr r7, [r1], #4
				253	ldr r0, [r4], #4
				254	orr ip, ip, r7, lsl r6
				255	eors r0, r0, ip
				256	moveq ip, r7, lsr r5
				257	ldreq r7, [r1], #4
				258	ldreq r0, [r4], #4
				259	orreq ip, ip, r7, lsl r6
				260	eoreqs r0, r0, ip
				261	bne 7f
				262	subs r2, r2, #8
				263	bhs 6b
				264
				265	sub r1, r1, r6, lsr #3
				266	ldmfd sp!, {r5, r6, r7}
				267
				268	/* are we done? */
				269	adds r2, r2, #8
				270	moveq r0, #0
				271	beq 9b
				272
				273	/* finish off the remaining bytes */
				274	b 8b
				275
				276	7: /* fix up the 2 pointers and fallthrough... */
				277	sub r1, r1, #4
				278	sub r1, r1, r6, lsr #3
				279	sub r4, r4, #4
				280	mov r2, #4
				281	ldmfd sp!, {r5, r6, r7}
				282	b 8b