Blame - libc/arch-arm/denver/bionic/memcpy_base.S - platform_bionic

blob: 2abb486717b2b6d474e833c7f5870cf12e6968cf [file] [log] [blame]

Shu Zhang	5b5d6e7	2014-03-12 11:18:41 +0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2008 The Android Open Source Project
				3	* All rights reserved.
				4	* Copyright (c) 2013-2014, NVIDIA Corporation. All rights reserved.
				5	*
				6	* Redistribution and use in source and binary forms, with or without
				7	* modification, are permitted provided that the following conditions
				8	* are met:
				9	* * Redistributions of source code must retain the above copyright
				10	* notice, this list of conditions and the following disclaimer.
				11	* * Redistributions in binary form must reproduce the above copyright
				12	* notice, this list of conditions and the following disclaimer in
				13	* the documentation and/or other materials provided with the
				14	* distribution.
				15	*
				16	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				17	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				18	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
				19	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
				20	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
				21	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
				22	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
				23	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
				24	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
				25	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
				26	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
				27	* SUCH DAMAGE.
				28	*/
				29
				30	#define CACHE_LINE_SIZE (64)
				31	#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*6)
				32
				33	ENTRY_PRIVATE(MEMCPY_BASE)
				34	.cfi_def_cfa_offset 8
				35	.cfi_rel_offset r0, 0
				36	.cfi_rel_offset lr, 4
				37
				38	cmp r2, #0
				39	beq .L_memcpy_done
				40	cmp r0, r1
				41	beq .L_memcpy_done
				42
				43	/* preload next cache line */
				44	pld [r1, #CACHE_LINE_SIZE*1]
				45
				46	/* Deal with very small blocks (< 32bytes) asap */
				47	cmp r2, #32
				48	blo .L_memcpy_lt_32bytes
				49	/* no need to align if len < 128 bytes */
				50	cmp r2, #128
				51	blo .L_memcpy_lt_128bytes
				52
				53	/* large copy, align dest to 64 byte boundry */
				54	pld [r1, #CACHE_LINE_SIZE*2]
				55	rsb r3, r0, #0
				56	ands r3, r3, #0x3F
				57	pld [r1, #CACHE_LINE_SIZE*3]
				58	beq .L_memcpy_dispatch
				59	sub r2, r2, r3
				60	/* copy 1 byte */
				61	movs ip, r3, lsl #31
				62	itt mi
				63	ldrbmi ip, [r1], #1
				64	strbmi ip, [r0], #1
				65	/* copy 2 bytes */
				66	itt cs
				67	ldrhcs ip, [r1], #2
				68	strhcs ip, [r0], #2
				69	/* copy 4 bytes */
				70	movs ip, r3, lsl #29
				71	itt mi
				72	ldrmi ip, [r1], #4
				73	strmi ip, [r0], #4
				74	/* copy 8 bytes */
				75	bcc 1f
				76	vld1.8 {d0}, [r1]!
				77	vst1.8 {d0}, [r0, :64]!
				78	1: /* copy 16 bytes */
				79	movs ip, r3, lsl #27
				80	bpl 1f
				81	vld1.8 {q0}, [r1]!
				82	vst1.8 {q0}, [r0, :128]!
				83	1: /* copy 32 bytes */
				84	bcc .L_memcpy_dispatch
				85	vld1.8 {q0, q1}, [r1]!
				86	vst1.8 {q0, q1}, [r0, :256]!
				87
				88	.L_memcpy_dispatch:
				89	// pre-decrement by 128 to detect nearly-done condition easily, but
				90	// also need to check if we have less than 128 bytes left at this
				91	// point due to alignment code above
				92	subs r2, r2, #128
				93	blo .L_memcpy_lt_128presub
				94
				95	// Denver does better if both source and dest are aligned so
				96	// we'll special-case that even though the code is virually identical
				97	tst r1, #0xF
				98	bne .L_memcpy_neon_unalign_src_pld
				99
				100	// DRAM memcpy should be throttled slightly to get full bandwidth
				101	//
				102	cmp r2, #32768
				103	bhi .L_memcpy_neon_unalign_src_pld
				104	.align 4
				105	1:
				106	/* copy 128 bytes in each loop */
				107	subs r2, r2, #128
				108
				109	/* preload a cache line */
				110	pld [r1, #PREFETCH_DISTANCE]
				111	/* copy a cache line */
				112	vld1.8 {q0, q1}, [r1, :128]!
				113	vst1.8 {q0, q1}, [r0, :256]!
				114	vld1.8 {q0, q1}, [r1, :128]!
				115	vst1.8 {q0, q1}, [r0, :256]!
				116	/* preload a cache line */
				117	pld [r1, #PREFETCH_DISTANCE]
				118	/* copy a cache line */
				119	vld1.8 {q0, q1}, [r1, :128]!
				120	vst1.8 {q0, q1}, [r0, :256]!
				121	vld1.8 {q0, q1}, [r1, :128]!
				122	vst1.8 {q0, q1}, [r0, :256]!
				123
				124	bhs 1b
				125	adds r2, r2, #128
				126	bne .L_memcpy_lt_128bytes_align
				127	pop {r0, pc}
				128
				129	.align 4
				130	.L_memcpy_neon_unalign_src_pld:
				131	1:
				132	/* copy 128 bytes in each loop */
				133	subs r2, r2, #128
				134
				135	/* preload a cache line */
				136	pld [r1, #PREFETCH_DISTANCE]
				137	/* copy a cache line */
				138	vld1.8 {q0, q1}, [r1]!
				139	vst1.8 {q0, q1}, [r0, :256]!
				140	vld1.8 {q0, q1}, [r1]!
				141	vst1.8 {q0, q1}, [r0, :256]!
				142	/* preload a cache line */
				143	pld [r1, #PREFETCH_DISTANCE]
				144	/* copy a cache line */
				145	vld1.8 {q0, q1}, [r1]!
				146	vst1.8 {q0, q1}, [r0, :256]!
				147	vld1.8 {q0, q1}, [r1]!
				148	vst1.8 {q0, q1}, [r0, :256]!
				149
				150	bhs 1b
				151	adds r2, r2, #128
				152	bne .L_memcpy_lt_128bytes_align
				153	pop {r0, pc}
				154
				155	.L_memcpy_lt_128presub:
				156	add r2, r2, #128
				157	.L_memcpy_lt_128bytes_align:
				158	/* copy 64 bytes */
				159	movs ip, r2, lsl #26
				160	bcc 1f
				161	vld1.8 {q0, q1}, [r1]!
				162	vst1.8 {q0, q1}, [r0, :256]!
				163	vld1.8 {q0, q1}, [r1]!
				164	vst1.8 {q0, q1}, [r0, :256]!
				165	1: /* copy 32 bytes */
				166	bpl 1f
				167	vld1.8 {q0, q1}, [r1]!
				168	vst1.8 {q0, q1}, [r0, :256]!
				169	1: /* copy 16 bytes */
				170	movs ip, r2, lsl #28
				171	bcc 1f
				172	vld1.8 {q0}, [r1]!
				173	vst1.8 {q0}, [r0, :128]!
				174	1: /* copy 8 bytes */
				175	bpl 1f
				176	vld1.8 {d0}, [r1]!
				177	vst1.8 {d0}, [r0, :64]!
				178	1: /* copy 4 bytes */
				179	tst r2, #4
				180	itt ne
				181	ldrne ip, [r1], #4
				182	strne ip, [r0], #4
				183	/* copy 2 bytes */
				184	movs ip, r2, lsl #31
				185	itt cs
				186	ldrhcs ip, [r1], #2
				187	strhcs ip, [r0], #2
				188	/* copy 1 byte */
				189	itt mi
				190	ldrbmi ip, [r1]
				191	strbmi ip, [r0]
				192
				193	pop {r0, pc}
				194
				195	.L_memcpy_lt_128bytes:
				196	/* copy 64 bytes */
				197	movs ip, r2, lsl #26
				198	bcc 1f
				199	vld1.8 {q0, q1}, [r1]!
				200	vst1.8 {q0, q1}, [r0]!
				201	vld1.8 {q0, q1}, [r1]!
				202	vst1.8 {q0, q1}, [r0]!
				203	1: /* copy 32 bytes */
				204	bpl .L_memcpy_lt_32bytes
				205	vld1.8 {q0, q1}, [r1]!
				206	vst1.8 {q0, q1}, [r0]!
				207	.L_memcpy_lt_32bytes:
				208	/* copy 16 bytes */
				209	movs ip, r2, lsl #28
				210	bcc 1f
				211	vld1.8 {q0}, [r1]!
				212	vst1.8 {q0}, [r0]!
				213	1: /* copy 8 bytes */
				214	bpl 1f
				215	vld1.8 {d0}, [r1]!
				216	vst1.8 {d0}, [r0]!
				217	1: /* copy 4 bytes */
				218	tst r2, #4
				219	itt ne
				220	ldrne ip, [r1], #4
				221	strne ip, [r0], #4
				222	/* copy 2 bytes */
				223	movs ip, r2, lsl #31
				224	itt cs
				225	ldrhcs ip, [r1], #2
				226	strhcs ip, [r0], #2
				227	/* copy 1 byte */
				228	itt mi
				229	ldrbmi ip, [r1]
				230	strbmi ip, [r0]
				231
				232	.L_memcpy_done:
				233	pop {r0, pc}
				234	END(MEMCPY_BASE)