Blame - libc/arch-arm/krait/bionic/memcpy.S - platform_bionic

blob: 0cd4d445a469a6a4fa83fc0461d4eb5feb5371ee [file] [log] [blame]

Christopher Ferris	7c83a1e	2013-02-26 01:30:00 -0800	[diff] [blame]	1	/*
				2	* Copyright (C) 2008 The Android Open Source Project
				3	* All rights reserved.
				4	*
				5	* Redistribution and use in source and binary forms, with or without
				6	* modification, are permitted provided that the following conditions
				7	* are met:
				8	* * Redistributions of source code must retain the above copyright
				9	* notice, this list of conditions and the following disclaimer.
				10	* * Redistributions in binary form must reproduce the above copyright
				11	* notice, this list of conditions and the following disclaimer in
				12	* the documentation and/or other materials provided with the
				13	* distribution.
				14	*
				15	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				16	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				17	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
				18	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
				19	* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
				20	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
				21	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
				22	* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
				23	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
				24	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
				25	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
				26	* SUCH DAMAGE.
				27	*/
				28
				29	/* Assumes neon instructions and a cache line size of 32 bytes. */
				30
				31	#include <machine/cpu-features.h>
				32	#include <machine/asm.h>
				33
				34	/*
				35	* This code assumes it is running on a processor that supports all arm v7
				36	* instructions, that supports neon instructions, and that has a 32 byte
				37	* cache line.
				38	*/
				39
				40	.text
				41	.fpu neon
				42
				43	#define CACHE_LINE_SIZE 32
				44
				45	ENTRY(memcpy)
				46	.save {r0, lr}
				47	/* start preloading as early as possible */
				48	pld [r1, #(CACHE_LINE_SIZE*0)]
				49	stmfd sp!, {r0, lr}
				50	pld [r1, #(CACHE_LINE_SIZE*2)]
				51
				52	/* do we have at least 16-bytes to copy (needed for alignment below) */
				53	cmp r2, #16
				54	blo 5f
				55
				56	/* align destination to cache-line for the write-buffer */
				57	rsb r3, r0, #0
				58	ands r3, r3, #0xF
				59	beq 0f
				60
				61	/* copy up to 15-bytes (count in r3) */
				62	sub r2, r2, r3
				63	movs ip, r3, lsl #31
				64	ldrmib lr, [r1], #1
				65	strmib lr, [r0], #1
				66	ldrcsb ip, [r1], #1
				67	ldrcsb lr, [r1], #1
				68	strcsb ip, [r0], #1
				69	strcsb lr, [r0], #1
				70	movs ip, r3, lsl #29
				71	bge 1f
				72	// copies 4 bytes, destination 32-bits aligned
				73	vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
				74	vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]!
				75	1: bcc 2f
				76	// copies 8 bytes, destination 64-bits aligned
				77	vld1.8 {d0}, [r1]!
				78	vst1.8 {d0}, [r0, :64]!
				79	2:
				80
				81	0: /* preload immediately the next cache line, which we may need */
				82	pld [r1, #(CACHE_LINE_SIZE*0)]
				83	pld [r1, #(CACHE_LINE_SIZE*2)]
				84
				85	/* make sure we have at least 64 bytes to copy */
				86	subs r2, r2, #64
				87	blo 2f
				88
				89	/* Preload all the cache lines we need.
				90	* NOTE: The number of pld below depends on CACHE_LINE_SIZE,
				91	* ideally we would increase the distance in the main loop to
				92	* avoid the goofy code below. In practice this doesn't seem to make
				93	* a big difference.
				94	* NOTE: The value CACHE_LINE_SIZE * 8 was chosen through
				95	* experimentation.
				96	*/
				97	pld [r1, #(CACHE_LINE_SIZE*4)]
				98	pld [r1, #(CACHE_LINE_SIZE*6)]
				99	pld [r1, #(CACHE_LINE_SIZE*8)]
				100
				101	1: /* The main loop copies 64 bytes at a time */
				102	vld1.8 {d0 - d3}, [r1]!
				103	vld1.8 {d4 - d7}, [r1]!
				104	pld [r1, #(CACHE_LINE_SIZE*8)]
				105	subs r2, r2, #64
				106	vst1.8 {d0 - d3}, [r0, :128]!
				107	vst1.8 {d4 - d7}, [r0, :128]!
				108	bhs 1b
				109
				110	2: /* fix-up the remaining count and make sure we have >= 32 bytes left */
				111	add r2, r2, #64
				112	subs r2, r2, #32
				113	blo 4f
				114
				115	3: /* 32 bytes at a time. These cache lines were already preloaded */
				116	vld1.8 {d0 - d3}, [r1]!
				117	subs r2, r2, #32
				118	vst1.8 {d0 - d3}, [r0, :128]!
				119	bhs 3b
				120	4: /* less than 32 left */
				121	add r2, r2, #32
				122	tst r2, #0x10
				123	beq 5f
				124	// copies 16 bytes, 128-bits aligned
				125	vld1.8 {d0, d1}, [r1]!
				126	vst1.8 {d0, d1}, [r0, :128]!
				127
				128	5: /* copy up to 15-bytes (count in r2) */
				129	movs ip, r2, lsl #29
				130	bcc 1f
				131	vld1.8 {d0}, [r1]!
				132	vst1.8 {d0}, [r0]!
				133	1: bge 2f
				134	vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
				135	vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
				136	2: movs ip, r2, lsl #31
				137	ldrmib r3, [r1], #1
				138	ldrcsb ip, [r1], #1
				139	ldrcsb lr, [r1], #1
				140	strmib r3, [r0], #1
				141	strcsb ip, [r0], #1
				142	strcsb lr, [r0], #1
				143
				144	ldmfd sp!, {r0, lr}
				145	bx lr
				146	END(memcpy)