blob: 94302f3fdcba298b686ae50aab4c4c006fdd5d66 [file] [log] [blame]
Shu Zhang6c80ccd2014-05-12 18:12:15 +08001/*
2 * Copyright (C) 2013 The Android Open Source Project
3 * All rights reserved.
4 * Copyright (c) 2013-2014 NVIDIA Corporation. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * * Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
14 * distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
23 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
24 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
26 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <private/bionic_asm.h>
Shu Zhang6c80ccd2014-05-12 18:12:15 +080031
32 .text
33 .syntax unified
34 .fpu neon
35
36#define CACHE_LINE_SIZE (64)
37#define MEMCPY_BLOCK_SIZE_SMALL (32768)
38#define MEMCPY_BLOCK_SIZE_MID (1048576)
39#define PREFETCH_DISTANCE_NEAR (CACHE_LINE_SIZE*4)
40#define PREFETCH_DISTANCE_MID (CACHE_LINE_SIZE*4)
41#define PREFETCH_DISTANCE_FAR (CACHE_LINE_SIZE*16)
42
43ENTRY(memmove)
44 cmp r2, #0
45 cmpne r0, r1
46 bxeq lr
47 subs r3, r0, r1
48 bls .L_jump_to_memcpy
49 cmp r2, r3
50 bhi .L_reversed_memcpy
51
52.L_jump_to_memcpy:
53 b memcpy
54
55.L_reversed_memcpy:
56 push {r0, lr}
57 .cfi_def_cfa_offset 8
58 .cfi_rel_offset r0, 0
59 .cfi_rel_offset lr, 4
60
61 add r0, r0, r2
62 add r1, r1, r2
63
64 /* preload next cache line */
65 pld [r1, #-CACHE_LINE_SIZE]
66 pld [r1, #-CACHE_LINE_SIZE*2]
67
68.L_reversed_memcpy_align_dest:
69 /* Deal with very small blocks (< 32bytes) asap */
70 cmp r2, #32
71 blo .L_reversed_memcpy_lt_32bytes
72 /* no need to align if len < 128 bytes */
73 cmp r2, #128
74 blo .L_reversed_memcpy_lt_128bytes
75 /* align destination to 64 bytes (1 cache line) */
76 ands r3, r0, #0x3f
77 beq .L_reversed_memcpy_dispatch
78 sub r2, r2, r3
790: /* copy 1 byte */
80 movs ip, r3, lsl #31
81 ldrbmi ip, [r1, #-1]!
82 strbmi ip, [r0, #-1]!
831: /* copy 2 bytes */
84 ldrbcs ip, [r1, #-1]!
85 strbcs ip, [r0, #-1]!
86 ldrbcs ip, [r1, #-1]!
87 strbcs ip, [r0, #-1]!
882: /* copy 4 bytes */
89 movs ip, r3, lsl #29
90 bpl 3f
91 sub r1, r1, #4
92 sub r0, r0, #4
93 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]
94 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]
953: /* copy 8 bytes */
96 bcc 4f
97 sub r1, r1, #8
98 sub r0, r0, #8
99 vld1.8 {d0}, [r1]
100 vst1.8 {d0}, [r0, :64]
1014: /* copy 16 bytes */
102 movs ip, r3, lsl #27
103 bpl 5f
104 sub r1, r1, #16
105 sub r0, r0, #16
106 vld1.8 {q0}, [r1]
107 vst1.8 {q0}, [r0, :128]
1085: /* copy 32 bytes */
109 bcc .L_reversed_memcpy_dispatch
110 sub r1, r1, #32
111 sub r0, r0, #32
112 vld1.8 {q0, q1}, [r1]
113 vst1.8 {q0, q1}, [r0, :256]
114
115.L_reversed_memcpy_dispatch:
116 /* preload more cache lines */
117 pld [r1, #-CACHE_LINE_SIZE*3]
118 pld [r1, #-CACHE_LINE_SIZE*4]
119
120 cmp r2, #MEMCPY_BLOCK_SIZE_SMALL
121 blo .L_reversed_memcpy_neon_pld_near
122 cmp r2, #MEMCPY_BLOCK_SIZE_MID
123 blo .L_reversed_memcpy_neon_pld_mid
124 b .L_reversed_memcpy_neon_pld_far
125
126.L_reversed_memcpy_neon_pld_near:
127 /* less than 128 bytes? */
128 subs r2, r2, #128
129 blo 1f
130 sub r1, r1, #32
131 sub r0, r0, #32
132 mov r3, #-32
133 .align 4
1340:
135 /* copy 128 bytes in each loop */
136 subs r2, r2, #128
137
138 /* preload to cache */
139 pld [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32]
140 /* copy a cache line */
141 vld1.8 {q0, q1}, [r1], r3
142 vst1.8 {q0, q1}, [r0, :256], r3
143 vld1.8 {q0, q1}, [r1], r3
144 vst1.8 {q0, q1}, [r0, :256], r3
145
146 /* preload to cache */
147 pld [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32]
148 /* copy a cache line */
149 vld1.8 {q0, q1}, [r1], r3
150 vst1.8 {q0, q1}, [r0, :256], r3
151 vld1.8 {q0, q1}, [r1], r3
152 vst1.8 {q0, q1}, [r0, :256], r3
153
154 bhs 0b
155 add r1, r1, #32
156 add r0, r0, #32
1571:
158 adds r2, r2, #128
159 bne .L_reversed_memcpy_lt_128bytes
160 pop {r0, pc}
161
162.L_reversed_memcpy_neon_pld_mid:
163 subs r2, r2, #128
164 sub r1, r1, #32
165 sub r0, r0, #32
166 mov r3, #-32
167 .align 4
1680:
169 /* copy 128 bytes in each loop */
170 subs r2, r2, #128
171
172 /* preload to cache */
173 pld [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32]
174 /* copy a cache line */
175 vld1.8 {q0, q1}, [r1], r3
176 vst1.8 {q0, q1}, [r0, :256], r3
177 vld1.8 {q0, q1}, [r1], r3
178 vst1.8 {q0, q1}, [r0, :256], r3
179
180 /* preload to cache */
181 pld [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32]
182 /* copy a cache line */
183 vld1.8 {q0, q1}, [r1], r3
184 vst1.8 {q0, q1}, [r0, :256], r3
185 vld1.8 {q0, q1}, [r1], r3
186 vst1.8 {q0, q1}, [r0, :256], r3
187
188 bhs 0b
189 add r1, r1, #32
190 add r0, r0, #32
1911:
192 adds r2, r2, #128
193 bne .L_reversed_memcpy_lt_128bytes
194 pop {r0, pc}
195
196.L_reversed_memcpy_neon_pld_far:
197 sub r2, r2, #128
198 sub r0, r0, #128
199 sub r1, r1, #128
200 .align 4
2010:
202 /* copy 128 bytes in each loop */
203 subs r2, r2, #128
204
205 /* preload to cache */
206 pld [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE*2)+128]
207 pld [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE)+128]
208 /* read */
209 vld1.8 {q0, q1}, [r1]!
210 vld1.8 {q2, q3}, [r1]!
211 vld1.8 {q8, q9}, [r1]!
212 vld1.8 {q10, q11}, [r1]!
213 /* write */
214 vst1.8 {q0, q1}, [r0, :256]!
215 vst1.8 {q2, q3}, [r0, :256]!
216 vst1.8 {q8, q9}, [r0, :256]!
217 vst1.8 {q10, q11}, [r0, :256]!
218
219 sub r0, r0, #256
220 sub r1, r1, #256
221 bhs 0b
222 add r0, r0, #128
223 add r1, r1, #128
2241:
225 adds r2, r2, #128
226 bne .L_reversed_memcpy_lt_128bytes
227 pop {r0, pc}
228
229.L_reversed_memcpy_lt_128bytes:
2306: /* copy 64 bytes */
231 movs ip, r2, lsl #26
232 bcc 5f
233 sub r1, r1, #32
234 sub r0, r0, #32
235 vld1.8 {q0, q1}, [r1]
236 vst1.8 {q0, q1}, [r0]
237 sub r1, r1, #32
238 sub r0, r0, #32
239 vld1.8 {q0, q1}, [r1]
240 vst1.8 {q0, q1}, [r0]
2415: /* copy 32 bytes */
242 bpl 4f
243 sub r1, r1, #32
244 sub r0, r0, #32
245 vld1.8 {q0, q1}, [r1]
246 vst1.8 {q0, q1}, [r0]
247.L_reversed_memcpy_lt_32bytes:
2484: /* copy 16 bytes */
249 movs ip, r2, lsl #28
250 bcc 3f
251 sub r1, r1, #16
252 sub r0, r0, #16
253 vld1.8 {q0}, [r1]
254 vst1.8 {q0}, [r0]
2553: /* copy 8 bytes */
256 bpl 2f
257 sub r1, r1, #8
258 sub r0, r0, #8
259 vld1.8 {d0}, [r1]
260 vst1.8 {d0}, [r0]
2612: /* copy 4 bytes */
262 ands ip, r2, #0x4
263 beq 1f
264 sub r1, r1, #4
265 sub r0, r0, #4
266 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]
267 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
2681: /* copy 2 bytes */
269 movs ip, r2, lsl #31
270 ldrbcs ip, [r1, #-1]!
271 strbcs ip, [r0, #-1]!
272 ldrbcs ip, [r1, #-1]!
273 strbcs ip, [r0, #-1]!
2740: /* copy 1 byte */
275 ldrbmi ip, [r1, #-1]!
276 strbmi ip, [r0, #-1]!
277
278 pop {r0, pc}
279
280END(memmove)