blob: 132190bb8dc5f004ce4e8a788cff8bc5529ed63c [file] [log] [blame]
Shu Zhang6c80ccd2014-05-12 18:12:15 +08001/*
2 * Copyright (C) 2013 The Android Open Source Project
3 * All rights reserved.
4 * Copyright (c) 2013-2014 NVIDIA Corporation. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * * Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
14 * distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
23 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
24 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
26 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <private/bionic_asm.h>
31#include <private/libc_events.h>
32
33 .text
34 .syntax unified
35 .fpu neon
36
37#define CACHE_LINE_SIZE (64)
38#define MEMCPY_BLOCK_SIZE_SMALL (32768)
39#define MEMCPY_BLOCK_SIZE_MID (1048576)
40#define PREFETCH_DISTANCE_NEAR (CACHE_LINE_SIZE*4)
41#define PREFETCH_DISTANCE_MID (CACHE_LINE_SIZE*4)
42#define PREFETCH_DISTANCE_FAR (CACHE_LINE_SIZE*16)
43
44ENTRY(memmove)
45 cmp r2, #0
46 cmpne r0, r1
47 bxeq lr
48 subs r3, r0, r1
49 bls .L_jump_to_memcpy
50 cmp r2, r3
51 bhi .L_reversed_memcpy
52
53.L_jump_to_memcpy:
54 b memcpy
55
56.L_reversed_memcpy:
57 push {r0, lr}
58 .cfi_def_cfa_offset 8
59 .cfi_rel_offset r0, 0
60 .cfi_rel_offset lr, 4
61
62 add r0, r0, r2
63 add r1, r1, r2
64
65 /* preload next cache line */
66 pld [r1, #-CACHE_LINE_SIZE]
67 pld [r1, #-CACHE_LINE_SIZE*2]
68
69.L_reversed_memcpy_align_dest:
70 /* Deal with very small blocks (< 32bytes) asap */
71 cmp r2, #32
72 blo .L_reversed_memcpy_lt_32bytes
73 /* no need to align if len < 128 bytes */
74 cmp r2, #128
75 blo .L_reversed_memcpy_lt_128bytes
76 /* align destination to 64 bytes (1 cache line) */
77 ands r3, r0, #0x3f
78 beq .L_reversed_memcpy_dispatch
79 sub r2, r2, r3
800: /* copy 1 byte */
81 movs ip, r3, lsl #31
82 ldrbmi ip, [r1, #-1]!
83 strbmi ip, [r0, #-1]!
841: /* copy 2 bytes */
85 ldrbcs ip, [r1, #-1]!
86 strbcs ip, [r0, #-1]!
87 ldrbcs ip, [r1, #-1]!
88 strbcs ip, [r0, #-1]!
892: /* copy 4 bytes */
90 movs ip, r3, lsl #29
91 bpl 3f
92 sub r1, r1, #4
93 sub r0, r0, #4
94 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]
95 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0, :32]
963: /* copy 8 bytes */
97 bcc 4f
98 sub r1, r1, #8
99 sub r0, r0, #8
100 vld1.8 {d0}, [r1]
101 vst1.8 {d0}, [r0, :64]
1024: /* copy 16 bytes */
103 movs ip, r3, lsl #27
104 bpl 5f
105 sub r1, r1, #16
106 sub r0, r0, #16
107 vld1.8 {q0}, [r1]
108 vst1.8 {q0}, [r0, :128]
1095: /* copy 32 bytes */
110 bcc .L_reversed_memcpy_dispatch
111 sub r1, r1, #32
112 sub r0, r0, #32
113 vld1.8 {q0, q1}, [r1]
114 vst1.8 {q0, q1}, [r0, :256]
115
116.L_reversed_memcpy_dispatch:
117 /* preload more cache lines */
118 pld [r1, #-CACHE_LINE_SIZE*3]
119 pld [r1, #-CACHE_LINE_SIZE*4]
120
121 cmp r2, #MEMCPY_BLOCK_SIZE_SMALL
122 blo .L_reversed_memcpy_neon_pld_near
123 cmp r2, #MEMCPY_BLOCK_SIZE_MID
124 blo .L_reversed_memcpy_neon_pld_mid
125 b .L_reversed_memcpy_neon_pld_far
126
127.L_reversed_memcpy_neon_pld_near:
128 /* less than 128 bytes? */
129 subs r2, r2, #128
130 blo 1f
131 sub r1, r1, #32
132 sub r0, r0, #32
133 mov r3, #-32
134 .align 4
1350:
136 /* copy 128 bytes in each loop */
137 subs r2, r2, #128
138
139 /* preload to cache */
140 pld [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32]
141 /* copy a cache line */
142 vld1.8 {q0, q1}, [r1], r3
143 vst1.8 {q0, q1}, [r0, :256], r3
144 vld1.8 {q0, q1}, [r1], r3
145 vst1.8 {q0, q1}, [r0, :256], r3
146
147 /* preload to cache */
148 pld [r1, #-(PREFETCH_DISTANCE_NEAR+CACHE_LINE_SIZE*2)+32]
149 /* copy a cache line */
150 vld1.8 {q0, q1}, [r1], r3
151 vst1.8 {q0, q1}, [r0, :256], r3
152 vld1.8 {q0, q1}, [r1], r3
153 vst1.8 {q0, q1}, [r0, :256], r3
154
155 bhs 0b
156 add r1, r1, #32
157 add r0, r0, #32
1581:
159 adds r2, r2, #128
160 bne .L_reversed_memcpy_lt_128bytes
161 pop {r0, pc}
162
163.L_reversed_memcpy_neon_pld_mid:
164 subs r2, r2, #128
165 sub r1, r1, #32
166 sub r0, r0, #32
167 mov r3, #-32
168 .align 4
1690:
170 /* copy 128 bytes in each loop */
171 subs r2, r2, #128
172
173 /* preload to cache */
174 pld [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32]
175 /* copy a cache line */
176 vld1.8 {q0, q1}, [r1], r3
177 vst1.8 {q0, q1}, [r0, :256], r3
178 vld1.8 {q0, q1}, [r1], r3
179 vst1.8 {q0, q1}, [r0, :256], r3
180
181 /* preload to cache */
182 pld [r1, #-(PREFETCH_DISTANCE_MID+CACHE_LINE_SIZE)+32]
183 /* copy a cache line */
184 vld1.8 {q0, q1}, [r1], r3
185 vst1.8 {q0, q1}, [r0, :256], r3
186 vld1.8 {q0, q1}, [r1], r3
187 vst1.8 {q0, q1}, [r0, :256], r3
188
189 bhs 0b
190 add r1, r1, #32
191 add r0, r0, #32
1921:
193 adds r2, r2, #128
194 bne .L_reversed_memcpy_lt_128bytes
195 pop {r0, pc}
196
197.L_reversed_memcpy_neon_pld_far:
198 sub r2, r2, #128
199 sub r0, r0, #128
200 sub r1, r1, #128
201 .align 4
2020:
203 /* copy 128 bytes in each loop */
204 subs r2, r2, #128
205
206 /* preload to cache */
207 pld [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE*2)+128]
208 pld [r1, #-(PREFETCH_DISTANCE_FAR+CACHE_LINE_SIZE)+128]
209 /* read */
210 vld1.8 {q0, q1}, [r1]!
211 vld1.8 {q2, q3}, [r1]!
212 vld1.8 {q8, q9}, [r1]!
213 vld1.8 {q10, q11}, [r1]!
214 /* write */
215 vst1.8 {q0, q1}, [r0, :256]!
216 vst1.8 {q2, q3}, [r0, :256]!
217 vst1.8 {q8, q9}, [r0, :256]!
218 vst1.8 {q10, q11}, [r0, :256]!
219
220 sub r0, r0, #256
221 sub r1, r1, #256
222 bhs 0b
223 add r0, r0, #128
224 add r1, r1, #128
2251:
226 adds r2, r2, #128
227 bne .L_reversed_memcpy_lt_128bytes
228 pop {r0, pc}
229
230.L_reversed_memcpy_lt_128bytes:
2316: /* copy 64 bytes */
232 movs ip, r2, lsl #26
233 bcc 5f
234 sub r1, r1, #32
235 sub r0, r0, #32
236 vld1.8 {q0, q1}, [r1]
237 vst1.8 {q0, q1}, [r0]
238 sub r1, r1, #32
239 sub r0, r0, #32
240 vld1.8 {q0, q1}, [r1]
241 vst1.8 {q0, q1}, [r0]
2425: /* copy 32 bytes */
243 bpl 4f
244 sub r1, r1, #32
245 sub r0, r0, #32
246 vld1.8 {q0, q1}, [r1]
247 vst1.8 {q0, q1}, [r0]
248.L_reversed_memcpy_lt_32bytes:
2494: /* copy 16 bytes */
250 movs ip, r2, lsl #28
251 bcc 3f
252 sub r1, r1, #16
253 sub r0, r0, #16
254 vld1.8 {q0}, [r1]
255 vst1.8 {q0}, [r0]
2563: /* copy 8 bytes */
257 bpl 2f
258 sub r1, r1, #8
259 sub r0, r0, #8
260 vld1.8 {d0}, [r1]
261 vst1.8 {d0}, [r0]
2622: /* copy 4 bytes */
263 ands ip, r2, #0x4
264 beq 1f
265 sub r1, r1, #4
266 sub r0, r0, #4
267 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]
268 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
2691: /* copy 2 bytes */
270 movs ip, r2, lsl #31
271 ldrbcs ip, [r1, #-1]!
272 strbcs ip, [r0, #-1]!
273 ldrbcs ip, [r1, #-1]!
274 strbcs ip, [r0, #-1]!
2750: /* copy 1 byte */
276 ldrbmi ip, [r1, #-1]!
277 strbmi ip, [r0, #-1]!
278
279 pop {r0, pc}
280
281END(memmove)