blob: 2abb486717b2b6d474e833c7f5870cf12e6968cf [file] [log] [blame]
Shu Zhang5b5d6e72014-03-12 11:18:41 +08001/*
2 * Copyright (C) 2008 The Android Open Source Project
3 * All rights reserved.
4 * Copyright (c) 2013-2014, NVIDIA Corporation. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * * Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * * Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
14 * distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
23 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
24 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
26 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#define CACHE_LINE_SIZE (64)
31#define PREFETCH_DISTANCE (CACHE_LINE_SIZE*6)
32
33ENTRY_PRIVATE(MEMCPY_BASE)
34 .cfi_def_cfa_offset 8
35 .cfi_rel_offset r0, 0
36 .cfi_rel_offset lr, 4
37
38 cmp r2, #0
39 beq .L_memcpy_done
40 cmp r0, r1
41 beq .L_memcpy_done
42
43 /* preload next cache line */
44 pld [r1, #CACHE_LINE_SIZE*1]
45
46 /* Deal with very small blocks (< 32bytes) asap */
47 cmp r2, #32
48 blo .L_memcpy_lt_32bytes
49 /* no need to align if len < 128 bytes */
50 cmp r2, #128
51 blo .L_memcpy_lt_128bytes
52
53 /* large copy, align dest to 64 byte boundry */
54 pld [r1, #CACHE_LINE_SIZE*2]
55 rsb r3, r0, #0
56 ands r3, r3, #0x3F
57 pld [r1, #CACHE_LINE_SIZE*3]
58 beq .L_memcpy_dispatch
59 sub r2, r2, r3
60 /* copy 1 byte */
61 movs ip, r3, lsl #31
62 itt mi
63 ldrbmi ip, [r1], #1
64 strbmi ip, [r0], #1
65 /* copy 2 bytes */
66 itt cs
67 ldrhcs ip, [r1], #2
68 strhcs ip, [r0], #2
69 /* copy 4 bytes */
70 movs ip, r3, lsl #29
71 itt mi
72 ldrmi ip, [r1], #4
73 strmi ip, [r0], #4
74 /* copy 8 bytes */
75 bcc 1f
76 vld1.8 {d0}, [r1]!
77 vst1.8 {d0}, [r0, :64]!
781: /* copy 16 bytes */
79 movs ip, r3, lsl #27
80 bpl 1f
81 vld1.8 {q0}, [r1]!
82 vst1.8 {q0}, [r0, :128]!
831: /* copy 32 bytes */
84 bcc .L_memcpy_dispatch
85 vld1.8 {q0, q1}, [r1]!
86 vst1.8 {q0, q1}, [r0, :256]!
87
88.L_memcpy_dispatch:
89 // pre-decrement by 128 to detect nearly-done condition easily, but
90 // also need to check if we have less than 128 bytes left at this
91 // point due to alignment code above
92 subs r2, r2, #128
93 blo .L_memcpy_lt_128presub
94
95 // Denver does better if both source and dest are aligned so
96 // we'll special-case that even though the code is virually identical
97 tst r1, #0xF
98 bne .L_memcpy_neon_unalign_src_pld
99
100 // DRAM memcpy should be throttled slightly to get full bandwidth
101 //
102 cmp r2, #32768
103 bhi .L_memcpy_neon_unalign_src_pld
104 .align 4
1051:
106 /* copy 128 bytes in each loop */
107 subs r2, r2, #128
108
109 /* preload a cache line */
110 pld [r1, #PREFETCH_DISTANCE]
111 /* copy a cache line */
112 vld1.8 {q0, q1}, [r1, :128]!
113 vst1.8 {q0, q1}, [r0, :256]!
114 vld1.8 {q0, q1}, [r1, :128]!
115 vst1.8 {q0, q1}, [r0, :256]!
116 /* preload a cache line */
117 pld [r1, #PREFETCH_DISTANCE]
118 /* copy a cache line */
119 vld1.8 {q0, q1}, [r1, :128]!
120 vst1.8 {q0, q1}, [r0, :256]!
121 vld1.8 {q0, q1}, [r1, :128]!
122 vst1.8 {q0, q1}, [r0, :256]!
123
124 bhs 1b
125 adds r2, r2, #128
126 bne .L_memcpy_lt_128bytes_align
127 pop {r0, pc}
128
129 .align 4
130.L_memcpy_neon_unalign_src_pld:
1311:
132 /* copy 128 bytes in each loop */
133 subs r2, r2, #128
134
135 /* preload a cache line */
136 pld [r1, #PREFETCH_DISTANCE]
137 /* copy a cache line */
138 vld1.8 {q0, q1}, [r1]!
139 vst1.8 {q0, q1}, [r0, :256]!
140 vld1.8 {q0, q1}, [r1]!
141 vst1.8 {q0, q1}, [r0, :256]!
142 /* preload a cache line */
143 pld [r1, #PREFETCH_DISTANCE]
144 /* copy a cache line */
145 vld1.8 {q0, q1}, [r1]!
146 vst1.8 {q0, q1}, [r0, :256]!
147 vld1.8 {q0, q1}, [r1]!
148 vst1.8 {q0, q1}, [r0, :256]!
149
150 bhs 1b
151 adds r2, r2, #128
152 bne .L_memcpy_lt_128bytes_align
153 pop {r0, pc}
154
155.L_memcpy_lt_128presub:
156 add r2, r2, #128
157.L_memcpy_lt_128bytes_align:
158 /* copy 64 bytes */
159 movs ip, r2, lsl #26
160 bcc 1f
161 vld1.8 {q0, q1}, [r1]!
162 vst1.8 {q0, q1}, [r0, :256]!
163 vld1.8 {q0, q1}, [r1]!
164 vst1.8 {q0, q1}, [r0, :256]!
1651: /* copy 32 bytes */
166 bpl 1f
167 vld1.8 {q0, q1}, [r1]!
168 vst1.8 {q0, q1}, [r0, :256]!
1691: /* copy 16 bytes */
170 movs ip, r2, lsl #28
171 bcc 1f
172 vld1.8 {q0}, [r1]!
173 vst1.8 {q0}, [r0, :128]!
1741: /* copy 8 bytes */
175 bpl 1f
176 vld1.8 {d0}, [r1]!
177 vst1.8 {d0}, [r0, :64]!
1781: /* copy 4 bytes */
179 tst r2, #4
180 itt ne
181 ldrne ip, [r1], #4
182 strne ip, [r0], #4
183 /* copy 2 bytes */
184 movs ip, r2, lsl #31
185 itt cs
186 ldrhcs ip, [r1], #2
187 strhcs ip, [r0], #2
188 /* copy 1 byte */
189 itt mi
190 ldrbmi ip, [r1]
191 strbmi ip, [r0]
192
193 pop {r0, pc}
194
195.L_memcpy_lt_128bytes:
196 /* copy 64 bytes */
197 movs ip, r2, lsl #26
198 bcc 1f
199 vld1.8 {q0, q1}, [r1]!
200 vst1.8 {q0, q1}, [r0]!
201 vld1.8 {q0, q1}, [r1]!
202 vst1.8 {q0, q1}, [r0]!
2031: /* copy 32 bytes */
204 bpl .L_memcpy_lt_32bytes
205 vld1.8 {q0, q1}, [r1]!
206 vst1.8 {q0, q1}, [r0]!
207.L_memcpy_lt_32bytes:
208 /* copy 16 bytes */
209 movs ip, r2, lsl #28
210 bcc 1f
211 vld1.8 {q0}, [r1]!
212 vst1.8 {q0}, [r0]!
2131: /* copy 8 bytes */
214 bpl 1f
215 vld1.8 {d0}, [r1]!
216 vst1.8 {d0}, [r0]!
2171: /* copy 4 bytes */
218 tst r2, #4
219 itt ne
220 ldrne ip, [r1], #4
221 strne ip, [r0], #4
222 /* copy 2 bytes */
223 movs ip, r2, lsl #31
224 itt cs
225 ldrhcs ip, [r1], #2
226 strhcs ip, [r0], #2
227 /* copy 1 byte */
228 itt mi
229 ldrbmi ip, [r1]
230 strbmi ip, [r0]
231
232.L_memcpy_done:
233 pop {r0, pc}
234END(MEMCPY_BASE)