blob: 76c5a84595f63a9e474f50efd37028ae1d472b2d [file] [log] [blame]
Brent DeGraaf1d0268c2013-10-02 13:47:11 +00001/***************************************************************************
2 Copyright (c) 2009-2013 The Linux Foundation. All rights reserved.
Christopher Ferris5f45d582013-08-07 13:09:51 -07003
Brent DeGraaf1d0268c2013-10-02 13:47:11 +00004 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the name of The Linux Foundation nor the names of its contributors may
12 be used to endorse or promote products derived from this software
13 without specific prior written permission.
Christopher Ferris5f45d582013-08-07 13:09:51 -070014
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000015 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
19 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 POSSIBILITY OF SUCH DAMAGE.
26 ***************************************************************************/
Christopher Ferris5f45d582013-08-07 13:09:51 -070027
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000028/* Assumes neon instructions and a cache line size of 64 bytes. */
Christopher Ferris5f45d582013-08-07 13:09:51 -070029
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000030#include <machine/cpu-features.h>
31#include <machine/asm.h>
Christopher Ferrisa57c9c02013-08-21 09:41:12 -070032
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000033#define PLDOFFS (10)
34#define PLDTHRESH (PLDOFFS)
35#define BBTHRESH (4096/64)
36#define PLDSIZE (64)
Christopher Ferris5f45d582013-08-07 13:09:51 -070037
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000038#if (PLDOFFS < 1)
39#error Routine does not support offsets less than 1
40#endif
Christopher Ferris5f45d582013-08-07 13:09:51 -070041
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000042#if (PLDTHRESH < PLDOFFS)
43#error PLD threshold must be greater than or equal to the PLD offset
44#endif
Christopher Ferris5f45d582013-08-07 13:09:51 -070045
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000046 .text
47 .fpu neon
Christopher Ferris5f45d582013-08-07 13:09:51 -070048
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000049.L_memcpy_base:
50 cmp r2, #4
51 blt .L_neon_lt4
52 cmp r2, #16
53 blt .L_neon_lt16
54 cmp r2, #32
55 blt .L_neon_16
56 cmp r2, #64
57 blt .L_neon_copy_32_a
Christopher Ferris5f45d582013-08-07 13:09:51 -070058
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000059 mov r12, r2, lsr #6
60 cmp r12, #PLDTHRESH
61 ble .L_neon_copy_64_loop_nopld
Christopher Ferris5f45d582013-08-07 13:09:51 -070062
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000063 push {r9, r10}
64 .cfi_adjust_cfa_offset 8
65 .cfi_rel_offset r9, 0
66 .cfi_rel_offset r10, 4
Christopher Ferris5f45d582013-08-07 13:09:51 -070067
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000068 cmp r12, #BBTHRESH
69 ble .L_neon_prime_pump
Christopher Ferris5f45d582013-08-07 13:09:51 -070070
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000071 add lr, r0, #0x400
72 add r9, r1, #(PLDOFFS*PLDSIZE)
73 sub lr, lr, r9
74 lsl lr, lr, #21
75 lsr lr, lr, #21
76 add lr, lr, #(PLDOFFS*PLDSIZE)
77 cmp r12, lr, lsr #6
78 ble .L_neon_prime_pump
Christopher Ferris5f45d582013-08-07 13:09:51 -070079
Brent DeGraaf1d0268c2013-10-02 13:47:11 +000080 itt gt
81 movgt r9, #(PLDOFFS)
82 rsbsgt r9, r9, lr, lsr #6
83 ble .L_neon_prime_pump
84
85 add r10, r1, lr
86 bic r10, #0x3F
87
88 sub r12, r12, lr, lsr #6
89
90 cmp r9, r12
91 itee le
92 suble r12, r12, r9
93 movgt r9, r12
94 movgt r12, #0
95
96 pld [r1, #((PLDOFFS-1)*PLDSIZE)]
97.L_neon_copy_64_loop_outer_doublepld:
98 pld [r1, #((PLDOFFS)*PLDSIZE)]
99 vld1.32 {q0, q1}, [r1]!
100 vld1.32 {q2, q3}, [r1]!
101 ldr r3, [r10]
102 subs r9, r9, #1
103 vst1.32 {q0, q1}, [r0]!
104 vst1.32 {q2, q3}, [r0]!
105 add r10, #64
106 bne .L_neon_copy_64_loop_outer_doublepld
107 cmp r12, #0
108 beq .L_neon_pop_before_nopld
109
110 cmp r12, #(512*1024/64)
111 blt .L_neon_copy_64_loop_outer
112
113.L_neon_copy_64_loop_ddr:
114 vld1.32 {q0, q1}, [r1]!
115 vld1.32 {q2, q3}, [r1]!
116 pld [r10]
117 subs r12, r12, #1
118 vst1.32 {q0, q1}, [r0]!
119 vst1.32 {q2, q3}, [r0]!
120 add r10, #64
121 bne .L_neon_copy_64_loop_ddr
122 b .L_neon_pop_before_nopld
123
124.L_neon_prime_pump:
125 mov lr, #(PLDOFFS*PLDSIZE)
126 add r10, r1, #(PLDOFFS*PLDSIZE)
127 bic r10, #0x3F
128 sub r12, r12, #PLDOFFS
129 ldr r3, [r10, #(-1*PLDSIZE)]
130
131.L_neon_copy_64_loop_outer:
132 vld1.32 {q0, q1}, [r1]!
133 vld1.32 {q2, q3}, [r1]!
134 ldr r3, [r10]
135 subs r12, r12, #1
136 vst1.32 {q0, q1}, [r0]!
137 vst1.32 {q2, q3}, [r0]!
138 add r10, #64
139 bne .L_neon_copy_64_loop_outer
140
141.L_neon_pop_before_nopld:
142 mov r12, lr, lsr #6
143 pop {r9, r10}
144 .cfi_adjust_cfa_offset -8
145 .cfi_restore r9
146 .cfi_restore r10
147
148.L_neon_copy_64_loop_nopld:
149 vld1.32 {q8, q9}, [r1]!
150 vld1.32 {q10, q11}, [r1]!
151 subs r12, r12, #1
152 vst1.32 {q8, q9}, [r0]!
153 vst1.32 {q10, q11}, [r0]!
154 bne .L_neon_copy_64_loop_nopld
155 ands r2, r2, #0x3f
156 beq .L_neon_exit
157
158.L_neon_copy_32_a:
159 movs r3, r2, lsl #27
160 bcc .L_neon_16
161 vld1.32 {q0,q1}, [r1]!
162 vst1.32 {q0,q1}, [r0]!
163
164.L_neon_16:
165 bpl .L_neon_lt16
166 vld1.32 {q8}, [r1]!
167 vst1.32 {q8}, [r0]!
168 ands r2, r2, #0x0f
169 beq .L_neon_exit
170
171.L_neon_lt16:
172 movs r3, r2, lsl #29
173 bcc 1f
174 vld1.8 {d0}, [r1]!
175 vst1.8 {d0}, [r0]!
1761:
177 bge .L_neon_lt4
178 vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
179 vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
180
181.L_neon_lt4:
182 movs r2, r2, lsl #31
183 itt cs
184 ldrhcs r3, [r1], #2
185 strhcs r3, [r0], #2
186 itt mi
187 ldrbmi r3, [r1]
188 strbmi r3, [r0]
189
190.L_neon_exit:
191 pop {r0, pc}