blob: a1c5055c4c111aa073973b66802ef02917897251 [file] [log] [blame]
Chris Dearman645d0312014-02-05 18:51:43 -08001/*
2 * Copyright (c) 2009
3 * MIPS Technologies, Inc., California.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14 * contributors may be used to endorse or promote products derived from
15 * this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30/************************************************************************
31 *
32 * memset.S, version "64h" with 1 cache line horizon for "pref 30" and 14 nops
33 * Version: "043009"
34 *
35 ************************************************************************/
36
37
38/************************************************************************
39 * Include files
40 ************************************************************************/
41
42#include "machine/asm.h"
43
44/*
45 * This routine could be optimized for MIPS64. The current code only
46 * uses MIPS32 instructions.
47 */
48
49#if defined(__MIPSEB__)
50# define SWHI swl /* high part is left in big-endian */
51# define SWLO swr /* low part is right in big-endian */
52#endif
53
54#if defined(__MIPSEL__)
55# define SWHI swr /* high part is right in little-endian */
56# define SWLO swl /* low part is left in little-endian */
57#endif
58
59#if !(defined(XGPROF) || defined(XPROF))
60#undef SETUP_GP
61#define SETUP_GP
62#endif
63
64#ifdef NDEBUG
65#define DBG #
66#else
67#define DBG
68#endif
69
70/*
71 * void _memset16(uint16_t* dst, uint16_t value, size_t size);
72 */
73
74LEAF(_memset16,0)
75 .set noreorder
76DBG /* Check parameters */
77DBG andi t0,a0,1 # a0 must be halfword aligned
78DBG tne t0,zero
79DBG andi t2,a2,1 # a2 must be even
80DBG tne t2,zero
81
82#ifdef FIXARGS
83 # ensure count is even
84#if (__mips==32) && (__mips_isa_rev>=2)
85 ins a2,zero,0,1
86#else
87 ori a2,1
88 xori a2,1
89#endif
90#endif
91
92#if (__mips==32) && (__mips_isa_rev>=2)
93 ins a1,a1,16,16
94#else
95 andi a1,0xffff
96 sll t3,a1,16
97 or a1,t3
98#endif
99
100 beqz a2,.Ldone
101 andi t1,a0,2
102 beqz t1,.Lalignok
103 addu t0,a0,a2 # t0 is the "past the end" address
104 sh a1,0(a0) # store one halfword to get aligned
105 addu a0,2
106 subu a2,2
107.Lalignok:
108 slti t1,a2,4 # .Laligned for 4 or more bytes
109 beqz t1,.Laligned
110 sne t1,a2,2 # one more halfword?
111 bnez t1,.Ldone
112 nop
113 sh a1,0(a0)
114.Ldone:
115 j ra
116 nop
117 .set reorder
118END(_memset16)
119
120/*
121 * void _memset32(uint32_t* dst, uint32_t value, size_t size);
122 */
123
124LEAF(_memset32,0)
125 .set noreorder
126DBG /* Check parameters */
127DBG andi t0,a0,3 # a0 must be word aligned
128DBG tne t0,zero
129DBG andi t2,a2,3 # a2 must be a multiple of 4 bytes
130DBG tne t2,zero
131
132#ifdef FIXARGS
133 # ensure count is a multiple of 4
134#if (__mips==32) && (__mips_isa_rev>=2)
135 ins $a2,$0,0,2
136#else
137 ori a2,3
138 xori a2,3
139#endif
140#endif
141
142 bnez a2,.Laligned # any work to do?
143 addu t0,a0,a2 # t0 is the "past the end" address
144
145 j ra
146 nop
147 .set reorder
148END(_memset32)
149
150LEAF(memset,0)
151
152 .set noreorder
153 .set noat
154
155 addu t0,a0,a2 # t0 is the "past the end" address
156 slti AT,a2,4 # is a2 less than 4?
157 bne AT,zero,.Llast4 # if yes, go to last4
158 move v0,a0 # memset returns the dst pointer
159
160 beq a1,zero,.Lset0
161 subu v1,zero,a0
162
163 # smear byte into 32 bit word
164#if (__mips==32) && (__mips_isa_rev>=2)
165 ins a1, a1, 8, 8 # Replicate fill byte into half-word.
166 ins a1, a1, 16, 16 # Replicate fill byte into word.
167#else
168 and a1,0xff
169 sll AT,a1,8
170 or a1,AT
171 sll AT,a1,16
172 or a1,AT
173#endif
174
175.Lset0:
176 andi v1,v1,0x3 # word-unaligned address?
177 beq v1,zero,.Laligned # v1 is the unalignment count
178 subu a2,a2,v1
179 SWHI a1,0(a0)
180 addu a0,a0,v1
181
182# Here we have the "word-aligned" a0 (until the "last4")
183.Laligned:
184 andi t8,a2,0x3f # any 64-byte chunks?
185 # t8 is the byte count past 64-byte chunks
186 beq a2,t8,.Lchk8w # when a2==t8, no 64-byte chunks
187 # There will be at most 1 32-byte chunk then
188 subu a3,a2,t8 # subtract from a2 the reminder
189 # Here a3 counts bytes in 16w chunks
190 addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks
191
192# Find out, if there are any 64-byte chunks after which will be still at least
193# 96 bytes left. The value "96" is calculated as needed buffer for
194# "pref 30,64(a0)" prefetch, which can be used as "pref 30,0(a0)" after
195# incrementing "a0" by 64.
196# For "a2" below 160 there will be no such "pref 30 safe" 64-byte chunk.
197#
198 sltiu v1,a2,160
199 bgtz v1,.Lloop16w_nopref30 # skip "pref 30,0(a0)"
200 subu t7,a2,96 # subtract "pref 30 unsafe" region
201 # below we have at least 1 64-byte chunk which is "pref 30 safe"
202 andi t6,t7,0x3f # t6 is past "64-byte safe chunks" reminder
203 subu t5,t7,t6 # subtract from t7 the reminder
204 # Here t5 counts bytes in 16w "safe" chunks
205 addu t4,a0,t5 # Now t4 is the dst after 64-byte "safe" chunks
206
207# Don't use "pref 30,0(a0)" for a0 in a "middle" of a cache line
208# pref 30,0(a0)
209# Here we are in the region, where it is safe to use "pref 30,64(a0)"
210.Lloop16w:
211 addiu a0,a0,64
212 pref 30,-32(a0) # continue setting up the dest, addr 64-32
213 sw a1,-64(a0)
214 sw a1,-60(a0)
215 sw a1,-56(a0)
216 sw a1,-52(a0)
217 sw a1,-48(a0)
218 sw a1,-44(a0)
219 sw a1,-40(a0)
220 sw a1,-36(a0)
221 nop
222 nop # the extra nop instructions help to balance
223 nop # cycles needed for "store" + "fill" + "evict"
224 nop # For 64byte store there are needed 8 fill
225 nop # and 8 evict cycles, i.e. at least 32 instr.
226 nop
227 nop
228 pref 30,0(a0) # continue setting up the dest, addr 64-0
229 sw a1,-32(a0)
230 sw a1,-28(a0)
231 sw a1,-24(a0)
232 sw a1,-20(a0)
233 sw a1,-16(a0)
234 sw a1,-12(a0)
235 sw a1,-8(a0)
236 sw a1,-4(a0)
237 nop
238 nop
239 nop
240 nop # NOTE: adding 14 nop-s instead of 12 nop-s
241 nop # gives better results for "fast" memory
242 nop
243 bne a0,t4,.Lloop16w
244 nop
245
246 beq a0,a3,.Lchk8w # maybe no more 64-byte chunks?
247 nop # this "delayed slot" is useless ...
248
249.Lloop16w_nopref30: # there could be up to 3 "64-byte nopref30" chunks
250 addiu a0,a0,64
251 sw a1,-64(a0)
252 sw a1,-60(a0)
253 sw a1,-56(a0)
254 sw a1,-52(a0)
255 sw a1,-48(a0)
256 sw a1,-44(a0)
257 sw a1,-40(a0)
258 sw a1,-36(a0)
259 sw a1,-32(a0)
260 sw a1,-28(a0)
261 sw a1,-24(a0)
262 sw a1,-20(a0)
263 sw a1,-16(a0)
264 sw a1,-12(a0)
265 sw a1,-8(a0)
266 bne a0,a3,.Lloop16w_nopref30
267 sw a1,-4(a0)
268
269.Lchk8w: # t8 here is the byte count past 64-byte chunks
270
271 andi t7,t8,0x1f # is there a 32-byte chunk?
272 # the t7 is the reminder count past 32-bytes
273 beq t8,t7,.Lchk1w # when t8==t7, no 32-byte chunk
274 move a2,t7
275
276 sw a1,0(a0)
277 sw a1,4(a0)
278 sw a1,8(a0)
279 sw a1,12(a0)
280 sw a1,16(a0)
281 sw a1,20(a0)
282 sw a1,24(a0)
283 sw a1,28(a0)
284 addiu a0,a0,32
285
286.Lchk1w:
287 andi t8,a2,0x3 # now t8 is the reminder past 1w chunks
288 beq a2,t8,.Llast4aligned
289 subu a3,a2,t8 # a3 is the count of bytes in 1w chunks
290 addu a3,a0,a3 # now a3 is the dst address past the 1w chunks
291
292# copying in words (4-byte chunks)
293.LwordCopy_loop:
294 addiu a0,a0,4
295 bne a0,a3,.LwordCopy_loop
296 sw a1,-4(a0)
297
298# store last 0-3 bytes
299# this will repeat the last store if the memset finishes on a word boundary
300.Llast4aligned:
301 j ra
302 SWLO a1,-1(t0)
303
304.Llast4:
305 beq a0,t0,.Llast4e
306.Llast4l:
307 addiu a0,a0,1
308 bne a0,t0,.Llast4l
309 sb a1,-1(a0)
310.Llast4e:
311 j ra
312 nop
313
314 .set at
315 .set reorder
316
317END(memset)
318
319
320/************************************************************************
321 * Implementation : Static functions
322 ************************************************************************/
323