blob: 09b756b77ba94de95cce471aac4cfa008b981bd3 [file] [log] [blame]
Raghu Gandham405b8022012-07-25 18:16:42 -07001/*
2 * Copyright (c) 2009
3 * MIPS Technologies, Inc., California.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14 * contributors may be used to endorse or promote products derived from
15 * this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30/************************************************************************
31 *
32 * memset.S, version "64h" with 1 cache line horizon for "pref 30" and 14 nops
33 * Version: "043009"
34 *
35 ************************************************************************/
36
37
38/************************************************************************
39 * Include files
40 ************************************************************************/
41
Elliott Hughes851e68a2014-02-19 16:53:20 -080042#include <private/bionic_asm.h>
Raghu Gandham405b8022012-07-25 18:16:42 -070043
Elliott Hughes851e68a2014-02-19 16:53:20 -080044/*
Raghu Gandham405b8022012-07-25 18:16:42 -070045 * This routine could be optimized for MIPS64. The current code only
46 * uses MIPS32 instructions.
Elliott Hughes851e68a2014-02-19 16:53:20 -080047 */
Raghu Gandham405b8022012-07-25 18:16:42 -070048
49#if defined(__MIPSEB__)
50# define SWHI swl /* high part is left in big-endian */
51# define SWLO swr /* low part is right in big-endian */
52#endif
53
54#if defined(__MIPSEL__)
55# define SWHI swr /* high part is right in little-endian */
56# define SWLO swl /* low part is left in little-endian */
57#endif
58
59#if !(defined(XGPROF) || defined(XPROF))
60#undef SETUP_GP
61#define SETUP_GP
62#endif
63
64#ifdef NDEBUG
65#define DBG #
66#else
67#define DBG
68#endif
69
Raghu Gandham405b8022012-07-25 18:16:42 -070070LEAF(memset,0)
71
72 .set noreorder
73 .set noat
74
75 addu t0,a0,a2 # t0 is the "past the end" address
76 slti AT,a2,4 # is a2 less than 4?
77 bne AT,zero,.Llast4 # if yes, go to last4
78 move v0,a0 # memset returns the dst pointer
79
80 beq a1,zero,.Lset0
81 subu v1,zero,a0
82
83 # smear byte into 32 bit word
84#if (__mips==32) && (__mips_isa_rev>=2)
85 ins a1, a1, 8, 8 # Replicate fill byte into half-word.
86 ins a1, a1, 16, 16 # Replicate fill byte into word.
87#else
88 and a1,0xff
89 sll AT,a1,8
90 or a1,AT
91 sll AT,a1,16
92 or a1,AT
93#endif
94
95.Lset0:
96 andi v1,v1,0x3 # word-unaligned address?
97 beq v1,zero,.Laligned # v1 is the unalignment count
98 subu a2,a2,v1
99 SWHI a1,0(a0)
100 addu a0,a0,v1
101
102# Here we have the "word-aligned" a0 (until the "last4")
103.Laligned:
104 andi t8,a2,0x3f # any 64-byte chunks?
105 # t8 is the byte count past 64-byte chunks
106 beq a2,t8,.Lchk8w # when a2==t8, no 64-byte chunks
107 # There will be at most 1 32-byte chunk then
108 subu a3,a2,t8 # subtract from a2 the reminder
109 # Here a3 counts bytes in 16w chunks
110 addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks
111
112# Find out, if there are any 64-byte chunks after which will be still at least
113# 96 bytes left. The value "96" is calculated as needed buffer for
114# "pref 30,64(a0)" prefetch, which can be used as "pref 30,0(a0)" after
115# incrementing "a0" by 64.
116# For "a2" below 160 there will be no such "pref 30 safe" 64-byte chunk.
117#
118 sltiu v1,a2,160
119 bgtz v1,.Lloop16w_nopref30 # skip "pref 30,0(a0)"
120 subu t7,a2,96 # subtract "pref 30 unsafe" region
121 # below we have at least 1 64-byte chunk which is "pref 30 safe"
122 andi t6,t7,0x3f # t6 is past "64-byte safe chunks" reminder
123 subu t5,t7,t6 # subtract from t7 the reminder
124 # Here t5 counts bytes in 16w "safe" chunks
125 addu t4,a0,t5 # Now t4 is the dst after 64-byte "safe" chunks
126
127# Don't use "pref 30,0(a0)" for a0 in a "middle" of a cache line
128# pref 30,0(a0)
129# Here we are in the region, where it is safe to use "pref 30,64(a0)"
130.Lloop16w:
131 addiu a0,a0,64
132 pref 30,-32(a0) # continue setting up the dest, addr 64-32
133 sw a1,-64(a0)
134 sw a1,-60(a0)
135 sw a1,-56(a0)
136 sw a1,-52(a0)
137 sw a1,-48(a0)
138 sw a1,-44(a0)
139 sw a1,-40(a0)
140 sw a1,-36(a0)
141 nop
142 nop # the extra nop instructions help to balance
Elliott Hughes851e68a2014-02-19 16:53:20 -0800143 nop # cycles needed for "store" + "fill" + "evict"
Raghu Gandham405b8022012-07-25 18:16:42 -0700144 nop # For 64byte store there are needed 8 fill
145 nop # and 8 evict cycles, i.e. at least 32 instr.
146 nop
147 nop
148 pref 30,0(a0) # continue setting up the dest, addr 64-0
149 sw a1,-32(a0)
150 sw a1,-28(a0)
151 sw a1,-24(a0)
152 sw a1,-20(a0)
153 sw a1,-16(a0)
154 sw a1,-12(a0)
155 sw a1,-8(a0)
156 sw a1,-4(a0)
157 nop
158 nop
159 nop
160 nop # NOTE: adding 14 nop-s instead of 12 nop-s
161 nop # gives better results for "fast" memory
162 nop
163 bne a0,t4,.Lloop16w
164 nop
165
166 beq a0,a3,.Lchk8w # maybe no more 64-byte chunks?
167 nop # this "delayed slot" is useless ...
168
169.Lloop16w_nopref30: # there could be up to 3 "64-byte nopref30" chunks
170 addiu a0,a0,64
171 sw a1,-64(a0)
172 sw a1,-60(a0)
173 sw a1,-56(a0)
174 sw a1,-52(a0)
175 sw a1,-48(a0)
176 sw a1,-44(a0)
177 sw a1,-40(a0)
178 sw a1,-36(a0)
179 sw a1,-32(a0)
180 sw a1,-28(a0)
181 sw a1,-24(a0)
182 sw a1,-20(a0)
183 sw a1,-16(a0)
184 sw a1,-12(a0)
185 sw a1,-8(a0)
186 bne a0,a3,.Lloop16w_nopref30
187 sw a1,-4(a0)
188
189.Lchk8w: # t8 here is the byte count past 64-byte chunks
190
191 andi t7,t8,0x1f # is there a 32-byte chunk?
192 # the t7 is the reminder count past 32-bytes
193 beq t8,t7,.Lchk1w # when t8==t7, no 32-byte chunk
194 move a2,t7
195
196 sw a1,0(a0)
197 sw a1,4(a0)
198 sw a1,8(a0)
199 sw a1,12(a0)
200 sw a1,16(a0)
201 sw a1,20(a0)
202 sw a1,24(a0)
203 sw a1,28(a0)
204 addiu a0,a0,32
205
206.Lchk1w:
207 andi t8,a2,0x3 # now t8 is the reminder past 1w chunks
208 beq a2,t8,.Llast4aligned
209 subu a3,a2,t8 # a3 is the count of bytes in 1w chunks
210 addu a3,a0,a3 # now a3 is the dst address past the 1w chunks
211
212# copying in words (4-byte chunks)
213.LwordCopy_loop:
214 addiu a0,a0,4
215 bne a0,a3,.LwordCopy_loop
216 sw a1,-4(a0)
217
218# store last 0-3 bytes
219# this will repeat the last store if the memset finishes on a word boundary
220.Llast4aligned:
221 j ra
222 SWLO a1,-1(t0)
223
224.Llast4:
225 beq a0,t0,.Llast4e
226.Llast4l:
227 addiu a0,a0,1
228 bne a0,t0,.Llast4l
229 sb a1,-1(a0)
230.Llast4e:
231 j ra
232 nop
233
234 .set at
235 .set reorder
236
237END(memset)
238
239
240/************************************************************************
241 * Implementation : Static functions
242 ************************************************************************/