blob: e1b1a727cda03b42609f1144a212c451f3e7b18a [file] [log] [blame]
Bernhard Rosenkraenzer7e4fa562014-03-05 11:40:57 +01001/* Copyright (c) 2012, Linaro Limited
2 All rights reserved.
3
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
6 * Redistributions of source code must retain the above copyright
7 notice, this list of conditions and the following disclaimer.
8 * Redistributions in binary form must reproduce the above copyright
9 notice, this list of conditions and the following disclaimer in the
10 documentation and/or other materials provided with the distribution.
11 * Neither the name of the Linaro nor the
12 names of its contributors may be used to endorse or promote products
13 derived from this software without specific prior written permission.
14
15 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
19 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
20 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
21 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27
28/* Assumptions:
29 *
30 * ARMv8-a, AArch64
31 * Unaligned accesses
32 *
33 */
34
35#include <private/bionic_asm.h>
36
37#define dstin x0
38#define src x1
39#define count x2
40#define tmp1 x3
41#define tmp1w w3
42#define tmp2 x4
43#define tmp2w w4
44#define tmp3 x5
45#define tmp3w w5
46#define dst x6
47
48#define A_l x7
49#define A_h x8
50#define B_l x9
51#define B_h x10
52#define C_l x11
53#define C_h x12
54#define D_l x13
55#define D_h x14
56
57ENTRY(memcpy)
58
59 mov dst, dstin
60 cmp count, #64
61 b.ge .Lcpy_not_short
62 cmp count, #15
63 b.le .Ltail15tiny
64
65 /* Deal with small copies quickly by dropping straight into the
66 * exit block. */
67.Ltail63:
68 /* Copy up to 48 bytes of data. At this point we only need the
69 * bottom 6 bits of count to be accurate. */
70 ands tmp1, count, #0x30
71 b.eq .Ltail15
72 add dst, dst, tmp1
73 add src, src, tmp1
74 cmp tmp1w, #0x20
75 b.eq 1f
76 b.lt 2f
77 ldp A_l, A_h, [src, #-48]
78 stp A_l, A_h, [dst, #-48]
791:
80 ldp A_l, A_h, [src, #-32]
81 stp A_l, A_h, [dst, #-32]
822:
83 ldp A_l, A_h, [src, #-16]
84 stp A_l, A_h, [dst, #-16]
85
86.Ltail15:
87 ands count, count, #15
88 beq 1f
89 add src, src, count
90 ldp A_l, A_h, [src, #-16]
91 add dst, dst, count
92 stp A_l, A_h, [dst, #-16]
931:
94 ret
95
96.Ltail15tiny:
97 /* Copy up to 15 bytes of data. Does not assume additional data
98 being copied. */
99 tbz count, #3, 1f
100 ldr tmp1, [src], #8
101 str tmp1, [dst], #8
1021:
103 tbz count, #2, 1f
104 ldr tmp1w, [src], #4
105 str tmp1w, [dst], #4
1061:
107 tbz count, #1, 1f
108 ldrh tmp1w, [src], #2
109 strh tmp1w, [dst], #2
1101:
111 tbz count, #0, 1f
112 ldrb tmp1w, [src]
113 strb tmp1w, [dst]
1141:
115 ret
116
117.Lcpy_not_short:
118 /* We don't much care about the alignment of DST, but we want SRC
119 * to be 128-bit (16 byte) aligned so that we don't cross cache line
120 * boundaries on both loads and stores. */
121 neg tmp2, src
122 ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
123 b.eq 2f
124 sub count, count, tmp2
125 /* Copy more data than needed; it's faster than jumping
126 * around copying sub-Quadword quantities. We know that
127 * it can't overrun. */
128 ldp A_l, A_h, [src]
129 add src, src, tmp2
130 stp A_l, A_h, [dst]
131 add dst, dst, tmp2
132 /* There may be less than 63 bytes to go now. */
133 cmp count, #63
134 b.le .Ltail63
1352:
136 subs count, count, #128
137 b.ge .Lcpy_body_large
138 /* Less than 128 bytes to copy, so handle 64 here and then jump
139 * to the tail. */
140 ldp A_l, A_h, [src]
141 ldp B_l, B_h, [src, #16]
142 ldp C_l, C_h, [src, #32]
143 ldp D_l, D_h, [src, #48]
144 stp A_l, A_h, [dst]
145 stp B_l, B_h, [dst, #16]
146 stp C_l, C_h, [dst, #32]
147 stp D_l, D_h, [dst, #48]
148 tst count, #0x3f
149 add src, src, #64
150 add dst, dst, #64
151 b.ne .Ltail63
152 ret
153
154 /* Critical loop. Start at a new cache line boundary. Assuming
155 * 64 bytes per line this ensures the entire loop is in one line. */
156 .p2align 6
157.Lcpy_body_large:
158 /* There are at least 128 bytes to copy. */
159 ldp A_l, A_h, [src, #0]
160 sub dst, dst, #16 /* Pre-bias. */
161 ldp B_l, B_h, [src, #16]
162 ldp C_l, C_h, [src, #32]
163 ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
1641:
165 stp A_l, A_h, [dst, #16]
166 ldp A_l, A_h, [src, #16]
167 stp B_l, B_h, [dst, #32]
168 ldp B_l, B_h, [src, #32]
169 stp C_l, C_h, [dst, #48]
170 ldp C_l, C_h, [src, #48]
171 stp D_l, D_h, [dst, #64]!
172 ldp D_l, D_h, [src, #64]!
173 subs count, count, #64
174 b.ge 1b
175 stp A_l, A_h, [dst, #16]
176 stp B_l, B_h, [dst, #32]
177 stp C_l, C_h, [dst, #48]
178 stp D_l, D_h, [dst, #64]
179 add src, src, #16
180 add dst, dst, #64 + 16
181 tst count, #0x3f
182 b.ne .Ltail63
183 ret
184END(memcpy)