Print this page
1235 Use symbol capabilities to eliminate libc_hwcap*
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/lib/libc/i386_hwcap1/gen/memcpy.s
+++ new/usr/src/lib/libc/capabilities/i386/common/memcpy.s
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 24 * Use is subject to license terms.
25 25 */
26 26
27 27 .file "memcpy.s"
28 28
29 29 #include <sys/asm_linkage.h>
30 30
31 31 ANSI_PRAGMA_WEAK(memmove,function)
32 32 ANSI_PRAGMA_WEAK(memcpy,function)
33 33
34 34 ENTRY(memmove)
35 35 movl 0+12(%esp),%ecx / get number of bytes to move
36 36 pushl %esi / save off %edi, %esi and move destination
37 37 pushl %edi
38 38 movl 8+ 4(%esp),%edi / destination buffer address
39 39 movl 8+ 8(%esp),%esi / source buffer address
40 40 movl %edi, %eax
41 41 testl %ecx,%ecx
42 42 jz .Return
43 43
44 44 cmpl %esi,%edi / if (source addr > dest addr)
45 45 leal -1(%esi,%ecx),%edx / %edx = src + size - 1
46 46 jbe .memcpy_post / jump if dst <= src
47 47 cmpl %edx,%edi
48 48 jbe .CopyLeft / jump if dst <= src + size - 1
49 49 jmp .memcpy_post
50 50
↓ open down ↓ |
50 lines elided |
↑ open up ↑ |
51 51 ENTRY(memcpy)
52 52 pushl %esi
53 53 pushl %edi
54 54
55 55 movl 8+4(%esp),%edi / %edi = dest address
56 56 movl %edi, %eax / save this
57 57 movl 8+8(%esp),%esi / %esi = source address
58 58 movl 8+12(%esp),%ecx/ %ecx = length of string
59 59 / %edx scratch register
60 60 / %eax scratch register
61 -.memcpy_post:
61 +.memcpy_post:
62 62 nop / this really helps, don't know why
63 63 / note: cld is perf death on P4
64 64 cmpl $63,%ecx
65 65 ja .move_sse / not worth doing sse for less
66 66
67 -.movew:
67 +.movew:
68 68 movl %ecx,%edx / save byte cnt
69 69 shrl $2,%ecx / %ecx = number of words to move
70 70 rep ; smovl / move the words
71 71
72 72
73 73 andl $0x3,%edx / %edx = number of bytes left to move
74 74 jz .Return / %edx <= 3, so just unroll the loop
75 75
76 76 movb (%esi), %cl
77 77 movb %cl, (%edi)
78 78 decl %edx
↓ open down ↓ |
1 lines elided |
↑ open up ↑ |
79 79 jz .Return
80 80 movb 1(%esi), %cl
81 81 movb %cl, 1(%edi)
82 82 decl %edx
83 83 jz .Return
84 84 movb 2(%esi), %cl
85 85 movb %cl, 2(%edi)
86 86
87 87 .Return:
88 88 popl %edi / restore register variables
89 - popl %esi
89 + popl %esi
90 90 ret
91 91
92 92 .move_sse:
93 93 /
94 94 / time to 16 byte align destination
95 95 /
96 96 andl $15, %eax
97 97 jnz .sse_unaligned / jmp if dest is unaligned
98 98 .sse: / dest is aligned, check source
99 99 movl %ecx, %edx / get byte count
100 100 shrl $6, %edx / number of 64 byte blocks to move
101 101 testl $15, %esi
102 102 jnz .sse_da / go to slow loop if source is unaligned
103 103 cmpl $65535, %ecx
104 104 ja .sse_sa_nt_loop
105 -
105 +
106 106 /
107 107 / use aligned load since we're lucky
108 108 /
109 109 .sse_sa_loop:
110 110 prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time
111 111 prefetcht0 568(%edi) / prefetch source & copy 64 byte at a time
112 112 movaps 0(%esi), %xmm0
113 - movaps %xmm0, 0(%edi)
113 + movaps %xmm0, 0(%edi)
114 114 movaps 16(%esi), %xmm1
115 115 movaps %xmm1, 16(%edi)
116 116 movaps 32(%esi), %xmm2
117 - movaps %xmm2, 32(%edi)
117 + movaps %xmm2, 32(%edi)
118 118 movaps 48(%esi), %xmm3
119 119 movaps %xmm3, 48(%edi)
120 120 addl $64, %esi
121 121 addl $64, %edi
122 122 decl %edx
123 123 jnz .sse_sa_loop
124 -
124 +
125 125 .sse_cleanup:
126 126 andl $63, %ecx / compute remaining bytes
127 127 movl 8+4(%esp), %eax / setup return value
128 128 jz .Return
129 129 jmp .movew
130 -
130 +
131 131 /
132 132 / use aligned load since we're lucky
133 133 /
134 134 .align 16
135 135 .sse_sa_nt_loop:
136 136 prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time
137 137 movaps (%esi), %xmm0
138 - movntps %xmm0, 0(%edi)
138 + movntps %xmm0, 0(%edi)
139 139 movaps 16(%esi), %xmm1
140 140 movntps %xmm1, 16(%edi)
141 141 movaps 32(%esi), %xmm2
142 - movntps %xmm2, 32(%edi)
142 + movntps %xmm2, 32(%edi)
143 143 movaps 48(%esi), %xmm3
144 144 movntps %xmm3, 48(%edi)
145 145 addl $64, %esi
146 146 addl $64, %edi
147 147 decl %edx
148 148 jnz .sse_sa_nt_loop
149 149 #if defined(_SSE2_INSN)
150 150 mfence
151 151 #elif defined(_SSE_INSN)
152 152 sfence
153 153 #else
154 154 #error "Must have either SSE or SSE2"
155 155 #endif
156 156 jmp .sse_cleanup
157 157
158 158 /
159 159 / Make certain that destination buffer becomes aligned
160 160 /
161 161 .sse_unaligned:
162 162 neg %eax / subtract from 16 and get destination
163 163 andl $15, %eax / aligned on a 16 byte boundary
↓ open down ↓ |
11 lines elided |
↑ open up ↑ |
164 164 movl %ecx, %edx / saved count
165 165 subl %eax, %ecx / subtract from byte count
166 166 cmpl $64, %ecx / after aligning, will we still have 64 bytes?
167 167 cmovb %edx, %ecx / if not, restore original byte count,
168 168 cmovb 8+4(%esp), %eax / and restore return value,
169 169 jb .movew / and do a non-SSE move.
170 170 xchg %ecx, %eax / flip for copy
171 171 rep ; smovb / move the bytes
172 172 xchg %ecx, %eax / flip back
173 173 jmp .sse
174 -
174 +
175 175 .align 16
176 176 .sse_da:
177 177 cmpl $65535, %ecx
178 178 jbe .sse_da_loop
179 179
180 180 /
181 181 / use unaligned load since source doesn't line up
182 182 /
183 183 .sse_da_nt_loop:
184 184 prefetchnta 16384(%esi) / prefetch source & copy 64 byte at a time
185 185 movups 0(%esi), %xmm0
186 - movntps %xmm0, 0(%edi)
186 + movntps %xmm0, 0(%edi)
187 187 movups 16(%esi), %xmm1
188 188 movntps %xmm1, 16(%edi)
189 189 movups 32(%esi), %xmm2
190 - movntps %xmm2, 32(%edi)
190 + movntps %xmm2, 32(%edi)
191 191 movups 48(%esi), %xmm3
192 192 movntps %xmm3, 48(%edi)
193 193 addl $64, %esi
194 194 addl $64, %edi
195 195 decl %edx
196 196 jnz .sse_da_nt_loop
197 197 #if defined(_SSE2_INSN)
198 198 mfence
199 199 #elif defined(_SSE_INSN)
200 200 sfence
201 201 #else
202 202 #error "Must have either SSE or SSE2"
↓ open down ↓ |
2 lines elided |
↑ open up ↑ |
203 203 #endif
204 204 jmp .sse_cleanup
205 205 /
206 206 / use unaligned load since source doesn't line up
207 207 /
208 208 .align 16
209 209 .sse_da_loop:
210 210 prefetcht0 568(%esi) / prefetch source & copy 64 byte at a time
211 211 prefetcht0 568(%edi)
212 212 movups 0(%esi), %xmm0
213 - movaps %xmm0, 0(%edi)
213 + movaps %xmm0, 0(%edi)
214 214 movups 16(%esi), %xmm1
215 215 movaps %xmm1, 16(%edi)
216 216 movups 32(%esi), %xmm2
217 - movaps %xmm2, 32(%edi)
217 + movaps %xmm2, 32(%edi)
218 218 movups 48(%esi), %xmm3
219 219 movaps %xmm3, 48(%edi)
220 220 addl $64, %esi
221 221 addl $64, %edi
222 222 decl %edx
223 223 jnz .sse_da_loop
224 224 jmp .sse_cleanup
225 -
225 +
226 226 SET_SIZE(memcpy)
227 227
228 228
229 229 / .CopyLeft handles the memmove case where we must perform the copy backwards,
230 230 / because of overlap between src and dst. This is not particularly optimized.
231 231
232 232 .CopyLeft:
233 233 movl $3,%eax / heavily used constant
234 234 std / reverse direction bit (RtoL)
235 235 cmpl $12,%ecx / if (size < 12)
236 236 ja .BigCopyLeft / {
237 237 movl %edx,%esi / src = src + size - 1
238 238 leal -1(%ecx,%edi),%edi / dst = dst + size - 1
239 239 rep; smovb / do the byte copy
240 240 cld / reset direction flag to LtoR
241 241 popl %edi / }
242 242 popl %esi / restore registers
243 243 movl 4(%esp),%eax / set up return value
244 244 ret / return(dba);
↓ open down ↓ |
9 lines elided |
↑ open up ↑ |
245 245 .BigCopyLeft: / } else {
246 246 xchgl %edx,%ecx
247 247 movl %ecx,%esi / align source w/byte copy
248 248 leal -1(%edx,%edi),%edi
249 249 andl %eax,%ecx
250 250 jz .SkipAlignLeft
251 251 addl $1, %ecx / we need to insure that future
252 252 subl %ecx,%edx / copy is done on aligned boundary
253 253 rep; smovb
254 254 .SkipAlignLeft:
255 - movl %edx,%ecx
255 + movl %edx,%ecx
256 256 subl %eax,%esi
257 257 shrl $2,%ecx / do 4 byte copy RtoL
258 258 subl %eax,%edi
259 259 rep; smovl
260 260 andl %eax,%edx / do 1 byte copy whats left
261 261 jz .CleanupReturnLeft
262 - movl %edx,%ecx
262 + movl %edx,%ecx
263 263 addl %eax,%esi / rep; smovl instruction will decrement
264 264 addl %eax,%edi / %edi, %esi by four after each copy
265 265 / adding 3 will restore pointers to byte
266 266 / before last double word copied
267 267 / which is where they are expected to
268 268 / be for the single byte copy code
269 269 rep; smovb
270 270 .CleanupReturnLeft:
271 271 cld / reset direction flag to LtoR
272 272 popl %edi
273 273 popl %esi / restore registers
274 274 movl 4(%esp),%eax / set up return value
275 275 ret / return(dba);
276 276 SET_SIZE(memmove)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX