Print this page
1235 Use symbol capabilities to eliminate libc_hwcap*
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/lib/libc/i386_hwcap1/gen/memset.s
+++ new/usr/src/lib/libc/capabilities/i386/common/memset.s
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 24 * Use is subject to license terms.
25 25 */
26 26
27 27 .file "memset.s"
28 28
29 29 #include <sys/asm_linkage.h>
30 30
31 31 ANSI_PRAGMA_WEAK(memset,function)
32 32
33 33 ENTRY(memset)
34 34 pushl %edi / save register variable
35 35 movl 8(%esp),%edi / %edi = string address
↓ open down ↓ |
35 lines elided |
↑ open up ↑ |
36 36 movl 12(%esp),%eax / %al = byte to duplicate
37 37 movl 16(%esp),%ecx / %ecx = number of copies
38 38
39 39 / For all basic blocks in this routine, maintain the following
40 40 / entry conditions: %eax each byte is set to desired byte.
41 41 / NOTE: .byteset doesn't require this
42 42 / %ecx contains # bytes to set
43 43 / %edi contain address to set
44 44
45 45 cld / make sure we go the right way...
46 - cmpl $20,%ecx / strings with fewer than 20 chars should be byte set
47 - jbe .byteset
46 + cmpl $20,%ecx / strings with fewer than 20 chars should be
47 + / byte set
48 + jbe .byteset
48 49
49 50 andl $0xff, %eax / trim anything above low byte
50 51 imul $0x01010101, %eax / extend low byte to each byte
51 -
52 +
52 53 cmpl $256, %ecx / smaller areas don't benefit from alignment
53 54 jbe .wordset
54 55
55 56 cmpl $511, %ecx / areas smaller than this should be wordset
56 - jbe .check_wordset
57 + jbe .check_wordset
57 58
58 59 /
59 60 / prep work for sse temporal and non-temporal
60 61 /
61 62
62 63 pushl %ebx / more registers are needed
63 64 pushl %esi / for alignment work
64 65
65 66 /
66 67 / align address to 64 byte boundaries.
67 68 /
68 69
69 70 movl %ecx, %ebx / save byte count
70 71 movl %edi, %esi / esi is scratch register
71 72 andl $63, %esi / bytes to align to 64 byte align addr
72 - neg %esi / compute count of bytes
73 + neg %esi / compute count of bytes
73 74 addl $64, %esi / needed to align
74 75 andl $63, %esi / to 64 byte align addr
75 76 jz .sse_aligned / skip alignment if not needed
76 77 subl %esi, %ebx / ebx contains remainder of bytes to set
77 78 movl %esi, %ecx / alignment bytes
78 79 shrl $2,%ecx / %ecx = number of words to set
79 80 rep; sstol
80 81 movl %esi,%ecx
81 82 andl $3,%ecx / %ecx = number of bytes left
82 83 rep; sstob
83 84 movl %ebx, %ecx / remainder to be set
84 85
85 86 .sse_aligned:
86 -
87 +
87 88 shr $6, %ecx / number of 64 byte blocks to set
88 89
89 90 /
90 91 / load xmm0 with bytes to be set
91 92 /
92 93 subl $16,%esp / give ourselves some working room on the stack
93 94 movl %eax,(%esp) / copy eax into each of 4 bytes
94 95 movl %eax,4(%esp) / avoid pushl since it causes more interlocking
95 96 movl %eax,8(%esp) /
96 97 movl %eax,12(%esp) /
97 98 movups (%esp), %xmm0 / unaligned load from stack into xmm0
98 99 addl $16,%esp / restore stack position
99 -
100 +
100 101 cmpl $262143, %ebx / blocks smaller than this allocate in the cache
101 102 jbe .sse_loop
102 103 jmp .sse_nt_loop / branch across alignment nops
103 -
104 +
104 105 .align 16
105 106
106 -.sse_nt_loop:
107 +.sse_nt_loop:
107 108 movntps %xmm0, (%edi) / block non-temporal store
108 109 movntps %xmm0, 16(%edi) / use sse rather than sse2
109 110 movntps %xmm0, 32(%edi) / so we work more places
110 111 movntps %xmm0, 48(%edi) /
111 112
112 113 addl $64, %edi / increment dest address
113 114 dec %ecx / dec count of blocks
114 115 jnz .sse_nt_loop / jump if not done
115 116
116 117 andl $63, %ebx / remainder of bytes to copy
117 118 movl %ebx, %ecx / ecx contains remainer of bytes to set
118 119 popl %esi / restore stack config
↓ open down ↓ |
2 lines elided |
↑ open up ↑ |
119 120 popl %ebx /
120 121 #if defined(_SSE2_INSN)
121 122 mfence
122 123 #elif defined(_SSE_INSN)
123 124 sfence
124 125 #else
125 126 #error "Must have either SSE or SSE2"
126 127 #endif
127 128 cmpl $20, %ecx / compare and jump accordingly
128 129 jbe .byteset
129 - jmp .wordset
130 + jmp .wordset
130 131
131 132 .align 16
132 133 .sse_loop:
133 134 movaps %xmm0, (%edi) / block copy w/ SSE
134 135 movaps %xmm0, 16(%edi)
135 136 movaps %xmm0, 32(%edi)
136 137 movaps %xmm0, 48(%edi)
137 138
138 139 addl $64, %edi / increment addr
139 140 dec %ecx / dec count of blocks
140 141 jnz .sse_loop / jump if not done
141 142
142 143 andl $63, %ebx / remainder of bytes to copy
143 144 movl %ebx, %ecx / in %ecx as normal
144 145 popl %esi / restore stack config
145 146 popl %ebx /
146 - cmpl $20, %ecx
147 + cmpl $20, %ecx
147 148 jbe .byteset
148 149 jmp .wordset
149 150
150 151 .check_wordset:
151 152 movl %edi, %edx / save current store ptr
152 153 andl $7, %edi / check alignment
153 154 movl %edx,%edi / %edi = string address
154 - jz .wordset / all ok
155 -
155 + jz .wordset / all ok
156 156
157 -.align_wordset:
157 +
158 +.align_wordset:
158 159 pushl %ebx / more registers are needed
159 - pushl %esi
160 + pushl %esi
160 161
161 162 movl %ecx, %ebx
162 163 movl %edi, %esi
163 164 andl $7, %esi
164 165 neg %esi
165 166 addl $8, %esi
166 167 andl $7, %esi
167 168 subl %esi, %ebx / ebx contains remainder of bytes to copy
168 169 movl %esi, %ecx
169 - rep; sstob
170 + rep; sstob
170 171 movl %ebx, %ecx
171 172 popl %esi / restore stack config
172 173 popl %ebx /
173 174
174 175 .wordset:
175 176 movl %ecx, %edx / save cont
176 177 shrl $2,%ecx / %ecx = number of words to set
177 178 rep; sstol
178 179 movl %edx,%ecx
179 180 andl $3,%ecx / %ecx = number of bytes left
180 181
181 182 .byteset:
182 183 rep; sstob
183 184 movl 8(%esp),%eax / return string address
184 185 popl %edi / restore register variable
185 186 ret
186 187 SET_SIZE(memset)
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX