1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
23 */
24 /*
25 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
26 * Use is subject to license terms.
27 */
28
29 .file "__vrsqrtf.S"
30
31 #include "libm.h"
32
33 RO_DATA
34 .align 64
35
36 ! i = [0,63]
37 ! TBL[2*i ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-24;
38 ! TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46)));
39 ! i = [64,127]
40 ! TBL[2*i ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-23;
41 ! TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46)));
42
43 .CONST_TBL:
44 .word 0x3e800000, 0x00000000, 0x3ff6a09e, 0x667f3bcd,
45 .word 0x3e7f81f8, 0x1f81f820, 0x3ff673e3, 0x2ef63a03,
46 .word 0x3e7f07c1, 0xf07c1f08, 0x3ff6482d, 0x37a5a3d2,
47 .word 0x3e7e9131, 0xabf0b767, 0x3ff61d72, 0xb7978671,
48 .word 0x3e7e1e1e, 0x1e1e1e1e, 0x3ff5f3aa, 0x673fa911,
49 .word 0x3e7dae60, 0x76b981db, 0x3ff5cacb, 0x7802f342,
50 .word 0x3e7d41d4, 0x1d41d41d, 0x3ff5a2cd, 0x8c69d61a,
51 .word 0x3e7cd856, 0x89039b0b, 0x3ff57ba8, 0xb0ee01b9,
52 .word 0x3e7c71c7, 0x1c71c71c, 0x3ff55555, 0x55555555,
53 .word 0x3e7c0e07, 0x0381c0e0, 0x3ff52fcc, 0x468d6b54,
54 .word 0x3e7bacf9, 0x14c1bad0, 0x3ff50b06, 0xa8fc6b70,
55 .word 0x3e7b4e81, 0xb4e81b4f, 0x3ff4e6fd, 0xf33cf032,
56 .word 0x3e7af286, 0xbca1af28, 0x3ff4c3ab, 0xe93bcf74,
57 .word 0x3e7a98ef, 0x606a63be, 0x3ff4a10a, 0x97af7b92,
58 .word 0x3e7a41a4, 0x1a41a41a, 0x3ff47f14, 0x4fe17f9f,
59 .word 0x3e79ec8e, 0x951033d9, 0x3ff45dc3, 0xa3c34fa3,
60 .word 0x3e799999, 0x9999999a, 0x3ff43d13, 0x6248490f,
61 .word 0x3e7948b0, 0xfcd6e9e0, 0x3ff41cfe, 0x93ff5199,
62 .word 0x3e78f9c1, 0x8f9c18fa, 0x3ff3fd80, 0x77e70577,
63 .word 0x3e78acb9, 0x0f6bf3aa, 0x3ff3de94, 0x8077db58,
64 .word 0x3e786186, 0x18618618, 0x3ff3c036, 0x50e00e03,
65 .word 0x3e781818, 0x18181818, 0x3ff3a261, 0xba6d7a37,
66 .word 0x3e77d05f, 0x417d05f4, 0x3ff38512, 0xba21f51e,
67 .word 0x3e778a4c, 0x8178a4c8, 0x3ff36845, 0x766eec92,
68 .word 0x3e7745d1, 0x745d1746, 0x3ff34bf6, 0x3d156826,
69 .word 0x3e7702e0, 0x5c0b8170, 0x3ff33021, 0x8127c0e0,
70 .word 0x3e76c16c, 0x16c16c17, 0x3ff314c3, 0xd92a9e91,
71 .word 0x3e768168, 0x16816817, 0x3ff2f9d9, 0xfd52fd50,
72 .word 0x3e7642c8, 0x590b2164, 0x3ff2df60, 0xc5df2c9e,
73 .word 0x3e760581, 0x60581606, 0x3ff2c555, 0x2988e428,
74 .word 0x3e75c988, 0x2b931057, 0x3ff2abb4, 0x3c0eb0f4,
75 .word 0x3e758ed2, 0x308158ed, 0x3ff2927b, 0x2cd320f5,
76 .word 0x3e755555, 0x55555555, 0x3ff279a7, 0x4590331c,
77 .word 0x3e751d07, 0xeae2f815, 0x3ff26135, 0xe91daf55,
78 .word 0x3e74e5e0, 0xa72f0539, 0x3ff24924, 0x92492492,
79 .word 0x3e74afd6, 0xa052bf5b, 0x3ff23170, 0xd2be638a,
80 .word 0x3e747ae1, 0x47ae147b, 0x3ff21a18, 0x51ff630a,
81 .word 0x3e7446f8, 0x6562d9fb, 0x3ff20318, 0xcc6a8f5d,
82 .word 0x3e741414, 0x14141414, 0x3ff1ec70, 0x124e98f9,
83 .word 0x3e73e22c, 0xbce4a902, 0x3ff1d61c, 0x070ae7d3,
84 .word 0x3e73b13b, 0x13b13b14, 0x3ff1c01a, 0xa03be896,
85 .word 0x3e738138, 0x13813814, 0x3ff1aa69, 0xe4f2777f,
86 .word 0x3e73521c, 0xfb2b78c1, 0x3ff19507, 0xecf5b9e9,
87 .word 0x3e7323e3, 0x4a2b10bf, 0x3ff17ff2, 0xe00ec3ee,
88 .word 0x3e72f684, 0xbda12f68, 0x3ff16b28, 0xf55d72d4,
89 .word 0x3e72c9fb, 0x4d812ca0, 0x3ff156a8, 0x72b5ef62,
90 .word 0x3e729e41, 0x29e4129e, 0x3ff1426f, 0xac0654db,
91 .word 0x3e727350, 0xb8812735, 0x3ff12e7d, 0x02c40253,
92 .word 0x3e724924, 0x92492492, 0x3ff11ace, 0xe560242a,
93 .word 0x3e721fb7, 0x8121fb78, 0x3ff10763, 0xcec30b26,
94 .word 0x3e71f704, 0x7dc11f70, 0x3ff0f43a, 0x45cdedad,
95 .word 0x3e71cf06, 0xada2811d, 0x3ff0e150, 0xdce2b60c,
96 .word 0x3e71a7b9, 0x611a7b96, 0x3ff0cea6, 0x317186dc,
97 .word 0x3e718118, 0x11811812, 0x3ff0bc38, 0xeb8ba412,
98 .word 0x3e715b1e, 0x5f75270d, 0x3ff0aa07, 0xbd7b7488,
99 .word 0x3e7135c8, 0x1135c811, 0x3ff09811, 0x63615499,
100 .word 0x3e711111, 0x11111111, 0x3ff08654, 0xa2d4f6db,
101 .word 0x3e70ecf5, 0x6be69c90, 0x3ff074d0, 0x4a8b1438,
102 .word 0x3e70c971, 0x4fbcda3b, 0x3ff06383, 0x31ff307a,
103 .word 0x3e70a681, 0x0a6810a7, 0x3ff0526c, 0x39213bfa,
104 .word 0x3e708421, 0x08421084, 0x3ff0418a, 0x4806de7d,
105 .word 0x3e70624d, 0xd2f1a9fc, 0x3ff030dc, 0x4ea03a72,
106 .word 0x3e704104, 0x10410410, 0x3ff02061, 0x446ffa9a,
107 .word 0x3e702040, 0x81020408, 0x3ff01018, 0x28467ee9,
108 .word 0x3e800000, 0x00000000, 0x3ff00000, 0x00000000,
109 .word 0x3e7f81f8, 0x1f81f820, 0x3fefc0bd, 0x88a0f1d9,
110 .word 0x3e7f07c1, 0xf07c1f08, 0x3fef82ec, 0x882c0f9b,
111 .word 0x3e7e9131, 0xabf0b767, 0x3fef467f, 0x2814b0cc,
112 .word 0x3e7e1e1e, 0x1e1e1e1e, 0x3fef0b68, 0x48d2af1c,
113 .word 0x3e7dae60, 0x76b981db, 0x3feed19b, 0x75e78957,
114 .word 0x3e7d41d4, 0x1d41d41d, 0x3fee990c, 0xdad55ed2,
115 .word 0x3e7cd856, 0x89039b0b, 0x3fee61b1, 0x38f18adc,
116 .word 0x3e7c71c7, 0x1c71c71c, 0x3fee2b7d, 0xddfefa66,
117 .word 0x3e7c0e07, 0x0381c0e0, 0x3fedf668, 0x9b7e6350,
118 .word 0x3e7bacf9, 0x14c1bad0, 0x3fedc267, 0xbea45549,
119 .word 0x3e7b4e81, 0xb4e81b4f, 0x3fed8f72, 0x08e6b82d,
120 .word 0x3e7af286, 0xbca1af28, 0x3fed5d7e, 0xa914b937,
121 .word 0x3e7a98ef, 0x606a63be, 0x3fed2c85, 0x34ed6d86,
122 .word 0x3e7a41a4, 0x1a41a41a, 0x3fecfc7d, 0xa32a9213,
123 .word 0x3e79ec8e, 0x951033d9, 0x3feccd60, 0x45f5d358,
124 .word 0x3e799999, 0x9999999a, 0x3fec9f25, 0xc5bfedd9,
125 .word 0x3e7948b0, 0xfcd6e9e0, 0x3fec71c7, 0x1c71c71c,
126 .word 0x3e78f9c1, 0x8f9c18fa, 0x3fec453d, 0x90f057a2,
127 .word 0x3e78acb9, 0x0f6bf3aa, 0x3fec1982, 0xb2ece47b,
128 .word 0x3e786186, 0x18618618, 0x3febee90, 0x56fb9c39,
129 .word 0x3e781818, 0x18181818, 0x3febc460, 0x92eb3118,
130 .word 0x3e77d05f, 0x417d05f4, 0x3feb9aed, 0xba588347,
131 .word 0x3e778a4c, 0x8178a4c8, 0x3feb7232, 0x5b79db11,
132 .word 0x3e7745d1, 0x745d1746, 0x3feb4a29, 0x3c1d9550,
133 .word 0x3e7702e0, 0x5c0b8170, 0x3feb22cd, 0x56d87d7e,
134 .word 0x3e76c16c, 0x16c16c17, 0x3feafc19, 0xd8606169,
135 .word 0x3e768168, 0x16816817, 0x3fead60a, 0x1d0fb394,
136 .word 0x3e7642c8, 0x590b2164, 0x3feab099, 0xae8f539a,
137 .word 0x3e760581, 0x60581606, 0x3fea8bc4, 0x41a3d02c,
138 .word 0x3e75c988, 0x2b931057, 0x3fea6785, 0xb41bacf7,
139 .word 0x3e758ed2, 0x308158ed, 0x3fea43da, 0x0adc6899,
140 .word 0x3e755555, 0x55555555, 0x3fea20bd, 0x700c2c3e,
141 .word 0x3e751d07, 0xeae2f815, 0x3fe9fe2c, 0x315637ee,
142 .word 0x3e74e5e0, 0xa72f0539, 0x3fe9dc22, 0xbe484458,
143 .word 0x3e74afd6, 0xa052bf5b, 0x3fe9ba9d, 0xa6c73588,
144 .word 0x3e747ae1, 0x47ae147b, 0x3fe99999, 0x9999999a,
145 .word 0x3e7446f8, 0x6562d9fb, 0x3fe97913, 0x63068b54,
146 .word 0x3e741414, 0x14141414, 0x3fe95907, 0xeb87ab44,
147 .word 0x3e73e22c, 0xbce4a902, 0x3fe93974, 0x368cfa31,
148 .word 0x3e73b13b, 0x13b13b14, 0x3fe91a55, 0x6151761c,
149 .word 0x3e738138, 0x13813814, 0x3fe8fba8, 0xa1bf6f96,
150 .word 0x3e73521c, 0xfb2b78c1, 0x3fe8dd6b, 0x4563a009,
151 .word 0x3e7323e3, 0x4a2b10bf, 0x3fe8bf9a, 0xb06e1af3,
152 .word 0x3e72f684, 0xbda12f68, 0x3fe8a234, 0x5cc04426,
153 .word 0x3e72c9fb, 0x4d812ca0, 0x3fe88535, 0xd90703c6,
154 .word 0x3e729e41, 0x29e4129e, 0x3fe8689c, 0xc7e07e7d,
155 .word 0x3e727350, 0xb8812735, 0x3fe84c66, 0xdf0ca4c2,
156 .word 0x3e724924, 0x92492492, 0x3fe83091, 0xe6a7f7e7,
157 .word 0x3e721fb7, 0x8121fb78, 0x3fe8151b, 0xb86fee1d,
158 .word 0x3e71f704, 0x7dc11f70, 0x3fe7fa02, 0x3f1068d1,
159 .word 0x3e71cf06, 0xada2811d, 0x3fe7df43, 0x7579b9b5,
160 .word 0x3e71a7b9, 0x611a7b96, 0x3fe7c4dd, 0x663ebb88,
161 .word 0x3e718118, 0x11811812, 0x3fe7aace, 0x2afa8b72,
162 .word 0x3e715b1e, 0x5f75270d, 0x3fe79113, 0xebbd7729,
163 .word 0x3e7135c8, 0x1135c811, 0x3fe777ac, 0xde80baea,
164 .word 0x3e711111, 0x11111111, 0x3fe75e97, 0x46a0b098,
165 .word 0x3e70ecf5, 0x6be69c90, 0x3fe745d1, 0x745d1746,
166 .word 0x3e70c971, 0x4fbcda3b, 0x3fe72d59, 0xc45f1fc5,
167 .word 0x3e70a681, 0x0a6810a7, 0x3fe7152e, 0x9f44f01f,
168 .word 0x3e708421, 0x08421084, 0x3fe6fd4e, 0x79325467,
169 .word 0x3e70624d, 0xd2f1a9fc, 0x3fe6e5b7, 0xd16657e1,
170 .word 0x3e704104, 0x10410410, 0x3fe6ce69, 0x31d5858d,
171 .word 0x3e702040, 0x81020408, 0x3fe6b761, 0x2ec892f6,
172
173 .word 0x3fefffff, 0xfee7f18f ! K0 = 9.99999997962321453275e-01
174 .word 0xbfdfffff, 0xfe07e52f ! K1 = -4.99999998166077580600e-01
175 .word 0x3fd80118, 0x0ca296d9 ! K2 = 3.75066768969515586277e-01
176 .word 0xbfd400fc, 0x0bbb8e78 ! K3 = -3.12560092408808548438e-01
177 .word 0x7ffe0000, 0x7ffe0000 ! DC0
178 .word 0x3f800000, 0x40000000 ! FTWO
179
180 #define stridex %l4
181 #define stridex2 %l1
182 #define stridey %l3
183 #define stridey2 %i2
184 #define TBL %l2
185 #define counter %i5
186
187 #define K3 %f38
188 #define K2 %f36
189 #define K1 %f34
190 #define K0 %f32
191 #define DC0 %f4
192 #define FONE %f2
193 #define FTWO %f3
194
195 #define _0x00800000 %o2
196 #define _0x7f800000 %o4
197
198 #define tmp0 STACK_BIAS-0x30
199 #define tmp1 STACK_BIAS-0x28
200 #define tmp2 STACK_BIAS-0x20
201 #define tmp3 STACK_BIAS-0x18
202 #define tmp_counter STACK_BIAS-0x10
203 #define tmp_px STACK_BIAS-0x08
204
205 ! sizeof temp storage - must be a multiple of 16 for V9
206 #define tmps 0x30
207
208 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
209 ! !!!!! algorithm !!!!!
210 ! ((float*)&ddx0)[0] = *px;
211 ! ax0 = *(int*)px;
212 !
213 ! ((float*)&ddx0)[1] = *(px + stridex);
214 ! ax1 = *(int*)(px + stridex);
215 !
216 ! px += stridex2;
217 !
218 ! if ( ax0 >= 0x7f800000 )
219 ! {
220 ! RETURN ( FONE / ((float*)&dres0)[0] );
221 ! }
222 ! if ( ax0 < 0x00800000 )
223 ! {
224 ! float res = ((float*)&dres0)[0];
225 !
226 ! if ( (ax0 & 0x7fffffff) == 0 ) /* |X| = zero */
227 ! {
228 ! RETURN ( FONE / res )
229 ! }
230 ! else if ( ax0 >= 0 ) /* X = denormal */
231 ! {
232 ! double res0, xx0, tbl_div0, tbl_sqrt0;
233 ! float fres0;
234 ! int iax0, si0, iexp0;
235 !
236 ! res = *(int*)&res;
237 ! res *= FTWO;
238 ! ax0 = *(int*)&res;
239 ! iexp0 = ax0 >> 24;
240 ! iexp0 = 0x3f + 0x4b - iexp0;
241 ! iexp0 = iexp0 << 23;
242 !
243 ! si0 = (ax0 >> 13) & 0x7f0;
244 !
245 ! tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0];
246 ! tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1];
247 ! iax0 = ax0 & 0x7ffe0000;
248 ! iax0 = ax0 - iax0;
249 ! xx0 = iax0 * tbl_div0;
250 ! res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
251 !
252 ! fres0 = res0;
253 ! iexp0 += *(int*)&fres0;
254 ! RETURN(*(float*)&iexp0)
255 ! }
256 ! else /* X = negative */
257 ! {
258 ! RETURN ( sqrtf(res) )
259 ! }
260 ! }
261 ! if ( ax1 >= 0x7f800000 )
262 ! {
263 ! RETURN ( FONE / ((float*)&dres0)[1] )
264 ! }
265 ! if ( ax1 < 0x00800000 )
266 ! {
267 ! float res = ((float*)&dres0)[1];
268 ! if ( (ax0 & 0x7fffffff) == 0 ) /* |X| = zero */
269 ! {
270 ! RETURN ( FONE / res )
271 ! }
272 ! else if ( ax0 >= 0 ) /* X = denormal */
273 ! {
274 ! double res0, xx0, tbl_div0, tbl_sqrt0;
275 ! float fres0;
276 ! int iax1, si0, iexp0;
277 !
278 ! res = *(int*)&res;
279 ! res *= FTWO;
280 ! ax1 = *(int*)&res;
281 ! iexp0 = ax1 >> 24;
282 ! iexp0 = 0x3f + 0x4b - iexp0;
283 ! iexp0 = iexp0 << 23;
284 !
285 ! si0 = (ax1 >> 13) & 0x7f0;
286 !
287 ! tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0];
288 ! tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1];
289 ! iax1 = ax1 & 0x7ffe0000;
290 ! iax1 = ax1 - iax1;
291 ! xx0 = iax1 * tbl_div0;
292 ! res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
293 !
294 ! fres0 = res0;
295 ! iexp0 += *(int*)&fres0;
296 ! RETURN(*(float*)&iexp0)
297 ! }
298 ! else /* X = negative */
299 ! {
300 ! RETURN ( sqrtf(res) )
301 ! }
302 ! }
303 !
304 ! iexp0 = ax0 >> 24;
305 ! iexp1 = ax1 >> 24;
306 ! iexp0 = 0x3f - iexp0;
307 ! iexp1 = 0x3f - iexp1;
308 ! iexp1 &= 0x1ff;
309 ! lexp0 = iexp0 << 55;
310 ! lexp1 = iexp1 << 23;
311 !
312 ! lexp0 |= lexp1;
313 !
314 ! fdx0 = *((double*)&lexp0);
315 !
316 ! si0 = ax0 >> 13;
317 ! si1 = ax1 >> 13;
318 ! si0 &= 0x7f0;
319 ! si1 &= 0x7f0;
320 !
321 ! addr0 = (char*)TBL + si0;
322 ! addr1 = (char*)TBL + si1;
323 ! tbl_div0 = ((double*)((char*)TBL + si0))[0];
324 ! tbl_div1 = ((double*)((char*)TBL + si1))[0];
325 ! tbl_sqrt0 = ((double*)addr0)[1];
326 ! tbl_sqrt1 = ((double*)addr1)[1];
327 ! dfx0 = vis_fand(ddx0,DC0);
328 ! dfx0 = vis_fpsub32(ddx0,dfx0);
329 ! dtmp0 = (double)(((int*)&dfx0)[0]);
330 ! dtmp1 = (double)(((int*)&dfx0)[1]);
331 ! xx0 = dtmp0 * tbl_div0;
332 ! xx1 = dtmp1 * tbl_div1;
333 ! res0 = K3 * xx0;
334 ! res1 = K3 * xx1;
335 ! res0 += K2;
336 ! res1 += K2;
337 ! res0 *= xx0;
338 ! res1 *= xx1;
339 ! res0 += K1;
340 ! res1 += K1;
341 ! res0 *= xx0;
342 ! res1 *= xx1;
343 ! res0 += K0;
344 ! res1 += K0;
345 ! res0 = tbl_sqrt0 * res0;
346 ! res1 = tbl_sqrt1 * res1;
347 ! ((float*)&dres0)[0] = (float)res0;
348 ! ((float*)&dres0)[1] = (float)res1;
349 ! dres0 = vis_fpadd32(dres0,fdx0);
350 ! *py = ((float*)&dres0)[0];
351 ! *(py + stridey) = ((float*)&dres0)[1];
352 ! py += stridey2;
353 !
354 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
355
356 ENTRY(__vrsqrtf)
357 save %sp,-SA(MINFRAME)-tmps,%sp
358 PIC_SETUP(l7)
359 PIC_SET(l7,.CONST_TBL,l2)
360
361 st %i0,[%fp+tmp_counter]
362 stx %i1,[%fp+tmp_px]
363
364 ldd [TBL+2048],K0
365 sll %i2,2,stridex
366
367 ldd [TBL+2048+8],K1
368 sll %i4,2,stridey
369 mov %i3,%i2
370
371 ldd [TBL+2048+16],K2
372 sethi %hi(0x7f800000),_0x7f800000
373 sll stridex,1,stridex2
374
375 ldd [TBL+2048+24],K3
376 sethi %hi(0x00800000),_0x00800000
377
378 ldd [TBL+2048+32],DC0
379 add %g0,0x3f,%l0
380
381 ldd [TBL+2048+40],FONE
382 ! ld [TBL+2048+44],FTWO
383 .begin:
384 ld [%fp+tmp_counter],counter
385 ldx [%fp+tmp_px],%l7
386 st %g0,[%fp+tmp_counter]
387 .begin1:
388 cmp counter,0
389 ble,pn %icc,.exit
390
391 lda [%l7]0x82,%f14 ! (4_0) ((float*)&ddx0)[0] = *px;
392
393 lda [stridex+%l7]0x82,%f15 ! (5_0) ((float*)&ddx0)[1] = *(px + stridex);
394 sethi %hi(0x7ffffc00),%o0
395
396 lda [%l7]0x82,%g1 ! (4_0) ax0 = *(int*)px;
397 add %l7,stridex2,%i1 ! px += stridex2
398 add %o0,0x3ff,%o0
399
400 lda [stridex+%l7]0x82,%g5 ! (5_0) ax1 = *(int*)(px + stridex);
401 fand %f14,DC0,%f16 ! (4_0) dfx0 = vis_fand(ddx0,DC0);
402
403 sra %g1,13,%l5 ! (4_0) si0 = ax0 >> 13;
404 add %i1,stridex2,%o5 ! px += stridex2
405
406 cmp %g1,_0x7f800000 ! (4_1) ax0 ? 0x7f800000
407 bge,pn %icc,.spec0 ! (4_1) if ( ax0 >= 0x7f800000 )
408 nop
409
410 cmp %g1,_0x00800000 ! (4_1) ax0 ? 0x00800000
411 bl,pn %icc,.spec1 ! (4_1) if ( ax0 < 0x00800000 )
412 sra %g5,13,%l6 ! (5_0) si1 = ax1 >> 13;
413 .cont_spec:
414 and %l5,2032,%l5 ! (4_0) si0 &= 0x7f0;
415
416 ldd [%l5+TBL],%f54 ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
417 sra %g5,24,%l7 ! (5_0) iexp1 = ax1 >> 24;
418 and %l6,2032,%l6 ! (5_0) si1 &= 0x7f0;
419 fpsub32 %f14,%f16,%f16 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);
420
421 ldd [%l6+TBL],%f46 ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
422 sra %g1,24,%i3 ! (4_0) iexp0 = ax0 >> 24;
423 sub %l0,%l7,%l7 ! (5_0) iexp1 = 0x3f - iexp1;
424
425 and %l7,511,%l1 ! (5_0) iexp1 = 0x1ff;
426 add %l6,TBL,%l6 ! (5_0) addr1 = (char*)TBL + si1;
427
428 sllx %l1,23,%l1 ! (5_0) lexp1 = iexp1 << 23;
429 sub %l0,%i3,%o0 ! (4_0) iexp0 = 0x3f - iexp0;
430 fitod %f16,%f56 ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]);
431
432 sllx %o0,55,%o0 ! (4_0) lexp0 = iexp0 << 55;
433 fitod %f17,%f44 ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]);
434
435 or %o0,%l1,%o0 ! (4_0) lexp0 |= lexp1;
436
437 stx %o0,[%fp+tmp0] ! (4_0) fdx0 = *((double*)lexp0);
438
439 fmuld %f56,%f54,%f40 ! (4_0) xx0 = dtmp0 * tbl_div0;
440
441 lda [%i1]0x82,%f18 ! (0_0) ((float*)&ddx0)[0] = *px;
442 fmuld %f44,%f46,%f46 ! (5_1) xx1 = dtmp1 * tbl_div1;
443
444 lda [stridex+%i1]0x82,%f19 ! (1_0) ((float*)&ddx0)[1] = *(px + stridex);
445
446 lda [%i1]0x82,%g1 ! (0_0) ax0 = *(int*)px;
447
448 lda [stridex+%i1]0x82,%i4 ! (1_0) ax1 = *(int*)(px + stridex);
449 cmp %g5,_0x7f800000 ! (5_1) ax1 ? 0x7f800000
450 bge,pn %icc,.update0 ! (5_1) if ( ax1 >= 0x7f800000 )
451 fmuld K3,%f40,%f52 ! (4_1) res0 = K3 * xx0;
452 .cont0:
453 fmuld K3,%f46,%f50 ! (5_1) res1 = K3 * xx1;
454 cmp %g5,_0x00800000 ! (5_1) ax1 ? 0x00800000
455 bl,pn %icc,.update1 ! (5_1) if ( ax1 < 0x00800000 )
456 fand %f18,DC0,%f56 ! (0_0) dfx0 = vis_fand(ddx0,DC0);
457 .cont1:
458 sra %g1,13,%o0 ! (0_0) si0 = ax0 >> 13;
459 cmp %g1,_0x7f800000 ! (0_0) ax0 ? 0x7f800000
460
461 sra %i4,13,%g5 ! (1_0) si1 = ax1 >> 13;
462 and %o0,2032,%o0 ! (0_0) si0 &= 0x7f0;
463
464 ldd [%o0+TBL],%f54 ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
465 sra %i4,24,%i1 ! (1_0) iexp1 = ax1 >> 24;
466 and %g5,2032,%o7 ! (1_0) si1 &= 0x7f0;
467 fpsub32 %f18,%f56,%f30 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);
468
469 ldd [%o7+TBL],%f44 ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
470 sra %g1,24,%i3 ! (0_0) iexp0 = ax0 >> 24;
471 sub %l0,%i1,%i1 ! (1_0) iexp1 = 0x3f - iexp1;
472 faddd %f52,K2,%f62 ! (4_1) res0 += K2;
473
474 sub %l0,%i3,%g5 ! (0_0) iexp0 = 0x3f - iexp0;
475 bge,pn %icc,.update2 ! (0_0) if ( ax0 >= 0x7f800000 )
476 faddd %f50,K2,%f60 ! (5_1) res1 += K2;
477 .cont2:
478 cmp %g1,_0x00800000 ! (0_0) ax0 ? 0x00800000
479 and %i1,511,%i0 ! (1_0) iexp1 = 0x1ff;
480 fitod %f30,%f56 ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]);
481
482 sllx %i0,23,%i0 ! (1_0) lexp1 = iexp1 << 23;
483 bl,pn %icc,.update3 ! (0_0) if ( ax0 < 0x00800000 )
484 fitod %f31,%f50 ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]);
485 .cont3:
486 fmuld %f62,%f40,%f30 ! (4_1) res0 *= xx0;
487 sllx %g5,55,%g5 ! (0_0) lexp0 = iexp0 << 55;
488
489 fmuld %f60,%f46,%f48 ! (5_1) res1 *= xx1;
490 or %g5,%i0,%g5 ! (0_0) lexp0 |= lexp1;
491 stx %g5,[%fp+tmp1] ! (0_0) fdx0 = *((double*)lexp0);
492
493 fmuld %f56,%f54,%f26 ! (0_0) xx0 = dtmp0 * tbl_div0;
494 sll stridex,1,stridex2 ! stridex2 = stridex * 2;
495
496 lda [%o5]0x82,%f24 ! (2_0) ((float*)&ddx0)[0] = *px;
497 add %o7,TBL,%o7 ! (1_0) addr0 = (char*)TBL + si0;
498 fmuld %f50,%f44,%f44 ! (1_0) xx0 = dtmp0 * tbl_div0;
499
500 lda [stridex+%o5]0x82,%f25 ! (3_0) ((float*)&ddx0)[1] = *(px + stridex);
501 add %l5,TBL,%l5 ! (4_1) addr0 = (char*)TBL + si0;
502 faddd %f30,K1,%f62 ! (4_1) res0 += K1;
503
504 lda [%o5]0x82,%g1 ! (2_0) ax0 = *(int*)px;
505 add %o5,stridex2,%l7 ! px += stridex2
506 faddd %f48,K1,%f42 ! (5_1) res1 += K1;
507
508 lda [stridex+%o5]0x82,%o5 ! (3_0) ax1 = *(int*)(px + stridex);
509 cmp %i4,_0x7f800000 ! (1_0) ax1 ? 0x7f800000
510 bge,pn %icc,.update4 ! (1_0) if ( ax1 >= 0x7f800000 )
511 fmuld K3,%f26,%f52 ! (0_0) res0 = K3 * xx0;
512 .cont4:
513 fmuld K3,%f44,%f50 ! (1_0) res1 = K3 * xx1;
514 cmp %i4,_0x00800000 ! (1_0) ax1 ? 0x00800000
515 bl,pn %icc,.update5 ! (1_0) if ( ax1 < 0x00800000 )
516 fand %f24,DC0,%f54 ! (2_0) dfx0 = vis_fand(ddx0,DC0);
517 .cont5:
518 fmuld %f62,%f40,%f48 ! (4_1) res0 *= xx0;
519 sra %g1,13,%i0 ! (2_0) si0 = ax0 >> 13;
520 cmp %g1,_0x7f800000 ! (2_0) ax0 ? 0x7f800000
521
522 fmuld %f42,%f46,%f58 ! (5_1) res1 *= xx1;
523 sra %o5,13,%o1 ! (3_0) si1 = ax1 >> 13;
524 and %i0,2032,%i0 ! (2_0) si0 &= 0x7f0;
525
526 ldd [%i0+TBL],%f30 ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
527 sra %o5,24,%o3 ! (3_0) iexp1 = ax1 >> 24;
528 and %o1,2032,%o1 ! (3_0) si1 &= 0x7f0;
529 fpsub32 %f24,%f54,%f12 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);
530
531 ldd [%o1+TBL],%f46 ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
532 sra %g1,24,%i3 ! (2_0) iexp0 = ax0 >> 24;
533 sub %l0,%o3,%o3 ! (3_0) iexp1 = 0x3f - iexp1;
534 faddd %f52,K2,%f40 ! (0_0) res0 += K2;
535
536 ldd [%l5+8],%f42 ! (4_1) tbl_sqrt0 = ((double*)addr0)[1];
537 sub %l0,%i3,%g5 ! (2_0) iexp0 = 0x3f - iexp0;
538 and %o3,511,%i3 ! (3_0) iexp1 &= 0x1ff;
539 faddd %f50,K2,%f60 ! (1_0) res0 += K2;
540
541 ldd [%l6+8],%f28 ! (5_1) tbl_sqrt1 = ((double*)addr1)[1];
542 sllx %g5,55,%g5 ! (2_0) lexp0 = iexp0 << 55;
543 add %i0,TBL,%i0 ! (2_0) addr0 = (char*)TBL + si0;
544 fitod %f12,%f56 ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]);
545
546 sllx %i3,23,%i3 ! (3_0) lexp1 = iexp1 << 23;
547 fitod %f13,%f50 ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]);
548
549 fmuld %f40,%f26,%f40 ! (0_0) res0 *= xx0;
550 or %g5,%i3,%g5 ! (2_0) lexp0 |= lexp1;
551 faddd %f48,K0,%f62 ! (4_1) res0 += K0;
552
553 fmuld %f60,%f44,%f48 ! (1_0) res1 *= xx1;
554 add %o1,TBL,%o1 ! (3_0) addr1 = (char*)TBL + si1;
555 stx %g5,[%fp+tmp2] ! (2_0) fdx0 = *((double*)lexp0);
556 faddd %f58,K0,%f60 ! (5_1) res1 += K0;
557
558 fmuld %f56,%f30,%f30 ! (2_0) xx0 = dtmp0 * tbl_div0;
559 bge,pn %icc,.update6 ! (2_0) if ( ax0 >= 0x7f800000 )
560 lda [%l7]0x82,%f14 ! (4_0) ((float*)&ddx0)[0] = *px;
561 .cont6:
562 cmp %g1,_0x00800000 ! (2_0) ax0 ? 0x00800000
563 bl,pn %icc,.update7 ! (2_0) if ( ax0 < 0x00800000 )
564 nop
565 .cont7:
566 fmuld %f50,%f46,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1;
567
568 lda [stridex+%l7]0x82,%f15 ! (5_0) ((float*)&ddx0)[1] = *(px + stridex);
569 cmp %o5,_0x7f800000 ! (3_0) ax1 ? 0x7f800000
570 fmuld %f42,%f62,%f58 ! (4_1) res0 = tbl_sqrt0 * res0;
571 faddd %f40,K1,%f46 ! (0_0) res0 += K1;
572
573 lda [%l7]0x82,%g1 ! (4_0) ax0 = *(int*)px;
574 add %l7,stridex2,%i1 ! px += stridex2
575 fmuld %f28,%f60,%f56 ! (5_1) res1 = tbl_sqrt1 * res1;
576 faddd %f48,K1,%f62 ! (1_0) res1 += K1;
577
578 lda [stridex+%l7]0x82,%g5 ! (5_0) ax1 = *(int*)(px + stridex);
579 add %o0,TBL,%o0 ! (0_0) addr0 = (char*)TBL + si0;
580 bge,pn %icc,.update8 ! (3_0) if ( ax1 >= 0x7f800000 )
581 fmuld K3,%f30,%f52 ! (2_0) res0 = K3 * xx0;
582 .cont8:
583 fmuld K3,%f24,%f50 ! (3_0) res1 = K3 * xx1;
584 cmp %o5,_0x00800000 ! (3_0) ax1 ? 0x00800000
585 bl,pn %icc,.update9 ! (3_0) if ( ax1 < 0x00800000 )
586 fand %f14,DC0,%f16 ! (4_0) dfx0 = vis_fand(ddx0,DC0);
587 .cont9:
588 fmuld %f46,%f26,%f48 ! (0_0) res0 *= xx0;
589 sra %g1,13,%l5 ! (4_0) si0 = ax0 >> 13;
590 add %i1,stridex2,%o5 ! px += stridex2
591 fdtos %f58,%f6 ! (4_1) ((float*)&dres0)[0] = (float)res0;
592
593 fmuld %f62,%f44,%f40 ! (1_0) res1 *= xx1;
594 sra %g5,13,%l6 ! (5_0) si1 = ax1 >> 13;
595 and %l5,2032,%l5 ! (4_0) si0 &= 0x7f0;
596 fdtos %f56,%f7 ! (5_1) ((float*)&dres0)[1] = (float)res1;
597
598 ldd [%l5+TBL],%f54 ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
599 sra %g5,24,%l7 ! (5_0) iexp1 = ax1 >> 24;
600 and %l6,2032,%l6 ! (5_0) si1 &= 0x7f0;
601 fpsub32 %f14,%f16,%f16 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);
602
603 ldd [%l6+TBL],%f46 ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
604 sra %g1,24,%i3 ! (4_0) iexp0 = ax0 >> 24;
605 sub %l0,%l7,%l7 ! (5_0) iexp1 = 0x3f - iexp1;
606 faddd %f52,K2,%f58 ! (2_0) res0 += K2;
607
608 ldd [%o0+8],%f42 ! (0_0) tbl_sqrt0 = ((double*)addr0)[1];
609 and %l7,511,%l1 ! (5_0) iexp1 = 0x1ff;
610 add %l6,TBL,%l6 ! (5_0) addr1 = (char*)TBL + si1;
611 faddd %f50,K2,%f60 ! (3_0) res1 += K2;
612
613 ldd [%o7+8],%f28 ! (1_0) tbl_sqrt1 = ((double*)addr1)[1];
614 sllx %l1,23,%l1 ! (5_0) lexp1 = iexp1 << 23;
615 sub %l0,%i3,%o0 ! (4_0) iexp0 = 0x3f - iexp0;
616 fitod %f16,%f56 ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]);
617
618 ldd [%fp+tmp0],%f52 ! (4_1) fdx0 = *((double*)lexp0);
619 sllx %o0,55,%o0 ! (4_0) lexp0 = iexp0 << 55;
620 fitod %f17,%f44 ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]);
621
622 fmuld %f58,%f30,%f62 ! (2_0) res0 *= xx0;
623 or %o0,%l1,%o0 ! (4_0) lexp0 |= lexp1;
624 faddd %f48,K0,%f22 ! (0_0) res0 += K0;
625
626 fmuld %f60,%f24,%f58 ! (3_0) res1 *= xx1;
627 stx %o0,[%fp+tmp0] ! (4_0) fdx0 = *((double*)lexp0);
628 faddd %f40,K0,%f26 ! (1_0) res1 += K0;
629
630 fmuld %f56,%f54,%f40 ! (4_0) xx0 = dtmp0 * tbl_div0;
631 fpadd32 %f6,%f52,%f10 ! (4_1) dres0 = vis_fpadd32(dres0,fdx0);
632
633 or %g0,%i2,%l7
634 add stridey,stridey,stridey2
635
636 cmp counter,6
637 bl,pn %icc,.tail
638 nop
639
640 ba .main_loop
641 sub counter,6,counter ! counter
642
643 .align 16
644 .main_loop:
645 lda [%i1]0x82,%f18 ! (0_0) ((float*)&ddx0)[0] = *px;
646 cmp %g1,_0x7f800000 ! (4_1) ax0 ? 0x7f800000
647 bge,pn %icc,.update10 ! (4_1) if ( ax0 >= 0x7f800000 )
648 fmuld %f44,%f46,%f46 ! (5_1) xx1 = dtmp1 * tbl_div1;
649 .cont10:
650 lda [stridex+%i1]0x82,%f19 ! (1_0) ((float*)&ddx0)[1] = *(px + stridex);
651 cmp %g1,_0x00800000 ! (4_1) ax0 ? 0x00800000
652 fmuld %f42,%f22,%f44 ! (0_1) res0 = tbl_sqrt0 * res0;
653 faddd %f62,K1,%f42 ! (2_1) res0 += K1;
654
655 lda [%i1]0x82,%g1 ! (0_0) ax0 = *(int*)px;
656 fmuld %f28,%f26,%f60 ! (1_1) res1 = tbl_sqrt1 * res1;
657 bl,pn %icc,.update11 ! (4_1) if ( ax0 < 0x00800000 )
658 faddd %f58,K1,%f62 ! (3_1) res1 += K1;
659 .cont11:
660 lda [stridex+%i1]0x82,%i4 ! (1_0) ax1 = *(int*)(px + stridex);
661 cmp %g5,_0x7f800000 ! (5_1) ax1 ? 0x7f800000
662 bge,pn %icc,.update12 ! (5_1) if ( ax1 >= 0x7f800000 )
663 fmuld K3,%f40,%f52 ! (4_1) res0 = K3 * xx0;
664 .cont12:
665 fmuld K3,%f46,%f50 ! (5_1) res1 = K3 * xx1;
666 cmp %g5,_0x00800000 ! (5_1) ax1 ? 0x00800000
667 bl,pn %icc,.update13 ! (5_1) if ( ax1 < 0x00800000 )
668 fand %f18,DC0,%f56 ! (0_0) dfx0 = vis_fand(ddx0,DC0);
669 .cont13:
670 fmuld %f42,%f30,%f48 ! (2_1) res0 *= xx0;
671 sra %g1,13,%o0 ! (0_0) si0 = ax0 >> 13;
672 cmp %g1,_0x7f800000 ! (0_0) ax0 ? 0x7f800000
673 fdtos %f44,%f8 ! (0_1) ((float*)&dres0)[0] = (float)res0;
674
675 fmuld %f62,%f24,%f58 ! (3_1) res1 *= xx1;
676 sra %i4,13,%g5 ! (1_0) si1 = ax1 >> 13;
677 and %o0,2032,%o0 ! (0_0) si0 &= 0x7f0;
678 fdtos %f60,%f9 ! (1_1) ((float*)&dres0)[1] = (float)res1;
679
680 ldd [%o0+TBL],%f54 ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
681 sra %i4,24,%i1 ! (1_0) iexp1 = ax1 >> 24;
682 and %g5,2032,%o7 ! (1_0) si1 &= 0x7f0;
683 fpsub32 %f18,%f56,%f30 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);
684
685 ldd [%o7+TBL],%f44 ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
686 sra %g1,24,%i3 ! (0_0) iexp0 = ax0 >> 24;
687 sub %l0,%i1,%i1 ! (1_0) iexp1 = 0x3f - iexp1;
688 faddd %f52,K2,%f62 ! (4_1) res0 += K2;
689
690 ldd [%i0+8],%f42 ! (2_1) tbl_sqrt0 = ((double*)addr0)[1];
691 sub %l0,%i3,%g5 ! (0_0) iexp0 = 0x3f - iexp0;
692 bge,pn %icc,.update14 ! (0_0) if ( ax0 >= 0x7f800000 )
693 faddd %f50,K2,%f60 ! (5_1) res1 += K2;
694 .cont14:
695 ldd [%o1+8],%f28 ! (3_1) tbl_sqrt1 = ((double*)addr0)[1];
696 cmp %g1,_0x00800000 ! (0_0) ax0 ? 0x00800000
697 and %i1,511,%i0 ! (1_0) iexp1 = 0x1ff;
698 fitod %f30,%f56 ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]);
699
700 ldd [%fp+tmp1],%f52 ! (0_1) fdx0 = *((double*)lexp0);
701 sllx %i0,23,%i0 ! (1_0) lexp1 = iexp1 << 23;
702 bl,pn %icc,.update15 ! (0_0) if ( ax0 < 0x00800000 )
703 fitod %f31,%f50 ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]);
704 .cont15:
705 fmuld %f62,%f40,%f30 ! (4_1) res0 *= xx0;
706 sllx %g5,55,%g5 ! (0_0) lexp0 = iexp0 << 55;
707 st %f10,[%l7] ! (4_2) *py = ((float*)&dres0)[0];
708 faddd %f48,K0,%f62 ! (2_1) res0 += K0;
709
710 fmuld %f60,%f46,%f48 ! (5_1) res1 *= xx1;
711 or %g5,%i0,%g5 ! (0_0) lexp0 |= lexp1;
712 stx %g5,[%fp+tmp1] ! (0_0) fdx0 = *((double*)lexp0);
713 faddd %f58,K0,%f60 ! (3_1) res1 += K0;
714
715 fmuld %f56,%f54,%f26 ! (0_0) xx0 = dtmp0 * tbl_div0;
716 sll stridex,1,stridex2 ! stridex2 = stridex * 2;
717 st %f11,[stridey+%l7] ! (5_2) *(py + stridey) = ((float*)&dres0)[1];
718 fpadd32 %f8,%f52,%f10 ! (0_1) dres0 = vis_fpadd32(dres0,fdx0);
719
720 lda [%o5]0x82,%f24 ! (2_0) ((float*)&ddx0)[0] = *px;
721 add %l7,stridey2,%i1 ! py += stridey2
722 add %o7,TBL,%o7 ! (1_0) addr0 = (char*)TBL + si0;
723 fmuld %f50,%f44,%f44 ! (1_0) xx0 = dtmp0 * tbl_div0;
724
725 lda [stridex+%o5]0x82,%f25 ! (3_0) ((float*)&ddx0)[1] = *(px + stridex);
726 add %l5,TBL,%l5 ! (4_1) addr0 = (char*)TBL + si0;
727 fmuld %f42,%f62,%f58 ! (2_1) res0 = tbl_sqrt0 * res0;
728 faddd %f30,K1,%f62 ! (4_1) res0 += K1;
729
730 lda [%o5]0x82,%g1 ! (2_0) ax0 = *(int*)px;
731 add %o5,stridex2,%l7 ! px += stridex2
732 fmuld %f28,%f60,%f56 ! (3_1) res1 = tbl_sqrt1 * res1;
733 faddd %f48,K1,%f42 ! (5_1) res1 += K1;
734
735 lda [stridex+%o5]0x82,%o5 ! (3_0) ax1 = *(int*)(px + stridex);
736 cmp %i4,_0x7f800000 ! (1_0) ax1 ? 0x7f800000
737 bge,pn %icc,.update16 ! (1_0) if ( ax1 >= 0x7f800000 )
738 fmuld K3,%f26,%f52 ! (0_0) res0 = K3 * xx0;
739 .cont16:
740 fmuld K3,%f44,%f50 ! (1_0) res1 = K3 * xx1;
741 cmp %i4,_0x00800000 ! (1_0) ax1 ? 0x00800000
742 bl,pn %icc,.update17 ! (1_0) if ( ax1 < 0x00800000 )
743 fand %f24,DC0,%f54 ! (2_0) dfx0 = vis_fand(ddx0,DC0);
744 .cont17:
745 fmuld %f62,%f40,%f48 ! (4_1) res0 *= xx0;
746 sra %g1,13,%i0 ! (2_0) si0 = ax0 >> 13;
747 cmp %g1,_0x7f800000 ! (2_0) ax0 ? 0x7f800000
748 fdtos %f58,%f20 ! (2_1) ((float*)&dres0)[0] = (float)res0;
749
750 fmuld %f42,%f46,%f58 ! (5_1) res1 *= xx1;
751 sra %o5,13,%o1 ! (3_0) si1 = ax1 >> 13;
752 and %i0,2032,%i0 ! (2_0) si0 &= 0x7f0;
753 fdtos %f56,%f21 ! (3_1) ((float*)&dres0)[0] = (float)res0;
754
755 ldd [%i0+TBL],%f30 ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
756 sra %o5,24,%o3 ! (3_0) iexp1 = ax1 >> 24;
757 and %o1,2032,%o1 ! (3_0) si1 &= 0x7f0;
758 fpsub32 %f24,%f54,%f12 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);
759
760 ldd [%o1+TBL],%f46 ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
761 sra %g1,24,%i3 ! (2_0) iexp0 = ax0 >> 24;
762 sub %l0,%o3,%o3 ! (3_0) iexp1 = 0x3f - iexp1;
763 faddd %f52,K2,%f40 ! (0_0) res0 += K2;
764
765 ldd [%l5+8],%f42 ! (4_1) tbl_sqrt0 = ((double*)addr0)[1];
766 sub %l0,%i3,%g5 ! (2_0) iexp0 = 0x3f - iexp0;
767 and %o3,511,%i3 ! (3_0) iexp1 &= 0x1ff;
768 faddd %f50,K2,%f60 ! (1_0) res0 += K2;
769
770 ldd [%l6+8],%f28 ! (5_1) tbl_sqrt1 = ((double*)addr1)[1];
771 sllx %g5,55,%g5 ! (2_0) lexp0 = iexp0 << 55;
772 add %i0,TBL,%i0 ! (2_0) addr0 = (char*)TBL + si0;
773 fitod %f12,%f56 ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]);
774
775 ldd [%fp+tmp2],%f52 ! (2_1) fdx0 = *((double*)lexp0);
776 sllx %i3,23,%i3 ! (3_0) lexp1 = iexp1 << 23;
777 add %i1,stridey2,%o3 ! py += stridey2
778 fitod %f13,%f50 ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]);
779
780 fmuld %f40,%f26,%f40 ! (0_0) res0 *= xx0;
781 or %g5,%i3,%g5 ! (2_0) lexp0 |= lexp1;
782 st %f10,[%i1] ! (0_1) *py = ((float*)&dres0)[0];
783 faddd %f48,K0,%f62 ! (4_1) res0 += K0;
784
785 fmuld %f60,%f44,%f48 ! (1_0) res1 *= xx1;
786 add %o1,TBL,%o1 ! (3_0) addr1 = (char*)TBL + si1;
787 stx %g5,[%fp+tmp2] ! (2_0) fdx0 = *((double*)lexp0);
788 faddd %f58,K0,%f60 ! (5_1) res1 += K0;
789
790 fmuld %f56,%f30,%f30 ! (2_0) xx0 = dtmp0 * tbl_div0;
791 bge,pn %icc,.update18 ! (2_0) if ( ax0 >= 0x7f800000 )
792 st %f11,[stridey+%i1] ! (1_1) *(py + stridey) = ((float*)&dres0)[1];
793 fpadd32 %f20,%f52,%f0 ! (2_1) dres0 = vis_fpadd32(dres0,fdx0);
794 .cont18:
795 cmp %g1,_0x00800000 ! (2_0) ax0 ? 0x00800000
796 bl,pn %icc,.update19 ! (2_0) if ( ax0 < 0x00800000 )
797 lda [%l7]0x82,%f14 ! (4_0) ((float*)&ddx0)[0] = *px;
798 fmuld %f50,%f46,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1;
799 .cont19:
800 lda [stridex+%l7]0x82,%f15 ! (5_0) ((float*)&ddx0)[1] = *(px + stridex);
801 cmp %o5,_0x7f800000 ! (3_0) ax1 ? 0x7f800000
802 fmuld %f42,%f62,%f58 ! (4_1) res0 = tbl_sqrt0 * res0;
803 faddd %f40,K1,%f46 ! (0_0) res0 += K1;
804
805 lda [%l7]0x82,%g1 ! (4_0) ax0 = *(int*)px;
806 add %l7,stridex2,%i1 ! px += stridex2
807 fmuld %f28,%f60,%f56 ! (5_1) res1 = tbl_sqrt1 * res1;
808 faddd %f48,K1,%f62 ! (1_0) res1 += K1;
809
810 lda [stridex+%l7]0x82,%g5 ! (5_0) ax1 = *(int*)(px + stridex);
811 add %o0,TBL,%o0 ! (0_0) addr0 = (char*)TBL + si0;
812 bge,pn %icc,.update20 ! (3_0) if ( ax1 >= 0x7f800000 )
813 fmuld K3,%f30,%f52 ! (2_0) res0 = K3 * xx0;
814 .cont20:
815 fmuld K3,%f24,%f50 ! (3_0) res1 = K3 * xx1;
816 cmp %o5,_0x00800000 ! (3_0) ax1 ? 0x00800000
817 bl,pn %icc,.update21 ! (3_0) if ( ax1 < 0x00800000 )
818 fand %f14,DC0,%f16 ! (4_0) dfx0 = vis_fand(ddx0,DC0);
819 .cont21:
820 fmuld %f46,%f26,%f48 ! (0_0) res0 *= xx0;
821 sra %g1,13,%l5 ! (4_0) si0 = ax0 >> 13;
822 add %i1,stridex2,%o5 ! px += stridex2
823 fdtos %f58,%f6 ! (4_1) ((float*)&dres0)[0] = (float)res0;
824
825 fmuld %f62,%f44,%f40 ! (1_0) res1 *= xx1;
826 sra %g5,13,%l6 ! (5_0) si1 = ax1 >> 13;
827 and %l5,2032,%l5 ! (4_0) si0 &= 0x7f0;
828 fdtos %f56,%f7 ! (5_1) ((float*)&dres0)[1] = (float)res1;
829
830 ldd [%l5+TBL],%f54 ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
831 sra %g5,24,%l7 ! (5_0) iexp1 = ax1 >> 24;
832 and %l6,2032,%l6 ! (5_0) si1 &= 0x7f0;
833 fpsub32 %f14,%f16,%f16 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);
834
835 ldd [%l6+TBL],%f46 ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
836 sra %g1,24,%i3 ! (4_0) iexp0 = ax0 >> 24;
837 sub %l0,%l7,%l7 ! (5_0) iexp1 = 0x3f - iexp1;
838 faddd %f52,K2,%f58 ! (2_0) res0 += K2;
839
840 ldd [%o0+8],%f42 ! (0_0) tbl_sqrt0 = ((double*)addr0)[1];
841 and %l7,511,%l1 ! (5_0) iexp1 = 0x1ff;
842 add %l6,TBL,%l6 ! (5_0) addr1 = (char*)TBL + si1;
843 faddd %f50,K2,%f60 ! (3_0) res1 += K2;
844
845 ldd [%o7+8],%f28 ! (1_0) tbl_sqrt1 = ((double*)addr1)[1];
846 sllx %l1,23,%l1 ! (5_0) lexp1 = iexp1 << 23;
847 sub %l0,%i3,%o0 ! (4_0) iexp0 = 0x3f - iexp0;
848 fitod %f16,%f56 ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]);
849
850 ldd [%fp+tmp0],%f52 ! (4_1) fdx0 = *((double*)lexp0);
851 sllx %o0,55,%o0 ! (4_0) lexp0 = iexp0 << 55;
852 add %o3,stridey2,%l7 ! py += stridey2
853 fitod %f17,%f44 ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]);
854
855 fmuld %f58,%f30,%f62 ! (2_0) res0 *= xx0;
856 or %o0,%l1,%o0 ! (4_0) lexp0 |= lexp1;
857 st %f0,[%o3] ! (2_1) *py = ((float*)&dres0)[0];
858 faddd %f48,K0,%f22 ! (0_0) res0 += K0;
859
860 fmuld %f60,%f24,%f58 ! (3_0) res1 *= xx1;
861 subcc counter,6,counter ! counter -= 6;
862 stx %o0,[%fp+tmp0] ! (4_0) fdx0 = *((double*)lexp0);
863 faddd %f40,K0,%f26 ! (1_0) res1 += K0;
864
865 fmuld %f56,%f54,%f40 ! (4_0) xx0 = dtmp0 * tbl_div0;
866 st %f1,[stridey+%o3] ! (3_1) *(py + stridey) = ((float*)&dres0)[1];
867 bpos,pt %icc,.main_loop
868 fpadd32 %f6,%f52,%f10 ! (4_1) dres0 = vis_fpadd32(dres0,fdx0);
869
870 add counter,6,counter
871 .tail:
872 sll stridex,1,stridex2
873 subcc counter,1,counter
874 bneg,a .begin
875 mov %l7,%i2
876
877 fmuld %f42,%f22,%f44 ! (0_1) res0 = tbl_sqrt0 * res0;
878 faddd %f62,K1,%f42 ! (2_1) res0 += K1;
879
880 fmuld %f28,%f26,%f60 ! (1_1) res1 = tbl_sqrt1 * res1;
881
882 fmuld %f42,%f30,%f48 ! (2_1) res0 *= xx0;
883 fdtos %f44,%f8 ! (0_1) ((float*)&dres0)[0] = (float)res0;
884
885 fdtos %f60,%f9 ! (1_1) ((float*)&dres0)[1] = (float)res1;
886
887 ldd [%i0+8],%f42 ! (2_1) tbl_sqrt0 = ((double*)addr0)[1];
888
889 ldd [%fp+tmp1],%f52 ! (0_1) fdx0 = *((double*)lexp0);
890
891 st %f10,[%l7] ! (4_2) *py = ((float*)&dres0)[0];
892 subcc counter,1,counter
893 bneg,a .begin
894 add %l7,stridey,%i2
895
896 faddd %f48,K0,%f62 ! (2_1) res0 += K0;
897 st %f11,[stridey+%l7] ! (5_2) *(py + stridey) = ((float*)&dres0)[1];
898 subcc counter,1,counter
899 bneg,a .begin
900 add %l7,stridey2,%i2
901 fpadd32 %f8,%f52,%f10 ! (0_1) dres0 = vis_fpadd32(dres0,fdx0);
902
903 add %l7,stridey2,%i1 ! py += stridey2
904
905 fmuld %f42,%f62,%f58 ! (2_1) res0 = tbl_sqrt0 * res0;
906
907 fdtos %f58,%f20 ! (2_1) ((float*)&dres0)[0] = (float)res0;
908
909 ldd [%fp+tmp2],%f52 ! (2_1) fdx0 = *((double*)lexp0);
910 add %i1,stridey2,%o3 ! py += stridey2
911
912 st %f10,[%i1] ! (0_1) *py = ((float*)&dres0)[0];
913 subcc counter,1,counter
914 bneg,a .begin
915 add %i1,stridey,%i2
916
917 st %f11,[stridey+%i1] ! (1_1) *(py + stridey) = ((float*)&dres0)[1];
918 subcc counter,1,counter
919 bneg,a .begin
920 mov %o3,%i2
921 fpadd32 %f20,%f52,%f0 ! (2_1) dres0 = vis_fpadd32(dres0,fdx0);
922
923 st %f0,[%o3] ! (2_1) *py = ((float*)&dres0)[0];
924 ba .begin
925 add %o3,stridey,%i2
926
927 .align 16
928 .spec0:
929 fdivs FONE,%f14,%f14 ! x0 = FONE / x0;
930 add %l7,stridex,%l7 ! px += stridex
931 st %f14,[%i2] ! *py = x0;
932 sub counter,1,counter
933 ba .begin1
934 add %i2,stridey,%i2 ! py += stridey
935
936 .align 16
937 .spec1:
938 andcc %g1,%o0,%g0
939 bz,a 1f
940 fdivs FONE,%f14,%f14 ! x0 = DONE / x0;
941
942 cmp %g1,0
943 bl,a 1f
944 fsqrts %f14,%f14 ! x0 = sqrtf(x0);
945
946 fitod %f14,%f0
947 fdtos %f0,%f14
948 fmuls %f14,FTWO,%f14
949 st %f14,[%fp+tmp3]
950 ld [%fp+tmp3],%g1
951 sethi %hi(0x4b000000),%o0
952 sra %g1,13,%l5 ! (4_0) si0 = ax0 >> 13;
953 fands %f14,DC0,%f16 ! (4_0) dfx0 = vis_fand(ddx0,DC0);
954 ba .cont_spec
955 sub %g1,%o0,%g1
956 1:
957 add %l7,stridex,%l7 ! px += stridex
958 sub counter,1,counter
959 st %f14,[%i2] ! *py = x0;
960 ba .begin1
961 add %i2,stridey,%i2 ! py += stridey
962
963 .align 16
964 .update0:
965 cmp counter,1
966 ble .cont0
967 nop
968
969 sub %i1,stridex,%o1
970 stx %o1,[%fp+tmp_px]
971
972 sub counter,1,counter
973 st counter,[%fp+tmp_counter]
974
975 ba .cont0
976 mov 1,counter
977
978 .align 16
979 .update1:
980 sethi %hi(0x7ffffc00),%o0
981 cmp counter,1
982 ble .cont1
983
984 add %o0,0x3ff,%o0
985
986 andcc %g5,%o0,%g0
987 bz,a 1f
988 nop
989
990 cmp %g5,0
991 bl,a 1f
992 nop
993
994 fitod %f15,%f0
995 fdtos %f0,%f15
996 fmuls %f15,FTWO,%f15
997 st %f15,[%fp+tmp3]
998 ld [%fp+tmp3],%g5
999 sethi %hi(0x4b000000),%o0
1000 sub %g5,%o0,%g5
1001
1002 fands %f15,DC0,%f17 ! (4_0) dfx0 = vis_fand(ddx0,DC0);
1003
1004 sra %g5,13,%l6 ! (5_0) si1 = ax1 >> 13;
1005
1006 sra %g5,24,%l7 ! (5_0) iexp1 = ax1 >> 24;
1007 and %l6,2032,%l6 ! (5_0) si1 &= 0x7f0;
1008
1009 fpsub32s %f15,%f17,%f17 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1010
1011 ldd [%l6+TBL],%f46 ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
1012 sub %l0,%l7,%l1 ! (5_0) iexp1 = 0x3f - iexp1;
1013
1014 sll %l1,23,%l1 ! (5_0) lexp1 = iexp1 << 23;
1015 add %l6,TBL,%l6 ! (5_0) addr1 = (char*)TBL + si1;
1016 st %l1,[%fp+tmp0+4] ! (4_0) fdx0 = *((double*)lexp0);
1017 fitod %f17,%f44 ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]);
1018
1019 fmuld %f44,%f46,%f46 ! (5_1) xx1 = dtmp1 * tbl_div1;
1020
1021 ba .cont1
1022 fmuld K3,%f46,%f50 ! (5_1) res1 = K3 * xx1;
1023 1:
1024 sub %i1,stridex,%o1
1025 stx %o1,[%fp+tmp_px]
1026
1027 sub counter,1,counter
1028 st counter,[%fp+tmp_counter]
1029
1030 ba .cont1
1031 mov 1,counter
1032
1033 .align 16
1034 .update2:
1035 cmp counter,2
1036 ble .cont2
1037 sub %o5,stridex,%o1
1038
1039 sub %o1,stridex,%o1
1040 stx %o1,[%fp+tmp_px]
1041
1042 sub counter,2,counter
1043 st counter,[%fp+tmp_counter]
1044
1045 ba .cont2
1046 mov 2,counter
1047
1048 .align 16
1049 .update3:
1050 sethi %hi(0x7ffffc00),%o1
1051 cmp counter,2
1052 ble .cont3
1053
1054 add %o1,0x3ff,%o1
1055
1056 andcc %g1,%o1,%g0
1057 bz,a 1f
1058 sub %o5,stridex,%o1
1059
1060 cmp %g1,0
1061 bl,a 1f
1062 sub %o5,stridex,%o1
1063
1064 fitod %f18,%f0
1065 fdtos %f0,%f18
1066 fmuls %f18,FTWO,%f18
1067 st %f18,[%fp+tmp3]
1068 ld [%fp+tmp3],%g1
1069 sethi %hi(0x4b000000),%o1
1070 sub %g1,%o1,%g1
1071
1072 fand %f18,DC0,%f56 ! (0_0) dfx0 = vis_fand(ddx0,DC0);
1073 sra %g1,13,%o0 ! (0_0) si0 = ax0 >> 13;
1074
1075 and %o0,2032,%o0 ! (0_0) si0 &= 0x7f0;
1076
1077 ldd [%o0+TBL],%f54 ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
1078 fpsub32 %f18,%f56,%f30 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1079
1080 sra %g1,24,%i3 ! (0_0) iexp0 = ax0 >> 24;
1081 sub %l0,%i3,%g5 ! (0_0) iexp0 = 0x3f - iexp0;
1082 ba .cont3
1083 fitod %f30,%f56 ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]);
1084 1:
1085 sub %o1,stridex,%o1
1086 stx %o1,[%fp+tmp_px]
1087
1088 sub counter,2,counter
1089 st counter,[%fp+tmp_counter]
1090
1091 ba .cont3
1092 mov 2,counter
1093
1094 .align 16
1095 .update4:
1096 cmp counter,3
1097 ble .cont4
1098 sub %l7,stridex2,%o1
1099
1100 sub %o1,stridex,%o1
1101 stx %o1,[%fp+tmp_px]
1102
1103 sub counter,3,counter
1104 st counter,[%fp+tmp_counter]
1105
1106 ba .cont4
1107 mov 3,counter
1108
1109 .align 16
1110 .update5:
1111 sethi %hi(0x7ffffc00),%o1
1112 cmp counter,3
1113 ble .cont5
1114
1115 add %o1,0x3ff,%o1
1116
1117 andcc %i4,%o1,%g0
1118 bz,a 1f
1119 sub %l7,stridex2,%o1
1120
1121 cmp %i4,0
1122 bl,a 1f
1123 sub %l7,stridex2,%o1
1124
1125 fitod %f19,%f0
1126 fdtos %f0,%f19
1127 fmuls %f19,FTWO,%f19
1128 st %f19,[%fp+tmp3]
1129 ld [%fp+tmp3],%i4
1130 sethi %hi(0x4b000000),%o1
1131 sub %i4,%o1,%i4
1132
1133 fands %f19,DC0,%f0 ! (0_0) dfx0 = vis_fand(ddx0,DC0);
1134
1135 sra %i4,13,%g5 ! (1_0) si1 = ax1 >> 13;
1136
1137 sra %i4,24,%i1 ! (1_0) iexp1 = ax1 >> 24;
1138 and %g5,2032,%o7 ! (1_0) si1 &= 0x7f0;
1139 fpsub32s %f19,%f0,%f31 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1140
1141 ldd [%o7+TBL],%f44 ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
1142 sub %l0,%i1,%i0 ! (1_0) iexp1 = 0x3f - iexp1;
1143
1144 sll %i0,23,%i0 ! (1_0) lexp1 = iexp1 << 23;
1145 fitod %f31,%f50 ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]);
1146
1147 st %i0,[%fp+tmp1+4] ! (0_0) fdx0 = *((double*)lexp0);
1148
1149 add %o7,TBL,%o7 ! (1_0) addr0 = (char*)TBL + si0;
1150 fmuld %f50,%f44,%f44 ! (1_0) xx0 = dtmp0 * tbl_div0;
1151
1152 ba .cont5
1153 fmuld K3,%f44,%f50 ! (1_0) res1 = K3 * xx1;
1154 1:
1155 sub %o1,stridex,%o1
1156 stx %o1,[%fp+tmp_px]
1157
1158 sub counter,3,counter
1159 st counter,[%fp+tmp_counter]
1160
1161 ba .cont5
1162 mov 3,counter
1163
1164 .align 16
1165 .update6:
1166 cmp counter,4
1167 ble .cont6
1168 sub %l7,stridex,%o3
1169
1170 sub %o3,stridex,%o3
1171 stx %o3,[%fp+tmp_px]
1172
1173 sub counter,4,counter
1174 st counter,[%fp+tmp_counter]
1175
1176 ba .cont6
1177 mov 4,counter
1178
1179 .align 16
1180 .update7:
1181 sethi %hi(0x7ffffc00),%o3
1182 cmp counter,4
1183 ble .cont7
1184
1185 add %o3,0x3ff,%o3
1186
1187 andcc %g1,%o3,%g0
1188 bz,a 1f
1189 sub %l7,stridex,%o3
1190
1191 cmp %g1,0
1192 bl,a 1f
1193 sub %l7,stridex,%o3
1194
1195 fitod %f24,%f0
1196 fdtos %f0,%f24
1197 fmuls %f24,FTWO,%f24
1198 st %f24,[%fp+tmp3]
1199 ld [%fp+tmp3],%g1
1200 sethi %hi(0x4b000000),%o3
1201 sub %g1,%o3,%g1
1202
1203 fands %f24,DC0,%f0 ! (2_0) dfx0 = vis_fand(ddx0,DC0);
1204 sra %g1,13,%i0 ! (2_0) si0 = ax0 >> 13;
1205
1206 and %i0,2032,%i0 ! (2_0) si0 &= 0x7f0;
1207
1208 ldd [%i0+TBL],%f30 ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
1209 fpsub32s %f24,%f0,%f12 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1210
1211 sra %g1,24,%i3 ! (2_0) iexp0 = ax0 >> 24;
1212
1213 sub %l0,%i3,%g5 ! (2_0) iexp0 = 0x3f - iexp0;
1214
1215 sll %g5,23,%g5 ! (2_0) lexp0 = iexp0 << 55;
1216 add %i0,TBL,%i0 ! (2_0) addr0 = (char*)TBL + si0;
1217 fitod %f12,%f56 ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]);
1218
1219 st %g5,[%fp+tmp2] ! (2_0) fdx0 = *((double*)lexp0);
1220 ba .cont7
1221 fmuld %f56,%f30,%f30 ! (2_0) xx0 = dtmp0 * tbl_div0;
1222 1:
1223 sub %o3,stridex,%o3
1224 stx %o3,[%fp+tmp_px]
1225
1226 sub counter,4,counter
1227 st counter,[%fp+tmp_counter]
1228
1229 ba .cont7
1230 mov 4,counter
1231
1232 .align 16
1233 .update8:
1234 cmp counter,5
1235 ble .cont8
1236 nop
1237
1238 sub %l7,stridex,%o3
1239 stx %o3,[%fp+tmp_px]
1240
1241 sub counter,5,counter
1242 st counter,[%fp+tmp_counter]
1243
1244 ba .cont8
1245 mov 5,counter
1246
1247 .align 16
1248 .update9:
1249 sethi %hi(0x7ffffc00),%o3
1250 cmp counter,5
1251 ble .cont9
1252 sub %l7,stridex,%i3
1253
1254 add %o3,0x3ff,%o3
1255
1256 andcc %o5,%o3,%g0
1257 bz 1f
1258 ld [%i3],%f0
1259
1260 cmp %o5,0
1261 bl,a 1f
1262 nop
1263
1264 fitod %f0,%f0
1265 fdtos %f0,%f0
1266 fmuls %f0,FTWO,%f0
1267 st %f0,[%fp+tmp3]
1268 ld [%fp+tmp3],%o5
1269 sethi %hi(0x4b000000),%o3
1270 sub %o5,%o3,%o5
1271
1272 fands %f0,DC0,%f8 ! (2_0) dfx0 = vis_fand(ddx0,DC0);
1273
1274 sra %o5,13,%o1 ! (3_0) si1 = ax1 >> 13;
1275
1276 sra %o5,24,%o3 ! (3_0) iexp1 = ax1 >> 24;
1277 and %o1,2032,%o1 ! (3_0) si1 &= 0x7f0;
1278 fpsub32s %f0,%f8,%f0 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1279
1280 ldd [%o1+TBL],%f8 ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
1281 sub %l0,%o3,%i3 ! (3_0) iexp1 = 0x3f - iexp1;
1282
1283 sllx %i3,23,%i3 ! (3_0) lexp1 = iexp1 << 23;
1284 fitod %f0,%f50 ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]);
1285
1286 add %o1,TBL,%o1 ! (3_0) addr1 = (char*)TBL + si1;
1287 st %i3,[%fp+tmp2+4] ! (2_0) fdx0 = *((double*)lexp0);
1288
1289 fmuld %f50,%f8,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1;
1290
1291 ba .cont9
1292 fmuld K3,%f24,%f50 ! (3_0) res1 = K3 * xx1;
1293 1:
1294 stx %i3,[%fp+tmp_px]
1295
1296 sub counter,5,counter
1297 st counter,[%fp+tmp_counter]
1298
1299 ba .cont9
1300 mov 5,counter
1301
1302 .align 16
1303 .update10:
1304 cmp counter,0
1305 ble .cont10
1306 sub %i1,stridex,%o3
1307
1308 sub %o3,stridex,%o3
1309 stx %o3,[%fp+tmp_px]
1310
1311 st counter,[%fp+tmp_counter]
1312
1313 ba .cont10
1314 mov 0,counter
1315
1316 .align 16
1317 .update11:
1318 sethi %hi(0x7ffffc00),%i4
1319 cmp counter,0
1320 ble .cont11
1321 sub %i1,stridex,%o3
1322
1323 sub %o3,stridex,%o3
1324 add %i4,0x3ff,%i4
1325 ld [%o3],%i3
1326
1327 andcc %i3,%i4,%g0
1328 bz 1f
1329
1330 cmp %i3,0
1331 bl,a 1f
1332 nop
1333
1334 fitod %f14,%f0
1335 fdtos %f0,%f14
1336 fmuls %f14,FTWO,%f14
1337 st %f14,[%fp+tmp3]
1338 ld [%fp+tmp3],%i3
1339 sethi %hi(0x4b000000),%o3
1340 sub %i3,%o3,%i3
1341
1342 fands %f14,DC0,%f16 ! (4_0) dfx0 = vis_fand(ddx0,DC0);
1343 sra %i3,13,%l5 ! (4_0) si0 = ax0 >> 13;
1344
1345 and %l5,2032,%l5 ! (4_0) si0 &= 0x7f0;
1346
1347 ldd [%l5+TBL],%f54 ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
1348 fpsub32s %f14,%f16,%f16 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1349
1350 sra %i3,24,%i3 ! (4_0) iexp0 = ax0 >> 24;
1351
1352 sub %l0,%i3,%o0 ! (4_0) iexp0 = 0x3f - iexp0;
1353 fitod %f16,%f56 ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]);
1354
1355 sllx %o0,23,%o0 ! (4_0) lexp0 = iexp0 << 55;
1356
1357 st %o0,[%fp+tmp0] ! (4_0) fdx0 = *((double*)lexp0);
1358
1359 ba .cont11
1360 fmuld %f56,%f54,%f40 ! (4_0) xx0 = dtmp0 * tbl_div0;
1361 1:
1362 stx %o3,[%fp+tmp_px]
1363
1364 st counter,[%fp+tmp_counter]
1365
1366 ba .cont11
1367 mov 0,counter
1368
1369 .align 16
1370 .update12:
1371 cmp counter,1
1372 ble .cont12
1373 nop
1374
1375 sub %i1,stridex,%i1
1376 stx %i1,[%fp+tmp_px]
1377
1378 sub counter,1,counter
1379 st counter,[%fp+tmp_counter]
1380
1381 ba .cont12
1382 mov 1,counter
1383
1384 .align 16
1385 .update13:
1386 sethi %hi(0x7ffffc00),%o3
1387 cmp counter,1
1388 ble .cont13
1389
1390 add %o3,0x3ff,%o3
1391
1392 andcc %g5,%o3,%g0
1393 bz 1f
1394
1395 cmp %g5,0
1396 bl,a 1f
1397 nop
1398
1399 fitod %f15,%f0
1400 fdtos %f0,%f15
1401 fmuls %f15,FTWO,%f15
1402 st %f15,[%fp+tmp3]
1403 ld [%fp+tmp3],%g5
1404 sethi %hi(0x4b000000),%o3
1405 sub %g5,%o3,%g5
1406
1407 fands %f15,DC0,%f17 ! (4_0) dfx0 = vis_fand(ddx0,DC0);
1408
1409 sra %g5,13,%l6 ! (5_0) si1 = ax1 >> 13;
1410 sra %g5,24,%o3 ! (5_0) iexp1 = ax1 >> 24;
1411 and %l6,2032,%l6 ! (5_0) si1 &= 0x7f0;
1412 fpsub32s %f15,%f17,%f17 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1413
1414 ldd [%l6+TBL],%f46 ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
1415 sub %l0,%o3,%l1 ! (5_0) iexp1 = 0x3f - iexp1;
1416
1417 add %l6,TBL,%l6 ! (5_0) addr1 = (char*)TBL + si1;
1418
1419 sllx %l1,23,%l1 ! (5_0) lexp1 = iexp1 << 23;
1420 st %l1,[%fp+tmp0+4] ! (4_0) fdx0 = *((double*)lexp0);
1421
1422 fitod %f17,%f0 ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]);
1423
1424 fmuld %f0,%f46,%f46 ! (5_1) xx1 = dtmp1 * tbl_div1;
1425 ba .cont13
1426 fmuld K3,%f46,%f50 ! (5_1) res1 = K3 * xx1;
1427 1:
1428 sub %i1,stridex,%i1
1429 stx %i1,[%fp+tmp_px]
1430
1431 sub counter,1,counter
1432 st counter,[%fp+tmp_counter]
1433
1434 ba .cont13
1435 mov 1,counter
1436
1437 .align 16
1438 .update14:
1439 cmp counter,2
1440 ble .cont14
1441 sub %o5,stridex,%o3
1442
1443 sub %o3,stridex,%o3
1444 stx %o3,[%fp+tmp_px]
1445
1446 sub counter,2,counter
1447 st counter,[%fp+tmp_counter]
1448
1449 ba .cont14
1450 mov 2,counter
1451
1452 .align 16
1453 .update15:
1454 sethi %hi(0x7ffffc00),%i3
1455 cmp counter,2
1456 ble .cont15
1457 sub %o5,stridex,%o3
1458
1459 add %i3,0x3ff,%i3
1460
1461 andcc %g1,%i3,%g0
1462 bz 1f
1463 sub %o3,stridex,%o3
1464
1465 cmp %g1,0
1466 bl,a 1f
1467 nop
1468
1469 fitod %f18,%f0
1470 fdtos %f0,%f18
1471 fmuls %f18,FTWO,%f18
1472 st %f18,[%fp+tmp3]
1473 ld [%fp+tmp3],%g1
1474 sethi %hi(0x4b000000),%o3
1475 sub %g1,%o3,%g1
1476
1477 fands %f18,DC0,%f0 ! (0_0) dfx0 = vis_fand(ddx0,DC0);
1478 sra %g1,13,%o0 ! (0_0) si0 = ax0 >> 13;
1479 and %o0,2032,%o0 ! (0_0) si0 &= 0x7f0;
1480
1481 ldd [%o0+TBL],%f54 ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
1482 fpsub32s %f18,%f0,%f30 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1483
1484 sra %g1,24,%i3 ! (0_0) iexp0 = ax0 >> 24;
1485
1486 sub %l0,%i3,%g5 ! (0_0) iexp0 = 0x3f - iexp0;
1487
1488 ba .cont15
1489 fitod %f30,%f56 ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]);
1490 1:
1491 stx %o3,[%fp+tmp_px]
1492
1493 sub counter,2,counter
1494 st counter,[%fp+tmp_counter]
1495
1496 ba .cont15
1497 mov 2,counter
1498
1499 .align 16
1500 .update16:
1501 cmp counter,3
1502 ble .cont16
1503 sub %l7,stridex2,%o3
1504
1505 sub %o3,stridex,%o3
1506 stx %o3,[%fp+tmp_px]
1507
1508 sub counter,3,counter
1509 st counter,[%fp+tmp_counter]
1510
1511 ba .cont16
1512 mov 3,counter
1513
1514 .align 16
1515 .update17:
1516 sethi %hi(0x7ffffc00),%i3
1517 cmp counter,3
1518 ble .cont17
1519 sub %l7,stridex2,%o3
1520
1521 add %i3,0x3ff,%i3
1522
1523 andcc %i4,%i3,%g0
1524 bz 1f
1525 sub %o3,stridex,%o3
1526
1527 cmp %i4,0
1528 bl,a 1f
1529 nop
1530
1531 fitod %f19,%f0
1532 fdtos %f0,%f19
1533 fmuls %f19,FTWO,%f19
1534 st %f19,[%fp+tmp3]
1535 ld [%fp+tmp3],%i4
1536 sethi %hi(0x4b000000),%o3
1537 sub %i4,%o3,%i4
1538
1539 fands %f19,DC0,%f0 ! (0_0) dfx0 = vis_fand(ddx0,DC0);
1540
1541 sra %i4,13,%g5 ! (1_0) si1 = ax1 >> 13;
1542
1543 sra %i4,24,%i0 ! (1_0) iexp1 = ax1 >> 24;
1544 and %g5,2032,%o7 ! (1_0) si1 &= 0x7f0;
1545 fpsub32s %f19,%f0,%f31 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1546
1547 ldd [%o7+TBL],%f44 ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
1548 sub %l0,%i0,%i0 ! (1_0) iexp1 = 0x3f - iexp1;
1549
1550 sllx %i0,23,%i0 ! (1_0) lexp1 = iexp1 << 23;
1551 fitod %f31,%f50 ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]);
1552
1553 st %i0,[%fp+tmp1+4] ! (0_0) fdx0 = *((double*)lexp0);
1554
1555 add %o7,TBL,%o7 ! (1_0) addr0 = (char*)TBL + si0;
1556 fmuld %f50,%f44,%f44 ! (1_0) xx0 = dtmp0 * tbl_div0;
1557
1558 ba .cont17
1559 fmuld K3,%f44,%f50 ! (1_0) res1 = K3 * xx1;
1560 1:
1561 stx %o3,[%fp+tmp_px]
1562
1563 sub counter,3,counter
1564 st counter,[%fp+tmp_counter]
1565
1566 ba .cont17
1567 mov 3,counter
1568
1569 .align 16
1570 .update18:
1571 cmp counter,4
1572 ble .cont18
1573 fpadd32 %f20,%f52,%f0 ! (2_1) dres0 = vis_fpadd32(dres0,fdx0);
1574
1575 sub %l7,stridex2,%i3
1576 stx %i3,[%fp+tmp_px]
1577
1578 sub counter,4,counter
1579 st counter,[%fp+tmp_counter]
1580
1581 ba .cont18
1582 mov 4,counter
1583
1584 .align 16
1585 .update19:
1586 sethi %hi(0x7ffffc00),%i3
1587 cmp counter,4
1588 ble,a .cont19
1589 fmuld %f50,%f46,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1;
1590
1591 add %i3,0x3ff,%i3
1592
1593 andcc %g1,%i3,%g0
1594 bz 1f
1595 nop
1596
1597 cmp %g1,0
1598 bl,a 1f
1599 nop
1600
1601 fitod %f24,%f24
1602 fdtos %f24,%f24
1603 fmuls %f24,FTWO,%f24
1604 st %f24,[%fp+tmp3]
1605 ld [%fp+tmp3],%g1
1606 sethi %hi(0x4b000000),%i3
1607 sub %g1,%i3,%g1
1608
1609 fands %f24,DC0,%f8 ! (2_0) dfx0 = vis_fand(ddx0,DC0);
1610 sra %g1,13,%i0 ! (2_0) si0 = ax0 >> 13;
1611
1612 and %i0,2032,%i0 ! (2_0) si0 &= 0x7f0;
1613
1614 ldd [%i0+TBL],%f30 ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
1615 fpsub32s %f24,%f8,%f12 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1616
1617 sra %g1,24,%i3 ! (2_0) iexp0 = ax0 >> 24;
1618
1619 sub %l0,%i3,%g5 ! (2_0) iexp0 = 0x3f - iexp0;
1620
1621 sllx %g5,23,%g5 ! (2_0) lexp0 = iexp0 << 55;
1622 add %i0,TBL,%i0 ! (2_0) addr0 = (char*)TBL + si0;
1623 fitod %f12,%f56 ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]);
1624
1625 st %g5,[%fp+tmp2] ! (2_0) fdx0 = *((double*)lexp0);
1626 fmuld %f56,%f30,%f30 ! (2_0) xx0 = dtmp0 * tbl_div0;
1627
1628 ba .cont19
1629 fmuld %f50,%f46,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1;
1630 1:
1631 sub %l7,stridex2,%i3
1632 stx %i3,[%fp+tmp_px]
1633
1634 sub counter,4,counter
1635 st counter,[%fp+tmp_counter]
1636
1637 mov 4,counter
1638 ba .cont19
1639 fmuld %f50,%f46,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1;
1640
1641 .align 16
1642 .update20:
1643 cmp counter,5
1644 ble .cont20
1645 nop
1646
1647 sub %l7,stridex,%i3
1648 stx %i3,[%fp+tmp_px]
1649
1650 sub counter,5,counter
1651 st counter,[%fp+tmp_counter]
1652
1653 ba .cont20
1654 mov 5,counter
1655
1656 .align 16
1657 .update21:
1658 sethi %hi(0x7ffffc00),%i3
1659 cmp counter,5
1660 ble,a .cont21
1661 nop
1662
1663 sub %l7,stridex,%i4
1664 add %i3,0x3ff,%i3
1665
1666 andcc %o5,%i3,%g0
1667 bz 1f
1668 ld [%i4],%f8
1669
1670 cmp %o5,0
1671 bl,a 1f
1672 nop
1673
1674 fitod %f8,%f8
1675 fdtos %f8,%f8
1676 fmuls %f8,FTWO,%f8
1677 st %f8,[%fp+tmp3]
1678 ld [%fp+tmp3],%o5
1679 sethi %hi(0x4b000000),%i3
1680 sub %o5,%i3,%o5
1681
1682 fands %f8,DC0,%f24 ! (2_0) dfx0 = vis_fand(ddx0,DC0);
1683
1684 sra %o5,13,%o1 ! (3_0) si1 = ax1 >> 13;
1685
1686 sra %o5,24,%i3 ! (3_0) iexp1 = ax1 >> 24;
1687 and %o1,2032,%o1 ! (3_0) si1 &= 0x7f0;
1688 fpsub32s %f8,%f24,%f24 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1689
1690 ldd [%o1+TBL],%f8 ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
1691 sub %l0,%i3,%i3 ! (3_0) iexp1 = 0x3f - iexp1;
1692
1693 sllx %i3,23,%i3 ! (3_0) lexp1 = iexp1 << 23;
1694 fitod %f24,%f50 ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]);
1695
1696 add %o1,TBL,%o1 ! (3_0) addr1 = (char*)TBL + si1;
1697 st %i3,[%fp+tmp2+4] ! (2_0) fdx0 = *((double*)lexp0);
1698
1699 fmuld %f50,%f8,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1;
1700
1701 ba .cont21
1702 fmuld K3,%f24,%f50 ! (3_0) res1 = K3 * xx1;
1703 1:
1704 sub %l7,stridex,%i3
1705 stx %i3,[%fp+tmp_px]
1706
1707 sub counter,5,counter
1708 st counter,[%fp+tmp_counter]
1709
1710 ba .cont21
1711 mov 5,counter
1712
1713 .align 16
1714 .exit:
1715 ret
1716 restore
1717
1718 SET_SIZE(__vrsqrtf)
1719