1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
23 */
24 /*
25 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
26 * Use is subject to license terms.
27 */
28
29 .file "__vsqrtf_ultra3.S"
30
31 #include "libm.h"
32 .weak __vsqrtf
33 .type __vsqrtf,#function
34 __vsqrtf = __vsqrtf_ultra3
35
36 RO_DATA
37 .align 64
38
39 .CONST_TBL:
40 .word 0x3fe00001, 0x80007e00 ! K1 = 5.00000715259318464227e-01
41 .word 0xbfc00003, 0xc0017a01 ! K2 = -1.25000447037521686593e-01
42 .word 0x000fffff, 0xffffffff ! DC0 = 0x000fffffffffffff
43 .word 0x3ff00000, 0x00000000 ! DC1 = 0x3ff0000000000000
44 .word 0x7ffff000, 0x00000000 ! DC2 = 0x7ffff00000000000
45
46 #define DC0 %f6
47 #define DC1 %f4
48 #define DC2 %f2
49 #define K2 %f38
50 #define K1 %f36
51 #define TBL %l2
52 #define stridex %l3
53 #define stridey %l4
54 #define _0x1ff0 %l5
55 #define counter %l6
56 #define _0x00800000 %l7
57 #define _0x7f800000 %o0
58
59 #define tmp_px STACK_BIAS-0x40
60 #define tmp_counter STACK_BIAS-0x38
61 #define tmp0 STACK_BIAS-0x30
62 #define tmp1 STACK_BIAS-0x28
63 #define tmp2 STACK_BIAS-0x20
64 #define tmp3 STACK_BIAS-0x18
65 #define tmp4 STACK_BIAS-0x10
66
67 ! sizeof temp storage - must be a multiple of 16 for V9
68 #define tmps 0x40
69
70 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
71 ! !!!!! algorithm !!!!!
72 !
73 ! x0 = *px;
74 ! ax = *(int*)px;
75 ! px += stridex;
76 !
77 ! if( ax >= 0x7f800000 )
78 ! {
79 ! *py = sqrtf(x0);
80 ! py += stridey;
81 ! continue;
82 ! }
83 ! if( ax < 0x00800000 )
84 ! {
85 ! *py = sqrtf(x0);
86 ! py += stridey;
87 ! continue;
88 ! }
89 !
90 ! db0 = (double)x0;
91 ! iexp0 = ax >> 24;
92 ! iexp0 += 0x3c0;
93 ! lexp0 = (long long)iexp0 << 52;
94 !
95 ! db0 = vis_fand(db0,DC0);
96 ! db0 = vis_for(db0,DC1);
97 ! hi0 = vis_fand(db0,DC2);
98 !
99 ! ax >>= 11;
100 ! si0 = ax & 0x1ff0;
101 ! dtmp0 = ((double*)((char*)TBL + si0))[0];
102 ! xx0 = (db0 - hi0);
103 ! xx0 *= dtmp0;
104 ! dtmp0 = ((double*)((char*)TBL + si0))[1]
105 ! res0 = K2 * xx0;
106 ! res0 += K1;
107 ! res0 *= xx0;
108 ! res0 += DC1;
109 ! res0 = dtmp0 * res0;
110 ! dtmp1 = *((double*)&lexp0);
111 ! res0 *= dtmp1;
112 ! fres0 = (float)res0;
113 ! *py = fres0;
114 ! py += stridey;
115 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
116
117 ENTRY(__vsqrtf_ultra3)
118 save %sp,-SA(MINFRAME)-tmps,%sp
119 PIC_SETUP(l7)
120 PIC_SET(l7,.CONST_TBL,o2)
121 PIC_SET(l7,__vlibm_TBL_sqrtf,l2)
122
123 st %i0,[%fp+tmp_counter]
124 sll %i2,2,stridex
125 or %g0,0xff8,%l5
126
127 stx %i1,[%fp+tmp_px]
128 sll %l5,1,_0x1ff0
129
130 ldd [%o2],K1
131 sll %i4,2,stridey
132
133 ldd [%o2+8],K2
134 or %g0,%i3,%g5
135
136 ldd [%o2+16],DC0
137 sethi %hi(0x7f800000),%o0
138
139 ldd [%o2+24],DC1
140 sethi %hi(0x00800000),%l7
141
142 ldd [%o2+32],DC2
143
144 .begin:
145 ld [%fp+tmp_counter],counter
146 ldx [%fp+tmp_px],%i1
147 st %g0,[%fp+tmp_counter]
148 .begin1:
149 cmp counter,0
150 ble,pn %icc,.exit
151
152 lda [%i1]0x82,%o2 ! (2_0) ax = *(int*)px;
153
154 or %g0,%i1,%o7
155 lda [%i1]0x82,%f25 ! (2_0) x0 = *px;
156
157 cmp %o2,_0x7f800000 ! (2_0) ax ? 0x7f800000
158 bge,pn %icc,.spec ! (2_0) if( ax >= 0x7f800000 )
159 nop
160
161 cmp %o2,_0x00800000 ! (2_0) ax ? 0x00800000
162 bl,pn %icc,.spec ! (2_0) if( ax < 0x00800000 )
163 nop
164
165 fstod %f25,%f56 ! (2_0) db0 = (double)x0;
166
167 lda [stridex+%o7]0x82,%o1 ! (3_0) ax = *(int*)px;
168
169 sra %o2,24,%l1 ! (2_0) iexp0 = ax >> 24;
170
171 add %o7,stridex,%i1 ! px += stridex
172 add %l1,960,%l0 ! (2_0) iexp0 += 0x3c0;
173 lda [stridex+%o7]0x82,%f0 ! (3_0) x0 = *px;
174 fand %f56,DC0,%f60 ! (2_0) db0 = vis_fand(db0,DC0);
175
176 cmp %o1,_0x7f800000 ! (3_0) ax ? 0x7f800000
177 bge,pn %icc,.update0 ! (3_0) if( ax >= 0x7f800000 )
178 nop
179 .cont0:
180 sllx %l0,52,%o3 ! (2_0) lexp0 = (long long)iexp0 << 52;
181
182 sra %o2,11,%i2 ! (2_0) ax >>= 11;
183 stx %o3,[%fp+tmp0] ! (2_0) dtmp1 = *((double*)&lexp0);
184 for %f60,DC1,%f40 ! (2_0) db0 = vis_for(db0,DC1);
185
186 cmp %o1,_0x00800000 ! (3_0) ax ? 0x00800000
187 bl,pn %icc,.update1 ! (3_0) if( ax < 0x00800000 )
188 nop
189 .cont1:
190 fstod %f0,%f48 ! (3_0) db0 = (double)x0;
191
192 and %i2,_0x1ff0,%o3 ! (2_0) si0 = ax & 0x1ff0;
193 lda [%i1+stridex]0x82,%o2 ! (4_0) ax = *(int*)px;
194
195 add %i1,stridex,%i1 ! px += stridex
196 add %o3,TBL,%i2 ! (2_0) (char*)TBL + si0
197 fand %f40,DC2,%f46 ! (2_0) hi0 = vis_fand(db0,DC2);
198
199 sra %o1,24,%o4 ! (3_0) iexp0 = ax >> 24;
200
201 lda [%i1]0x82,%f13 ! (4_0) x0 = *px;
202 fand %f48,DC0,%f58 ! (3_0) db0 = vis_fand(db0,DC0);
203
204 add %o4,960,%i0 ! (3_0) iexp0 += 0x3c0;
205
206 cmp %o2,_0x7f800000 ! (4_1) ax ? 0x7f800000
207 bge,pn %icc,.update2 ! (4_1) if( ax >= 0x7f800000 )
208 nop
209 .cont2:
210 fsubd %f40,%f46,%f44 ! (2_1) xx0 = (db0 - hi0);
211 sllx %i0,52,%g1 ! (3_1) lexp0 = (long long)iexp0 << 52;
212 ldd [%i2],%f40 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
213
214 sra %o1,11,%l0 ! (3_1) ax >>= 11;
215 stx %g1,[%fp+tmp1] ! (3_1) dtmp1 = *((double*)&lexp0);
216 for %f58,DC1,%f48 ! (3_1) db0 = vis_for(db0,DC1);
217
218 cmp %o2,_0x00800000 ! (4_1) ax ? 0x00800000
219 bl,pn %icc,.update3 ! (4_1) if( ax < 0x00800000 )
220 nop
221 .cont3:
222 fstod %f13,%f50 ! (4_1) db0 = (double)x0;
223
224 fmuld %f44,%f40,%f46 ! (2_1) xx0 *= dtmp0;
225 and %l0,_0x1ff0,%i0 ! (3_1) si0 = ax & 0x1ff0;
226 lda [%i1+stridex]0x82,%l1 ! (0_0) ax = *(int*)px;
227
228 add %i0,TBL,%l0 ! (3_1) (char*)TBL + si0
229 fand %f48,DC2,%f62 ! (3_1) hi0 = vis_fand(db0,DC2);
230
231 sra %o2,24,%o7 ! (4_1) iexp0 = ax >> 24;
232
233 add %i1,stridex,%o4 ! px += stridex
234 add %o7,960,%o7 ! (4_1) iexp0 += 0x3c0;
235 lda [%i1+stridex]0x82,%f17 ! (0_0) x0 = *px;
236 fand %f50,DC0,%f54 ! (4_1) db0 = vis_fand(db0,DC0);
237
238 fmuld K2,%f46,%f52 ! (2_1) res0 = K2 * xx0;
239 cmp %l1,_0x7f800000 ! (0_0) ax ? 0x7f800000
240 bge,pn %icc,.update4 ! (0_0) if( ax >= 0x7f800000 )
241 fsubd %f48,%f62,%f42 ! (3_1) xx0 = (db0 - hi0);
242 .cont4:
243 sllx %o7,52,%o1 ! (4_1) lexp0 = (long long)iexp0 << 52;
244 ldd [%i0+TBL],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
245
246 sra %o2,11,%i5 ! (4_1) ax >>= 11;
247 stx %o1,[%fp+tmp2] ! (4_1) dtmp1 = *((double*)&lexp0);
248 for %f54,DC1,%f34 ! (4_1) db0 = vis_for(db0,DC1);
249
250 cmp %l1,_0x00800000 ! (0_0) ax ? 0x00800000
251 bl,pn %icc,.update5 ! (0_0) if( ax < 0x00800000 )
252 nop
253 .cont5:
254 fstod %f17,%f56 ! (0_0) db0 = (double)x0;
255
256 fmuld %f42,%f40,%f42 ! (3_1) xx0 *= dtmp0;
257 lda [stridex+%o4]0x82,%i0 ! (1_0) ax = *(int*)px;
258 faddd %f52,K1,%f52 ! (2_1) res0 += K1;
259
260 sra %l1,24,%g1 ! (0_0) iexp0 = ax >> 24;
261 and %i5,_0x1ff0,%i5 ! (4_1) si0 = ax & 0x1ff0;
262 fand %f34,DC2,%f62 ! (4_1) hi0 = vis_fand(db0,DC2);
263
264 add %o4,stridex,%i1 ! px += stridex
265
266 add %g1,960,%o5 ! (0_0) iexp0 += 0x3c0;
267 add %i5,TBL,%i3 ! (4_1) (char*)TBL + si0
268 lda [stridex+%o4]0x82,%f21 ! (1_0) x0 = *px;
269 fand %f56,DC0,%f32 ! (0_0) db0 = vis_fand(db0,DC0);
270
271 fmuld K2,%f42,%f50 ! (3_1) res0 = K2 * xx0;
272 cmp %i0,_0x7f800000 ! (1_0) ax ? 0x7f800000
273 bge,pn %icc,.update6 ! (1_0) if( ax >= 0x7f800000 )
274 fsubd %f34,%f62,%f54 ! (4_1) xx0 = (db0 - hi0);
275 .cont6:
276 fmuld %f52,%f46,%f52 ! (2_1) res0 *= xx0;
277 sllx %o5,52,%o7 ! (0_0) lexp0 = (long long)iexp0 << 52;
278 ldd [TBL+%i5],%f62 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
279
280 sra %l1,11,%i4 ! (0_0) ax >>= 11;
281 stx %o7,[%fp+tmp3] ! (0_0) dtmp1 = *((double*)&lexp0);
282 for %f32,DC1,%f48 ! (0_0) db0 = vis_for(db0,DC1);
283
284 cmp %i0,_0x00800000 ! (1_0) ax ? 0x00800000
285 bl,pn %icc,.update7 ! (1_0) if( ax < 0x00800000 )
286 nop
287 .cont7:
288 fstod %f21,%f56 ! (1_0) db0 = (double)x0;
289
290 fmuld %f54,%f62,%f46 ! (4_1) xx0 *= dtmp0;
291 and %i4,_0x1ff0,%g1 ! (0_0) si0 = ax & 0x1ff0;
292 lda [%i1+stridex]0x82,%o2 ! (2_0) ax = *(int*)px;
293 faddd %f50,K1,%f62 ! (3_1) res0 += K1;
294
295 add %g1,TBL,%i5 ! (0_0) (double*)((char*)TBL + si0
296 fand %f48,DC2,%f32 ! (0_0) hi0 = vis_fand(db0,DC2);
297
298 sra %i0,24,%o4 ! (1_0) iexp0 = ax >> 24;
299 ldd [%i2+8],%f60 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
300 faddd %f52,DC1,%f58 ! (2_1) res0 += DC1;
301
302 add %i1,stridex,%o7 ! px += stridex
303 add %o4,960,%i2 ! (1_0) iexp0 += 0x3c0;
304 lda [%i1+stridex]0x82,%f25 ! (2_0) x0 = *px;
305 fand %f56,DC0,%f34 ! (1_0) db0 = vis_fand(db0,DC0);
306
307 fmuld K2,%f46,%f50 ! (4_1) res0 = K2 * xx0;
308 cmp %o2,_0x7f800000 ! (2_0) ax ? 0x7f800000
309 bge,pn %icc,.update8 ! (2_0) if( ax >= 0x7f800000 )
310 fsubd %f48,%f32,%f52 ! (0_0) xx0 = (db0 - hi0);
311 .cont8:
312 fmuld %f62,%f42,%f54 ! (3_1) res0 *= xx0;
313 sllx %i2,52,%o4 ! (1_0) lexp0 = (long long)iexp0 << 52;
314 ldd [TBL+%g1],%f32 ! (0_0) dtmp0 = ((double*)((char*)TBL + si0))[0];
315
316 fmuld %f60,%f58,%f60 ! (2_1) res0 = dtmp0 * res0;
317 sra %i0,11,%g1 ! (1_0) ax >>= 11;
318 stx %o4,[%fp+tmp4] ! (1_0) dtmp1 = *((double*)&lexp0);
319 for %f34,DC1,%f48 ! (1_0) db0 = vis_for(db0,DC1);
320
321 cmp %o2,_0x00800000 ! (2_0) ax ? 0x00800000
322 bl,pn %icc,.update9 ! (2_0) if( ax < 0x00800000 )
323 ldd [%fp+tmp0],%f40 ! (2_1) dtmp1 = *((double*)&lexp0);
324 fstod %f25,%f56 ! (2_0) db0 = (double)x0;
325 .cont9:
326 fmuld %f52,%f32,%f42 ! (0_0) xx0 *= dtmp0;
327 and %g1,_0x1ff0,%o5 ! (1_0) si0 = ax & 0x1ff0;
328 lda [stridex+%o7]0x82,%o1 ! (3_0) ax = *(int*)px;
329 faddd %f50,K1,%f34 ! (4_1) res0 += K1;
330
331 add %o5,TBL,%i4 ! (1_0) (char*)TBL + si0
332 fand %f48,DC2,%f62 ! (1_0) hi0 = vis_fand(db0,DC2);
333
334 fmuld %f60,%f40,%f32 ! (2_1) res0 *= dtmp1;
335 sra %o2,24,%l1 ! (2_0) iexp0 = ax >> 24;
336 ldd [%l0+8],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
337 faddd %f54,DC1,%f58 ! (3_1) res0 += DC1;
338
339 add %o7,stridex,%i1 ! px += stridex
340 add %l1,960,%l0 ! (2_0) iexp0 += 0x3c0;
341 lda [stridex+%o7]0x82,%f0 ! (3_0) x0 = *px;
342 fand %f56,DC0,%f60 ! (2_0) db0 = vis_fand(db0,DC0);
343
344 fmuld K2,%f42,%f50 ! (0_0) res0 = K2 * xx0;
345 cmp %o1,_0x7f800000 ! (3_0) ax ? 0x7f800000
346 bge,pn %icc,.update10 ! (3_0) if( ax >= 0x7f800000 )
347 fsubd %f48,%f62,%f54 ! (1_0) xx0 = (db0 - hi0);
348 .cont10:
349 fmuld %f34,%f46,%f52 ! (4_1) res0 *= xx0;
350 sllx %l0,52,%o3 ! (2_0) lexp0 = (long long)iexp0 << 52;
351 ldd [TBL+%o5],%f56 ! (1_0) dtmp0 = ((double*)((char*)TBL + si0))[0];
352
353 fmuld %f40,%f58,%f34 ! (3_1) res0 = dtmp0 * res0;
354 sra %o2,11,%i2 ! (2_0) ax >>= 11;
355 stx %o3,[%fp+tmp0] ! (2_0) dtmp1 = *((double*)&lexp0);
356 for %f60,DC1,%f40 ! (2_0) db0 = vis_for(db0,DC1);
357
358 cmp %o1,_0x00800000 ! (3_0) ax ? 0x00800000
359 bl,pn %icc,.update11 ! (3_0) if( ax < 0x00800000 )
360 ldd [%fp+tmp1],%f62 ! (3_1) dtmp1 = *((double*)&lexp0);
361 fstod %f0,%f48 ! (3_0) db0 = (double)x0;
362 .cont11:
363 fmuld %f54,%f56,%f30 ! (1_0) xx0 *= dtmp0;
364 and %i2,_0x1ff0,%o3 ! (2_0) si0 = ax & 0x1ff0;
365 lda [%i1+stridex]0x82,%o2 ! (4_0) ax = *(int*)px;
366 faddd %f50,K1,%f56 ! (0_0) res0 += K1;
367
368 add %i1,stridex,%i1 ! px += stridex
369 add %o3,TBL,%i2 ! (2_0) (char*)TBL + si0
370 fand %f40,DC2,%f46 ! (2_0) hi0 = vis_fand(db0,DC2);
371
372 fmuld %f34,%f62,%f28 ! (3_1) res0 *= dtmp1;
373 sra %o1,24,%o4 ! (3_0) iexp0 = ax >> 24;
374 ldd [%i3+8],%f50 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
375 faddd %f52,DC1,%f54 ! (4_1) res0 += DC1;
376
377 lda [%i1]0x82,%f13 ! (4_0) x0 = *px;
378 fand %f48,DC0,%f58 ! (3_0) db0 = vis_fand(db0,DC0);
379
380 or %g0,%g5,%i3
381 cmp counter,5
382 bl,pn %icc,.tail
383 add %o4,960,%g5 ! (3_0) iexp0 += 0x3c0;
384
385 ba .main_loop
386 sub counter,5,counter ! counter
387
388 .align 16
389 .main_loop:
390 fmuld K2,%f30,%f60 ! (1_1) res0 = K2 * xx0;
391 cmp %o2,_0x7f800000 ! (4_1) ax ? 0x7f800000
392 bge,pn %icc,.update12 ! (4_1) if( ax >= 0x7f800000 )
393 fsubd %f40,%f46,%f44 ! (2_1) xx0 = (db0 - hi0);
394 .cont12:
395 fmuld %f56,%f42,%f52 ! (0_1) res0 *= xx0;
396 sllx %g5,52,%g5 ! (3_1) lexp0 = (long long)iexp0 << 52;
397 ldd [%i2],%f40 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
398 fdtos %f32,%f15 ! (2_2) fres0 = (float)res0;
399
400 fmuld %f50,%f54,%f42 ! (4_2) res0 = dtmp0 * res0;
401 sra %o1,11,%l0 ! (3_1) ax >>= 11;
402 stx %g5,[%fp+tmp1] ! (3_1) dtmp1 = *((double*)&lexp0);
403 for %f58,DC1,%f48 ! (3_1) db0 = vis_for(db0,DC1);
404
405 cmp %o2,_0x00800000 ! (4_1) ax ? 0x00800000
406 bl,pn %icc,.update13 ! (4_1) if( ax < 0x00800000 )
407 ldd [%fp+tmp2],%f56 ! (4_2) dtmp1 = *((double*)&lexp0);
408 fstod %f13,%f50 ! (4_1) db0 = (double)x0;
409 .cont13:
410 fmuld %f44,%f40,%f46 ! (2_1) xx0 *= dtmp0;
411 and %l0,_0x1ff0,%i0 ! (3_1) si0 = ax & 0x1ff0;
412 lda [%i1+stridex]0x82,%l1 ! (0_0) ax = *(int*)px;
413 faddd %f60,K1,%f32 ! (1_1) res0 += K1;
414
415 add %i0,TBL,%l0 ! (3_1) (char*)TBL + si0
416 add %i3,stridey,%o3 ! py += stridey
417 st %f15,[%i3] ! (2_2) *py = fres0;
418 fand %f48,DC2,%f62 ! (3_1) hi0 = vis_fand(db0,DC2);
419
420 fmuld %f42,%f56,%f44 ! (4_2) res0 *= dtmp1;
421 sra %o2,24,%o7 ! (4_1) iexp0 = ax >> 24;
422 ldd [%i5+8],%f58 ! (0_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
423 faddd %f52,DC1,%f34 ! (0_1) res0 += DC1;
424
425 add %i1,stridex,%o4 ! px += stridex
426 add %o7,960,%o7 ! (4_1) iexp0 += 0x3c0;
427 lda [%i1+stridex]0x82,%f17 ! (0_0) x0 = *px;
428 fand %f50,DC0,%f54 ! (4_1) db0 = vis_fand(db0,DC0);
429
430 fmuld K2,%f46,%f52 ! (2_1) res0 = K2 * xx0;
431 cmp %l1,_0x7f800000 ! (0_0) ax ? 0x7f800000
432 bge,pn %icc,.update14 ! (0_0) if( ax >= 0x7f800000 )
433 fsubd %f48,%f62,%f42 ! (3_1) xx0 = (db0 - hi0);
434 .cont14:
435 fmuld %f32,%f30,%f48 ! (1_1) res0 *= xx0;
436 sllx %o7,52,%o1 ! (4_1) lexp0 = (long long)iexp0 << 52;
437 ldd [%i0+TBL],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
438 fdtos %f28,%f19 ! (3_2) fres0 = (float)res0;
439
440 fmuld %f58,%f34,%f32 ! (0_1) res0 = dtmp0 * res0;
441 sra %o2,11,%i5 ! (4_1) ax >>= 11;
442 stx %o1,[%fp+tmp2] ! (4_1) dtmp1 = *((double*)&lexp0);
443 for %f54,DC1,%f34 ! (4_1) db0 = vis_for(db0,DC1);
444
445 cmp %l1,_0x00800000 ! (0_0) ax ? 0x00800000
446 bl,pn %icc,.update15 ! (0_0) if( ax < 0x00800000 )
447 ldd [%fp+tmp3],%f60 ! (0_1) dtmp1 = *((double*)&lexp0);
448 fstod %f17,%f56 ! (0_0) db0 = (double)x0;
449 .cont15:
450 fmuld %f42,%f40,%f42 ! (3_1) xx0 *= dtmp0;
451 add %o3,stridey,%g5 ! py += stridey
452 lda [stridex+%o4]0x82,%i0 ! (1_0) ax = *(int*)px;
453 faddd %f52,K1,%f52 ! (2_1) res0 += K1;
454
455 sra %l1,24,%g1 ! (0_0) iexp0 = ax >> 24;
456 and %i5,_0x1ff0,%i5 ! (4_1) si0 = ax & 0x1ff0;
457 st %f19,[%o3] ! (3_2) *py = fres0;
458 fand %f34,DC2,%f62 ! (4_1) hi0 = vis_fand(db0,DC2);
459
460 fmuld %f32,%f60,%f40 ! (0_1) res0 *= dtmp1;
461 add %o4,stridex,%i1 ! px += stridex
462 ldd [%i4+8],%f60 ! (1_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
463 faddd %f48,DC1,%f58 ! (1_1) res0 += DC1;
464
465 add %g1,960,%o5 ! (0_0) iexp0 += 0x3c0;
466 add %i5,TBL,%i3 ! (4_1) (char*)TBL + si0
467 lda [stridex+%o4]0x82,%f21 ! (1_0) x0 = *px;
468 fand %f56,DC0,%f32 ! (0_0) db0 = vis_fand(db0,DC0);
469
470 fmuld K2,%f42,%f50 ! (3_1) res0 = K2 * xx0;
471 cmp %i0,_0x7f800000 ! (1_0) ax ? 0x7f800000
472 bge,pn %icc,.update16 ! (1_0) if( ax >= 0x7f800000 )
473 fsubd %f34,%f62,%f54 ! (4_1) xx0 = (db0 - hi0);
474 .cont16:
475 fmuld %f52,%f46,%f52 ! (2_1) res0 *= xx0;
476 sllx %o5,52,%o7 ! (0_0) lexp0 = (long long)iexp0 << 52;
477 ldd [TBL+%i5],%f62 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
478 fdtos %f44,%f23 ! (4_2) fres0 = (float)res0;
479
480 fmuld %f60,%f58,%f44 ! (1_1) res0 = dtmp0 * res0;
481 sra %l1,11,%i4 ! (0_0) ax >>= 11;
482 stx %o7,[%fp+tmp3] ! (0_0) dtmp1 = *((double*)&lexp0);
483 for %f32,DC1,%f48 ! (0_0) db0 = vis_for(db0,DC1);
484
485 cmp %i0,_0x00800000 ! (1_0) ax ? 0x00800000
486 bl,pn %icc,.update17 ! (1_0) if( ax < 0x00800000 )
487 ldd [%fp+tmp4],%f34 ! (1_1) dtmp1 = *((double*)&lexp0);
488 fstod %f21,%f56 ! (1_0) db0 = (double)x0;
489 .cont17:
490 fmuld %f54,%f62,%f46 ! (4_1) xx0 *= dtmp0;
491 and %i4,_0x1ff0,%g1 ! (0_0) si0 = ax & 0x1ff0;
492 lda [%i1+stridex]0x82,%o2 ! (2_0) ax = *(int*)px;
493 faddd %f50,K1,%f62 ! (3_1) res0 += K1;
494
495 add %g1,TBL,%i5 ! (0_0) (double*)((char*)TBL + si0
496 add %g5,stridey,%g5 ! py += stridey
497 st %f23,[stridey+%o3] ! (4_2) *py = fres0;
498 fand %f48,DC2,%f32 ! (0_0) hi0 = vis_fand(db0,DC2);
499
500 fmuld %f44,%f34,%f44 ! (1_1) res0 *= dtmp1;
501 sra %i0,24,%o4 ! (1_0) iexp0 = ax >> 24;
502 ldd [%i2+8],%f60 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
503 faddd %f52,DC1,%f58 ! (2_1) res0 += DC1;
504
505 add %i1,stridex,%o7 ! px += stridex
506 add %o4,960,%i2 ! (1_0) iexp0 += 0x3c0;
507 lda [%i1+stridex]0x82,%f25 ! (2_0) x0 = *px;
508 fand %f56,DC0,%f34 ! (1_0) db0 = vis_fand(db0,DC0);
509
510 fmuld K2,%f46,%f50 ! (4_1) res0 = K2 * xx0;
511 cmp %o2,_0x7f800000 ! (2_0) ax ? 0x7f800000
512 bge,pn %icc,.update18 ! (2_0) if( ax >= 0x7f800000 )
513 fsubd %f48,%f32,%f52 ! (0_0) xx0 = (db0 - hi0);
514 .cont18:
515 fmuld %f62,%f42,%f54 ! (3_1) res0 *= xx0;
516 sllx %i2,52,%o4 ! (1_0) lexp0 = (long long)iexp0 << 52;
517 ldd [TBL+%g1],%f32 ! (0_0) dtmp0 = ((double*)((char*)TBL + si0))[0];
518 fdtos %f40,%f27 ! (0_1) fres0 = (float)res0;
519
520 fmuld %f60,%f58,%f60 ! (2_1) res0 = dtmp0 * res0;
521 sra %i0,11,%g1 ! (1_0) ax >>= 11;
522 stx %o4,[%fp+tmp4] ! (1_0) dtmp1 = *((double*)&lexp0);
523 for %f34,DC1,%f48 ! (1_0) db0 = vis_for(db0,DC1);
524
525 cmp %o2,_0x00800000 ! (2_0) ax ? 0x00800000
526 bl,pn %icc,.update19 ! (2_0) if( ax < 0x00800000 )
527 ldd [%fp+tmp0],%f40 ! (2_1) dtmp1 = *((double*)&lexp0);
528 fstod %f25,%f56 ! (2_0) db0 = (double)x0;
529 .cont19:
530 fmuld %f52,%f32,%f42 ! (0_0) xx0 *= dtmp0;
531 and %g1,_0x1ff0,%o5 ! (1_0) si0 = ax & 0x1ff0;
532 lda [stridex+%o7]0x82,%o1 ! (3_0) ax = *(int*)px;
533 faddd %f50,K1,%f34 ! (4_1) res0 += K1;
534
535 add %o5,TBL,%i4 ! (1_0) (char*)TBL + si0
536 add %g5,stridey,%g1 ! py += stridey
537 st %f27,[%g5] ! (0_1) *py = fres0;
538 fand %f48,DC2,%f62 ! (1_0) hi0 = vis_fand(db0,DC2);
539
540 fmuld %f60,%f40,%f32 ! (2_1) res0 *= dtmp1;
541 sra %o2,24,%l1 ! (2_0) iexp0 = ax >> 24;
542 ldd [%l0+8],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
543 faddd %f54,DC1,%f58 ! (3_1) res0 += DC1;
544
545 add %o7,stridex,%i1 ! px += stridex
546 add %l1,960,%l0 ! (2_0) iexp0 += 0x3c0;
547 lda [stridex+%o7]0x82,%f0 ! (3_0) x0 = *px;
548 fand %f56,DC0,%f60 ! (2_0) db0 = vis_fand(db0,DC0);
549
550 fmuld K2,%f42,%f50 ! (0_0) res0 = K2 * xx0;
551 cmp %o1,_0x7f800000 ! (3_0) ax ? 0x7f800000
552 bge,pn %icc,.update20 ! (3_0) if( ax >= 0x7f800000 )
553 fsubd %f48,%f62,%f54 ! (1_0) xx0 = (db0 - hi0);
554 .cont20:
555 fmuld %f34,%f46,%f52 ! (4_1) res0 *= xx0;
556 sllx %l0,52,%o3 ! (2_0) lexp0 = (long long)iexp0 << 52;
557 ldd [TBL+%o5],%f56 ! (1_0) dtmp0 = ((double*)((char*)TBL + si0))[0];
558 fdtos %f44,%f8 ! (1_1) fres0 = (float)res0;
559
560 fmuld %f40,%f58,%f34 ! (3_1) res0 = dtmp0 * res0;
561 sra %o2,11,%i2 ! (2_0) ax >>= 11;
562 stx %o3,[%fp+tmp0] ! (2_0) dtmp1 = *((double*)&lexp0);
563 for %f60,DC1,%f40 ! (2_0) db0 = vis_for(db0,DC1);
564
565 cmp %o1,_0x00800000 ! (3_0) ax ? 0x00800000
566 bl,pn %icc,.update21 ! (3_0) if( ax < 0x00800000 )
567 ldd [%fp+tmp1],%f62 ! (3_1) dtmp1 = *((double*)&lexp0);
568 fstod %f0,%f48 ! (3_0) db0 = (double)x0;
569 .cont21:
570 fmuld %f54,%f56,%f30 ! (1_0) xx0 *= dtmp0;
571 and %i2,_0x1ff0,%o3 ! (2_0) si0 = ax & 0x1ff0;
572 lda [%i1+stridex]0x82,%o2 ! (4_0) ax = *(int*)px;
573 faddd %f50,K1,%f56 ! (0_0) res0 += K1;
574
575 add %i1,stridex,%i1 ! px += stridex
576 add %o3,TBL,%i2 ! (2_0) (char*)TBL + si0
577 st %f8,[stridey+%g5] ! (1_1) *py = fres0;
578 fand %f40,DC2,%f46 ! (2_0) hi0 = vis_fand(db0,DC2);
579
580 fmuld %f34,%f62,%f28 ! (3_1) res0 *= dtmp1;
581 sra %o1,24,%o4 ! (3_0) iexp0 = ax >> 24;
582 ldd [%i3+8],%f50 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
583 faddd %f52,DC1,%f54 ! (4_1) res0 += DC1;
584
585 add %g1,stridey,%i3 ! py += stridey
586 subcc counter,5,counter ! counter
587 lda [%i1]0x82,%f13 ! (4_0) x0 = *px;
588 fand %f48,DC0,%f58 ! (3_0) db0 = vis_fand(db0,DC0);
589
590 bpos,pt %icc,.main_loop
591 add %o4,960,%g5 ! (3_0) iexp0 += 0x3c0;
592
593 add counter,5,counter
594 .tail:
595 subcc counter,1,counter
596 bneg,a .begin
597 or %g0,%i3,%g5
598
599 fmuld %f56,%f42,%f52 ! (0_1) res0 *= xx0;
600 fdtos %f32,%f15 ! (2_2) fres0 = (float)res0;
601
602 fmuld %f50,%f54,%f42 ! (4_2) res0 = dtmp0 * res0;
603
604 ldd [%fp+tmp2],%f56 ! (4_2) dtmp1 = *((double*)&lexp0);
605
606 add %i3,stridey,%o3 ! py += stridey
607 st %f15,[%i3] ! (2_2) *py = fres0;
608
609 subcc counter,1,counter
610 bneg,a .begin
611 or %g0,%o3,%g5
612
613 fmuld %f42,%f56,%f44 ! (4_2) res0 *= dtmp1;
614 ldd [%i5+8],%f58 ! (0_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
615 faddd %f52,DC1,%f34 ! (0_1) res0 += DC1;
616
617 fdtos %f28,%f19 ! (3_2) fres0 = (float)res0;
618
619 fmuld %f58,%f34,%f32 ! (0_1) res0 = dtmp0 * res0;
620
621 ldd [%fp+tmp3],%f60 ! (0_1) dtmp1 = *((double*)&lexp0);
622
623 add %o3,stridey,%g5 ! py += stridey
624
625 st %f19,[%o3] ! (3_2) *py = fres0;
626
627 subcc counter,1,counter
628 bneg,a .begin
629 nop
630
631 fmuld %f32,%f60,%f40 ! (0_1) res0 *= dtmp1;
632
633 fdtos %f44,%f23 ! (4_2) fres0 = (float)res0;
634
635 add %g5,stridey,%g5 ! py += stridey
636 st %f23,[stridey+%o3] ! (4_2) *py = fres0;
637
638 subcc counter,1,counter
639 bneg,a .begin
640 nop
641
642 fdtos %f40,%f27 ! (0_1) fres0 = (float)res0;
643
644 st %f27,[%g5] ! (0_1) *py = fres0;
645
646 ba .begin
647 add %g5,stridey,%g5
648
649 .align 16
650 .spec:
651 fsqrts %f25,%f25
652 sub counter,1,counter
653 add %i1,stridex,%i1
654 st %f25,[%g5]
655 ba .begin1
656 add %g5,stridey,%g5
657
658 .align 16
659 .update0:
660 cmp counter,1
661 ble .cont0
662 fzeros %f0
663
664 stx %i1,[%fp+tmp_px]
665 sethi %hi(0x7f800000),%o1
666
667 sub counter,1,counter
668 st counter,[%fp+tmp_counter]
669
670 ba .cont0
671 or %g0,1,counter
672
673 .align 16
674 .update1:
675 cmp counter,1
676 ble .cont1
677 fzeros %f0
678
679 stx %i1,[%fp+tmp_px]
680 clr %o1
681
682 sub counter,1,counter
683 st counter,[%fp+tmp_counter]
684
685 ba .cont1
686 or %g0,1,counter
687
688 .align 16
689 .update2:
690 cmp counter,2
691 ble .cont2
692 fzeros %f13
693
694 stx %i1,[%fp+tmp_px]
695 sethi %hi(0x7f800000),%o2
696
697 sub counter,2,counter
698 st counter,[%fp+tmp_counter]
699
700 ba .cont2
701 or %g0,2,counter
702
703 .align 16
704 .update3:
705 cmp counter,2
706 ble .cont3
707 fzeros %f13
708
709 stx %i1,[%fp+tmp_px]
710 clr %o2
711
712 sub counter,2,counter
713 st counter,[%fp+tmp_counter]
714
715 ba .cont3
716 or %g0,2,counter
717
718 .align 16
719 .update4:
720 cmp counter,3
721 ble .cont4
722 fzeros %f17
723
724 stx %o4,[%fp+tmp_px]
725 sethi %hi(0x7f800000),%l1
726
727 sub counter,3,counter
728 st counter,[%fp+tmp_counter]
729
730 ba .cont4
731 or %g0,3,counter
732
733 .align 16
734 .update5:
735 cmp counter,3
736 ble .cont5
737 fzeros %f17
738
739 stx %o4,[%fp+tmp_px]
740 clr %l1
741
742 sub counter,3,counter
743 st counter,[%fp+tmp_counter]
744
745 ba .cont5
746 or %g0,3,counter
747
748 .align 16
749 .update6:
750 cmp counter,4
751 ble .cont6
752 fzeros %f21
753
754 stx %i1,[%fp+tmp_px]
755 sethi %hi(0x7f800000),%i0
756
757 sub counter,4,counter
758 st counter,[%fp+tmp_counter]
759
760 ba .cont6
761 or %g0,4,counter
762
763 .align 16
764 .update7:
765 cmp counter,4
766 ble .cont7
767 fzeros %f21
768
769 stx %i1,[%fp+tmp_px]
770 clr %i0
771
772 sub counter,4,counter
773 st counter,[%fp+tmp_counter]
774
775 ba .cont7
776 or %g0,4,counter
777
778 .align 16
779 .update8:
780 cmp counter,5
781 ble .cont8
782 fzeros %f25
783
784 stx %o7,[%fp+tmp_px]
785 sethi %hi(0x7f800000),%o2
786
787 sub counter,5,counter
788 st counter,[%fp+tmp_counter]
789
790 ba .cont8
791 or %g0,5,counter
792
793 .align 16
794 .update9:
795 cmp counter,5
796 ble .cont9
797 fzeros %f25
798
799 stx %o7,[%fp+tmp_px]
800 clr %o2
801
802 sub counter,5,counter
803 st counter,[%fp+tmp_counter]
804
805 ba .cont9
806 or %g0,5,counter
807
808 .align 16
809 .update10:
810 cmp counter,6
811 ble .cont10
812 fzeros %f0
813
814 stx %i1,[%fp+tmp_px]
815 sethi %hi(0x7f800000),%o1
816
817 sub counter,6,counter
818 st counter,[%fp+tmp_counter]
819
820 ba .cont10
821 or %g0,6,counter
822
823 .align 16
824 .update11:
825 cmp counter,6
826 ble .cont11
827 fzeros %f0
828
829 stx %i1,[%fp+tmp_px]
830 clr %o1
831
832 sub counter,6,counter
833 st counter,[%fp+tmp_counter]
834
835 ba .cont11
836 or %g0,6,counter
837
838 .align 16
839 .update12:
840 cmp counter,2
841 ble .cont12
842 fzeros %f13
843
844 stx %i1,[%fp+tmp_px]
845 sethi %hi(0x7f800000),%o2
846
847 sub counter,2,counter
848 st counter,[%fp+tmp_counter]
849
850 ba .cont12
851 or %g0,2,counter
852
853 .align 16
854 .update13:
855 cmp counter,2
856 ble .cont13
857 fzeros %f13
858
859 stx %i1,[%fp+tmp_px]
860 clr %o2
861
862 sub counter,2,counter
863 st counter,[%fp+tmp_counter]
864
865 ba .cont13
866 or %g0,2,counter
867
868 .align 16
869 .update14:
870 cmp counter,3
871 ble .cont14
872 fzeros %f17
873
874 stx %o4,[%fp+tmp_px]
875 sethi %hi(0x7f800000),%l1
876
877 sub counter,3,counter
878 st counter,[%fp+tmp_counter]
879
880 ba .cont14
881 or %g0,3,counter
882
883 .align 16
884 .update15:
885 cmp counter,3
886 ble .cont15
887 fzeros %f17
888
889 stx %o4,[%fp+tmp_px]
890 clr %l1
891
892 sub counter,3,counter
893 st counter,[%fp+tmp_counter]
894
895 ba .cont15
896 or %g0,3,counter
897
898 .align 16
899 .update16:
900 cmp counter,4
901 ble .cont16
902 fzeros %f21
903
904 stx %i1,[%fp+tmp_px]
905 sethi %hi(0x7f800000),%i0
906
907 sub counter,4,counter
908 st counter,[%fp+tmp_counter]
909
910 ba .cont16
911 or %g0,4,counter
912
913 .align 16
914 .update17:
915 cmp counter,4
916 ble .cont17
917 fzeros %f21
918
919 stx %i1,[%fp+tmp_px]
920 clr %i0
921
922 sub counter,4,counter
923 st counter,[%fp+tmp_counter]
924
925 ba .cont17
926 or %g0,4,counter
927
928 .align 16
929 .update18:
930 cmp counter,5
931 ble .cont18
932 fzeros %f25
933
934 stx %o7,[%fp+tmp_px]
935 sethi %hi(0x7f800000),%o2
936
937 sub counter,5,counter
938 st counter,[%fp+tmp_counter]
939
940 ba .cont18
941 or %g0,5,counter
942
943 .align 16
944 .update19:
945 cmp counter,5
946 ble .cont19
947 fzeros %f25
948
949 stx %o7,[%fp+tmp_px]
950 clr %o2
951
952 sub counter,5,counter
953 st counter,[%fp+tmp_counter]
954
955 ba .cont19
956 or %g0,5,counter
957
958 .align 16
959 .update20:
960 cmp counter,6
961 ble .cont20
962 fzeros %f0
963
964 stx %i1,[%fp+tmp_px]
965 sethi %hi(0x7f800000),%o1
966
967 sub counter,6,counter
968 st counter,[%fp+tmp_counter]
969
970 ba .cont20
971 or %g0,6,counter
972
973 .align 16
974 .update21:
975 cmp counter,6
976 ble .cont21
977 fzeros %f0
978
979 stx %i1,[%fp+tmp_px]
980 clr %o1
981
982 sub counter,6,counter
983 st counter,[%fp+tmp_counter]
984
985 ba .cont21
986 or %g0,6,counter
987
988 .exit:
989 ret
990 restore
991 SET_SIZE(__vsqrtf_ultra3)
992