1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
23 */
24 /*
25 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
26 * Use is subject to license terms.
27 */
28
29 .file "__vsqrtf_ultra3.S"
30
31 #include "libm.h"
32 #if defined(LIBMVEC_SO_BUILD)
33 .weak __vsqrtf
34 .type __vsqrtf,#function
35 __vsqrtf = __vsqrtf_ultra3
36 #endif
37
38 RO_DATA
39 .align 64
40
41 .CONST_TBL:
42 .word 0x3fe00001, 0x80007e00 ! K1 = 5.00000715259318464227e-01
43 .word 0xbfc00003, 0xc0017a01 ! K2 = -1.25000447037521686593e-01
44 .word 0x000fffff, 0xffffffff ! DC0 = 0x000fffffffffffff
45 .word 0x3ff00000, 0x00000000 ! DC1 = 0x3ff0000000000000
46 .word 0x7ffff000, 0x00000000 ! DC2 = 0x7ffff00000000000
47
48 #define DC0 %f6
49 #define DC1 %f4
50 #define DC2 %f2
51 #define K2 %f38
52 #define K1 %f36
53 #define TBL %l2
54 #define stridex %l3
55 #define stridey %l4
56 #define _0x1ff0 %l5
57 #define counter %l6
58 #define _0x00800000 %l7
59 #define _0x7f800000 %o0
60
61 #define tmp_px STACK_BIAS-0x40
62 #define tmp_counter STACK_BIAS-0x38
63 #define tmp0 STACK_BIAS-0x30
64 #define tmp1 STACK_BIAS-0x28
65 #define tmp2 STACK_BIAS-0x20
66 #define tmp3 STACK_BIAS-0x18
67 #define tmp4 STACK_BIAS-0x10
68
69 ! sizeof temp storage - must be a multiple of 16 for V9
70 #define tmps 0x40
71
72 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
73 ! !!!!! algorithm !!!!!
74 !
75 ! x0 = *px;
76 ! ax = *(int*)px;
77 ! px += stridex;
78 !
79 ! if( ax >= 0x7f800000 )
80 ! {
81 ! *py = sqrtf(x0);
82 ! py += stridey;
83 ! continue;
84 ! }
85 ! if( ax < 0x00800000 )
86 ! {
87 ! *py = sqrtf(x0);
88 ! py += stridey;
89 ! continue;
90 ! }
91 !
92 ! db0 = (double)x0;
93 ! iexp0 = ax >> 24;
94 ! iexp0 += 0x3c0;
95 ! lexp0 = (long long)iexp0 << 52;
96 !
97 ! db0 = vis_fand(db0,DC0);
98 ! db0 = vis_for(db0,DC1);
99 ! hi0 = vis_fand(db0,DC2);
100 !
101 ! ax >>= 11;
102 ! si0 = ax & 0x1ff0;
103 ! dtmp0 = ((double*)((char*)TBL + si0))[0];
104 ! xx0 = (db0 - hi0);
105 ! xx0 *= dtmp0;
106 ! dtmp0 = ((double*)((char*)TBL + si0))[1]
107 ! res0 = K2 * xx0;
108 ! res0 += K1;
109 ! res0 *= xx0;
110 ! res0 += DC1;
111 ! res0 = dtmp0 * res0;
112 ! dtmp1 = *((double*)&lexp0);
113 ! res0 *= dtmp1;
114 ! fres0 = (float)res0;
115 ! *py = fres0;
116 ! py += stridey;
117 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
118
119 ENTRY(__vsqrtf_ultra3)
120 save %sp,-SA(MINFRAME)-tmps,%sp
121 PIC_SETUP(l7)
122 PIC_SET(l7,.CONST_TBL,o2)
123 PIC_SET(l7,__vlibm_TBL_sqrtf,l2)
124
125 st %i0,[%fp+tmp_counter]
126 sll %i2,2,stridex
127 or %g0,0xff8,%l5
128
129 stx %i1,[%fp+tmp_px]
130 sll %l5,1,_0x1ff0
131
132 ldd [%o2],K1
133 sll %i4,2,stridey
134
135 ldd [%o2+8],K2
136 or %g0,%i3,%g5
137
138 ldd [%o2+16],DC0
139 sethi %hi(0x7f800000),%o0
140
141 ldd [%o2+24],DC1
142 sethi %hi(0x00800000),%l7
143
144 ldd [%o2+32],DC2
145
146 .begin:
147 ld [%fp+tmp_counter],counter
148 ldx [%fp+tmp_px],%i1
149 st %g0,[%fp+tmp_counter]
150 .begin1:
151 cmp counter,0
152 ble,pn %icc,.exit
153
154 lda [%i1]0x82,%o2 ! (2_0) ax = *(int*)px;
155
156 or %g0,%i1,%o7
157 lda [%i1]0x82,%f25 ! (2_0) x0 = *px;
158
159 cmp %o2,_0x7f800000 ! (2_0) ax ? 0x7f800000
160 bge,pn %icc,.spec ! (2_0) if( ax >= 0x7f800000 )
161 nop
162
163 cmp %o2,_0x00800000 ! (2_0) ax ? 0x00800000
164 bl,pn %icc,.spec ! (2_0) if( ax < 0x00800000 )
165 nop
166
167 fstod %f25,%f56 ! (2_0) db0 = (double)x0;
168
169 lda [stridex+%o7]0x82,%o1 ! (3_0) ax = *(int*)px;
170
171 sra %o2,24,%l1 ! (2_0) iexp0 = ax >> 24;
172
173 add %o7,stridex,%i1 ! px += stridex
174 add %l1,960,%l0 ! (2_0) iexp0 += 0x3c0;
175 lda [stridex+%o7]0x82,%f0 ! (3_0) x0 = *px;
176 fand %f56,DC0,%f60 ! (2_0) db0 = vis_fand(db0,DC0);
177
178 cmp %o1,_0x7f800000 ! (3_0) ax ? 0x7f800000
179 bge,pn %icc,.update0 ! (3_0) if( ax >= 0x7f800000 )
180 nop
181 .cont0:
182 sllx %l0,52,%o3 ! (2_0) lexp0 = (long long)iexp0 << 52;
183
184 sra %o2,11,%i2 ! (2_0) ax >>= 11;
185 stx %o3,[%fp+tmp0] ! (2_0) dtmp1 = *((double*)&lexp0);
186 for %f60,DC1,%f40 ! (2_0) db0 = vis_for(db0,DC1);
187
188 cmp %o1,_0x00800000 ! (3_0) ax ? 0x00800000
189 bl,pn %icc,.update1 ! (3_0) if( ax < 0x00800000 )
190 nop
191 .cont1:
192 fstod %f0,%f48 ! (3_0) db0 = (double)x0;
193
194 and %i2,_0x1ff0,%o3 ! (2_0) si0 = ax & 0x1ff0;
195 lda [%i1+stridex]0x82,%o2 ! (4_0) ax = *(int*)px;
196
197 add %i1,stridex,%i1 ! px += stridex
198 add %o3,TBL,%i2 ! (2_0) (char*)TBL + si0
199 fand %f40,DC2,%f46 ! (2_0) hi0 = vis_fand(db0,DC2);
200
201 sra %o1,24,%o4 ! (3_0) iexp0 = ax >> 24;
202
203 lda [%i1]0x82,%f13 ! (4_0) x0 = *px;
204 fand %f48,DC0,%f58 ! (3_0) db0 = vis_fand(db0,DC0);
205
206 add %o4,960,%i0 ! (3_0) iexp0 += 0x3c0;
207
208 cmp %o2,_0x7f800000 ! (4_1) ax ? 0x7f800000
209 bge,pn %icc,.update2 ! (4_1) if( ax >= 0x7f800000 )
210 nop
211 .cont2:
212 fsubd %f40,%f46,%f44 ! (2_1) xx0 = (db0 - hi0);
213 sllx %i0,52,%g1 ! (3_1) lexp0 = (long long)iexp0 << 52;
214 ldd [%i2],%f40 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
215
216 sra %o1,11,%l0 ! (3_1) ax >>= 11;
217 stx %g1,[%fp+tmp1] ! (3_1) dtmp1 = *((double*)&lexp0);
218 for %f58,DC1,%f48 ! (3_1) db0 = vis_for(db0,DC1);
219
220 cmp %o2,_0x00800000 ! (4_1) ax ? 0x00800000
221 bl,pn %icc,.update3 ! (4_1) if( ax < 0x00800000 )
222 nop
223 .cont3:
224 fstod %f13,%f50 ! (4_1) db0 = (double)x0;
225
226 fmuld %f44,%f40,%f46 ! (2_1) xx0 *= dtmp0;
227 and %l0,_0x1ff0,%i0 ! (3_1) si0 = ax & 0x1ff0;
228 lda [%i1+stridex]0x82,%l1 ! (0_0) ax = *(int*)px;
229
230 add %i0,TBL,%l0 ! (3_1) (char*)TBL + si0
231 fand %f48,DC2,%f62 ! (3_1) hi0 = vis_fand(db0,DC2);
232
233 sra %o2,24,%o7 ! (4_1) iexp0 = ax >> 24;
234
235 add %i1,stridex,%o4 ! px += stridex
236 add %o7,960,%o7 ! (4_1) iexp0 += 0x3c0;
237 lda [%i1+stridex]0x82,%f17 ! (0_0) x0 = *px;
238 fand %f50,DC0,%f54 ! (4_1) db0 = vis_fand(db0,DC0);
239
240 fmuld K2,%f46,%f52 ! (2_1) res0 = K2 * xx0;
241 cmp %l1,_0x7f800000 ! (0_0) ax ? 0x7f800000
242 bge,pn %icc,.update4 ! (0_0) if( ax >= 0x7f800000 )
243 fsubd %f48,%f62,%f42 ! (3_1) xx0 = (db0 - hi0);
244 .cont4:
245 sllx %o7,52,%o1 ! (4_1) lexp0 = (long long)iexp0 << 52;
246 ldd [%i0+TBL],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
247
248 sra %o2,11,%i5 ! (4_1) ax >>= 11;
249 stx %o1,[%fp+tmp2] ! (4_1) dtmp1 = *((double*)&lexp0);
250 for %f54,DC1,%f34 ! (4_1) db0 = vis_for(db0,DC1);
251
252 cmp %l1,_0x00800000 ! (0_0) ax ? 0x00800000
253 bl,pn %icc,.update5 ! (0_0) if( ax < 0x00800000 )
254 nop
255 .cont5:
256 fstod %f17,%f56 ! (0_0) db0 = (double)x0;
257
258 fmuld %f42,%f40,%f42 ! (3_1) xx0 *= dtmp0;
259 lda [stridex+%o4]0x82,%i0 ! (1_0) ax = *(int*)px;
260 faddd %f52,K1,%f52 ! (2_1) res0 += K1;
261
262 sra %l1,24,%g1 ! (0_0) iexp0 = ax >> 24;
263 and %i5,_0x1ff0,%i5 ! (4_1) si0 = ax & 0x1ff0;
264 fand %f34,DC2,%f62 ! (4_1) hi0 = vis_fand(db0,DC2);
265
266 add %o4,stridex,%i1 ! px += stridex
267
268 add %g1,960,%o5 ! (0_0) iexp0 += 0x3c0;
269 add %i5,TBL,%i3 ! (4_1) (char*)TBL + si0
270 lda [stridex+%o4]0x82,%f21 ! (1_0) x0 = *px;
271 fand %f56,DC0,%f32 ! (0_0) db0 = vis_fand(db0,DC0);
272
273 fmuld K2,%f42,%f50 ! (3_1) res0 = K2 * xx0;
274 cmp %i0,_0x7f800000 ! (1_0) ax ? 0x7f800000
275 bge,pn %icc,.update6 ! (1_0) if( ax >= 0x7f800000 )
276 fsubd %f34,%f62,%f54 ! (4_1) xx0 = (db0 - hi0);
277 .cont6:
278 fmuld %f52,%f46,%f52 ! (2_1) res0 *= xx0;
279 sllx %o5,52,%o7 ! (0_0) lexp0 = (long long)iexp0 << 52;
280 ldd [TBL+%i5],%f62 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
281
282 sra %l1,11,%i4 ! (0_0) ax >>= 11;
283 stx %o7,[%fp+tmp3] ! (0_0) dtmp1 = *((double*)&lexp0);
284 for %f32,DC1,%f48 ! (0_0) db0 = vis_for(db0,DC1);
285
286 cmp %i0,_0x00800000 ! (1_0) ax ? 0x00800000
287 bl,pn %icc,.update7 ! (1_0) if( ax < 0x00800000 )
288 nop
289 .cont7:
290 fstod %f21,%f56 ! (1_0) db0 = (double)x0;
291
292 fmuld %f54,%f62,%f46 ! (4_1) xx0 *= dtmp0;
293 and %i4,_0x1ff0,%g1 ! (0_0) si0 = ax & 0x1ff0;
294 lda [%i1+stridex]0x82,%o2 ! (2_0) ax = *(int*)px;
295 faddd %f50,K1,%f62 ! (3_1) res0 += K1;
296
297 add %g1,TBL,%i5 ! (0_0) (double*)((char*)TBL + si0
298 fand %f48,DC2,%f32 ! (0_0) hi0 = vis_fand(db0,DC2);
299
300 sra %i0,24,%o4 ! (1_0) iexp0 = ax >> 24;
301 ldd [%i2+8],%f60 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
302 faddd %f52,DC1,%f58 ! (2_1) res0 += DC1;
303
304 add %i1,stridex,%o7 ! px += stridex
305 add %o4,960,%i2 ! (1_0) iexp0 += 0x3c0;
306 lda [%i1+stridex]0x82,%f25 ! (2_0) x0 = *px;
307 fand %f56,DC0,%f34 ! (1_0) db0 = vis_fand(db0,DC0);
308
309 fmuld K2,%f46,%f50 ! (4_1) res0 = K2 * xx0;
310 cmp %o2,_0x7f800000 ! (2_0) ax ? 0x7f800000
311 bge,pn %icc,.update8 ! (2_0) if( ax >= 0x7f800000 )
312 fsubd %f48,%f32,%f52 ! (0_0) xx0 = (db0 - hi0);
313 .cont8:
314 fmuld %f62,%f42,%f54 ! (3_1) res0 *= xx0;
315 sllx %i2,52,%o4 ! (1_0) lexp0 = (long long)iexp0 << 52;
316 ldd [TBL+%g1],%f32 ! (0_0) dtmp0 = ((double*)((char*)TBL + si0))[0];
317
318 fmuld %f60,%f58,%f60 ! (2_1) res0 = dtmp0 * res0;
319 sra %i0,11,%g1 ! (1_0) ax >>= 11;
320 stx %o4,[%fp+tmp4] ! (1_0) dtmp1 = *((double*)&lexp0);
321 for %f34,DC1,%f48 ! (1_0) db0 = vis_for(db0,DC1);
322
323 cmp %o2,_0x00800000 ! (2_0) ax ? 0x00800000
324 bl,pn %icc,.update9 ! (2_0) if( ax < 0x00800000 )
325 ldd [%fp+tmp0],%f40 ! (2_1) dtmp1 = *((double*)&lexp0);
326 fstod %f25,%f56 ! (2_0) db0 = (double)x0;
327 .cont9:
328 fmuld %f52,%f32,%f42 ! (0_0) xx0 *= dtmp0;
329 and %g1,_0x1ff0,%o5 ! (1_0) si0 = ax & 0x1ff0;
330 lda [stridex+%o7]0x82,%o1 ! (3_0) ax = *(int*)px;
331 faddd %f50,K1,%f34 ! (4_1) res0 += K1;
332
333 add %o5,TBL,%i4 ! (1_0) (char*)TBL + si0
334 fand %f48,DC2,%f62 ! (1_0) hi0 = vis_fand(db0,DC2);
335
336 fmuld %f60,%f40,%f32 ! (2_1) res0 *= dtmp1;
337 sra %o2,24,%l1 ! (2_0) iexp0 = ax >> 24;
338 ldd [%l0+8],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
339 faddd %f54,DC1,%f58 ! (3_1) res0 += DC1;
340
341 add %o7,stridex,%i1 ! px += stridex
342 add %l1,960,%l0 ! (2_0) iexp0 += 0x3c0;
343 lda [stridex+%o7]0x82,%f0 ! (3_0) x0 = *px;
344 fand %f56,DC0,%f60 ! (2_0) db0 = vis_fand(db0,DC0);
345
346 fmuld K2,%f42,%f50 ! (0_0) res0 = K2 * xx0;
347 cmp %o1,_0x7f800000 ! (3_0) ax ? 0x7f800000
348 bge,pn %icc,.update10 ! (3_0) if( ax >= 0x7f800000 )
349 fsubd %f48,%f62,%f54 ! (1_0) xx0 = (db0 - hi0);
350 .cont10:
351 fmuld %f34,%f46,%f52 ! (4_1) res0 *= xx0;
352 sllx %l0,52,%o3 ! (2_0) lexp0 = (long long)iexp0 << 52;
353 ldd [TBL+%o5],%f56 ! (1_0) dtmp0 = ((double*)((char*)TBL + si0))[0];
354
355 fmuld %f40,%f58,%f34 ! (3_1) res0 = dtmp0 * res0;
356 sra %o2,11,%i2 ! (2_0) ax >>= 11;
357 stx %o3,[%fp+tmp0] ! (2_0) dtmp1 = *((double*)&lexp0);
358 for %f60,DC1,%f40 ! (2_0) db0 = vis_for(db0,DC1);
359
360 cmp %o1,_0x00800000 ! (3_0) ax ? 0x00800000
361 bl,pn %icc,.update11 ! (3_0) if( ax < 0x00800000 )
362 ldd [%fp+tmp1],%f62 ! (3_1) dtmp1 = *((double*)&lexp0);
363 fstod %f0,%f48 ! (3_0) db0 = (double)x0;
364 .cont11:
365 fmuld %f54,%f56,%f30 ! (1_0) xx0 *= dtmp0;
366 and %i2,_0x1ff0,%o3 ! (2_0) si0 = ax & 0x1ff0;
367 lda [%i1+stridex]0x82,%o2 ! (4_0) ax = *(int*)px;
368 faddd %f50,K1,%f56 ! (0_0) res0 += K1;
369
370 add %i1,stridex,%i1 ! px += stridex
371 add %o3,TBL,%i2 ! (2_0) (char*)TBL + si0
372 fand %f40,DC2,%f46 ! (2_0) hi0 = vis_fand(db0,DC2);
373
374 fmuld %f34,%f62,%f28 ! (3_1) res0 *= dtmp1;
375 sra %o1,24,%o4 ! (3_0) iexp0 = ax >> 24;
376 ldd [%i3+8],%f50 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
377 faddd %f52,DC1,%f54 ! (4_1) res0 += DC1;
378
379 lda [%i1]0x82,%f13 ! (4_0) x0 = *px;
380 fand %f48,DC0,%f58 ! (3_0) db0 = vis_fand(db0,DC0);
381
382 or %g0,%g5,%i3
383 cmp counter,5
384 bl,pn %icc,.tail
385 add %o4,960,%g5 ! (3_0) iexp0 += 0x3c0;
386
387 ba .main_loop
388 sub counter,5,counter ! counter
389
390 .align 16
391 .main_loop:
392 fmuld K2,%f30,%f60 ! (1_1) res0 = K2 * xx0;
393 cmp %o2,_0x7f800000 ! (4_1) ax ? 0x7f800000
394 bge,pn %icc,.update12 ! (4_1) if( ax >= 0x7f800000 )
395 fsubd %f40,%f46,%f44 ! (2_1) xx0 = (db0 - hi0);
396 .cont12:
397 fmuld %f56,%f42,%f52 ! (0_1) res0 *= xx0;
398 sllx %g5,52,%g5 ! (3_1) lexp0 = (long long)iexp0 << 52;
399 ldd [%i2],%f40 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
400 fdtos %f32,%f15 ! (2_2) fres0 = (float)res0;
401
402 fmuld %f50,%f54,%f42 ! (4_2) res0 = dtmp0 * res0;
403 sra %o1,11,%l0 ! (3_1) ax >>= 11;
404 stx %g5,[%fp+tmp1] ! (3_1) dtmp1 = *((double*)&lexp0);
405 for %f58,DC1,%f48 ! (3_1) db0 = vis_for(db0,DC1);
406
407 cmp %o2,_0x00800000 ! (4_1) ax ? 0x00800000
408 bl,pn %icc,.update13 ! (4_1) if( ax < 0x00800000 )
409 ldd [%fp+tmp2],%f56 ! (4_2) dtmp1 = *((double*)&lexp0);
410 fstod %f13,%f50 ! (4_1) db0 = (double)x0;
411 .cont13:
412 fmuld %f44,%f40,%f46 ! (2_1) xx0 *= dtmp0;
413 and %l0,_0x1ff0,%i0 ! (3_1) si0 = ax & 0x1ff0;
414 lda [%i1+stridex]0x82,%l1 ! (0_0) ax = *(int*)px;
415 faddd %f60,K1,%f32 ! (1_1) res0 += K1;
416
417 add %i0,TBL,%l0 ! (3_1) (char*)TBL + si0
418 add %i3,stridey,%o3 ! py += stridey
419 st %f15,[%i3] ! (2_2) *py = fres0;
420 fand %f48,DC2,%f62 ! (3_1) hi0 = vis_fand(db0,DC2);
421
422 fmuld %f42,%f56,%f44 ! (4_2) res0 *= dtmp1;
423 sra %o2,24,%o7 ! (4_1) iexp0 = ax >> 24;
424 ldd [%i5+8],%f58 ! (0_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
425 faddd %f52,DC1,%f34 ! (0_1) res0 += DC1;
426
427 add %i1,stridex,%o4 ! px += stridex
428 add %o7,960,%o7 ! (4_1) iexp0 += 0x3c0;
429 lda [%i1+stridex]0x82,%f17 ! (0_0) x0 = *px;
430 fand %f50,DC0,%f54 ! (4_1) db0 = vis_fand(db0,DC0);
431
432 fmuld K2,%f46,%f52 ! (2_1) res0 = K2 * xx0;
433 cmp %l1,_0x7f800000 ! (0_0) ax ? 0x7f800000
434 bge,pn %icc,.update14 ! (0_0) if( ax >= 0x7f800000 )
435 fsubd %f48,%f62,%f42 ! (3_1) xx0 = (db0 - hi0);
436 .cont14:
437 fmuld %f32,%f30,%f48 ! (1_1) res0 *= xx0;
438 sllx %o7,52,%o1 ! (4_1) lexp0 = (long long)iexp0 << 52;
439 ldd [%i0+TBL],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
440 fdtos %f28,%f19 ! (3_2) fres0 = (float)res0;
441
442 fmuld %f58,%f34,%f32 ! (0_1) res0 = dtmp0 * res0;
443 sra %o2,11,%i5 ! (4_1) ax >>= 11;
444 stx %o1,[%fp+tmp2] ! (4_1) dtmp1 = *((double*)&lexp0);
445 for %f54,DC1,%f34 ! (4_1) db0 = vis_for(db0,DC1);
446
447 cmp %l1,_0x00800000 ! (0_0) ax ? 0x00800000
448 bl,pn %icc,.update15 ! (0_0) if( ax < 0x00800000 )
449 ldd [%fp+tmp3],%f60 ! (0_1) dtmp1 = *((double*)&lexp0);
450 fstod %f17,%f56 ! (0_0) db0 = (double)x0;
451 .cont15:
452 fmuld %f42,%f40,%f42 ! (3_1) xx0 *= dtmp0;
453 add %o3,stridey,%g5 ! py += stridey
454 lda [stridex+%o4]0x82,%i0 ! (1_0) ax = *(int*)px;
455 faddd %f52,K1,%f52 ! (2_1) res0 += K1;
456
457 sra %l1,24,%g1 ! (0_0) iexp0 = ax >> 24;
458 and %i5,_0x1ff0,%i5 ! (4_1) si0 = ax & 0x1ff0;
459 st %f19,[%o3] ! (3_2) *py = fres0;
460 fand %f34,DC2,%f62 ! (4_1) hi0 = vis_fand(db0,DC2);
461
462 fmuld %f32,%f60,%f40 ! (0_1) res0 *= dtmp1;
463 add %o4,stridex,%i1 ! px += stridex
464 ldd [%i4+8],%f60 ! (1_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
465 faddd %f48,DC1,%f58 ! (1_1) res0 += DC1;
466
467 add %g1,960,%o5 ! (0_0) iexp0 += 0x3c0;
468 add %i5,TBL,%i3 ! (4_1) (char*)TBL + si0
469 lda [stridex+%o4]0x82,%f21 ! (1_0) x0 = *px;
470 fand %f56,DC0,%f32 ! (0_0) db0 = vis_fand(db0,DC0);
471
472 fmuld K2,%f42,%f50 ! (3_1) res0 = K2 * xx0;
473 cmp %i0,_0x7f800000 ! (1_0) ax ? 0x7f800000
474 bge,pn %icc,.update16 ! (1_0) if( ax >= 0x7f800000 )
475 fsubd %f34,%f62,%f54 ! (4_1) xx0 = (db0 - hi0);
476 .cont16:
477 fmuld %f52,%f46,%f52 ! (2_1) res0 *= xx0;
478 sllx %o5,52,%o7 ! (0_0) lexp0 = (long long)iexp0 << 52;
479 ldd [TBL+%i5],%f62 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[0];
480 fdtos %f44,%f23 ! (4_2) fres0 = (float)res0;
481
482 fmuld %f60,%f58,%f44 ! (1_1) res0 = dtmp0 * res0;
483 sra %l1,11,%i4 ! (0_0) ax >>= 11;
484 stx %o7,[%fp+tmp3] ! (0_0) dtmp1 = *((double*)&lexp0);
485 for %f32,DC1,%f48 ! (0_0) db0 = vis_for(db0,DC1);
486
487 cmp %i0,_0x00800000 ! (1_0) ax ? 0x00800000
488 bl,pn %icc,.update17 ! (1_0) if( ax < 0x00800000 )
489 ldd [%fp+tmp4],%f34 ! (1_1) dtmp1 = *((double*)&lexp0);
490 fstod %f21,%f56 ! (1_0) db0 = (double)x0;
491 .cont17:
492 fmuld %f54,%f62,%f46 ! (4_1) xx0 *= dtmp0;
493 and %i4,_0x1ff0,%g1 ! (0_0) si0 = ax & 0x1ff0;
494 lda [%i1+stridex]0x82,%o2 ! (2_0) ax = *(int*)px;
495 faddd %f50,K1,%f62 ! (3_1) res0 += K1;
496
497 add %g1,TBL,%i5 ! (0_0) (double*)((char*)TBL + si0
498 add %g5,stridey,%g5 ! py += stridey
499 st %f23,[stridey+%o3] ! (4_2) *py = fres0;
500 fand %f48,DC2,%f32 ! (0_0) hi0 = vis_fand(db0,DC2);
501
502 fmuld %f44,%f34,%f44 ! (1_1) res0 *= dtmp1;
503 sra %i0,24,%o4 ! (1_0) iexp0 = ax >> 24;
504 ldd [%i2+8],%f60 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
505 faddd %f52,DC1,%f58 ! (2_1) res0 += DC1;
506
507 add %i1,stridex,%o7 ! px += stridex
508 add %o4,960,%i2 ! (1_0) iexp0 += 0x3c0;
509 lda [%i1+stridex]0x82,%f25 ! (2_0) x0 = *px;
510 fand %f56,DC0,%f34 ! (1_0) db0 = vis_fand(db0,DC0);
511
512 fmuld K2,%f46,%f50 ! (4_1) res0 = K2 * xx0;
513 cmp %o2,_0x7f800000 ! (2_0) ax ? 0x7f800000
514 bge,pn %icc,.update18 ! (2_0) if( ax >= 0x7f800000 )
515 fsubd %f48,%f32,%f52 ! (0_0) xx0 = (db0 - hi0);
516 .cont18:
517 fmuld %f62,%f42,%f54 ! (3_1) res0 *= xx0;
518 sllx %i2,52,%o4 ! (1_0) lexp0 = (long long)iexp0 << 52;
519 ldd [TBL+%g1],%f32 ! (0_0) dtmp0 = ((double*)((char*)TBL + si0))[0];
520 fdtos %f40,%f27 ! (0_1) fres0 = (float)res0;
521
522 fmuld %f60,%f58,%f60 ! (2_1) res0 = dtmp0 * res0;
523 sra %i0,11,%g1 ! (1_0) ax >>= 11;
524 stx %o4,[%fp+tmp4] ! (1_0) dtmp1 = *((double*)&lexp0);
525 for %f34,DC1,%f48 ! (1_0) db0 = vis_for(db0,DC1);
526
527 cmp %o2,_0x00800000 ! (2_0) ax ? 0x00800000
528 bl,pn %icc,.update19 ! (2_0) if( ax < 0x00800000 )
529 ldd [%fp+tmp0],%f40 ! (2_1) dtmp1 = *((double*)&lexp0);
530 fstod %f25,%f56 ! (2_0) db0 = (double)x0;
531 .cont19:
532 fmuld %f52,%f32,%f42 ! (0_0) xx0 *= dtmp0;
533 and %g1,_0x1ff0,%o5 ! (1_0) si0 = ax & 0x1ff0;
534 lda [stridex+%o7]0x82,%o1 ! (3_0) ax = *(int*)px;
535 faddd %f50,K1,%f34 ! (4_1) res0 += K1;
536
537 add %o5,TBL,%i4 ! (1_0) (char*)TBL + si0
538 add %g5,stridey,%g1 ! py += stridey
539 st %f27,[%g5] ! (0_1) *py = fres0;
540 fand %f48,DC2,%f62 ! (1_0) hi0 = vis_fand(db0,DC2);
541
542 fmuld %f60,%f40,%f32 ! (2_1) res0 *= dtmp1;
543 sra %o2,24,%l1 ! (2_0) iexp0 = ax >> 24;
544 ldd [%l0+8],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
545 faddd %f54,DC1,%f58 ! (3_1) res0 += DC1;
546
547 add %o7,stridex,%i1 ! px += stridex
548 add %l1,960,%l0 ! (2_0) iexp0 += 0x3c0;
549 lda [stridex+%o7]0x82,%f0 ! (3_0) x0 = *px;
550 fand %f56,DC0,%f60 ! (2_0) db0 = vis_fand(db0,DC0);
551
552 fmuld K2,%f42,%f50 ! (0_0) res0 = K2 * xx0;
553 cmp %o1,_0x7f800000 ! (3_0) ax ? 0x7f800000
554 bge,pn %icc,.update20 ! (3_0) if( ax >= 0x7f800000 )
555 fsubd %f48,%f62,%f54 ! (1_0) xx0 = (db0 - hi0);
556 .cont20:
557 fmuld %f34,%f46,%f52 ! (4_1) res0 *= xx0;
558 sllx %l0,52,%o3 ! (2_0) lexp0 = (long long)iexp0 << 52;
559 ldd [TBL+%o5],%f56 ! (1_0) dtmp0 = ((double*)((char*)TBL + si0))[0];
560 fdtos %f44,%f8 ! (1_1) fres0 = (float)res0;
561
562 fmuld %f40,%f58,%f34 ! (3_1) res0 = dtmp0 * res0;
563 sra %o2,11,%i2 ! (2_0) ax >>= 11;
564 stx %o3,[%fp+tmp0] ! (2_0) dtmp1 = *((double*)&lexp0);
565 for %f60,DC1,%f40 ! (2_0) db0 = vis_for(db0,DC1);
566
567 cmp %o1,_0x00800000 ! (3_0) ax ? 0x00800000
568 bl,pn %icc,.update21 ! (3_0) if( ax < 0x00800000 )
569 ldd [%fp+tmp1],%f62 ! (3_1) dtmp1 = *((double*)&lexp0);
570 fstod %f0,%f48 ! (3_0) db0 = (double)x0;
571 .cont21:
572 fmuld %f54,%f56,%f30 ! (1_0) xx0 *= dtmp0;
573 and %i2,_0x1ff0,%o3 ! (2_0) si0 = ax & 0x1ff0;
574 lda [%i1+stridex]0x82,%o2 ! (4_0) ax = *(int*)px;
575 faddd %f50,K1,%f56 ! (0_0) res0 += K1;
576
577 add %i1,stridex,%i1 ! px += stridex
578 add %o3,TBL,%i2 ! (2_0) (char*)TBL + si0
579 st %f8,[stridey+%g5] ! (1_1) *py = fres0;
580 fand %f40,DC2,%f46 ! (2_0) hi0 = vis_fand(db0,DC2);
581
582 fmuld %f34,%f62,%f28 ! (3_1) res0 *= dtmp1;
583 sra %o1,24,%o4 ! (3_0) iexp0 = ax >> 24;
584 ldd [%i3+8],%f50 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
585 faddd %f52,DC1,%f54 ! (4_1) res0 += DC1;
586
587 add %g1,stridey,%i3 ! py += stridey
588 subcc counter,5,counter ! counter
589 lda [%i1]0x82,%f13 ! (4_0) x0 = *px;
590 fand %f48,DC0,%f58 ! (3_0) db0 = vis_fand(db0,DC0);
591
592 bpos,pt %icc,.main_loop
593 add %o4,960,%g5 ! (3_0) iexp0 += 0x3c0;
594
595 add counter,5,counter
596 .tail:
597 subcc counter,1,counter
598 bneg,a .begin
599 or %g0,%i3,%g5
600
601 fmuld %f56,%f42,%f52 ! (0_1) res0 *= xx0;
602 fdtos %f32,%f15 ! (2_2) fres0 = (float)res0;
603
604 fmuld %f50,%f54,%f42 ! (4_2) res0 = dtmp0 * res0;
605
606 ldd [%fp+tmp2],%f56 ! (4_2) dtmp1 = *((double*)&lexp0);
607
608 add %i3,stridey,%o3 ! py += stridey
609 st %f15,[%i3] ! (2_2) *py = fres0;
610
611 subcc counter,1,counter
612 bneg,a .begin
613 or %g0,%o3,%g5
614
615 fmuld %f42,%f56,%f44 ! (4_2) res0 *= dtmp1;
616 ldd [%i5+8],%f58 ! (0_1) dtmp0 = ((double*)((char*)TBL + si0))[1]
617 faddd %f52,DC1,%f34 ! (0_1) res0 += DC1;
618
619 fdtos %f28,%f19 ! (3_2) fres0 = (float)res0;
620
621 fmuld %f58,%f34,%f32 ! (0_1) res0 = dtmp0 * res0;
622
623 ldd [%fp+tmp3],%f60 ! (0_1) dtmp1 = *((double*)&lexp0);
624
625 add %o3,stridey,%g5 ! py += stridey
626
627 st %f19,[%o3] ! (3_2) *py = fres0;
628
629 subcc counter,1,counter
630 bneg,a .begin
631 nop
632
633 fmuld %f32,%f60,%f40 ! (0_1) res0 *= dtmp1;
634
635 fdtos %f44,%f23 ! (4_2) fres0 = (float)res0;
636
637 add %g5,stridey,%g5 ! py += stridey
638 st %f23,[stridey+%o3] ! (4_2) *py = fres0;
639
640 subcc counter,1,counter
641 bneg,a .begin
642 nop
643
644 fdtos %f40,%f27 ! (0_1) fres0 = (float)res0;
645
646 st %f27,[%g5] ! (0_1) *py = fres0;
647
648 ba .begin
649 add %g5,stridey,%g5
650
651 .align 16
652 .spec:
653 fsqrts %f25,%f25
654 sub counter,1,counter
655 add %i1,stridex,%i1
656 st %f25,[%g5]
657 ba .begin1
658 add %g5,stridey,%g5
659
660 .align 16
661 .update0:
662 cmp counter,1
663 ble .cont0
664 fzeros %f0
665
666 stx %i1,[%fp+tmp_px]
667 sethi %hi(0x7f800000),%o1
668
669 sub counter,1,counter
670 st counter,[%fp+tmp_counter]
671
672 ba .cont0
673 or %g0,1,counter
674
675 .align 16
676 .update1:
677 cmp counter,1
678 ble .cont1
679 fzeros %f0
680
681 stx %i1,[%fp+tmp_px]
682 clr %o1
683
684 sub counter,1,counter
685 st counter,[%fp+tmp_counter]
686
687 ba .cont1
688 or %g0,1,counter
689
690 .align 16
691 .update2:
692 cmp counter,2
693 ble .cont2
694 fzeros %f13
695
696 stx %i1,[%fp+tmp_px]
697 sethi %hi(0x7f800000),%o2
698
699 sub counter,2,counter
700 st counter,[%fp+tmp_counter]
701
702 ba .cont2
703 or %g0,2,counter
704
705 .align 16
706 .update3:
707 cmp counter,2
708 ble .cont3
709 fzeros %f13
710
711 stx %i1,[%fp+tmp_px]
712 clr %o2
713
714 sub counter,2,counter
715 st counter,[%fp+tmp_counter]
716
717 ba .cont3
718 or %g0,2,counter
719
720 .align 16
721 .update4:
722 cmp counter,3
723 ble .cont4
724 fzeros %f17
725
726 stx %o4,[%fp+tmp_px]
727 sethi %hi(0x7f800000),%l1
728
729 sub counter,3,counter
730 st counter,[%fp+tmp_counter]
731
732 ba .cont4
733 or %g0,3,counter
734
735 .align 16
736 .update5:
737 cmp counter,3
738 ble .cont5
739 fzeros %f17
740
741 stx %o4,[%fp+tmp_px]
742 clr %l1
743
744 sub counter,3,counter
745 st counter,[%fp+tmp_counter]
746
747 ba .cont5
748 or %g0,3,counter
749
750 .align 16
751 .update6:
752 cmp counter,4
753 ble .cont6
754 fzeros %f21
755
756 stx %i1,[%fp+tmp_px]
757 sethi %hi(0x7f800000),%i0
758
759 sub counter,4,counter
760 st counter,[%fp+tmp_counter]
761
762 ba .cont6
763 or %g0,4,counter
764
765 .align 16
766 .update7:
767 cmp counter,4
768 ble .cont7
769 fzeros %f21
770
771 stx %i1,[%fp+tmp_px]
772 clr %i0
773
774 sub counter,4,counter
775 st counter,[%fp+tmp_counter]
776
777 ba .cont7
778 or %g0,4,counter
779
780 .align 16
781 .update8:
782 cmp counter,5
783 ble .cont8
784 fzeros %f25
785
786 stx %o7,[%fp+tmp_px]
787 sethi %hi(0x7f800000),%o2
788
789 sub counter,5,counter
790 st counter,[%fp+tmp_counter]
791
792 ba .cont8
793 or %g0,5,counter
794
795 .align 16
796 .update9:
797 cmp counter,5
798 ble .cont9
799 fzeros %f25
800
801 stx %o7,[%fp+tmp_px]
802 clr %o2
803
804 sub counter,5,counter
805 st counter,[%fp+tmp_counter]
806
807 ba .cont9
808 or %g0,5,counter
809
810 .align 16
811 .update10:
812 cmp counter,6
813 ble .cont10
814 fzeros %f0
815
816 stx %i1,[%fp+tmp_px]
817 sethi %hi(0x7f800000),%o1
818
819 sub counter,6,counter
820 st counter,[%fp+tmp_counter]
821
822 ba .cont10
823 or %g0,6,counter
824
825 .align 16
826 .update11:
827 cmp counter,6
828 ble .cont11
829 fzeros %f0
830
831 stx %i1,[%fp+tmp_px]
832 clr %o1
833
834 sub counter,6,counter
835 st counter,[%fp+tmp_counter]
836
837 ba .cont11
838 or %g0,6,counter
839
840 .align 16
841 .update12:
842 cmp counter,2
843 ble .cont12
844 fzeros %f13
845
846 stx %i1,[%fp+tmp_px]
847 sethi %hi(0x7f800000),%o2
848
849 sub counter,2,counter
850 st counter,[%fp+tmp_counter]
851
852 ba .cont12
853 or %g0,2,counter
854
855 .align 16
856 .update13:
857 cmp counter,2
858 ble .cont13
859 fzeros %f13
860
861 stx %i1,[%fp+tmp_px]
862 clr %o2
863
864 sub counter,2,counter
865 st counter,[%fp+tmp_counter]
866
867 ba .cont13
868 or %g0,2,counter
869
870 .align 16
871 .update14:
872 cmp counter,3
873 ble .cont14
874 fzeros %f17
875
876 stx %o4,[%fp+tmp_px]
877 sethi %hi(0x7f800000),%l1
878
879 sub counter,3,counter
880 st counter,[%fp+tmp_counter]
881
882 ba .cont14
883 or %g0,3,counter
884
885 .align 16
886 .update15:
887 cmp counter,3
888 ble .cont15
889 fzeros %f17
890
891 stx %o4,[%fp+tmp_px]
892 clr %l1
893
894 sub counter,3,counter
895 st counter,[%fp+tmp_counter]
896
897 ba .cont15
898 or %g0,3,counter
899
900 .align 16
901 .update16:
902 cmp counter,4
903 ble .cont16
904 fzeros %f21
905
906 stx %i1,[%fp+tmp_px]
907 sethi %hi(0x7f800000),%i0
908
909 sub counter,4,counter
910 st counter,[%fp+tmp_counter]
911
912 ba .cont16
913 or %g0,4,counter
914
915 .align 16
916 .update17:
917 cmp counter,4
918 ble .cont17
919 fzeros %f21
920
921 stx %i1,[%fp+tmp_px]
922 clr %i0
923
924 sub counter,4,counter
925 st counter,[%fp+tmp_counter]
926
927 ba .cont17
928 or %g0,4,counter
929
930 .align 16
931 .update18:
932 cmp counter,5
933 ble .cont18
934 fzeros %f25
935
936 stx %o7,[%fp+tmp_px]
937 sethi %hi(0x7f800000),%o2
938
939 sub counter,5,counter
940 st counter,[%fp+tmp_counter]
941
942 ba .cont18
943 or %g0,5,counter
944
945 .align 16
946 .update19:
947 cmp counter,5
948 ble .cont19
949 fzeros %f25
950
951 stx %o7,[%fp+tmp_px]
952 clr %o2
953
954 sub counter,5,counter
955 st counter,[%fp+tmp_counter]
956
957 ba .cont19
958 or %g0,5,counter
959
960 .align 16
961 .update20:
962 cmp counter,6
963 ble .cont20
964 fzeros %f0
965
966 stx %i1,[%fp+tmp_px]
967 sethi %hi(0x7f800000),%o1
968
969 sub counter,6,counter
970 st counter,[%fp+tmp_counter]
971
972 ba .cont20
973 or %g0,6,counter
974
975 .align 16
976 .update21:
977 cmp counter,6
978 ble .cont21
979 fzeros %f0
980
981 stx %i1,[%fp+tmp_px]
982 clr %o1
983
984 sub counter,6,counter
985 st counter,[%fp+tmp_counter]
986
987 ba .cont21
988 or %g0,6,counter
989
990 .exit:
991 ret
992 restore
993 SET_SIZE(__vsqrtf_ultra3)
994