1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
23 */
24 /*
25 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
26 * Use is subject to license terms.
27 */
28
29 .file "__vatan.S"
30
31 #include "libm.h"
32
33 RO_DATA
34
35 ! following is the C version of the ATAN algorithm
36 ! #include <math.h>
37 ! #include <stdio.h>
38 ! double jkatan(double *x)
39 ! {
40 ! double f, z, ans, ansu, ansl, tmp, poly, conup, conlo, dummy;
41 ! int index, sign, intf, intz;
42 ! extern const double __vlibm_TBL_atan1[];
43 ! long *pf = (long *) &f, *pz = (long *) &z;
44 !
45 ! /* Power series atan(x) = x + p1*x**3 + p2*x**5 + p3*x**7
46 ! * Error = -3.08254E-18 On the interval |x| < 1/64 */
47 !
48 ! /* define dummy names for readability. Use parray to help compiler optimize loads */
49 ! #define p3 parray[0]
50 ! #define p2 parray[1]
51 ! #define p1 parray[2]
52 ! #define soffset 3
53 !
54 ! static const double parray[] = {
55 ! -1.428029046844299722E-01, /* p[3] */
56 ! 1.999999917247000615E-01, /* p[2] */
57 ! -3.333333333329292858E-01, /* p[1] */
58 ! 1.0, /* not used for p[0], though */
59 ! -1.0, /* used to flip sign of answer */
60 ! };
61 !
62 ! f = *x; /* fetch argument */
63 ! intf = pf[0]; /* grab upper half */
64 ! sign = intf & 0x80000000; /* sign of argument */
65 ! intf ^= sign; /* abs(upper argument) */
66 ! sign = (unsigned) sign >> 31; /* sign bit = 0 or 1 */
67 ! pf[0] = intf;
68 !
69 ! if( (intf > 0x43600000) || (intf < 0x3e300000) ) /* filter out special cases */
70 ! {
71 ! if( (intf > 0x7ff00000) ||
72 ! ((intf == 0x7ff00000) && (pf[1] !=0)) ) return (*x-*x);/* return NaN if x=NaN*/
73 ! if( intf < 0x3e300000 ) /* avoid underflow for small arg */
74 ! {
75 ! dummy = 1.0e37 + f;
76 ! dummy = dummy;
77 ! return (*x);
78 ! }
79 ! if( intf > 0x43600000 ) /* avoid underflow for big arg */
80 ! {
81 ! index = 2;
82 ! f = __vlibm_TBL_atan1[index] + __vlibm_TBL_atan1[index+1];/* pi/2 up + pi/2 low */
83 ! f = parray[soffset + sign] * f; /* put sign bit on ans */
84 ! return (f);
85 ! }
86 ! }
87 !
88 ! index = 0; /* points to 0,0 in table */
89 ! if (intf > 0x40500000) /* if(|x| > 64 */
90 ! { f = -1.0/f;
91 ! index = 2; /* point to pi/2 upper, lower */
92 ! }
93 ! else if( intf >= 0x3f900000 ) /* if |x| >= (1/64)... */
94 ! {
95 ! intz = (intf + 0x00008000) & 0x7fff0000;/* round arg, keep upper */
96 ! pz[0] = intz; /* store as a double (z) */
97 ! pz[1] = 0; /* ...lower */
98 ! f = (f - z)/(1.0 + f*z); /* get reduced argument */
99 ! index = (intz - 0x3f900000) >> 15; /* (index >> 16) << 1) */
100 ! index += 4; /* skip over 0,0,pi/2,pi/2 */
101 ! }
102 ! conup = __vlibm_TBL_atan1[index]; /* upper table */
103 ! conlo = __vlibm_TBL_atan1[index+1]; /* lower table */
104 ! tmp = f*f;
105 ! poly = (f*tmp)*((p3*tmp + p2)*tmp + p1);
106 ! ansu = conup + f; /* compute atan(f) upper */
107 ! ansl = (((conup - ansu) + f) + poly) + conlo;
108 ! ans = ansu + ansl;
109 ! ans = parray[soffset + sign] * ans;
110 ! return ans;
111 ! }
112
113 /* 8 bytes = 1 double f.p. word */
114 #define WSIZE 8
115
116 .align 32 !align with full D-cache line
117 .COEFFS:
118 .double 0r-1.428029046844299722E-01 !p[3]
119 .double 0r1.999999917247000615E-01 !p[2]
120 .double 0r-3.333333333329292858E-01 !p[1]
121 .double 0r-1.0, !constant -1.0
122 .word 0x00008000,0x0 !for fp rounding of reduced arg
123 .word 0x7fff0000,0x0 !for fp truncation
124 .word 0x47900000,0 !a number close to 1.0E37
125 .word 0x80000000,0x0 !mask for fp sign bit
126 .word 0x3f800000,0x0 !1.0/128.0 dummy "safe" argument
127 .type .COEFFS,#object
128
129 ENTRY(__vatan)
130 save %sp,-SA(MINFRAME)-16,%sp
131 PIC_SETUP(g5)
132 PIC_SET(g5,__vlibm_TBL_atan1,o4)
133 PIC_SET(g5,.COEFFS,o0)
134 /*
135 __vatan(int n, double *x, int stridex, double *y, stridey)
136 computes y(i) = atan( x(i) ), for 1=1,n. Stridex, stridey
137 are the distance between x and y elements
138
139 %i0 n
140 %i1 address of x
141 %i2 stride x
142 %i3 address of y
143 %i4 stride y
144 */
145 cmp %i0,0 !if n <=0,
146 ble,pn %icc,.RETURN !....then do nothing
147 sll %i2,3,%i2 !convert stride to byte count
148 sll %i4,3,%i4 !convert stride to byte count
149
150 /* pre-load constants before beginning main loop */
151
152 ldd [%o0],%f58 !load p[3]
153 mov 2,%i5 !argcount = 3
154
155 ldd [%o0+WSIZE],%f60 !load p[2]
156 add %fp,STACK_BIAS-8,%l1 !yaddr1 = &dummy
157 fzero %f18 !ansu1 = 0
158
159 ldd [%o0+2*WSIZE],%f62 !load p[1]
160 add %fp,STACK_BIAS-8,%l2 !yaddr2 = &dummy
161 fzero %f12 !(poly1) = 0
162
163 ldd [%o0+3*WSIZE],%f56 !-1.0
164 fzero %f14 !tmp1 = 0
165
166 ldd [%o0+4*WSIZE],%f52 !load rounding mask
167 fzero %f16 !conup1 = 0
168
169 ldd [%o0+5*WSIZE],%f54 !load truncation mask
170 fzero %f36 !f1 = 0
171
172 ldd [%o0+6*WSIZE],%f50 !1.0e37
173 fzero %f38 !f2 = 0
174
175 ldd [%o0+7*WSIZE],%f32 !mask for sign bit
176
177 ldd [%o4+2*WSIZE],%f46 !pi/2 upper
178 ldd [%o4+(2*WSIZE+8)],%f48 !pi/2 lower
179 sethi %hi(0x40500000),%l6 !64.0
180 sethi %hi(0x3f900000),%l7 !1/64.0
181 mov 0,%l4 !index1 = 0
182 mov 0,%l5 !index2 = 0
183
184 .MAINLOOP:
185
186 /*--------------------------------------------------------------------------*/
187 /*--------------------------------------------------------------------------*/
188 /*--------------------------------------------------------------------------*/
189
190 .LOOP0:
191 deccc %i0 !--n
192 bneg 1f
193 mov %i1,%o5 !xuse = x (delay slot)
194
195 ba 2f
196 nop !delay slot
197 1:
198 PIC_SET(g5,.COEFFS+8*WSIZE,o5)
199 dec %i5 !argcount--
200 2:
201 sethi %hi(0x80000000),%o7 !mask for sign bit
202 /*2 */ sethi %hi(0x43600000),%o1 !big = 0x43600000,0
203 ld [%o5],%o0 !intf = pf[0] = f upper
204 ldd [%o4+%l5],%f26 !conup2 = __vlibm_TBL_atan1[index2]
205
206 sethi %hi(0x3e300000),%o2 !small = 0x3e300000,0
207 /*4 */ andn %o0,%o7,%o0 !intf = fabs(intf)
208 ldd [%o5],%f34 !f = *x into f34
209
210 sub %o1,%o0,%o1 !(-) if intf > big
211 /*6 */ sub %o0,%o2,%o2 !(-) if intf < small
212 fand %f34,%f32,%f40 !sign0 = sign bit
213 fmuld %f38,%f38,%f24 !tmp2= f2*f2
214
215 /*7 */ orcc %o1,%o2,%g0 !(-) if either true
216 bneg,pn %icc,.SPECIAL0 !if (-) goto special cases below
217 fabsd %f34,%f34 !abs(f) (delay slot)
218 !----------------------
219
220
221 sethi %hi(0x8000),%o7 !rounding bit
222 /*8 */ fpadd32 %f34,%f52,%f0 !intf + 0x00008000 (again)
223 faddd %f26,%f38,%f28 !ansu2 = conup2 + f2
224
225 add %o0,%o7,%o0 !intf + 0x00008000 (delay slot)
226 /*9*/ fand %f0,%f54,%f0 !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
227 fmuld %f58,%f24,%f22 !p[3]*tmp2
228
229 /*10 */ sethi %hi(0x7fff0000),%o7 !mask for rounding argument
230 fmuld %f34,%f0,%f10 !f*z
231 fsubd %f34,%f0,%f20 !f - z
232 add %o4,%l4,%l4 !base addr + index1
233 fmuld %f14,%f12,%f12 !poly1 = (f1*tmp1)*((p3*tmp1 + p2)*tmp1 + p1)
234 faddd %f16,%f36,%f16 !(conup1 - ansu1) + f1
235
236 /*12 */ and %o0,%o7,%o0 !intz = (intf + 0x00008000) & 0x7fff0000
237 faddd %f22,%f60,%f22 !p[3]*tmp2 + p[2]
238 ldd [%l4+WSIZE],%f14 !conlo1 = __vlibm_TBL_atan1[index+1]
239
240 /*13 */ sub %o0,%l7,%o2 !intz - 0x3f900000
241 fsubd %f10,%f56,%f10 !(f*z - (-1.0))
242 faddd %f16,%f12,%f12 !((conup1 - ansu1) + f1) + poly1
243
244 cmp %o0,%l6 !(|f| > 64)
245 ble .ELSE0 !if(|f| > 64) then
246 /*15 */ sra %o2,15,%o3 !index = (intz - 0x3f900000) >> 15
247 mov 2,%o1 !index == 2, point to conup, conlo = pi/2 upper, lower
248 ba .ENDIF0 !continue
249 /*16 */ fdivd %f56,%f34,%f34 !f = -1.0/f (delay slot)
250 .ELSE0: !else f( |x| >= (1/64))
251 cmp %o0,%l7 !if intf >= 1/64
252 bl .ENDIF0 !if( |x| >= (1/64) ) then...
253 mov 0,%o1 !index == 0 , point to conup,conlo = 0,0
254 add %o3,4,%o1 !index = index + 4
255 /*16 */ fdivd %f20,%f10,%f34 !f = (f - z)/(1.0 + f*z), reduced argument
256 .ENDIF0:
257
258 /*17*/ sll %o1,3,%l3 !index0 = index
259 mov %i3,%l0 !yaddr0 = address of y
260 faddd %f12,%f14,%f12 !ansl1 = (((conup1 - ansu)1 + f1) + poly1) + conlo1
261 fmuld %f22,%f24,%f22 !(p3*tmp2 + p2)*tmp2
262 fsubd %f26,%f28,%f26 !conup2 - ansu2
263
264 /*20*/ add %i1,%i2,%i1 !x += stridex
265 add %i3,%i4,%i3 !y += stridey
266 faddd %f18,%f12,%f36 !ans1 = ansu1 + ansl1
267 fmuld %f38,%f24,%f24 !f*tmp2
268 faddd %f22,%f62,%f22 !(p3*tmp2 + p2)*tmp2 + p1
269
270 /*23*/ for %f36,%f42,%f36 !sign(ans1) = sign of argument
271 std %f36,[%l1] !*yaddr1 = ans1
272 add %o4,%l5,%l5 !base addr + index2
273 fmuld %f24,%f22,%f22 !poly2 = (f2*tmp2)*((p3*tmp2 + p2)*tmp2 + p1)
274 faddd %f26,%f38,%f26 !(conup2 - ansu2) + f2
275 cmp %i5,0 !if argcount =0, we are done
276 be .RETURN
277 nop
278
279 /*--------------------------------------------------------------------------*/
280 /*--------------------------------------------------------------------------*/
281 /*--------------------------------------------------------------------------*/
282
283 .LOOP1:
284 /*25*/ deccc %i0 !--n
285 bneg 1f
286 mov %i1,%o5 !xuse = x (delay slot)
287 ba 2f
288 nop !delay slot
289 1:
290 PIC_SET(g5,.COEFFS+8*WSIZE,o5)
291 dec %i5 !argcount--
292 2:
293
294 /*26*/ sethi %hi(0x80000000),%o7 !mask for sign bit
295 sethi %hi(0x43600000),%o1 !big = 0x43600000,0
296 ld [%o5],%o0 !intf = pf[0] = f upper
297
298 /*28*/ sethi %hi(0x3e300000),%o2 !small = 0x3e300000,0
299 andn %o0,%o7,%o0 !intf = fabs(intf)
300 ldd [%o5],%f36 !f = *x into f36
301
302 /*30*/ sub %o1,%o0,%o1 !(-) if intf > big
303 sub %o0,%o2,%o2 !(-) if intf < small
304 fand %f36,%f32,%f42 !sign1 = sign bit
305
306 /*31*/ orcc %o1,%o2,%g0 !(-) if either true
307 bneg,pn %icc,.SPECIAL1 !if (-) goto special cases below
308 fabsd %f36,%f36 !abs(f) (delay slot)
309 !----------------------
310
311 /*32*/ fpadd32 %f36,%f52,%f0 !intf + 0x00008000 (again)
312 ldd [%l5+WSIZE],%f24 !conlo2 = __vlibm_TBL_atan1[index2+1]
313
314 /*33*/ fand %f0,%f54,%f0 !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
315 sethi %hi(0x8000),%o7 !rounding bit
316 faddd %f26,%f22,%f22 !((conup2 - ansu2) + f2) + poly2
317
318 /*34*/ add %o0,%o7,%o0 !intf + 0x00008000 (delay slot)
319 sethi %hi(0x7fff0000),%o7 !mask for rounding argument
320 fmuld %f36,%f0,%f10 !f*z
321 fsubd %f36,%f0,%f20 !f - z
322
323 /*35*/ and %o0,%o7,%o0 !intz = (intf + 0x00008000) & 0x7fff0000
324 faddd %f22,%f24,%f22 !ansl2 = (((conup2 - ansu2) + f2) + poly2) + conlo2
325
326 /*37*/ sub %o0,%l7,%o2 !intz - 0x3f900000
327 fsubd %f10,%f56,%f10 !(f*z - (-1.0))
328 ldd [%o4+%l3],%f6 !conup0 = __vlibm_TBL_atan1[index0]
329
330 cmp %o0,%l6 !(|f| > 64)
331 ble .ELSE1 !if(|f| > 64) then
332 /*38*/ sra %o2,15,%o3 !index = (intz - 0x3f900000) >> 15
333 mov 2,%o1 !index == 2, point to conup, conlo = pi/2 upper, lower
334 ba .ENDIF1 !continue
335 /*40*/ fdivd %f56,%f36,%f36 !f = -1.0/f (delay slot)
336 .ELSE1: !else f( |x| >= (1/64))
337 cmp %o0,%l7 !if intf >= 1/64
338 bl .ENDIF1 !if( |x| >= (1/64) ) then...
339 mov 0,%o1 !index == 0 , point to conup,conlo = 0,0
340 add %o3,4,%o1 !index = index + 4
341 /*40*/ fdivd %f20,%f10,%f36 !f = (f - z)/(1.0 + f*z), reduced argument
342 .ENDIF1:
343
344 /*41*/sll %o1,3,%l4 !index1 = index
345 mov %i3,%l1 !yaddr1 = address of y
346 fmuld %f34,%f34,%f4 !tmp0= f0*f0
347 faddd %f28,%f22,%f38 !ans2 = ansu2 + ansl2
348
349 /*44*/add %i1,%i2,%i1 !x += stridex
350 add %i3,%i4,%i3 !y += stridey
351 fmuld %f58,%f4,%f2 !p[3]*tmp0
352 faddd %f6,%f34,%f8 !ansu0 = conup0 + f0
353 for %f38,%f44,%f38 !sign(ans2) = sign of argument
354 std %f38,[%l2] !*yaddr2 = ans2
355 cmp %i5,0 !if argcount =0, we are done
356 be .RETURN
357 nop
358
359 /*--------------------------------------------------------------------------*/
360 /*--------------------------------------------------------------------------*/
361 /*--------------------------------------------------------------------------*/
362
363 .LOOP2:
364 /*46*/ deccc %i0 !--n
365 bneg 1f
366 mov %i1,%o5 !xuse = x (delay slot)
367 ba 2f
368 nop !delay slot
369 1:
370 PIC_SET(g5,.COEFFS+8*WSIZE,o5)
371 dec %i5 !argcount--
372 2:
373
374 /*47*/ sethi %hi(0x80000000),%o7 !mask for sign bit
375 sethi %hi(0x43600000),%o1 !big = 0x43600000,0
376 ld [%o5],%o0 !intf = pf[0] = f upper
377
378 /*49*/ sethi %hi(0x3e300000),%o2 !small = 0x3e300000,0
379 andn %o0,%o7,%o0 !intf = fabs(intf)
380 ldd [%o5],%f38 !f = *x into f38
381
382 /*51*/ sub %o1,%o0,%o1 !(-) if intf > big
383 sub %o0,%o2,%o2 !(-) if intf < small
384 fand %f38,%f32,%f44 !sign2 = sign bit
385
386 /*52*/ orcc %o1,%o2,%g0 !(-) if either true
387 bneg,pn %icc,.SPECIAL2 !if (-) goto special cases below
388 fabsd %f38,%f38 !abs(f) (delay slot)
389 !----------------------
390
391 /*53*/ fpadd32 %f38,%f52,%f0 !intf + 0x00008000 (again)
392 faddd %f2,%f60,%f2 !p[3]*tmp0 + p[2]
393
394 /*54*/ sethi %hi(0x8000),%o7 !rounding bit
395 fand %f0,%f54,%f0 !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
396
397 /*55*/ add %o0,%o7,%o0 !intf + 0x00008000 (delay slot)
398 sethi %hi(0x7fff0000),%o7 !mask for rounding argument
399 fmuld %f38,%f0,%f10 !f*z
400 fsubd %f38,%f0,%f20 !f - z
401
402 /*56*/ and %o0,%o7,%o0 !intz = (intf + 0x00008000) & 0x7fff0000
403 fmuld %f2,%f4,%f2 !(p3*tmp0 + p2)*tmp0
404 fsubd %f6,%f8,%f6 !conup0 - ansu0
405
406 /*58*/ sub %o0,%l7,%o2 !intz - 0x3f900000
407 fsubd %f10,%f56,%f10 !(f*z - (-1.0))
408 ldd [%o4+%l4],%f16 !conup1 = __vlibm_TBL_atan1[index1]
409
410 cmp %o0,%l6 !(|f| > 64)
411 ble .ELSE2 !if(|f| > 64) then
412 /*60*/ sra %o2,15,%o3 !index = (intz - 0x3f900000) >> 15
413 mov 2,%o1 !index == 2, point to conup, conlo = pi/2 upper, lower
414 ba .ENDIF2 !continue
415 /*61*/ fdivd %f56,%f38,%f38 !f = -1.0/f (delay slot)
416 .ELSE2: !else f( |x| >= (1/64))
417 cmp %o0,%l7 !if intf >= 1/64
418 bl .ENDIF2 !if( |x| >= (1/64) ) then...
419 mov 0,%o1 !index == 0 , point to conup,conlo = 0,0
420 add %o3,4,%o1 !index = index + 4
421 /*61*/ fdivd %f20,%f10,%f38 !f = (f - z)/(1.0 + f*z), reduced argument
422 .ENDIF2:
423
424
425 /*62*/ sll %o1,3,%l5 !index2 = index
426 mov %i3,%l2 !yaddr2 = address of y
427 fmuld %f34,%f4,%f4 !f0*tmp0
428 faddd %f2,%f62,%f2 !(p3*tmp0 + p2)*tmp0 + p1
429 fmuld %f36,%f36,%f14 !tmp1= f1*f1
430
431 /*65*/add %o4,%l3,%l3 !base addr + index0
432 fmuld %f4,%f2,%f2 !poly0 = (f0*tmp0)*((p3*tmp0 + p2)*tmp0 + p1)
433 faddd %f6,%f34,%f6 !(conup0 - ansu0) + f0
434 fmuld %f58,%f14,%f12 !p[3]*tmp1
435 faddd %f16,%f36,%f18 !ansu1 = conup1 + f1
436 ldd [%l3+WSIZE],%f4 !conlo0 = __vlibm_TBL_atan1[index0+1]
437
438 /*68*/ add %i1,%i2,%i1 !x += stridex
439 add %i3,%i4,%i3 !y += stridey
440 faddd %f6,%f2,%f2 !((conup0 - ansu0) + f0) + poly0
441 faddd %f12,%f60,%f12 !p[3]*tmp1 + p[2]
442
443 /*71*/faddd %f2,%f4,%f2 !ansl0 = (((conup0 - ansu)0 + f0) + poly0) + conlo0
444 fmuld %f12,%f14,%f12 !(p3*tmp1 + p2)*tmp1
445 fsubd %f16,%f18,%f16 !conup1 - ansu1
446
447 /*74*/faddd %f8,%f2,%f34 !ans0 = ansu0 + ansl0
448 fmuld %f36,%f14,%f14 !f1*tmp1
449 faddd %f12,%f62,%f12 !(p3*tmp1 + p2)*tmp1 + p1
450
451 /*77*/ for %f34,%f40,%f34 !sign(ans0) = sign of argument
452 std %f34,[%l0] !*yaddr0 = ans, always gets stored (delay slot)
453 cmp %i5,0 !if argcount =0, we are done
454 bg .MAINLOOP
455 nop
456
457 /*--------------------------------------------------------------------------*/
458 /*--------------------------------------------------------------------------*/
459 /*--------------------------------------------------------------------------*/
460
461 .RETURN:
462 ret
463 restore %g0,%g0,%g0
464
465 /*--------------------------------------------------------------------------*/
466 /*------------SPECIAL CASE HANDLING FOR LOOP0 ------------------------------*/
467 /*--------------------------------------------------------------------------*/
468
469 /* at this point
470 %i1 x address
471 %o0 intf
472 %o2 intf - 0x3e300000
473 %f34,36,38 f0,f1,f2
474 %f40,42,44 sign0,sign1,sign2
475 */
476
477 .align 32 !align on I-cache boundary
478 .SPECIAL0:
479 orcc %o2,%g0,%g0 !(-) if intf < 0x3e300000
480 bpos 1f !if >=...continue
481 sethi %hi(0x7ff00000),%g1 !upper word of Inf (we use 64-bit wide int for this)
482 ba 3f
483 faddd %f34,%f50,%f30 !dummy op just to generate exception (delay slot)
484 1:
485 ld [%o5+4],%o5 !load x lower word
486 sllx %o0,32,%o0 !left justify intf
487 sllx %g1,32,%g1 !left justify Inf
488 or %o0,%o5,%o0 !merge in lower intf
489 cmp %o0,%g1 !if intf > 0x7ff00000 00000000
490 ble,pt %xcc,2f !pass thru if NaN
491 nop
492 fmuld %f34,%f34,%f34 !...... (x*x) trigger invalid exception
493 ba 3f
494 nop
495 2:
496 faddd %f46,%f48,%f34 !ans = pi/2 upper + pi/2 lower
497 3:
498 add %i1,%i2,%i1 !x += stridex
499 for %f34,%f40,%f34 !sign(ans) = sign of argument
500 std %f34,[%i3] !*y = ans
501 ba .LOOP0 !keep looping
502 add %i3,%i4,%i3 !y += stridey (delay slot)
503
504 /*--------------------------------------------------------------------------*/
505 /*-----------SPECIAL CASE HANDLING FOR LOOP1 -------------------------------*/
506 /*--------------------------------------------------------------------------*/
507
508 .align 32 !align on I-cache boundary
509 .SPECIAL1:
510 orcc %o2,%g0,%g0 !(-) if intf < 0x3e300000
511 bpos 1f !if >=...continue
512 sethi %hi(0x7ff00000),%g1 !upper word of Inf (we use 64-bit wide int for this)
513 ba 3f
514 faddd %f36,%f50,%f30 !dummy op just to generate exception (delay slot)
515 1:
516 ld [%o5+4],%o5 !load x lower word
517 sllx %o0,32,%o0 !left justify intf
518 sllx %g1,32,%g1 !left justify Inf
519 or %o0,%o5,%o0 !merge in lower intf
520 cmp %o0,%g1 !if intf > 0x7ff00000 00000000
521 ble,pt %xcc,2f !pass thru if NaN
522 nop
523 fmuld %f36,%f36,%f36 !...... (x*x) trigger invalid exception
524 ba 3f
525 nop
526 2:
527 faddd %f46,%f48,%f36 !ans = pi/2 upper + pi/2 lower
528 3:
529 add %i1,%i2,%i1 !x += stridex
530 for %f36,%f42,%f36 !sign(ans) = sign of argument
531 std %f36,[%i3] !*y = ans
532 ba .LOOP1 !keep looping
533 add %i3,%i4,%i3 !y += stridey (delay slot)
534
535 /*--------------------------------------------------------------------------*/
536 /*------------SPECIAL CASE HANDLING FOR LOOP2 ------------------------------*/
537 /*--------------------------------------------------------------------------*/
538
539 .align 32 !align on I-cache boundary
540 .SPECIAL2:
541 orcc %o2,%g0,%g0 !(-) if intf < 0x3e300000
542 bpos 1f !if >=...continue
543 sethi %hi(0x7ff00000),%g1 !upper word of Inf (we use 64-bit wide int for this)
544 ba 3f
545 faddd %f38,%f50,%f30 !dummy op just to generate exception (delay slot)
546 1:
547 ld [%o5+4],%o5 !load x lower word
548 sllx %o0,32,%o0 !left justify intf
549 sllx %g1,32,%g1 !left justify Inf
550 or %o0,%o5,%o0 !merge in lower intf
551 cmp %o0,%g1 !if intf > 0x7ff00000 00000000
552 ble,pt %xcc,2f !pass thru if NaN
553 nop
554 fmuld %f38,%f38,%f38 !...... (x*x) trigger invalid exception
555 ba 3f
556 nop
557 2:
558 faddd %f46,%f48,%f38 !ans = pi/2 upper + pi/2 lower
559 3:
560 add %i1,%i2,%i1 !x += stridex
561 for %f38,%f44,%f38 !sign(ans) = sign of argument
562 std %f38,[%i3] !*y = ans
563 ba .LOOP2 !keep looping
564 add %i3,%i4,%i3 !y += stridey
565
566 /*--------------------------------------------------------------------------*/
567 /*--------------------------------------------------------------------------*/
568 /*--------------------------------------------------------------------------*/
569
570 SET_SIZE(__vatan)
571
572 ! .ident "03-20-96 Sparc V9 3-way-unrolled version"