1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  23  */
  24 /*
  25  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  26  * Use is subject to license terms.
  27  */
  28 
  29         .file   "__vatan.S"
  30 
  31 #include "libm.h"
  32 
  33         RO_DATA
  34 
  35 ! following is the C version of the ATAN algorithm
  36 !  #include <math.h>
  37 !  #include <stdio.h>
  38 !  double jkatan(double *x)
  39 !  {
  40 !    double  f, z, ans, ansu, ansl, tmp, poly, conup, conlo, dummy;
  41 !    int index, sign, intf, intz;
  42 !    extern const double __vlibm_TBL_atan1[];
  43 !    long *pf = (long *) &f, *pz = (long *) &z;
  44 !  
  45 !  /*    Power series  atan(x) = x + p1*x**3 + p2*x**5 + p3*x**7
  46 !   *    Error =  -3.08254E-18   On the interval  |x| < 1/64 */
  47 !  
  48 !  /* define dummy names for readability.  Use parray to help compiler optimize loads */
  49 !  #define p3    parray[0]
  50 !  #define p2    parray[1]
  51 !  #define p1    parray[2]
  52 !  #define soffset      3  
  53 !  
  54 !    static const double parray[] = { 
  55 !     -1.428029046844299722E-01,                /* p[3]         */
  56 !      1.999999917247000615E-01,                /* p[2]         */
  57 !     -3.333333333329292858E-01,                /* p[1]         */
  58 !      1.0,                             /* not used for p[0], though            */
  59 !     -1.0,                             /* used to flip sign of answer          */
  60 !    };
  61 !  
  62 !    f        = *x;                             /* fetch argument               */
  63 !    intf     = pf[0];                          /* grab upper half              */
  64 !    sign     = intf & 0x80000000;                  /* sign of argument             */
  65 !    intf    ^= sign;                           /* abs(upper argument)          */
  66 !    sign     = (unsigned) sign >> 31;            /* sign bit = 0 or 1            */
  67 !    pf[0]    = intf;
  68 !  
  69 !    if( (intf > 0x43600000) || (intf < 0x3e300000) ) /* filter out special cases */
  70 !    {
  71 !      if(  (intf > 0x7ff00000) || 
  72 !        ((intf == 0x7ff00000) &&  (pf[1] !=0)) ) return (*x-*x);/* return NaN if x=NaN*/
  73 !      if( intf < 0x3e300000 )                       /* avoid underflow for small arg */
  74 !      {
  75 !        dummy = 1.0e37 + f;
  76 !        dummy = dummy;
  77 !        return (*x);
  78 !      }
  79 !      if( intf > 0x43600000 )                       /* avoid underflow for big arg  */
  80 !      {
  81 !        index = 2;
  82 !        f     = __vlibm_TBL_atan1[index] + __vlibm_TBL_atan1[index+1];/* pi/2 up + pi/2 low   */
  83 !        f     = parray[soffset + sign] * f;    /* put sign bit on ans          */
  84 !        return (f);
  85 !      }
  86 !    }
  87 !  
  88 !    index    = 0;                              /* points to 0,0 in table       */
  89 !    if (intf > 0x40500000)                  /* if(|x| > 64                       */
  90 !    { f = -1.0/f;
  91 !      index  = 2;                              /* point to pi/2 upper, lower   */
  92 !    }
  93 !    else if( intf >= 0x3f900000 )           /* if |x| >= (1/64)...               */
  94 !    {
  95 !      intz   = (intf + 0x00008000) & 0x7fff0000;/* round arg, keep upper   */
  96 !      pz[0]  = intz;                           /* store as a double (z)        */
  97 !      pz[1]  = 0;                              /* ...lower                     */
  98 !      f      = (f - z)/(1.0 + f*z);            /* get reduced argument         */
  99 !      index  = (intz - 0x3f900000) >> 15;        /* (index >> 16) << 1)              */
 100 !      index += 4;                              /* skip over 0,0,pi/2,pi/2      */
 101 !    }
 102 !    conup    = __vlibm_TBL_atan1[index];       /* upper table                  */
 103 !    conlo    = __vlibm_TBL_atan1[index+1];     /* lower table                  */
 104 !    tmp      = f*f;
 105 !    poly     = (f*tmp)*((p3*tmp + p2)*tmp + p1);
 106 !    ansu     = conup + f;                      /* compute atan(f)  upper       */
 107 !    ansl     = (((conup - ansu) + f) + poly) + conlo;
 108 !    ans      = ansu + ansl;
 109 !    ans      = parray[soffset + sign] * ans;
 110 !    return ans;
 111 !  }
 112 
 113 /* 8 bytes = 1 double f.p. word */
 114 #define WSIZE   8
 115 
 116         .align  32                      !align with full D-cache line
 117 .COEFFS:
 118         .double 0r-1.428029046844299722E-01     !p[3]
 119         .double 0r1.999999917247000615E-01      !p[2]
 120         .double 0r-3.333333333329292858E-01     !p[1]
 121         .double 0r-1.0,                         !constant -1.0
 122         .word   0x00008000,0x0                  !for fp rounding of reduced arg
 123         .word   0x7fff0000,0x0                  !for fp truncation
 124         .word   0x47900000,0                    !a number close to 1.0E37
 125         .word   0x80000000,0x0                  !mask for fp sign bit
 126         .word   0x3f800000,0x0                  !1.0/128.0 dummy "safe" argument
 127         .type   .COEFFS,#object
 128 
 129         ENTRY(__vatan)
 130         save    %sp,-SA(MINFRAME)-16,%sp
 131         PIC_SETUP(g5)
 132         PIC_SET(g5,__vlibm_TBL_atan1,o4)
 133         PIC_SET(g5,.COEFFS,o0)
 134 /*
 135    __vatan(int n, double *x, int stridex, double *y, stridey)
 136    computes y(i) = atan( x(i) ), for 1=1,n.  Stridex, stridey
 137    are the distance between x and y elements                     
 138 
 139         %i0    n
 140         %i1    address of x
 141         %i2    stride x
 142         %i3    address of y
 143         %i4    stride y
 144 */
 145         cmp     %i0,0                   !if n <=0,
 146         ble,pn  %icc,.RETURN            !....then do nothing
 147         sll     %i2,3,%i2               !convert stride to byte count
 148         sll     %i4,3,%i4               !convert stride to byte count
 149 
 150 /*  pre-load constants before beginning main loop */
 151 
 152         ldd     [%o0],%f58              !load p[3]
 153         mov     2,%i5                   !argcount = 3 
 154 
 155         ldd     [%o0+WSIZE],%f60        !load p[2]
 156         add     %fp,STACK_BIAS-8,%l1            !yaddr1 = &dummy
 157         fzero   %f18                    !ansu1 = 0
 158 
 159         ldd     [%o0+2*WSIZE],%f62      !load p[1]
 160         add     %fp,STACK_BIAS-8,%l2            !yaddr2 = &dummy
 161         fzero   %f12                    !(poly1) = 0
 162 
 163         ldd     [%o0+3*WSIZE],%f56      !-1.0 
 164         fzero   %f14                    !tmp1 = 0
 165 
 166         ldd     [%o0+4*WSIZE],%f52      !load rounding mask
 167         fzero   %f16                    !conup1 = 0
 168 
 169         ldd     [%o0+5*WSIZE],%f54      !load truncation mask
 170         fzero   %f36                    !f1 = 0
 171 
 172         ldd     [%o0+6*WSIZE],%f50      !1.0e37
 173         fzero   %f38                    !f2 = 0
 174 
 175         ldd     [%o0+7*WSIZE],%f32      !mask for sign bit
 176 
 177         ldd     [%o4+2*WSIZE],%f46      !pi/2 upper
 178         ldd     [%o4+(2*WSIZE+8)],%f48  !pi/2 lower 
 179         sethi   %hi(0x40500000),%l6     !64.0
 180         sethi   %hi(0x3f900000),%l7     !1/64.0
 181         mov     0,%l4                   !index1 = 0
 182         mov     0,%l5                   !index2 = 0
 183 
 184 .MAINLOOP:
 185 
 186     /*--------------------------------------------------------------------------*/
 187     /*--------------------------------------------------------------------------*/
 188     /*--------------------------------------------------------------------------*/
 189 
 190 .LOOP0:
 191         deccc   %i0                     !--n
 192         bneg    1f
 193         mov     %i1,%o5                 !xuse = x (delay slot)
 194 
 195         ba      2f
 196         nop                             !delay slot
 197 1:
 198         PIC_SET(g5,.COEFFS+8*WSIZE,o5)
 199         dec     %i5                     !argcount--
 200 2:
 201         sethi   %hi(0x80000000),%o7     !mask for sign bit
 202 /*2 */  sethi   %hi(0x43600000),%o1     !big = 0x43600000,0
 203         ld      [%o5],%o0               !intf = pf[0] = f upper
 204         ldd     [%o4+%l5],%f26          !conup2 = __vlibm_TBL_atan1[index2]
 205 
 206         sethi   %hi(0x3e300000),%o2     !small = 0x3e300000,0
 207 /*4 */  andn    %o0,%o7,%o0             !intf = fabs(intf)
 208         ldd     [%o5],%f34              !f = *x into f34
 209 
 210         sub     %o1,%o0,%o1             !(-) if intf > big
 211 /*6 */  sub     %o0,%o2,%o2             !(-) if intf < small
 212         fand    %f34,%f32,%f40          !sign0 = sign bit 
 213             fmuld   %f38,%f38,%f24      !tmp2= f2*f2
 214 
 215 /*7 */  orcc    %o1,%o2,%g0             !(-) if either true
 216         bneg,pn  %icc,.SPECIAL0         !if (-) goto special cases below
 217         fabsd   %f34,%f34               !abs(f)  (delay slot)
 218         !----------------------
 219 
 220 
 221         sethi   %hi(0x8000),%o7         !rounding bit
 222 /*8 */  fpadd32 %f34,%f52,%f0           !intf + 0x00008000 (again)
 223             faddd   %f26,%f38,%f28      !ansu2 = conup2 + f2
 224 
 225         add     %o0,%o7,%o0             !intf + 0x00008000  (delay slot)
 226 /*9*/   fand    %f0,%f54,%f0            !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
 227             fmuld   %f58,%f24,%f22      !p[3]*tmp2
 228 
 229 /*10 */ sethi   %hi(0x7fff0000),%o7     !mask for rounding argument
 230         fmuld   %f34,%f0,%f10           !f*z
 231         fsubd   %f34,%f0,%f20           !f - z
 232           add     %o4,%l4,%l4           !base addr + index1
 233           fmuld   %f14,%f12,%f12        !poly1 = (f1*tmp1)*((p3*tmp1 + p2)*tmp1 + p1)
 234           faddd   %f16,%f36,%f16        !(conup1 - ansu1) + f1
 235 
 236 /*12 */ and     %o0,%o7,%o0             !intz = (intf + 0x00008000) & 0x7fff0000
 237             faddd   %f22,%f60,%f22      !p[3]*tmp2 + p[2]
 238           ldd     [%l4+WSIZE],%f14      !conlo1 = __vlibm_TBL_atan1[index+1]
 239 
 240 /*13 */ sub     %o0,%l7,%o2             !intz - 0x3f900000
 241         fsubd   %f10,%f56,%f10          !(f*z - (-1.0))
 242           faddd   %f16,%f12,%f12        !((conup1 - ansu1) + f1) + poly1
 243 
 244         cmp     %o0,%l6                 !(|f| > 64)
 245         ble     .ELSE0                  !if(|f| > 64) then
 246 /*15 */ sra       %o2,15,%o3            !index  = (intz - 0x3f900000) >> 15
 247           mov     2,%o1                 !index == 2, point to conup, conlo = pi/2 upper, lower
 248           ba      .ENDIF0               !continue
 249 /*16 */   fdivd   %f56,%f34,%f34        !f = -1.0/f (delay slot)
 250         .ELSE0:                         !else f( |x| >= (1/64))
 251         cmp     %o0,%l7                 !if intf >= 1/64
 252         bl      .ENDIF0                 !if( |x| >= (1/64) ) then...
 253         mov     0,%o1                   !index == 0 , point to conup,conlo = 0,0
 254           add     %o3,4,%o1             !index = index + 4
 255 /*16 */   fdivd   %f20,%f10,%f34        !f = (f - z)/(1.0 + f*z), reduced argument
 256         .ENDIF0:
 257 
 258 /*17*/  sll     %o1,3,%l3               !index0 = index
 259         mov     %i3,%l0                 !yaddr0 = address of y
 260           faddd   %f12,%f14,%f12        !ansl1 = (((conup1 - ansu)1 + f1) + poly1) + conlo1
 261             fmuld   %f22,%f24,%f22      !(p3*tmp2 + p2)*tmp2
 262             fsubd   %f26,%f28,%f26      !conup2 - ansu2
 263 
 264 /*20*/  add     %i1,%i2,%i1             !x     += stridex
 265         add     %i3,%i4,%i3             !y     += stridey
 266           faddd   %f18,%f12,%f36        !ans1 = ansu1 + ansl1
 267             fmuld   %f38,%f24,%f24      !f*tmp2
 268             faddd   %f22,%f62,%f22      !(p3*tmp2 + p2)*tmp2 + p1
 269 
 270 /*23*/    for     %f36,%f42,%f36        !sign(ans1) = sign of argument
 271           std     %f36,[%l1]            !*yaddr1 = ans1
 272             add     %o4,%l5,%l5         !base addr + index2
 273             fmuld   %f24,%f22,%f22      !poly2 = (f2*tmp2)*((p3*tmp2 + p2)*tmp2 + p1)
 274             faddd   %f26,%f38,%f26      !(conup2 - ansu2) + f2
 275         cmp     %i5,0                   !if argcount =0, we are done
 276         be      .RETURN
 277           nop
 278 
 279     /*--------------------------------------------------------------------------*/
 280     /*--------------------------------------------------------------------------*/
 281     /*--------------------------------------------------------------------------*/
 282 
 283 .LOOP1:
 284 /*25*/  deccc   %i0                     !--n
 285         bneg    1f
 286         mov     %i1,%o5                 !xuse = x (delay slot)
 287         ba      2f
 288         nop                             !delay slot
 289 1:
 290         PIC_SET(g5,.COEFFS+8*WSIZE,o5)
 291         dec     %i5                     !argcount--
 292 2:
 293 
 294 /*26*/  sethi   %hi(0x80000000),%o7     !mask for sign bit
 295         sethi   %hi(0x43600000),%o1     !big = 0x43600000,0
 296         ld      [%o5],%o0               !intf = pf[0] = f upper
 297 
 298 /*28*/  sethi   %hi(0x3e300000),%o2     !small = 0x3e300000,0
 299         andn    %o0,%o7,%o0             !intf = fabs(intf)
 300         ldd     [%o5],%f36              !f = *x into f36
 301 
 302 /*30*/  sub     %o1,%o0,%o1             !(-) if intf > big
 303         sub     %o0,%o2,%o2             !(-) if intf < small
 304         fand    %f36,%f32,%f42          !sign1 = sign bit 
 305 
 306 /*31*/  orcc    %o1,%o2,%g0             !(-) if either true
 307         bneg,pn  %icc,.SPECIAL1         !if (-) goto special cases below
 308         fabsd   %f36,%f36               !abs(f)  (delay slot)
 309         !----------------------
 310 
 311 /*32*/  fpadd32 %f36,%f52,%f0           !intf + 0x00008000 (again)
 312           ldd     [%l5+WSIZE],%f24      !conlo2 = __vlibm_TBL_atan1[index2+1]
 313 
 314 /*33*/  fand    %f0,%f54,%f0            !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
 315         sethi   %hi(0x8000),%o7         !rounding bit
 316           faddd   %f26,%f22,%f22        !((conup2 - ansu2) + f2) + poly2
 317 
 318 /*34*/  add     %o0,%o7,%o0             !intf + 0x00008000  (delay slot)
 319         sethi   %hi(0x7fff0000),%o7     !mask for rounding argument
 320         fmuld   %f36,%f0,%f10           !f*z
 321         fsubd   %f36,%f0,%f20           !f - z
 322 
 323 /*35*/  and     %o0,%o7,%o0             !intz = (intf + 0x00008000) & 0x7fff0000
 324           faddd   %f22,%f24,%f22        !ansl2 = (((conup2 - ansu2) + f2) + poly2) + conlo2
 325 
 326 /*37*/  sub     %o0,%l7,%o2             !intz - 0x3f900000
 327         fsubd   %f10,%f56,%f10          !(f*z - (-1.0))
 328       ldd     [%o4+%l3],%f6             !conup0 = __vlibm_TBL_atan1[index0]
 329 
 330         cmp     %o0,%l6                 !(|f| > 64)
 331         ble     .ELSE1                  !if(|f| > 64) then
 332 /*38*/  sra     %o2,15,%o3              !index  = (intz - 0x3f900000) >> 15
 333           mov     2,%o1                 !index == 2, point to conup, conlo = pi/2 upper, lower
 334           ba      .ENDIF1               !continue
 335 /*40*/    fdivd   %f56,%f36,%f36        !f = -1.0/f (delay slot)
 336         .ELSE1:                         !else f( |x| >= (1/64))
 337         cmp     %o0,%l7                 !if intf >= 1/64
 338         bl      .ENDIF1                 !if( |x| >= (1/64) ) then...
 339         mov     0,%o1                   !index == 0 , point to conup,conlo = 0,0
 340           add     %o3,4,%o1             !index = index + 4
 341 /*40*/    fdivd   %f20,%f10,%f36        !f = (f - z)/(1.0 + f*z), reduced argument
 342         .ENDIF1:
 343 
 344 /*41*/sll     %o1,3,%l4                 !index1 = index
 345       mov     %i3,%l1                   !yaddr1 = address of y
 346       fmuld   %f34,%f34,%f4             !tmp0= f0*f0
 347           faddd   %f28,%f22,%f38        !ans2 = ansu2 + ansl2
 348         
 349 /*44*/add       %i1,%i2,%i1             !x     += stridex
 350       add       %i3,%i4,%i3             !y     += stridey
 351     fmuld   %f58,%f4,%f2                !p[3]*tmp0
 352     faddd   %f6,%f34,%f8                !ansu0 = conup0 + f0
 353           for     %f38,%f44,%f38        !sign(ans2) = sign of argument
 354           std     %f38,[%l2]            !*yaddr2 = ans2
 355           cmp   %i5,0                   !if argcount =0, we are done
 356           be    .RETURN
 357           nop
 358 
 359     /*--------------------------------------------------------------------------*/
 360     /*--------------------------------------------------------------------------*/
 361     /*--------------------------------------------------------------------------*/
 362 
 363 .LOOP2:
 364 /*46*/  deccc   %i0                     !--n
 365         bneg    1f
 366         mov     %i1,%o5                 !xuse = x (delay slot)
 367         ba      2f
 368         nop                             !delay slot
 369 1:
 370         PIC_SET(g5,.COEFFS+8*WSIZE,o5)
 371         dec     %i5                     !argcount--
 372 2:
 373 
 374 /*47*/  sethi   %hi(0x80000000),%o7     !mask for sign bit
 375         sethi   %hi(0x43600000),%o1     !big = 0x43600000,0
 376         ld      [%o5],%o0               !intf = pf[0] = f upper
 377 
 378 /*49*/  sethi   %hi(0x3e300000),%o2     !small = 0x3e300000,0
 379         andn    %o0,%o7,%o0             !intf = fabs(intf)
 380         ldd     [%o5],%f38              !f = *x into f38
 381 
 382 /*51*/  sub     %o1,%o0,%o1             !(-) if intf > big
 383         sub     %o0,%o2,%o2             !(-) if intf < small
 384         fand    %f38,%f32,%f44          !sign2 = sign bit 
 385 
 386 /*52*/  orcc    %o1,%o2,%g0             !(-) if either true
 387         bneg,pn  %icc,.SPECIAL2         !if (-) goto special cases below
 388         fabsd   %f38,%f38               !abs(f)  (delay slot)
 389         !----------------------
 390 
 391 /*53*/  fpadd32 %f38,%f52,%f0           !intf + 0x00008000 (again)
 392       faddd   %f2,%f60,%f2              !p[3]*tmp0 + p[2]
 393 
 394 /*54*/  sethi   %hi(0x8000),%o7         !rounding bit
 395         fand    %f0,%f54,%f0            !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
 396 
 397 /*55*/  add     %o0,%o7,%o0             !intf + 0x00008000  (delay slot)
 398         sethi   %hi(0x7fff0000),%o7     !mask for rounding argument
 399         fmuld   %f38,%f0,%f10           !f*z
 400         fsubd   %f38,%f0,%f20           !f - z
 401 
 402 /*56*/  and     %o0,%o7,%o0             !intz = (intf + 0x00008000) & 0x7fff0000
 403       fmuld   %f2,%f4,%f2               !(p3*tmp0 + p2)*tmp0
 404       fsubd   %f6,%f8,%f6               !conup0 - ansu0
 405 
 406 /*58*/  sub     %o0,%l7,%o2             !intz - 0x3f900000
 407         fsubd   %f10,%f56,%f10          !(f*z - (-1.0))
 408             ldd     [%o4+%l4],%f16      !conup1 = __vlibm_TBL_atan1[index1]
 409 
 410         cmp     %o0,%l6                 !(|f| > 64)
 411         ble     .ELSE2                  !if(|f| > 64) then
 412 /*60*/  sra     %o2,15,%o3              !index  = (intz - 0x3f900000) >> 15
 413           mov     2,%o1                 !index == 2, point to conup, conlo = pi/2 upper, lower
 414           ba      .ENDIF2               !continue
 415 /*61*/    fdivd   %f56,%f38,%f38        !f = -1.0/f (delay slot)
 416         .ELSE2:                         !else f( |x| >= (1/64))
 417         cmp     %o0,%l7                 !if intf >= 1/64
 418         bl      .ENDIF2                 !if( |x| >= (1/64) ) then...
 419         mov     0,%o1                   !index == 0 , point to conup,conlo = 0,0
 420           add     %o3,4,%o1             !index = index + 4
 421 /*61*/    fdivd   %f20,%f10,%f38        !f = (f - z)/(1.0 + f*z), reduced argument
 422         .ENDIF2:
 423 
 424         
 425 /*62*/  sll     %o1,3,%l5               !index2 = index
 426         mov     %i3,%l2                 !yaddr2 = address of y
 427       fmuld   %f34,%f4,%f4              !f0*tmp0
 428       faddd   %f2,%f62,%f2              !(p3*tmp0 + p2)*tmp0 + p1
 429             fmuld   %f36,%f36,%f14      !tmp1= f1*f1
 430 
 431 /*65*/add     %o4,%l3,%l3               !base addr + index0
 432       fmuld   %f4,%f2,%f2               !poly0 = (f0*tmp0)*((p3*tmp0 + p2)*tmp0 + p1)
 433       faddd   %f6,%f34,%f6              !(conup0 - ansu0) + f0
 434             fmuld   %f58,%f14,%f12      !p[3]*tmp1
 435             faddd   %f16,%f36,%f18      !ansu1 = conup1 + f1
 436       ldd     [%l3+WSIZE],%f4           !conlo0 = __vlibm_TBL_atan1[index0+1]
 437 
 438 /*68*/  add     %i1,%i2,%i1             !x     += stridex
 439         add     %i3,%i4,%i3             !y     += stridey
 440       faddd   %f6,%f2,%f2               !((conup0 - ansu0) + f0) + poly0
 441             faddd   %f12,%f60,%f12      !p[3]*tmp1 + p[2]
 442 
 443 /*71*/faddd   %f2,%f4,%f2               !ansl0 = (((conup0 - ansu)0 + f0) + poly0) + conlo0
 444             fmuld   %f12,%f14,%f12      !(p3*tmp1 + p2)*tmp1
 445             fsubd   %f16,%f18,%f16      !conup1 - ansu1
 446 
 447 /*74*/faddd   %f8,%f2,%f34              !ans0 = ansu0 + ansl0
 448           fmuld   %f36,%f14,%f14        !f1*tmp1
 449           faddd   %f12,%f62,%f12        !(p3*tmp1 + p2)*tmp1 + p1
 450 
 451 /*77*/  for     %f34,%f40,%f34          !sign(ans0) = sign of argument
 452         std     %f34,[%l0]              !*yaddr0 = ans, always gets stored (delay slot)
 453         cmp     %i5,0                   !if argcount =0, we are done
 454         bg      .MAINLOOP
 455           nop
 456 
 457     /*--------------------------------------------------------------------------*/
 458     /*--------------------------------------------------------------------------*/
 459     /*--------------------------------------------------------------------------*/
 460 
 461 .RETURN:
 462         ret
 463         restore %g0,%g0,%g0
 464 
 465     /*--------------------------------------------------------------------------*/
 466     /*------------SPECIAL CASE HANDLING FOR LOOP0 ------------------------------*/
 467     /*--------------------------------------------------------------------------*/
 468 
 469 /* at this point
 470    %i1     x address
 471    %o0     intf
 472    %o2     intf - 0x3e300000
 473    %f34,36,38     f0,f1,f2
 474    %f40,42,44     sign0,sign1,sign2
 475 */
 476 
 477         .align  32                              !align on I-cache boundary
 478 .SPECIAL0:
 479         orcc    %o2,%g0,%g0                     !(-) if intf < 0x3e300000
 480         bpos    1f                              !if >=...continue
 481         sethi   %hi(0x7ff00000),%g1     !upper word of Inf (we use 64-bit wide int for this)
 482           ba      3f
 483           faddd   %f34,%f50,%f30                !dummy op just to generate exception (delay slot)
 484 1:
 485         ld      [%o5+4],%o5                     !load x lower word  
 486         sllx    %o0,32,%o0                      !left justify intf
 487         sllx    %g1,32,%g1              !left justify Inf
 488         or      %o0,%o5,%o0                     !merge in lower intf
 489         cmp     %o0,%g1                         !if intf > 0x7ff00000 00000000
 490         ble,pt  %xcc,2f                         !pass thru if NaN
 491           nop
 492           fmuld   %f34,%f34,%f34                !...... (x*x) trigger invalid exception
 493           ba      3f
 494           nop
 495 2:
 496         faddd   %f46,%f48,%f34                  !ans = pi/2 upper + pi/2 lower 
 497 3:   
 498         add     %i1,%i2,%i1                     !x += stridex
 499         for     %f34,%f40,%f34                  !sign(ans) = sign of argument
 500         std     %f34,[%i3]                      !*y = ans
 501         ba      .LOOP0                          !keep looping
 502         add     %i3,%i4,%i3                     !y += stridey (delay slot)
 503 
 504     /*--------------------------------------------------------------------------*/
 505     /*-----------SPECIAL CASE HANDLING FOR LOOP1 -------------------------------*/
 506     /*--------------------------------------------------------------------------*/
 507 
 508         .align  32                              !align on I-cache boundary
 509 .SPECIAL1:
 510         orcc    %o2,%g0,%g0                     !(-) if intf < 0x3e300000
 511         bpos    1f                              !if >=...continue
 512         sethi   %hi(0x7ff00000),%g1     !upper word of Inf (we use 64-bit wide int for this)
 513           ba      3f
 514           faddd   %f36,%f50,%f30                !dummy op just to generate exception (delay slot)
 515 1:
 516         ld      [%o5+4],%o5                     !load x lower word  
 517         sllx    %o0,32,%o0                      !left justify intf
 518         sllx    %g1,32,%g1              !left justify Inf
 519         or      %o0,%o5,%o0                     !merge in lower intf
 520         cmp     %o0,%g1                         !if intf > 0x7ff00000 00000000
 521         ble,pt  %xcc,2f                         !pass thru if NaN
 522           nop
 523           fmuld   %f36,%f36,%f36                !...... (x*x) trigger invalid exception
 524           ba      3f
 525           nop
 526 2:
 527         faddd   %f46,%f48,%f36                  !ans = pi/2 upper + pi/2 lower 
 528 3:   
 529         add     %i1,%i2,%i1                     !x += stridex
 530         for     %f36,%f42,%f36                  !sign(ans) = sign of argument
 531         std     %f36,[%i3]                      !*y = ans
 532         ba      .LOOP1                          !keep looping
 533         add     %i3,%i4,%i3                     !y += stridey (delay slot)
 534 
 535     /*--------------------------------------------------------------------------*/
 536     /*------------SPECIAL CASE HANDLING FOR LOOP2 ------------------------------*/
 537     /*--------------------------------------------------------------------------*/
 538 
 539         .align  32                              !align on I-cache boundary
 540 .SPECIAL2:
 541         orcc    %o2,%g0,%g0                     !(-) if intf < 0x3e300000
 542         bpos    1f                              !if >=...continue
 543         sethi   %hi(0x7ff00000),%g1     !upper word of Inf (we use 64-bit wide int for this)
 544           ba      3f
 545           faddd   %f38,%f50,%f30                !dummy op just to generate exception (delay slot)
 546 1:
 547         ld      [%o5+4],%o5                     !load x lower word  
 548         sllx    %o0,32,%o0                      !left justify intf
 549         sllx    %g1,32,%g1              !left justify Inf
 550         or      %o0,%o5,%o0                     !merge in lower intf
 551         cmp     %o0,%g1                         !if intf > 0x7ff00000 00000000
 552         ble,pt  %xcc,2f                         !pass thru if NaN
 553           nop
 554           fmuld   %f38,%f38,%f38                !...... (x*x) trigger invalid exception
 555           ba      3f
 556           nop
 557 2:
 558         faddd   %f46,%f48,%f38                  !ans = pi/2 upper + pi/2 lower 
 559 3:   
 560         add     %i1,%i2,%i1                     !x += stridex
 561         for     %f38,%f44,%f38                  !sign(ans) = sign of argument
 562         std     %f38,[%i3]                      !*y = ans
 563         ba      .LOOP2                          !keep looping
 564         add     %i3,%i4,%i3                     !y += stridey
 565 
 566     /*--------------------------------------------------------------------------*/
 567     /*--------------------------------------------------------------------------*/
 568     /*--------------------------------------------------------------------------*/
 569 
 570         SET_SIZE(__vatan)
 571 
 572 !       .ident  "03-20-96 Sparc V9 3-way-unrolled version"