illumos-gate New usr/src/lib/libmvec/common/vis/_

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  23  */
  24 /*
  25  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  26  * Use is subject to license terms.
  27  */
  28 
  29         .file   "__vhypotf.S"
  30 
  31 #include "libm.h"
  32 
  33         RO_DATA
  34         .align  64
  35 
  36 .CONST_TBL:
  37         .word   0x3fe00001, 0x80007e00  ! K1  =  5.00000715259318464227e-01
  38         .word   0xbfc00003, 0xc0017a01  ! K2  = -1.25000447037521686593e-01
  39         .word   0x000fffff, 0xffffffff  ! DC0 = 0x000fffffffffffff
  40         .word   0x3ff00000, 0x00000000  ! DC1 = 0x3ff0000000000000
  41         .word   0x7ffff000, 0x00000000  ! DC2 = 0x7ffff00000000000
  42         .word   0x7fe00000, 0x00000000  ! DA0 = 0x7fe0000000000000
  43         .word   0x47efffff, 0xe0000000  ! DFMAX = 3.402823e+38
  44         .word   0x7f7fffff, 0x80808080  ! FMAX = 3.402823e+38 , SCALE = 0x80808080
  45         .word   0x20000000, 0x00000000  ! DA1 = 0x2000000000000000
  46 
  47 #define DC0             %f12
  48 #define DC1             %f10
  49 #define DC2             %f42
  50 #define DA0             %f6
  51 #define DA1             %f4
  52 #define K2              %f26
  53 #define K1              %f28
  54 #define SCALE           %f3
  55 #define FMAX            %f2
  56 #define DFMAX           %f50
  57 
  58 #define stridex         %l6
  59 #define stridey         %i4
  60 #define stridez         %l5
  61 #define _0x7fffffff     %o1
  62 #define _0x7f3504f3     %o2
  63 #define _0x1ff0         %l2
  64 #define TBL             %l1
  65 
  66 #define counter         %l0
  67 
  68 #define tmp_px          STACK_BIAS-0x30
  69 #define tmp_py          STACK_BIAS-0x28
  70 #define tmp_counter     STACK_BIAS-0x20
  71 #define tmp0            STACK_BIAS-0x18
  72 #define tmp1            STACK_BIAS-0x10
  73 #define tmp2            STACK_BIAS-0x0c
  74 #define tmp3            STACK_BIAS-0x08
  75 #define tmp4            STACK_BIAS-0x04
  76 
  77 ! sizeof temp storage - must be a multiple of 16 for V9
  78 #define tmps            0x30
  79 
  80 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  81 !      !!!!!   algorithm   !!!!!
  82 !  hx0 = *(int*)px;
  83 !  x0 = *px;
  84 !  px += stridex;
  85 !
  86 !  hy0 = *(int*)py;
  87 !  y0 = *py;
  88 !  py += stridey;
  89 !
  90 !  hx0 &= 0x7fffffff;
  91 !  hy0 &= 0x7fffffff;
  92 !
  93 !  if ( hx >= 0x7f3504f3 || hy >= 0x7f3504f3 )
  94 !  {
  95 !    if ( hx >= 0x7f800000 || hy >= 0x7f800000 )
  96 !    {
  97 !      if ( hx == 0x7f800000 || hy == 0x7f800000 )
  98 !        *(int*)pz = 0x7f800000;
  99 !      else *pz = x * y;
 100 !    }
 101 !    else
 102 !    {
 103 !      hyp = sqrt(x * (double)x + y * (double)y);
 104 !      if ( hyp <= DMAX ) ftmp0 = (float)hyp;
 105 !      else ftmp0 = FMAX * FMAX;
 106 !      *pz = ftmp0;
 107 !    }
 108 !    pz += stridez;
 109 !    continue;
 110 !  }
 111 !  if ( (hx | hy) == 0 )
 112 !  {
 113 !    *pz = 0;
 114 !    pz += stridez;
 115 !    continue;
 116 !  }
 117 !  dx0 = x0 * (double)x0;
 118 !  dy0 = y0 * (double)y0;
 119 !  db0 = dx0 + dy0;
 120 !
 121 !  iexp0 = ((int*)&db0)[0];
 122 !
 123 !  h0 = vis_fand(db0,DC0);
 124 !  h0 = vis_for(h0,DC1);
 125 !  h_hi0 = vis_fand(h0,DC2);
 126 !
 127 !  db0 = vis_fand(db0,DA0);
 128 !  db0 = vis_fmul8x16(SCALE, db0);
 129 !  db0 = vis_fpadd32(db0,DA1);
 130 !
 131 !  iexp0 >>= 8;
 132 !  di0 = iexp0 & 0x1ff0;
 133 !  si0 = (char*)sqrt_arr + di0;
 134 !
 135 !  dtmp0 = ((double*)((char*)div_arr + di0))[0];
 136 !  xx0 = h0 - h_hi0;
 137 !  xx0 *= dmp0;
 138 !
 139 !  dtmp0 = ((double*)si0)[1];
 140 !  res0 = K2 * xx0;
 141 !  res0 += K1;
 142 !  res0 *= xx0;
 143 !  res0 += DC1;
 144 !  res0 = dtmp0 * res0;
 145 !  res0 *= db0;
 146 !  ftmp0 = (float)res0;
 147 !  *pz = ftmp0;
 148 !  pz += stridez;
 149 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 150 
 151         ENTRY(__vhypotf)
 152         save    %sp,-SA(MINFRAME)-tmps,%sp
 153         PIC_SETUP(l7)
 154         PIC_SET(l7,.CONST_TBL,o3)
 155         PIC_SET(l7,__vlibm_TBL_sqrtf,l1)
 156 
 157 #ifdef __sparcv9
 158         ldx     [%fp+STACK_BIAS+176],stridez
 159 #else
 160         ld      [%fp+STACK_BIAS+92],stridez
 161 #endif
 162         st      %i0,[%fp+tmp_counter]
 163 
 164         stx     %i1,[%fp+tmp_px]
 165 
 166         stx     %i3,[%fp+tmp_py]
 167 
 168         ldd     [%o3],K1
 169         sethi   %hi(0x7ffffc00),%o1
 170 
 171         ldd     [%o3+8],K2
 172         sethi   %hi(0x7f350400),%o2
 173 
 174         ldd     [%o3+16],DC0
 175         add     %o1,1023,_0x7fffffff
 176         add     %o2,0xf3,_0x7f3504f3
 177 
 178         ldd     [%o3+24],DC1
 179         sll     %i2,2,stridex
 180 
 181         ld      [%o3+56],FMAX
 182 
 183         ldd     [%o3+32],DC2
 184         sll     %i4,2,stridey
 185 
 186         ldd     [%o3+40],DA0
 187         sll     stridez,2,stridez
 188 
 189         ldd     [%o3+48],DFMAX
 190 
 191         ld      [%o3+60],SCALE
 192         or      %g0,0xff8,%l2
 193 
 194         ldd     [%o3+64],DA1
 195         sll     %l2,1,_0x1ff0
 196         or      %g0,%i5,%l7
 197 
 198 .begin:
 199         ld      [%fp+tmp_counter],counter
 200         ldx     [%fp+tmp_px],%i1
 201         ldx     [%fp+tmp_py],%i2
 202         st      %g0,[%fp+tmp_counter]
 203 .begin1:
 204         cmp     counter,0
 205         ble,pn  %icc,.exit
 206         lda     [%i1]0x82,%l3           ! (3_0) hx0 = *(int*)px;
 207 
 208         lda     [%i2]0x82,%l4           ! (3_0) hy0 = *(int*)py;
 209 
 210         lda     [%i1]0x82,%f17          ! (3_0) x0 = *px;
 211         and     %l3,_0x7fffffff,%l3     ! (3_0) hx0 &= 0x7fffffff;
 212 
 213         cmp     %l3,_0x7f3504f3         ! (3_0) hx ? 0x7f3504f3
 214         bge,pn  %icc,.spec              ! (3_0) if ( hx >= 0x7f3504f3 )
 215         and     %l4,_0x7fffffff,%l4     ! (3_0) hy0 &= 0x7fffffff;
 216 
 217         cmp     %l4,_0x7f3504f3         ! (3_0) hy ? 0x7f3504f3
 218         bge,pn  %icc,.spec              ! (3_0) if ( hy >= 0x7f3504f3 )
 219         or      %g0,%i2,%o7
 220 
 221         orcc    %l3,%l4,%g0
 222         bz,pn   %icc,.spec1
 223 
 224         add     %i1,stridex,%i1         ! px += stridex
 225         fsmuld  %f17,%f17,%f44          ! (3_0) dx0 = x0 * (double)x0;
 226         lda     [%i2]0x82,%f17          ! (3_0) y0 = *py;
 227 
 228         lda     [%i1]0x82,%l3           ! (4_0) hx0 = *(int*)px;
 229 
 230         lda     [stridey+%o7]0x82,%l4   ! (4_0) hy0 = *(int*)py;
 231 
 232         and     %l3,_0x7fffffff,%l3     ! (4_0) hx0 &= 0x7fffffff;
 233 
 234         fsmuld  %f17,%f17,%f24          ! (3_0) dy0 = y0 * (double)y0;
 235         cmp     %l3,_0x7f3504f3         ! (4_0) hx ? 0x7f3504f3
 236         bge,pn  %icc,.update0           ! (4_0) if ( hx >= 0x7f3504f3 )
 237         and     %l4,_0x7fffffff,%l4     ! (4_0) hy0 &= 0x7fffffff;
 238 
 239         orcc    %l3,%l4,%g0
 240         bz,pn   %icc,.update0
 241         lda     [%i1]0x82,%f17          ! (4_0) x0 = *px;
 242 .cont0:
 243         faddd   %f44,%f24,%f24          ! (3_0) db0 = dx0 + dy0;
 244 
 245         fsmuld  %f17,%f17,%f40          ! (4_1) dy0 = x0 * (double)x0;
 246         cmp     %l4,_0x7f3504f3         ! (4_1) hy ? 0x7f3504f3
 247         lda     [stridey+%o7]0x82,%f17  ! (4_1) hy0 = *py;
 248 
 249         add     %o7,stridey,%i5         ! py += stridey
 250         lda     [%i1+stridex]0x82,%l3   ! (0_0) hx0 = *(int*)px;
 251 
 252         bge,pn  %icc,.update1           ! (4_1) if ( hy >= 0x7f3504f3 )
 253         st      %f24,[%fp+tmp0]         ! (3_1) iexp0 = ((int*)&db0)[0];
 254 .cont1:
 255         and     %l3,_0x7fffffff,%l3     ! (0_0) hx0 &= 0x7fffffff;
 256 
 257         fsmuld  %f17,%f17,%f48          ! (4_1) dy0 = y0 * (double)y0;
 258         lda     [%i1+stridex]0x82,%f8   ! (0_0) x0 = *px;
 259 
 260         add     %i1,stridex,%i1         ! px += stridex
 261 
 262         lda     [%i5+stridey]0x82,%l4   ! (0_0) hy0 = *(int*)py;
 263         cmp     %l3,_0x7f3504f3         ! (0_0) hx ? 0x7f3504f3
 264         bge,pn  %icc,.update2           ! (0_0) if ( hx >= 0x7f3504f3 )
 265         add     %i5,stridey,%o4         ! py += stridey
 266 .cont2:
 267         faddd   %f40,%f48,%f20          ! (4_1) db0 = dx0 + dy0;
 268 
 269         fsmuld  %f8,%f8,%f40            ! (0_0) dx0 = x0 * (double)x0;
 270         and     %l4,_0x7fffffff,%l4     ! (0_0) hy0 &= 0x7fffffff;
 271         lda     [%i5+stridey]0x82,%f17  ! (0_0) hy0 = *py;
 272 
 273         cmp     %l4,_0x7f3504f3         ! (0_0) hy ? 0x7f3504f3
 274         bge,pn  %icc,.update3           ! (0_0) if ( hy >= 0x7f3504f3 )
 275         st      %f20,[%fp+tmp1]         ! (4_1) iexp0 = ((int*)&db0)[0];
 276 
 277         orcc    %l3,%l4,%g0
 278         bz,pn   %icc,.update3
 279 .cont3:
 280         lda     [%i1+stridex]0x82,%l3   ! (1_0) hx0 = *(int*)px;
 281 
 282         fand    %f24,DC0,%f60           ! (3_1) h0 = vis_fand(db0,DC0);
 283 
 284         and     %l3,_0x7fffffff,%l3     ! (1_0) hx0 &= 0x7fffffff;
 285 
 286         fsmuld  %f17,%f17,%f34          ! (0_0) dy0 = y0 * (double)y0;
 287         cmp     %l3,_0x7f3504f3         ! (1_0) hx ? 0x7f3504f3
 288         lda     [%o4+stridey]0x82,%l4   ! (1_0) hy0 = *(int*)py;
 289 
 290         add     %i1,stridex,%i1         ! px += stridex
 291 
 292         lda     [%i1]0x82,%f17          ! (1_0) x0 = *px;
 293         bge,pn  %icc,.update4           ! (1_0) if ( hx >= 0x7f3504f3 )
 294         add     %o4,stridey,%i5         ! py += stridey
 295 .cont4:
 296         and     %l4,_0x7fffffff,%l4     ! (1_0) hy0 &= 0x7fffffff;
 297         for     %f60,DC1,%f46           ! (3_1) h0 = vis_for(h0,DC1);
 298 
 299         cmp     %l4,_0x7f3504f3         ! (1_0) hy ? 0x7f3504f3
 300         ld      [%fp+tmp0],%o0          ! (3_1) iexp0 = ((int*)&db0)[0];
 301         faddd   %f40,%f34,%f0           ! (0_0) db0 = dx0 + dy0;
 302 
 303         fsmuld  %f17,%f17,%f40          ! (1_0) dx0 = x0 * (double)x0;
 304         add     %i1,stridex,%i1         ! px += stridex
 305         lda     [%o4+stridey]0x82,%f17  ! (1_0) y0 = *py;
 306 
 307         srax    %o0,8,%o0               ! (3_1) iexp0 >>= 8;
 308         bge,pn  %icc,.update5           ! (1_0) if ( hy >= 0x7f3504f3 )
 309         fand    %f46,DC2,%f38           ! (3_1) h_hi0 = vis_fand(h0,DC2);
 310 
 311         orcc    %l3,%l4,%g0
 312         bz,pn   %icc,.update5
 313 .cont5:
 314         lda     [%i1]0x82,%l3           ! (2_0) hx0 = *(int*)px;
 315 
 316         and     %o0,_0x1ff0,%o0         ! (3_1) di0 = iexp0 & 0x1ff0;
 317         st      %f0,[%fp+tmp2]          ! (0_0) iexp0 = ((int*)&db0)[0];
 318         fand    %f20,DC0,%f60           ! (4_1) h0 = vis_fand(db0,DC0);
 319 
 320         ldd     [TBL+%o0],%f22          ! (3_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
 321         fsubd   %f46,%f38,%f38          ! (3_1) xx0 = h0 - h_hi0;
 322 
 323         fsmuld  %f17,%f17,%f32          ! (1_0) dy0 = y0 * (double)y0;
 324         add     %i5,stridey,%i2         ! py += stridey
 325         lda     [stridey+%i5]0x82,%l4   ! (2_0) hy0 = *(int*)py;
 326 
 327         and     %l3,_0x7fffffff,%l3     ! (2_0) hx0 &= 0x7fffffff;
 328 
 329         lda     [%i1]0x82,%f17          ! (2_0) x0 = *px;
 330         cmp     %l3,_0x7f3504f3         ! (2_0) hx ? 0x7f3504f3
 331 
 332         fmuld   %f38,%f22,%f38          ! (3_1) xx0 *= dmp0;
 333         and     %l4,_0x7fffffff,%l4     ! (2_0) hy0 &= 0x7fffffff;
 334         for     %f60,DC1,%f46           ! (4_1) h0 = vis_for(h0,DC1);
 335 
 336         bge,pn  %icc,.update6           ! (2_0) if ( hx >= 0x7f3504f3 )
 337         ld      [%fp+tmp1],%o3          ! (4_1) iexp0 = ((int*)&db0)[0];
 338 .cont6:
 339         faddd   %f40,%f32,%f18          ! (1_0) db0 = dx0 + dy0;
 340 
 341         fsmuld  %f17,%f17,%f44          ! (2_0) dx0 = x0 * (double)x0;
 342         cmp     %l4,_0x7f3504f3         ! (2_0) hy ? 0x7f3504f3
 343         lda     [stridey+%i5]0x82,%f17  ! (2_0) y0 = *py;
 344 
 345         add     %i1,stridex,%i1         ! px += stridex
 346         bge,pn  %icc,.update7           ! (2_0) if ( hy >= 0x7f3504f3 )
 347         fand    %f46,DC2,%f58           ! (4_1) h_hi0 = vis_fand(h0,DC2);
 348 
 349         orcc    %l3,%l4,%g0
 350         bz,pn   %icc,.update7
 351         nop
 352 .cont7:
 353         fmuld   K2,%f38,%f56            ! (3_1) res0 = K2 * xx0;
 354         srax    %o3,8,%o3               ! (4_1) iexp0 >>= 8;
 355         lda     [%i1]0x82,%l3           ! (3_0) hx0 = *(int*)px;
 356 
 357         and     %o3,_0x1ff0,%o3         ! (4_1) di0 = iexp0 & 0x1ff0;
 358         st      %f18,[%fp+tmp3]         ! (1_0) iexp0 = ((int*)&db0)[0];
 359         fand    %f0,DC0,%f60            ! (0_0) h0 = vis_fand(db0,DC0);
 360 
 361         ldd     [TBL+%o3],%f22          ! (4_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
 362         add     %i2,stridey,%o7         ! py += stridey
 363         fsubd   %f46,%f58,%f58          ! (4_1) xx0 = h0 - h_hi0;
 364 
 365         fsmuld  %f17,%f17,%f30          ! (2_0) dy0 = y0 * (double)y0;
 366         lda     [stridey+%i2]0x82,%l4   ! (3_0) hy0 = *(int*)py;
 367         and     %l3,_0x7fffffff,%l3     ! (3_0) hx0 &= 0x7fffffff;
 368 
 369         faddd   %f56,K1,%f54            ! (3_1) res0 += K1;
 370         cmp     %l3,_0x7f3504f3         ! (3_0) hx ? 0x7f3504f3
 371 
 372         lda     [%i1]0x82,%f17          ! (3_0) x0 = *px;
 373         add     %i1,stridex,%i1         ! px += stridex
 374         bge,pn  %icc,.update8           ! (3_0) if ( hx >= 0x7f3504f3 )
 375 
 376         fmuld   %f58,%f22,%f58          ! (4_1) xx0 *= dmp0;
 377 .cont8:
 378         and     %l4,_0x7fffffff,%l4     ! (3_0) hy0 &= 0x7fffffff;
 379         for     %f60,DC1,%f46           ! (0_0) h0 = vis_for(h0,DC1);
 380 
 381         cmp     %l4,_0x7f3504f3         ! (3_0) hy ? 0x7f3504f3
 382         ld      [%fp+tmp2],%g1          ! (0_0) iexp0 = ((int*)&db0)[0];
 383         faddd   %f44,%f30,%f30          ! (2_0) db0 = dx0 + dy0;
 384 
 385         fsmuld  %f17,%f17,%f44          ! (3_0) dx0 = x0 * (double)x0;
 386         bge,pn  %icc,.update9           ! (3_0) if ( hy >= 0x7f3504f3 )
 387         lda     [stridey+%i2]0x82,%f17  ! (3_0) y0 = *py;
 388 
 389         orcc    %l3,%l4,%g0
 390         bz,pn   %icc,.update9
 391         nop
 392 .cont9:
 393         fmuld   %f54,%f38,%f40          ! (3_1) res0 *= xx0;
 394         lda     [%i1]0x82,%l3           ! (4_0) hx0 = *(int*)px;
 395         fand    %f46,DC2,%f38           ! (0_0) h_hi0 = vis_fand(h0,DC2);
 396 
 397         fmuld   K2,%f58,%f54            ! (4_1) res0 = K2 * xx0;
 398         srax    %g1,8,%o5               ! (0_0) iexp0 >>= 8;
 399         lda     [stridey+%o7]0x82,%l4   ! (4_0) hy0 = *(int*)py;
 400         fand    %f24,DA0,%f56           ! (3_1) db0 = vis_fand(db0,DA0);
 401 
 402         and     %o5,_0x1ff0,%o5         ! (0_0) di0 = iexp0 & 0x1ff0;
 403         st      %f30,[%fp+tmp4]         ! (2_0) iexp0 = ((int*)&db0)[0];
 404         fand    %f18,DC0,%f60           ! (1_0) h0 = vis_fand(db0,DC0);
 405 
 406         ldd     [TBL+%o5],%f22          ! (0_0) dtmp0 = ((double*)((char*)div_arr + di0))[0];
 407         add     %o0,TBL,%g1             ! (3_1) si0 = (char*)sqrt_arr + di0;
 408         and     %l3,_0x7fffffff,%l3     ! (4_0) hx0 &= 0x7fffffff;
 409         fsubd   %f46,%f38,%f38          ! (0_0) xx0 = h0 - h_hi0;
 410 
 411         fsmuld  %f17,%f17,%f24          ! (3_0) dy0 = y0 * (double)y0;
 412         cmp     %l3,_0x7f3504f3         ! (4_0) hx ? 0x7f3504f3
 413         bge,pn  %icc,.update10          ! (4_0) if ( hx >= 0x7f3504f3 )
 414         faddd   %f40,DC1,%f40           ! (3_1) res0 += DC1;
 415 
 416         fmul8x16        SCALE,%f56,%f36 ! (3_1) db0 = vis_fmul8x16(SCALE, db0);
 417         and     %l4,_0x7fffffff,%l4     ! (4_0) hy0 &= 0x7fffffff;
 418         ldd     [%g1+8],%f56            ! (3_1) dtmp0 = ((double*)si0)[1];
 419         faddd   %f54,K1,%f54            ! (4_1) res0 += K1;
 420 
 421         lda     [%i1]0x82,%f17          ! (4_0) x0 = *px;
 422 .cont10:
 423         fmuld   %f38,%f22,%f38          ! (0_0) xx0 *= dmp0;
 424         cmp     counter,5
 425         for     %f60,DC1,%f46           ! (1_0) h0 = vis_for(h0,DC1);
 426 
 427         ld      [%fp+tmp3],%g1          ! (1_0) iexp0 = ((int*)&db0)[0];
 428         fmuld   %f56,%f40,%f62          ! (3_1) res0 = dtmp0 * res0;
 429         faddd   %f44,%f24,%f24          ! (3_0) db0 = dx0 + dy0;
 430 
 431         bl,pn   %icc,.tail
 432         nop
 433 
 434         ba      .main_loop
 435         sub     counter,5,counter
 436 
 437         .align  16
 438 .main_loop:
 439         fsmuld  %f17,%f17,%f40          ! (4_1) dy0 = x0 * (double)x0;
 440         cmp     %l4,_0x7f3504f3         ! (4_1) hy ? 0x7f3504f3
 441         lda     [stridey+%o7]0x82,%f17  ! (4_1) hy0 = *py;
 442         fpadd32 %f36,DA1,%f36           ! (3_2) db0 = vis_fpadd32(db0,DA1);
 443 
 444         fmuld   %f54,%f58,%f58          ! (4_2) res0 *= xx0;
 445         add     %o7,stridey,%i5         ! py += stridey
 446         st      %f24,[%fp+tmp0]         ! (3_1) iexp0 = ((int*)&db0)[0];
 447         fand    %f46,DC2,%f44           ! (1_1) h_hi0 = vis_fand(h0,DC2);
 448 
 449         fmuld   K2,%f38,%f56            ! (0_1) res0 = K2 * xx0;
 450         srax    %g1,8,%g5               ! (1_1) iexp0 >>= 8;
 451         bge,pn  %icc,.update11          ! (4_1) if ( hy >= 0x7f3504f3 )
 452         fand    %f20,DA0,%f54           ! (4_2) db0 = vis_fand(db0,DA0);
 453 
 454         orcc    %l3,%l4,%g0
 455         nop
 456         bz,pn   %icc,.update11
 457         fzero   %f52
 458 .cont11:
 459         fmuld   %f62,%f36,%f62          ! (3_2) res0 *= db0;
 460         and     %g5,_0x1ff0,%g5         ! (1_1) di0 = iexp0 & 0x1ff0;
 461         lda     [%i1+stridex]0x82,%l3   ! (0_0) hx0 = *(int*)px;
 462         fand    %f30,DC0,%f60           ! (2_1) h0 = vis_fand(db0,DC0);
 463 
 464         ldd     [%g5+TBL],%f22          ! (1_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
 465         add     %o3,TBL,%g1             ! (4_2) si0 = (char*)sqrt_arr + di0;
 466         add     %i1,stridex,%i0         ! px += stridex
 467         fsubd   %f46,%f44,%f44          ! (1_1) xx0 = h0 - h_hi0;
 468 
 469         fsmuld  %f17,%f17,%f48          ! (4_1) dy0 = y0 * (double)y0;
 470         nop
 471         lda     [%i1+stridex]0x82,%f8   ! (0_0) x0 = *px;
 472         faddd   %f58,DC1,%f36           ! (4_2) res0 += DC1;
 473 
 474         faddd   %f56,K1,%f58            ! (0_1) res0 += K1;
 475         and     %l3,_0x7fffffff,%l3     ! (0_0) hx0 &= 0x7fffffff;
 476         ldd     [%g1+8],%f56            ! (4_2) dtmp0 = ((double*)si0)[1];
 477         fmul8x16        SCALE,%f54,%f54 ! (4_2) db0 = vis_fmul8x16(SCALE, db0);
 478 
 479         lda     [%i5+stridey]0x82,%l4   ! (0_0) hy0 = *(int*)py;
 480         cmp     %l3,_0x7f3504f3         ! (0_0) hx ? 0x7f3504f3
 481         bge,pn  %icc,.update12          ! (0_0) if ( hx >= 0x7f3504f3 )
 482         fdtos   %f62,%f14               ! (3_2) ftmp0 = (float)res0;
 483 .cont12:
 484         fmuld   %f44,%f22,%f44          ! (1_1) xx0 *= dmp0;
 485         add     %l7,stridez,%o7         ! pz += stridez
 486         st      %f14,[%l7]              ! (3_2) *pz = ftmp0;
 487         for     %f60,DC1,%f46           ! (2_1) h0 = vis_for(h0,DC1);
 488 
 489         fmuld   %f56,%f36,%f36          ! (4_2) res0 = dtmp0 * res0;
 490         add     %i5,stridey,%o4         ! py += stridey
 491         ld      [%fp+tmp4],%g1          ! (2_1) iexp0 = ((int*)&db0)[0];
 492         faddd   %f40,%f48,%f20          ! (4_1) db0 = dx0 + dy0;
 493 
 494         fsmuld  %f8,%f8,%f40            ! (0_0) dx0 = x0 * (double)x0;
 495         and     %l4,_0x7fffffff,%l4     ! (0_0) hy0 &= 0x7fffffff;
 496         lda     [%i5+stridey]0x82,%f17  ! (0_0) hy0 = *py;
 497         fpadd32 %f54,DA1,%f62           ! (4_2) db0 = vis_fpadd32(db0,DA1);
 498 
 499         fmuld   %f58,%f38,%f38          ! (0_1) res0 *= xx0;
 500         cmp     %l4,_0x7f3504f3         ! (0_0) hy ? 0x7f3504f3
 501         st      %f20,[%fp+tmp1]         ! (4_1) iexp0 = ((int*)&db0)[0];
 502         fand    %f46,DC2,%f58           ! (2_1) h_hi0 = vis_fand(h0,DC2);
 503 
 504         fmuld   K2,%f44,%f56            ! (1_1) res0 = K2 * xx0;
 505         srax    %g1,8,%g1               ! (2_1) iexp0 >>= 8;
 506         bge,pn  %icc,.update13          ! (0_0) if ( hy >= 0x7f3504f3 )
 507         fand    %f0,DA0,%f54            ! (0_1) db0 = vis_fand(db0,DA0);
 508 
 509         orcc    %l3,%l4,%g0
 510         nop
 511         bz,pn   %icc,.update13
 512         fzero   %f52
 513 .cont13:
 514         fmuld   %f36,%f62,%f62          ! (4_2) res0 *= db0;
 515         and     %g1,_0x1ff0,%g1         ! (2_1) di0 = iexp0 & 0x1ff0;
 516         lda     [%i0+stridex]0x82,%l3   ! (1_0) hx0 = *(int*)px;
 517         fand    %f24,DC0,%f60           ! (3_1) h0 = vis_fand(db0,DC0);
 518 
 519         ldd     [TBL+%g1],%f22          ! (2_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
 520         add     %o5,TBL,%o0             ! (0_1) si0 = (char*)sqrt_arr + di0;
 521         add     %i0,stridex,%i1         ! px += stridex
 522         fsubd   %f46,%f58,%f58          ! (2_1) xx0 = h0 - h_hi0;
 523 
 524         fsmuld  %f17,%f17,%f34          ! (0_0) dy0 = y0 * (double)y0;
 525         add     %o7,stridez,%i0         ! pz += stridez
 526         lda     [%o4+stridey]0x82,%l4   ! (1_0) hy0 = *(int*)py;
 527         faddd   %f38,DC1,%f36           ! (0_1) res0 += DC1;
 528 
 529         faddd   %f56,K1,%f38            ! (1_1) res0 += K1;
 530         and     %l3,_0x7fffffff,%l3     ! (1_0) hx0 &= 0x7fffffff;
 531         ldd     [%o0+8],%f56            ! (0_1) dtmp0 = ((double*)si0)[1];
 532         fmul8x16        SCALE,%f54,%f54 ! (0_1) db0 = vis_fmul8x16(SCALE, db0);
 533 
 534         lda     [%i1]0x82,%f17          ! (1_0) x0 = *px;
 535         cmp     %l3,_0x7f3504f3         ! (1_0) hx ? 0x7f3504f3
 536         bge,pn  %icc,.update14          ! (1_0) if ( hx >= 0x7f3504f3 )
 537         fdtos   %f62,%f14               ! (4_2) ftmp0 = (float)res0;
 538 .cont14:
 539         fmuld   %f58,%f22,%f58          ! (2_1) xx0 *= dmp0;
 540         and     %l4,_0x7fffffff,%l4     ! (1_0) hy0 &= 0x7fffffff;
 541         add     %o4,stridey,%i5         ! py += stridey
 542         for     %f60,DC1,%f46           ! (3_1) h0 = vis_for(h0,DC1);
 543 
 544         fmuld   %f56,%f36,%f36          ! (0_1) res0 = dtmp0 * res0;
 545         cmp     %l4,_0x7f3504f3         ! (1_0) hy ? 0x7f3504f3
 546         ld      [%fp+tmp0],%o0          ! (3_1) iexp0 = ((int*)&db0)[0];
 547         faddd   %f40,%f34,%f0           ! (0_0) db0 = dx0 + dy0;
 548 
 549         fsmuld  %f17,%f17,%f40          ! (1_0) dx0 = x0 * (double)x0;
 550         add     %i1,stridex,%i1         ! px += stridex
 551         lda     [%o4+stridey]0x82,%f17  ! (1_0) y0 = *py;
 552         fpadd32 %f54,DA1,%f62           ! (0_1) db0 = vis_fpadd32(db0,DA1);
 553 
 554         fmuld   %f38,%f44,%f44          ! (1_1) res0 *= xx0;
 555         st      %f14,[%o7]              ! (4_2) *pz = ftmp0;
 556         bge,pn  %icc,.update15          ! (1_0) if ( hy >= 0x7f3504f3 )
 557         fand    %f46,DC2,%f38           ! (3_1) h_hi0 = vis_fand(h0,DC2);
 558 
 559         orcc    %l3,%l4,%g0
 560         bz,pn   %icc,.update15
 561         nop
 562 .cont15:
 563         fmuld   K2,%f58,%f54            ! (2_1) res0 = K2 * xx0;
 564         srax    %o0,8,%o0               ! (3_1) iexp0 >>= 8;
 565         st      %f0,[%fp+tmp2]          ! (0_0) iexp0 = ((int*)&db0)[0];
 566         fand    %f18,DA0,%f56           ! (1_1) db0 = vis_fand(db0,DA0);
 567 
 568         fmuld   %f36,%f62,%f62          ! (0_1) res0 *= db0;
 569         and     %o0,_0x1ff0,%o0         ! (3_1) di0 = iexp0 & 0x1ff0;
 570         lda     [%i1]0x82,%l3           ! (2_0) hx0 = *(int*)px;
 571         fand    %f20,DC0,%f60           ! (4_1) h0 = vis_fand(db0,DC0);
 572 
 573         ldd     [TBL+%o0],%f22          ! (3_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
 574         add     %g5,TBL,%o3             ! (1_1) si0 = (char*)sqrt_arr + di0;
 575         add     %i0,stridez,%i3         ! pz += stridez
 576         fsubd   %f46,%f38,%f38          ! (3_1) xx0 = h0 - h_hi0;
 577 
 578         fsmuld  %f17,%f17,%f32          ! (1_0) dy0 = y0 * (double)y0;
 579         add     %i5,stridey,%i2         ! py += stridey
 580         lda     [stridey+%i5]0x82,%l4   ! (2_0) hy0 = *(int*)py;
 581         faddd   %f44,DC1,%f44           ! (1_1) res0 += DC1;
 582 
 583         fmul8x16        SCALE,%f56,%f36 ! (1_1) db0 = vis_fmul8x16(SCALE, db0);
 584         and     %l3,_0x7fffffff,%l3     ! (2_0) hx0 &= 0x7fffffff;
 585         ldd     [%o3+8],%f56            ! (1_1) dtmp0 = ((double*)si0)[1];
 586         faddd   %f54,K1,%f54            ! (2_1) res0 += K1;
 587 
 588         lda     [%i1]0x82,%f17          ! (2_0) x0 = *px;
 589         cmp     %l3,_0x7f3504f3         ! (2_0) hx ? 0x7f3504f3
 590         add     %i3,stridez,%o4         ! pz += stridez
 591         fdtos   %f62,%f14               ! (0_1) ftmp0 = (float)res0;
 592 
 593         fmuld   %f38,%f22,%f38          ! (3_1) xx0 *= dmp0;
 594         and     %l4,_0x7fffffff,%l4     ! (2_0) hy0 &= 0x7fffffff;
 595         st      %f14,[%i0]              ! (0_1) *pz = ftmp0;
 596         for     %f60,DC1,%f46           ! (4_1) h0 = vis_for(h0,DC1);
 597 
 598         fmuld   %f56,%f44,%f62          ! (1_1) res0 = dtmp0 * res0;
 599         bge,pn  %icc,.update16          ! (2_0) if ( hx >= 0x7f3504f3 )
 600         ld      [%fp+tmp1],%o3          ! (4_1) iexp0 = ((int*)&db0)[0];
 601         faddd   %f40,%f32,%f18          ! (1_0) db0 = dx0 + dy0;
 602 .cont16:
 603         fsmuld  %f17,%f17,%f44          ! (2_0) dx0 = x0 * (double)x0;
 604         cmp     %l4,_0x7f3504f3         ! (2_0) hy ? 0x7f3504f3
 605         lda     [stridey+%i5]0x82,%f17  ! (2_0) y0 = *py;
 606         fpadd32 %f36,DA1,%f36           ! (1_1) db0 = vis_fpadd32(db0,DA1);
 607 
 608         fmuld   %f54,%f58,%f54          ! (2_1) res0 *= xx0;
 609         add     %i1,stridex,%l7         ! px += stridex
 610         bge,pn  %icc,.update17          ! (2_0) if ( hy >= 0x7f3504f3 )
 611         fand    %f46,DC2,%f58           ! (4_1) h_hi0 = vis_fand(h0,DC2);
 612 
 613         orcc    %l3,%l4,%g0
 614         nop
 615         bz,pn   %icc,.update17
 616         fzero   %f52
 617 .cont17:
 618         fmuld   K2,%f38,%f56            ! (3_1) res0 = K2 * xx0;
 619         srax    %o3,8,%o3               ! (4_1) iexp0 >>= 8;
 620         st      %f18,[%fp+tmp3]         ! (1_0) iexp0 = ((int*)&db0)[0];
 621         fand    %f30,DA0,%f40           ! (2_1) db0 = vis_fand(db0,DA0);
 622 
 623         fmuld   %f62,%f36,%f62          ! (1_1) res0 *= db0;
 624         and     %o3,_0x1ff0,%o3         ! (4_1) di0 = iexp0 & 0x1ff0;
 625         lda     [%l7]0x82,%l3           ! (3_0) hx0 = *(int*)px;
 626         fand    %f0,DC0,%f60            ! (0_0) h0 = vis_fand(db0,DC0);
 627 
 628         ldd     [TBL+%o3],%f22          ! (4_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
 629         add     %g1,TBL,%g1             ! (2_1) si0 = (char*)sqrt_arr + di0;
 630         add     %i2,stridey,%o7         ! py += stridey
 631         fsubd   %f46,%f58,%f58          ! (4_1) xx0 = h0 - h_hi0;
 632 
 633         fsmuld  %f17,%f17,%f30          ! (2_0) dy0 = y0 * (double)y0;
 634         lda     [stridey+%i2]0x82,%l4   ! (3_0) hy0 = *(int*)py;
 635         add     %l7,stridex,%i1         ! px += stridex
 636         faddd   %f54,DC1,%f36           ! (2_1) res0 += DC1;
 637 
 638         faddd   %f56,K1,%f54            ! (3_1) res0 += K1;
 639         and     %l3,_0x7fffffff,%l3     ! (3_0) hx0 &= 0x7fffffff;
 640         ldd     [%g1+8],%f56            ! (2_1) dtmp0 = ((double*)si0)[1];
 641         fmul8x16        SCALE,%f40,%f40 ! (2_1) db0 = vis_fmul8x16(SCALE, db0);
 642 
 643         lda     [%l7]0x82,%f17          ! (3_0) x0 = *px;
 644         cmp     %l3,_0x7f3504f3         ! (3_0) hx ? 0x7f3504f3
 645         bge,pn  %icc,.update18          ! (3_0) if ( hx >= 0x7f3504f3 )
 646         fdtos   %f62,%f14               ! (1_1) ftmp0 = (float)res0;
 647 .cont18:
 648         fmuld   %f58,%f22,%f58          ! (4_1) xx0 *= dmp0;
 649         and     %l4,_0x7fffffff,%l4     ! (3_0) hy0 &= 0x7fffffff;
 650         st      %f14,[%i3]              ! (1_1) *pz = ftmp0;
 651         for     %f60,DC1,%f46           ! (0_0) h0 = vis_for(h0,DC1);
 652 
 653         fmuld   %f56,%f36,%f36          ! (2_1) res0 = dtmp0 * res0;
 654         cmp     %l4,_0x7f3504f3         ! (3_0) hy ? 0x7f3504f3
 655         ld      [%fp+tmp2],%g1          ! (0_0) iexp0 = ((int*)&db0)[0];
 656         faddd   %f44,%f30,%f30          ! (2_0) db0 = dx0 + dy0;
 657 
 658         fsmuld  %f17,%f17,%f44          ! (3_0) dx0 = x0 * (double)x0;
 659         bge,pn  %icc,.update19          ! (3_0) if ( hy >= 0x7f3504f3 )
 660         lda     [stridey+%i2]0x82,%f17  ! (3_0) y0 = *py;
 661         fpadd32 %f40,DA1,%f62           ! (2_1) db0 = vis_fpadd32(db0,DA1);
 662 
 663 .cont19:
 664         fmuld   %f54,%f38,%f40          ! (3_1) res0 *= xx0;
 665         orcc    %l3,%l4,%g0
 666         st      %f30,[%fp+tmp4]         ! (2_0) iexp0 = ((int*)&db0)[0];
 667         fand    %f46,DC2,%f38           ! (0_0) h_hi0 = vis_fand(h0,DC2);
 668 
 669         fmuld   K2,%f58,%f54            ! (4_1) res0 = K2 * xx0;
 670         srax    %g1,8,%o5               ! (0_0) iexp0 >>= 8;
 671         lda     [%i1]0x82,%l3           ! (4_0) hx0 = *(int*)px;
 672         fand    %f24,DA0,%f56           ! (3_1) db0 = vis_fand(db0,DA0);
 673 
 674         fmuld   %f36,%f62,%f62          ! (2_1) res0 *= db0;
 675         and     %o5,_0x1ff0,%o5         ! (0_0) di0 = iexp0 & 0x1ff0;
 676         bz,pn   %icc,.update19a
 677         fand    %f18,DC0,%f60           ! (1_0) h0 = vis_fand(db0,DC0);
 678 .cont19a:
 679         ldd     [TBL+%o5],%f22          ! (0_0) dtmp0 = ((double*)((char*)div_arr + di0))[0];
 680         add     %o0,TBL,%g1             ! (3_1) si0 = (char*)sqrt_arr + di0;
 681         and     %l3,_0x7fffffff,%l3     ! (4_0) hx0 &= 0x7fffffff;
 682         fsubd   %f46,%f38,%f38          ! (0_0) xx0 = h0 - h_hi0;
 683 
 684         fsmuld  %f17,%f17,%f24          ! (3_0) dy0 = y0 * (double)y0;
 685         cmp     %l3,_0x7f3504f3         ! (4_0) hx ? 0x7f3504f3
 686         lda     [stridey+%o7]0x82,%l4   ! (4_0) hy0 = *(int*)py;
 687         faddd   %f40,DC1,%f40           ! (3_1) res0 += DC1;
 688 
 689         fmul8x16        SCALE,%f56,%f36 ! (3_1) db0 = vis_fmul8x16(SCALE, db0);
 690         bge,pn  %icc,.update20          ! (4_0) if ( hx >= 0x7f3504f3 )
 691         ldd     [%g1+8],%f56            ! (3_1) dtmp0 = ((double*)si0)[1];
 692         faddd   %f54,K1,%f54            ! (4_1) res0 += K1;
 693 
 694         lda     [%i1]0x82,%f17          ! (4_0) x0 = *px;
 695 .cont20:
 696         subcc   counter,5,counter       ! counter -= 5
 697         add     %o4,stridez,%l7         ! pz += stridez
 698         fdtos   %f62,%f14               ! (2_1) ftmp0 = (float)res0;
 699 
 700         fmuld   %f38,%f22,%f38          ! (0_0) xx0 *= dmp0;
 701         and     %l4,_0x7fffffff,%l4     ! (4_0) hy0 &= 0x7fffffff;
 702         st      %f14,[%o4]              ! (2_1) *pz = ftmp0;
 703         for     %f60,DC1,%f46           ! (1_0) h0 = vis_for(h0,DC1);
 704 
 705         ld      [%fp+tmp3],%g1          ! (1_0) iexp0 = ((int*)&db0)[0];
 706         fmuld   %f56,%f40,%f62          ! (3_1) res0 = dtmp0 * res0;
 707         bpos,pt %icc,.main_loop
 708         faddd   %f44,%f24,%f24          ! (3_0) db0 = dx0 + dy0;
 709 
 710         add     counter,5,counter
 711 
 712 .tail:
 713         subcc   counter,1,counter
 714         bneg    .begin
 715         nop
 716 
 717         fpadd32 %f36,DA1,%f36           ! (3_2) db0 = vis_fpadd32(db0,DA1);
 718 
 719         fmuld   %f54,%f58,%f58          ! (4_2) res0 *= xx0;
 720         fand    %f46,DC2,%f44           ! (1_1) h_hi0 = vis_fand(h0,DC2);
 721 
 722         fmuld   K2,%f38,%f56            ! (0_1) res0 = K2 * xx0;
 723         srax    %g1,8,%g5               ! (1_1) iexp0 >>= 8;
 724         fand    %f20,DA0,%f54           ! (4_2) db0 = vis_fand(db0,DA0);
 725 
 726         fmuld   %f62,%f36,%f62          ! (3_2) res0 *= db0;
 727         and     %g5,_0x1ff0,%g5         ! (1_1) di0 = iexp0 & 0x1ff0;
 728 
 729         ldd     [%g5+TBL],%f22          ! (1_1) dtmp0 = ((double*)((char*)div_arr + di0))[0];
 730         add     %o3,TBL,%g1             ! (4_2) si0 = (char*)sqrt_arr + di0;
 731         fsubd   %f46,%f44,%f44          ! (1_1) xx0 = h0 - h_hi0;
 732 
 733         faddd   %f58,DC1,%f36           ! (4_2) res0 += DC1;
 734 
 735         faddd   %f56,K1,%f58            ! (0_1) res0 += K1;
 736         ldd     [%g1+8],%f56            ! (4_2) dtmp0 = ((double*)si0)[1];
 737         fmul8x16        SCALE,%f54,%f54 ! (4_2) db0 = vis_fmul8x16(SCALE, db0);
 738 
 739         fdtos   %f62,%f14               ! (3_2) ftmp0 = (float)res0;
 740 
 741         fmuld   %f44,%f22,%f44          ! (1_1) xx0 *= dmp0;
 742         add     %l7,stridez,%o7         ! pz += stridez
 743         st      %f14,[%l7]              ! (3_2) *pz = ftmp0;
 744 
 745         subcc   counter,1,counter
 746         bneg    .begin
 747         or      %g0,%o7,%l7
 748 
 749         fmuld   %f56,%f36,%f36          ! (4_2) res0 = dtmp0 * res0;
 750 
 751         fpadd32 %f54,DA1,%f62           ! (4_2) db0 = vis_fpadd32(db0,DA1);
 752 
 753         fmuld   %f58,%f38,%f38          ! (0_1) res0 *= xx0;
 754 
 755         fmuld   K2,%f44,%f56            ! (1_1) res0 = K2 * xx0;
 756         fand    %f0,DA0,%f54            ! (0_1) db0 = vis_fand(db0,DA0);
 757 
 758         fmuld   %f36,%f62,%f62          ! (4_2) res0 *= db0;
 759 
 760         add     %o5,TBL,%o0             ! (0_1) si0 = (char*)sqrt_arr + di0;
 761 
 762         faddd   %f38,DC1,%f36           ! (0_1) res0 += DC1;
 763 
 764         faddd   %f56,K1,%f38            ! (1_1) res0 += K1;
 765         ldd     [%o0+8],%f56            ! (0_1) dtmp0 = ((double*)si0)[1];
 766         fmul8x16        SCALE,%f54,%f54 ! (0_1) db0 = vis_fmul8x16(SCALE, db0);
 767 
 768         add     %o7,stridez,%i0         ! pz += stridez
 769         fdtos   %f62,%f14               ! (4_2) ftmp0 = (float)res0;
 770 
 771         fmuld   %f56,%f36,%f36          ! (0_1) res0 = dtmp0 * res0;
 772 
 773         fpadd32 %f54,DA1,%f62           ! (0_1) db0 = vis_fpadd32(db0,DA1);
 774 
 775         fmuld   %f38,%f44,%f44          ! (1_1) res0 *= xx0;
 776         add     %i0,stridez,%i3         ! pz += stridez
 777         st      %f14,[%o7]              ! (4_2) *pz = ftmp0;
 778 
 779         subcc   counter,1,counter
 780         bneg    .begin
 781         or      %g0,%i0,%l7
 782 
 783         fand    %f18,DA0,%f56           ! (1_1) db0 = vis_fand(db0,DA0);
 784 
 785         fmuld   %f36,%f62,%f62          ! (0_1) res0 *= db0;
 786 
 787         add     %g5,TBL,%o3             ! (1_1) si0 = (char*)sqrt_arr + di0;
 788 
 789         faddd   %f44,DC1,%f44           ! (1_1) res0 += DC1;
 790 
 791         fmul8x16        SCALE,%f56,%f36 ! (1_1) db0 = vis_fmul8x16(SCALE, db0);
 792         ldd     [%o3+8],%f56            ! (1_1) dtmp0 = ((double*)si0)[1];
 793 
 794         add     %i3,stridez,%o4         ! pz += stridez
 795         fdtos   %f62,%f14               ! (0_1) ftmp0 = (float)res0;
 796 
 797         st      %f14,[%i0]              ! (0_1) *pz = ftmp0;
 798 
 799         subcc   counter,1,counter
 800         bneg    .begin
 801         or      %g0,%i3,%l7
 802 
 803         fmuld   %f56,%f44,%f62          ! (1_1) res0 = dtmp0 * res0;
 804 
 805         fpadd32 %f36,DA1,%f36           ! (1_1) db0 = vis_fpadd32(db0,DA1);
 806 
 807         fmuld   %f62,%f36,%f62          ! (1_1) res0 *= db0;
 808 
 809         fdtos   %f62,%f14               ! (1_1) ftmp0 = (float)res0;
 810 
 811         st      %f14,[%i3]              ! (1_1) *pz = ftmp0;
 812 
 813         ba      .begin
 814         or      %g0,%o4,%l7
 815 
 816         .align  16
 817 .spec1:
 818         st      %g0,[%l7]               ! *pz = 0;
 819         add     %l7,stridez,%l7         ! pz += stridez
 820 
 821         add     %i2,stridey,%i2         ! py += stridey
 822         ba      .begin1
 823         sub     counter,1,counter       ! counter--
 824 
 825         .align  16
 826 .spec:
 827         sethi   %hi(0x7f800000),%i0
 828         cmp     %l3,%i0                 ! hx ? 0x7f800000
 829         bge,pt  %icc,2f                 ! if ( hx >= 0x7f800000 )
 830         ld      [%i2],%f8
 831 
 832         cmp     %l4,%i0                 ! hy ? 0x7f800000
 833         bge,pt  %icc,2f                 ! if ( hy >= 0x7f800000 )
 834         nop
 835 
 836         fsmuld  %f17,%f17,%f44          ! x * (double)x
 837         fsmuld  %f8,%f8,%f24            ! y * (double)y
 838         faddd   %f44,%f24,%f24          ! x * (double)x + y * (double)y
 839         fsqrtd  %f24,%f24               ! hyp = sqrt(x * (double)x + y * (double)y);
 840         fcmped  %f24,DFMAX              ! hyp ? DMAX
 841         fbug,a  1f                      ! if ( hyp > DMAX )
 842         fmuls   FMAX,FMAX,%f20          ! ftmp0 = FMAX * FMAX;
 843 
 844         fdtos   %f24,%f20               ! ftmp0 = (float)hyp;
 845 1:
 846         st      %f20,[%l7]              ! *pz = ftmp0;
 847         add     %l7,stridez,%l7         ! pz += stridez
 848         add     %i1,stridex,%i1         ! px += stridex
 849 
 850         add     %i2,stridey,%i2         ! py += stridey
 851         ba      .begin1
 852         sub     counter,1,counter       ! counter--
 853 2:
 854         fcmps   %f17,%f8                ! exceptions
 855         cmp     %l3,%i0                 ! hx ? 0x7f800000
 856         be,a    %icc,1f                 ! if ( hx == 0x7f800000 )
 857         st      %i0,[%l7]               ! *(int*)pz = 0x7f800000;
 858 
 859         cmp     %l4,%i0                 ! hy ? 0x7f800000
 860         be,a    %icc,1f                 ! if ( hy == 0x7f800000
 861         st      %i0,[%l7]               ! *(int*)pz = 0x7f800000;
 862 
 863         fmuls   %f17,%f8,%f8            ! x * y
 864         st      %f8,[%l7]               ! *pz = x * y;
 865 
 866 1:
 867         add     %l7,stridez,%l7         ! pz += stridez
 868         add     %i1,stridex,%i1         ! px += stridex
 869 
 870         add     %i2,stridey,%i2         ! py += stridey
 871         ba      .begin1
 872         sub     counter,1,counter       ! counter--
 873 
 874         .align  16
 875 .update0:
 876         cmp     counter,1
 877         ble     .cont0
 878         fzeros  %f17
 879 
 880         stx     %i1,[%fp+tmp_px]
 881 
 882         add     %o7,stridey,%i5
 883         stx     %i5,[%fp+tmp_py]
 884 
 885         sub     counter,1,counter
 886         st      counter,[%fp+tmp_counter]
 887 
 888         ba      .cont0
 889         or      %g0,1,counter
 890 
 891         .align  16
 892 .update1:
 893         cmp     counter,1
 894         ble     .cont1
 895         fzeros  %f17
 896 
 897         stx     %i1,[%fp+tmp_px]
 898         stx     %i5,[%fp+tmp_py]
 899 
 900         sub     counter,1,counter
 901         st      counter,[%fp+tmp_counter]
 902 
 903         ba      .cont1
 904         or      %g0,1,counter
 905 
 906         .align  16
 907 .update2:
 908         cmp     counter,2
 909         ble     .cont2
 910         fzeros  %f8
 911 
 912         stx     %i1,[%fp+tmp_px]
 913         stx     %o4,[%fp+tmp_py]
 914 
 915         sub     counter,2,counter
 916         st      counter,[%fp+tmp_counter]
 917 
 918         ba      .cont2
 919         or      %g0,2,counter
 920 
 921         .align  16
 922 .update3:
 923         cmp     counter,2
 924         ble     .cont3
 925         fzeros  %f17
 926 
 927         stx     %i1,[%fp+tmp_px]
 928         stx     %o4,[%fp+tmp_py]
 929 
 930         sub     counter,2,counter
 931         st      counter,[%fp+tmp_counter]
 932 
 933         ba      .cont3
 934         or      %g0,2,counter
 935 
 936         .align  16
 937 .update4:
 938         cmp     counter,3
 939         ble     .cont4
 940         fzeros  %f17
 941 
 942         stx     %i1,[%fp+tmp_px]
 943         stx     %i5,[%fp+tmp_py]
 944 
 945         sub     counter,3,counter
 946         st      counter,[%fp+tmp_counter]
 947 
 948         ba      .cont4
 949         or      %g0,3,counter
 950 
 951         .align  16
 952 .update5:
 953         cmp     counter,3
 954         ble     .cont5
 955         fzeros  %f17
 956 
 957         sub     %i1,stridex,%i2
 958         stx     %i2,[%fp+tmp_px]
 959         stx     %i5,[%fp+tmp_py]
 960 
 961         sub     counter,3,counter
 962         st      counter,[%fp+tmp_counter]
 963 
 964         ba      .cont5
 965         or      %g0,3,counter
 966 
 967         .align  16
 968 .update6:
 969         cmp     counter,4
 970         ble     .cont6
 971         fzeros  %f17
 972 
 973         stx     %i1,[%fp+tmp_px]
 974         stx     %i2,[%fp+tmp_py]
 975 
 976         sub     counter,4,counter
 977         st      counter,[%fp+tmp_counter]
 978 
 979         ba      .cont6
 980         or      %g0,4,counter
 981 
 982         .align  16
 983 .update7:
 984         cmp     counter,4
 985         ble     .cont7
 986         fzeros  %f17
 987 
 988         sub     %i1,stridex,%o7
 989         stx     %o7,[%fp+tmp_px]
 990         stx     %i2,[%fp+tmp_py]
 991 
 992         sub     counter,4,counter
 993         st      counter,[%fp+tmp_counter]
 994 
 995         ba      .cont7
 996         or      %g0,4,counter
 997 
 998         .align  16
 999 .update8:
1000         cmp     counter,5
1001         ble     .cont8
1002         fzeros  %f17
1003 
1004         sub     %i1,stridex,%o5
1005         stx     %o5,[%fp+tmp_px]
1006         stx     %o7,[%fp+tmp_py]
1007 
1008         sub     counter,5,counter
1009         st      counter,[%fp+tmp_counter]
1010 
1011         ba      .cont8
1012         or      %g0,5,counter
1013 
1014         .align  16
1015 .update9:
1016         cmp     counter,5
1017         ble     .cont9
1018         fzeros  %f17
1019 
1020         sub     %i1,stridex,%o5
1021         stx     %o5,[%fp+tmp_px]
1022         stx     %o7,[%fp+tmp_py]
1023 
1024         sub     counter,5,counter
1025         st      counter,[%fp+tmp_counter]
1026 
1027         ba      .cont9
1028         or      %g0,5,counter
1029 
1030         .align  16
1031 .update10:
1032         fmul8x16        SCALE,%f56,%f36 ! (3_1) db0 = vis_fmul8x16(SCALE, db0);
1033         and     %l4,_0x7fffffff,%l4     ! (4_0) hy0 &= 0x7fffffff;
1034         ldd     [%g1+8],%f56            ! (3_1) dtmp0 = ((double*)si0)[1];
1035         faddd   %f54,K1,%f54            ! (4_1) res0 += K1;
1036 
1037         cmp     counter,6
1038         ble     .cont10
1039         fzeros  %f17
1040 
1041         stx     %i1,[%fp+tmp_px]
1042         add     %o7,stridey,%i5
1043         stx     %i5,[%fp+tmp_py]
1044 
1045         sub     counter,6,counter
1046         st      counter,[%fp+tmp_counter]
1047 
1048         ba      .cont10
1049         or      %g0,6,counter
1050 
1051         .align  16
1052 .update11:
1053         cmp     counter,1
1054         ble     .cont11
1055         fzeros  %f17
1056 
1057         stx     %i1,[%fp+tmp_px]
1058         stx     %i5,[%fp+tmp_py]
1059 
1060         sub     counter,1,counter
1061         st      counter,[%fp+tmp_counter]
1062 
1063         ba      .cont11
1064         or      %g0,1,counter
1065 
1066         .align  16
1067 .update12:
1068         cmp     counter,2
1069         ble     .cont12
1070         fzeros  %f8
1071 
1072         stx     %i0,[%fp+tmp_px]
1073         add     %i5,stridey,%o4
1074         stx     %o4,[%fp+tmp_py]
1075 
1076         sub     counter,2,counter
1077         st      counter,[%fp+tmp_counter]
1078 
1079         ba      .cont12
1080         or      %g0,2,counter
1081 
1082         .align  16
1083 .update13:
1084         cmp     counter,2
1085         ble     .cont13
1086         fzeros  %f17
1087 
1088         stx     %i0,[%fp+tmp_px]
1089         stx     %o4,[%fp+tmp_py]
1090 
1091         sub     counter,2,counter
1092         st      counter,[%fp+tmp_counter]
1093 
1094         ba      .cont13
1095         or      %g0,2,counter
1096 
1097         .align  16
1098 .update14:
1099         cmp     counter,3
1100         ble     .cont14
1101         fzeros  %f17
1102 
1103         stx     %i1,[%fp+tmp_px]
1104         add     %o4,stridey,%i5
1105         stx     %i5,[%fp+tmp_py]
1106 
1107         sub     counter,3,counter
1108         st      counter,[%fp+tmp_counter]
1109 
1110         ba      .cont14
1111         or      %g0,3,counter
1112 
1113         .align  16
1114 .update15:
1115         cmp     counter,3
1116         ble     .cont15
1117         fzeros  %f17
1118 
1119         sub     %i1,stridex,%i2
1120         stx     %i2,[%fp+tmp_px]
1121         stx     %i5,[%fp+tmp_py]
1122 
1123         sub     counter,3,counter
1124         st      counter,[%fp+tmp_counter]
1125 
1126         ba      .cont15
1127         or      %g0,3,counter
1128 
1129         .align  16
1130 .update16:
1131         faddd   %f40,%f32,%f18          ! (1_0) db0 = dx0 + dy0;
1132         cmp     counter,4
1133         ble     .cont16
1134         fzeros  %f17
1135 
1136         stx     %i1,[%fp+tmp_px]
1137         stx     %i2,[%fp+tmp_py]
1138 
1139         sub     counter,4,counter
1140         st      counter,[%fp+tmp_counter]
1141 
1142         ba      .cont16
1143         or      %g0,4,counter
1144 
1145         .align  16
1146 .update17:
1147         cmp     counter,4
1148         ble     .cont17
1149         fzeros  %f17
1150 
1151         stx     %i1,[%fp+tmp_px]
1152         stx     %i2,[%fp+tmp_py]
1153 
1154         sub     counter,4,counter
1155         st      counter,[%fp+tmp_counter]
1156 
1157         ba      .cont17
1158         or      %g0,4,counter
1159 
1160         .align  16
1161 .update18:
1162         cmp     counter,5
1163         ble     .cont18
1164         fzeros  %f17
1165 
1166         stx     %l7,[%fp+tmp_px]
1167         stx     %o7,[%fp+tmp_py]
1168 
1169         sub     counter,5,counter
1170         st      counter,[%fp+tmp_counter]
1171 
1172         ba      .cont18
1173         or      %g0,5,counter
1174 
1175         .align  16
1176 .update19:
1177         fpadd32 %f40,DA1,%f62           ! (2_1) db0 = vis_fpadd32(db0,DA1);
1178         cmp     counter,5
1179         ble     .cont19
1180         fzeros  %f17
1181 
1182         stx     %l7,[%fp+tmp_px]
1183         stx     %o7,[%fp+tmp_py]
1184 
1185         sub     counter,5,counter
1186         st      counter,[%fp+tmp_counter]
1187 
1188         ba      .cont19
1189         or      %g0,5,counter
1190 
1191         .align  16
1192 .update19a:
1193         cmp     counter,5
1194         ble     .cont19a
1195         fzeros  %f17
1196 
1197         stx     %l7,[%fp+tmp_px]
1198         stx     %o7,[%fp+tmp_py]
1199 
1200         sub     counter,5,counter
1201         st      counter,[%fp+tmp_counter]
1202 
1203         ba      .cont19a
1204         or      %g0,5,counter
1205 
1206         .align  16
1207 .update20:
1208         faddd   %f54,K1,%f54            ! (4_1) res0 += K1;
1209         cmp     counter,6
1210         ble     .cont20
1211         fzeros  %f17
1212 
1213         stx     %i1,[%fp+tmp_px]
1214         add     %o7,stridey,%g1
1215         stx     %g1,[%fp+tmp_py]
1216 
1217         sub     counter,6,counter
1218         st      counter,[%fp+tmp_counter]
1219 
1220         ba      .cont20
1221         or      %g0,6,counter
1222 
1223 .exit:
1224         ret
1225         restore
1226         SET_SIZE(__vhypotf)
1227