1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  23  */
  24 /*
  25  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  26  * Use is subject to license terms.
  27  */
  28 
  29         .file   "__vhypot.S"
  30 
  31 #include "libm.h"
  32 
  33         RO_DATA
  34         .align  64
  35 
  36 .CONST_TBL:
  37         .word   0x7ff00000, 0   ! DC0
  38         .word   0x7fe00000, 0   ! DC1
  39         .word   0x00100000, 0   ! DC2
  40         .word   0x41b00000, 0   ! D2ON28 = 268435456.0
  41         .word   0x7fd00000, 0   ! DC3
  42 
  43 #define counter         %i0
  44 #define tmp_counter     %l3
  45 #define tmp_px          %l5
  46 #define tmp_py          %o7
  47 #define stridex         %i2
  48 #define stridey         %i4
  49 #define stridez         %l0
  50 
  51 #define DC0             %f8
  52 #define DC0_HI          %f8
  53 #define DC0_LO          %f9
  54 #define DC1             %f46
  55 #define DC2             %f48
  56 #define DC3             %f0
  57 #define D2ON28          %f62
  58 
  59 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  60 !      !!!!!   algorithm   !!!!!
  61 !  ((float*)&x)[0] = ((float*)px)[0];
  62 !  ((float*)&x)[1] = ((float*)px)[1];
  63 !
  64 !  ((float*)&y)[0] = ((float*)py)[0];
  65 !  ((float*)&y)[1] = ((float*)py)[1];
  66 !
  67 !  x = fabs(x);
  68 !  y = fabs(y);
  69 !
  70 !  c0 = vis_fcmple32(DC1,x);
  71 !  c2 = vis_fcmple32(DC1,y);
  72 !  c1 = vis_fcmpgt32(DC2,x);
  73 !  c3 = vis_fcmpgt32(DC2,y);
  74 !
  75 !  c0 |= c2;
  76 !  c1 &= c3;
  77 !  if ( (c0 & 2) != 0 )
  78 !  {
  79 !    lx = ((int*)px)[1];
  80 !    ly = ((int*)py)[1];
  81 !    hx = *(int*)px;
  82 !    hy = *(int*)py;
  83 !
  84 !    hx &= 0x7fffffff;
  85 !    hy &= 0x7fffffff;
  86 !
  87 !    j0 = hx;
  88 !    if ( j0 < hy ) j0 = hy;
  89 !    j0 &= 0x7ff00000;
  90 !    if ( j0 >= 0x7ff00000 )
  91 !    {
  92 !      if ( hx == 0x7ff00000 && lx == 0 ) res = x == y ? y : x;
  93 !      else if ( hy == 0x7ff00000 && ly == 0 ) res = x == y ? x : y;
  94 !      else res = x * y;
  95 !
  96 !      ((float*)pz)[0] = ((float*)&res)[0];
  97 !      ((float*)pz)[1] = ((float*)&res)[1];
  98 !    }
  99 !    else
 100 !    {
 101 !      diff = hy - hx;
 102 !      j0 = diff >> 31;
 103 !      if ( ((diff ^ j0) - j0) < 0x03600000 )
 104 !      {!
 105 !        x *= D2ONM1022;
 106 !        y *= D2ONM1022;
 107 !
 108 !        x_hi = ( x + two28 ) - two28;
 109 !        x_lo = x - x_hi;
 110 !        y_hi = ( y + two28 ) - two28;
 111 !        y_lo = y - y_hi;
 112 !        res = (x_hi * x_hi + y_hi * y_hi);
 113 !        res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo);
 114 !
 115 !        res = sqrt(res);
 116 !
 117 !        res = D2ONP1022 * res;
 118 !        ((float*)pz)[0] = ((float*)&res)[0];
 119 !        ((float*)pz)[1] = ((float*)&res)[1];
 120 !      }
 121 !      else
 122 !      {
 123 !        res = x + y;
 124 !        ((float*)pz)[0] = ((float*)&res)[0];
 125 !        ((float*)pz)[1] = ((float*)&res)[1];
 126 !      }
 127 !    }
 128 !    px += stridex;
 129 !    py += stridey;
 130 !    pz += stridez;
 131 !    continue;
 132 !  }
 133 !  if ( (c1 & 2) != 0 )
 134 !  {
 135 !    x *= D2ONP1022;
 136 !    y *= D2ONP1022;
 137 !
 138 !    x_hi = ( x + two28 ) - two28;
 139 !    x_lo = x - x_hi;
 140 !    y_hi = ( y + two28 ) - two28;
 141 !    y_lo = y - y_hi;
 142 !    res = (x_hi * x_hi + y_hi * y_hi);
 143 !    res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo);
 144 !
 145 !    res = sqrt(res);
 146 !
 147 !    res = D2ONM1022 * res;
 148 !    ((float*)pz)[0] = ((float*)&res)[0];
 149 !    ((float*)pz)[1] = ((float*)&res)[1];
 150 !    px += stridex;
 151 !    py += stridey;
 152 !    pz += stridez;
 153 !    continue;
 154 !  }
 155 !
 156 !  dmax = x;
 157 !  if ( dmax < y ) dmax = y;
 158 !
 159 !  dmax = vis_fand(dmax,DC0);
 160 !  dnorm = vis_fpsub32(DC1,dmax);
 161 !
 162 !  x *= dnorm;
 163 !  y *= dnorm;
 164 !
 165 !  x_hi = x + D2ON28;
 166 !  x_hi -= D2ON28;
 167 !  x_lo = x - x_hi;
 168 !
 169 !  y_hi = y + D2ON28;
 170 !  y_hi -= D2ON28;
 171 !  y_lo = y - y_hi;
 172 !
 173 !  res = x_hi * x_hi;
 174 !  dtmp1 = x + x_hi;
 175 !  dtmp0 = y_hi * y_hi;
 176 !  dtmp2 = y + y_hi;
 177 !
 178 !  res += dtmp0;
 179 !  dtmp1 *= x_lo;
 180 !  dtmp2 *= y_lo;
 181 !  dtmp1 += dtmp2;
 182 !  res += dtmp1;
 183 !
 184 !  res = sqrt(res);
 185 !
 186 !  res = dmax * res;
 187 !  ((float*)pz)[0] = ((float*)&res)[0];
 188 !  ((float*)pz)[1] = ((float*)&res)[1];
 189 !
 190 !  px += stridex;
 191 !  py += stridey;
 192 !  pz += stridez;
 193 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 194 
 195         ENTRY(__vhypot)
 196         save    %sp,-SA(MINFRAME),%sp
 197         PIC_SETUP(l7)
 198         PIC_SET(l7,.CONST_TBL,o3)
 199         wr      %g0,0x82,%asi
 200 
 201 #ifdef __sparcv9
 202         ldx     [%fp+STACK_BIAS+176],%l0
 203 #else
 204         ld      [%fp+STACK_BIAS+92],%l0
 205 #endif
 206         ldd     [%o3],DC0
 207         sll     %i2,3,stridex
 208         mov     %i0,tmp_counter
 209 
 210         ldd     [%o3+8],DC1
 211         sll     %i4,3,stridey
 212         mov     %i1,tmp_px
 213 
 214         ldd     [%o3+16],DC2
 215         sll     %l0,3,stridez
 216         mov     %i3,tmp_py
 217 
 218         ldd     [%o3+24],D2ON28
 219 
 220         ldd     [%o3+32],DC3
 221 
 222 .begin:
 223         mov     tmp_counter,counter
 224         mov     tmp_px,%i1
 225         mov     tmp_py,%i3
 226         clr     tmp_counter
 227 .begin1:
 228         cmp     counter,0
 229         ble,pn  %icc,.exit
 230         nop
 231 
 232         lda     [%i1]%asi,%o0
 233         sethi   %hi(0x7ffffc00),%o5
 234 
 235         lda     [%i3]%asi,%o2
 236         add     %o5,1023,%o5
 237 
 238         lda     [%i1]%asi,%f26          ! (1_0) ((float*)&x)[0] = ((float*)px)[0];
 239 
 240         lda     [%i1+4]%asi,%f27        ! (1_0) ((float*)&x)[1] = ((float*)px)[1];
 241         add     %i1,stridex,%o1         ! px += stridex
 242 
 243         lda     [%i3]%asi,%f24          ! (1_0) ((float*)&y)[0] = ((float*)py)[0];
 244         sethi   %hi(0x00100000),%l7
 245         and     %o0,%o5,%o0
 246 
 247         lda     [%i3+4]%asi,%f25        ! (1_0) ((float*)&y)[1] = ((float*)py)[1];
 248         and     %o2,%o5,%o2
 249         sethi   %hi(0x7fe00000),%l6
 250 
 251         fabsd   %f26,%f36               ! (1_0) x = fabs(x);
 252         cmp     %o0,%o2
 253         mov     %o2,%l4
 254 
 255         fabsd   %f24,%f54               ! (1_0) y = fabs(y);
 256         add     %i3,stridey,%o5         ! py += stridey
 257         movg    %icc,%o0,%o2
 258         lda     [%o5]%asi,%f28          ! (2_0) ((float*)&y)[0] = ((float*)py)[0];
 259 
 260         cmp     %o2,%l6
 261         sethi   %hi(0x7ff00000),%o4
 262         bge,pn  %icc,.spec0
 263         lda     [%o5+4]%asi,%f29        ! (2_0) ((float*)&y)[1] = ((float*)py)[1];
 264 
 265         cmp     %o2,%l7
 266         bl,pn   %icc,.spec1
 267         nop
 268         lda     [%o1]%asi,%f26          ! (2_0) ((float*)&x)[0] = ((float*)px)[0];
 269 
 270         lda     [%o1+4]%asi,%f27        ! (2_0) ((float*)&x)[1] = ((float*)px)[1];
 271         add     %i3,stridey,%i3         ! py += stridey
 272 
 273         fabsd   %f28,%f34               ! (2_0) y = fabs(y);
 274 
 275         fabsd   %f26,%f50               ! (2_0) x = fabs(x);
 276 
 277         fcmple32        DC1,%f50,%o3    ! (2_0) c0 = vis_fcmple32(DC1,x);
 278 
 279         fcmple32        DC1,%f34,%o0    ! (2_0) c2 = vis_fcmple32(DC1,y);
 280 
 281         fcmpgt32        DC2,%f50,%o4    ! (2_0) c1 = vis_fcmpgt32(DC2,x);
 282 
 283         fcmpgt32        DC2,%f34,%o5    ! (2_0) c3 = vis_fcmpgt32(DC2,y);
 284 
 285         or      %o3,%o0,%o3             ! (2_0) c0 |= c2;
 286 
 287         andcc   %o3,2,%g0               ! (2_0) c0 & 2
 288         bnz,pn  %icc,.update0           ! (2_0) if ( (c0 & 2) != 0 )
 289         and     %o4,%o5,%o4             ! (2_0) c1 &= c3;
 290 .cont0:
 291         add     %i3,stridey,%l4         ! py += stridey
 292         andcc   %o4,2,%g0               ! (2_0) c1 & 2
 293         bnz,pn  %icc,.update1           ! (2_0) if ( (c1 & 2) != 0 )
 294         fmovd   %f36,%f56               ! (1_0) dmax = x;
 295 .cont1:
 296         lda     [%l4]%asi,%f30          ! (3_0) ((float*)&y)[0] = ((float*)py)[0];
 297         add     %o1,stridex,%l2         ! px += stridex
 298 
 299         lda     [%l4+4]%asi,%f31        ! (3_0) ((float*)&y)[1] = ((float*)py)[1];
 300 
 301         lda     [%l2]%asi,%f18          ! (3_1) ((float*)&x)[0] = ((float*)px)[0];
 302 
 303         lda     [%l2+4]%asi,%f19        ! (3_1) ((float*)&x)[1] = ((float*)px)[1];
 304 
 305         fabsd   %f30,%f30               ! (3_1) y = fabs(y);
 306 
 307         fabsd   %f18,%f18               ! (3_1) x = fabs(x);
 308 
 309         fcmped  %fcc2,%f54,%f56         ! (1_1) dmax ? y
 310 
 311         fmovdg  %fcc2,%f54,%f56         ! (1_1) if ( dmax < y ) dmax = y;
 312 
 313         fcmple32        DC1,%f18,%o3    ! (3_1) c0 = vis_fcmple32(DC1,x);
 314 
 315         fcmple32        DC1,%f30,%o0    ! (3_1) c2 = vis_fcmple32(DC1,y);
 316 
 317         fcmpgt32        DC2,%f18,%o4    ! (3_1) c1 = vis_fcmpgt32(DC2,x);
 318 
 319         fcmpgt32        DC2,%f30,%o1    ! (3_1) c3 = vis_fcmpgt32(DC2,y);
 320 
 321         fand    %f56,DC0,%f38           ! (1_1) dmax = vis_fand(dmax,DC0);
 322 
 323         or      %o3,%o0,%o3             ! (3_1) c0 |= c2;
 324 
 325         andcc   %o3,2,%g0               ! (3_1) c0 & 2
 326         bnz,pn  %icc,.update2           ! (3_1) if ( (c0 & 2) != 0 )
 327         and     %o4,%o1,%o4             ! (3_1) c1 &= c3;
 328 .cont2:
 329         add     %l4,stridey,%i3         ! py += stridey
 330         andcc   %o4,2,%g0               ! (3_1) c1 & 2
 331         bnz,pn  %icc,.update3           ! (3_1) if ( (c1 & 2) != 0 )
 332         fmovd   %f50,%f32               ! (2_1) dmax = x;
 333 .cont3:
 334         fpsub32 DC1,%f38,%f10           ! (1_1) dnorm = vis_fpsub32(DC1,dmax);
 335         lda     [%i3]%asi,%f20          ! (0_0) ((float*)&y)[0] = ((float*)py)[0];
 336 
 337         lda     [%i3+4]%asi,%f21        ! (0_0) ((float*)&y)[1] = ((float*)py)[1];
 338 
 339         add     %l2,stridex,%l1         ! px += stridex
 340 
 341         fmuld   %f36,%f10,%f36          ! (1_1) x *= dnorm;
 342         lda     [%l1]%asi,%f22          ! (0_0) ((float*)&x)[0] = ((float*)px)[0]
 343 
 344         lda     [%l1+4]%asi,%f23        ! (0_0) ((float*)&x)[1] = ((float*)px)[1];
 345 
 346         fmuld   %f54,%f10,%f56          ! (1_1) y *= dnorm;
 347         fabsd   %f20,%f40               ! (0_0) y = fabs(y);
 348 
 349         fabsd   %f22,%f20               ! (0_0) x = fabs(x);
 350 
 351         fcmped  %fcc3,%f34,%f32         ! (2_1) dmax ? y
 352 
 353 
 354         fmovdg  %fcc3,%f34,%f32         ! (2_1) if ( dmax < y ) dmax = y;
 355 
 356         faddd   %f36,D2ON28,%f58        ! (1_1) x_hi = x + D2ON28;
 357         fcmple32        DC1,%f20,%g5    ! (0_0) c0 = vis_fcmple32(DC1,x);
 358 
 359         faddd   %f56,D2ON28,%f22        ! (1_1) y_hi = y + D2ON28;
 360         fcmple32        DC1,%f40,%o2    ! (0_0) c2 = vis_fcmple32(DC1,y);
 361 
 362         fcmpgt32        DC2,%f20,%g1    ! (0_0) c1 = vis_fcmpgt32(DC2,x);
 363 
 364         fcmpgt32        DC2,%f40,%o4    ! (0_0) c3 = vis_fcmpgt32(DC2,y);
 365 
 366         fand    %f32,DC0,%f52           ! (2_1) dmax = vis_fand(dmax,DC0);
 367 
 368         or      %g5,%o2,%g5             ! (0_0) c0 |= c2;
 369         fsubd   %f58,D2ON28,%f58        ! (1_1) x_hi -= D2ON28;
 370 
 371         andcc   %g5,2,%g0               ! (0_0) c0 & 2
 372         bnz,pn  %icc,.update4           ! (0_0) if ( (c0 & 2) != 0 )
 373         fsubd   %f22,D2ON28,%f22        ! (1_1) y_hi -= D2ON28;
 374 .cont4:
 375         and     %g1,%o4,%g1             ! (0_0) c1 &= c3;
 376 
 377         add     %i3,stridey,%l2         ! py += stridey
 378         andcc   %g1,2,%g0               ! (0_0) c1 & 2
 379         bnz,pn  %icc,.update5           ! (0_0) if ( (c1 & 2) != 0 )
 380         fmovd   %f18,%f44               ! (3_1) dmax = x;
 381 .cont5:
 382         fpsub32 DC1,%f52,%f10           ! (2_1) dnorm = vis_fpsub32(DC1,dmax);
 383         lda     [%l2]%asi,%f24          ! (1_0) ((float*)&y)[0] = ((float*)py)[0];
 384 
 385         fmuld   %f58,%f58,%f60          ! (1_1) res = x_hi * x_hi;
 386         lda     [%l2+4]%asi,%f25        ! (1_0) ((float*)&y)[1] = ((float*)py)[1];
 387         add     %l1,stridex,%l7         ! px += stridex
 388         faddd   %f56,%f22,%f28          ! (1_1) dtmp2 = y + y_hi;
 389 
 390         faddd   %f36,%f58,%f6           ! (1_1) dtmp1 = x + x_hi;
 391         lda     [%l7]%asi,%f26          ! (1_0) ((float*)&x)[0] = ((float*)px)[0];
 392 
 393         fmuld   %f50,%f10,%f50          ! (2_1) x *= dnorm;
 394         fsubd   %f36,%f58,%f58          ! (1_1) x_lo = x - x_hi;
 395         lda     [%l7+4]%asi,%f27        ! (1_0) ((float*)&x)[1] = ((float*)px)[1];
 396 
 397         fmuld   %f22,%f22,%f2           ! (1_1) dtmp0 = y_hi * y_hi;
 398         fsubd   %f56,%f22,%f56          ! (1_1) y_lo = y - y_hi;
 399 
 400         fmuld   %f34,%f10,%f34          ! (2_1) y *= dnorm;
 401         fabsd   %f24,%f54               ! (1_0) y = fabs(y);
 402 
 403         fabsd   %f26,%f36               ! (1_0) x = fabs(x);
 404 
 405         fmuld   %f6,%f58,%f10           ! (1_1) dtmp1 *= x_lo;
 406         fcmped  %fcc0,%f30,%f44         ! (3_1) dmax ? y
 407 
 408         fmuld   %f28,%f56,%f26          ! (1_1) dtmp2 *= y_lo;
 409 
 410         fmovdg  %fcc0,%f30,%f44         ! (3_1) if ( dmax < y ) dmax = y;
 411 
 412         faddd   %f50,D2ON28,%f58        ! (2_1) x_hi = x + D2ON28;
 413         fcmple32        DC1,%f36,%g1    ! (1_0) c0 = vis_fcmple32(DC1,x);
 414 
 415         faddd   %f34,D2ON28,%f22        ! (2_1) y_hi = y + D2ON28;
 416         fcmple32        DC1,%f54,%g5    ! (1_0) c2 = vis_fcmple32(DC1,y);
 417 
 418         faddd   %f60,%f2,%f24           ! (1_1) res += dtmp0;
 419         fcmpgt32        DC2,%f36,%o5    ! (1_0) c1 = vis_fcmpgt32(DC2,x);
 420 
 421         faddd   %f10,%f26,%f28          ! (1_1) dtmp1 += dtmp2;
 422         fcmpgt32        DC2,%f54,%o1    ! (1_0) c3 = vis_fcmpgt32(DC2,y);
 423 
 424         fand    %f44,DC0,%f14           ! (3_1) dmax = vis_fand(dmax,DC0);
 425 
 426         or      %g1,%g5,%g1             ! (1_0) c0 |= c2;
 427         fsubd   %f58,D2ON28,%f44        ! (2_1) x_hi -= D2ON28;
 428 
 429         andcc   %g1,2,%g0               ! (1_0) c0 & 2
 430         bnz,pn  %icc,.update6           ! (1_0) if ( (c0 & 2) != 0 )
 431         fsubd   %f22,D2ON28,%f58        ! (2_1) y_hi -= D2ON28;
 432 .cont6:
 433         and     %o5,%o1,%o5             ! (1_0) c1 &= c3;
 434         faddd   %f24,%f28,%f26          ! (1_1) res += dtmp1;
 435 
 436         add     %l2,stridey,%i3         ! py += stridey
 437         andcc   %o5,2,%g0               ! (1_0) c1 & 2
 438         bnz,pn  %icc,.update7           ! (1_0) if ( (c1 & 2) != 0 )
 439         fmovd   %f20,%f4                ! (0_0) dmax = x;
 440 .cont7:
 441         fpsub32 DC1,%f14,%f10           ! (3_1) dnorm = vis_fpsub32(DC1,dmax);
 442         lda     [%i3]%asi,%f28          ! (2_0) ((float*)&y)[0] = ((float*)py)[0];
 443 
 444         fmuld   %f44,%f44,%f2           ! (2_1) res = x_hi * x_hi;
 445         lda     [%i3+4]%asi,%f29        ! (2_0) ((float*)&y)[1] = ((float*)py)[1];
 446         add     %l7,stridex,%o1         ! px += stridex
 447         faddd   %f34,%f58,%f60          ! (2_1) dtmp2 = y + y_hi;
 448 
 449         fsqrtd  %f26,%f24               ! (1_1) res = sqrt(res);
 450         lda     [%o1]%asi,%f26          ! (2_0) ((float*)&x)[0] = ((float*)px)[0];
 451         faddd   %f50,%f44,%f56          ! (2_1) dtmp1 = x + x_hi;
 452 
 453         fmuld   %f18,%f10,%f6           ! (3_1) x *= dnorm;
 454         fsubd   %f50,%f44,%f18          ! (2_1) x_lo = x - x_hi;
 455         lda     [%o1+4]%asi,%f27        ! (2_0) ((float*)&x)[1] = ((float*)px)[1];
 456 
 457         fmuld   %f58,%f58,%f44          ! (2_1) dtmp0 = y_hi * y_hi;
 458         fsubd   %f34,%f58,%f22          ! (2_1) y_lo = y - y_hi;
 459 
 460         fmuld   %f30,%f10,%f58          ! (3_1) y *= dnorm;
 461         fabsd   %f28,%f34               ! (2_0) y = fabs(y);
 462 
 463         fabsd   %f26,%f50               ! (2_0) x = fabs(x);
 464 
 465         fmuld   %f56,%f18,%f10          ! (2_1) dtmp1 *= x_lo;
 466         fcmped  %fcc1,%f40,%f4          ! (0_0) dmax ? y
 467 
 468         fmuld   %f60,%f22,%f12          ! (2_1) dtmp2 *= y_lo;
 469 
 470         fmovdg  %fcc1,%f40,%f4          ! (0_0) if ( dmax < y ) dmax = y;
 471 
 472         faddd   %f6,D2ON28,%f56         ! (3_1) x_hi = x + D2ON28;
 473         fcmple32        DC1,%f50,%o3    ! (2_0) c0 = vis_fcmple32(DC1,x);
 474 
 475         faddd   %f58,D2ON28,%f28        ! (3_1) y_hi = y + D2ON28;
 476         fcmple32        DC1,%f34,%o0    ! (2_0) c2 = vis_fcmple32(DC1,y);
 477 
 478         faddd   %f2,%f44,%f30           ! (2_1) res += dtmp0;
 479         fcmpgt32        DC2,%f50,%o4    ! (2_0) c1 = vis_fcmpgt32(DC2,x);
 480 
 481         faddd   %f10,%f12,%f26          ! (2_1) dtmp1 += dtmp2;
 482         fcmpgt32        DC2,%f34,%o5    ! (2_0) c3 = vis_fcmpgt32(DC2,y);
 483 
 484         fand    %f4,DC0,%f16            ! (0_0) dmax = vis_fand(dmax,DC0);
 485 
 486         or      %o3,%o0,%o3             ! (2_0) c0 |= c2;
 487         fsubd   %f56,D2ON28,%f18        ! (3_1) x_hi -= D2ON28;
 488 
 489         andcc   %o3,2,%g0               ! (2_0) c0 & 2
 490         bnz,pn  %icc,.update8           ! (2_0) if ( (c0 & 2) != 0 )
 491         fsubd   %f28,D2ON28,%f4         ! (3_1) y_hi -= D2ON28;
 492 .cont8:
 493         and     %o4,%o5,%o4             ! (2_0) c1 &= c3;
 494         faddd   %f30,%f26,%f12          ! (2_1) res += dtmp1;
 495 
 496         add     %i3,stridey,%l4         ! py += stridey
 497         andcc   %o4,2,%g0               ! (2_0) c1 & 2
 498         bnz,pn  %icc,.update9           ! (2_0) if ( (c1 & 2) != 0 )
 499         fmovd   %f36,%f56               ! (1_0) dmax = x;
 500 .cont9:
 501         lda     [%l4]%asi,%f30          ! (3_0) ((float*)&y)[0] = ((float*)py)[0];
 502         add     %o1,stridex,%l2         ! px += stridex
 503         fpsub32 DC1,%f16,%f44           ! (0_0) dnorm = vis_fpsub32(DC1,dmax);
 504 
 505         fmuld   %f18,%f18,%f60          ! (3_1) res = x_hi * x_hi;
 506         lda     [%l4+4]%asi,%f31        ! (3_0) ((float*)&y)[1] = ((float*)py)[1];
 507         faddd   %f58,%f4,%f32           ! (3_1) dtmp2 = y + y_hi;
 508 
 509         fsqrtd  %f12,%f12               ! (2_1) res = sqrt(res);
 510         faddd   %f6,%f18,%f28           ! (3_1) dtmp1 = x + x_hi;
 511 
 512         cmp     counter,4
 513         bl,pn   %icc,.tail
 514         nop
 515 
 516         ba      .main_loop
 517         sub     counter,4,counter
 518 
 519         .align  16
 520 .main_loop:
 521         fmuld   %f20,%f44,%f2           ! (0_1) x *= dnorm;
 522         fsubd   %f6,%f18,%f20           ! (3_2) x_lo = x - x_hi;
 523         lda     [%l2]%asi,%f18          ! (3_1) ((float*)&x)[0] = ((float*)px)[0];
 524 
 525         fmuld   %f4,%f4,%f22            ! (3_2) dtmp0 = y_hi * y_hi;
 526         lda     [%l2+4]%asi,%f19        ! (3_1) ((float*)&x)[1] = ((float*)px)[1];
 527         fsubd   %f58,%f4,%f58           ! (3_2) y_lo = y - y_hi;
 528 
 529         fmuld   %f40,%f44,%f44          ! (0_1) y *= dnorm;
 530         fabsd   %f30,%f30               ! (3_1) y = fabs(y);
 531 
 532         fmuld   %f38,%f24,%f10          ! (1_2) res = dmax * res;
 533         fabsd   %f18,%f18               ! (3_1) x = fabs(x);
 534         st      %f10,[%i5]              ! (1_2) ((float*)pz)[0] = ((float*)&res)[0];
 535 
 536         fmuld   %f28,%f20,%f28          ! (3_2) dtmp1 *= x_lo;
 537         st      %f11,[%i5+4]            ! (1_2) ((float*)pz)[1] = ((float*)&res)[1];
 538         fcmped  %fcc2,%f54,%f56         ! (1_1) dmax ? y
 539 
 540         fmuld   %f32,%f58,%f24          ! (3_2) dtmp2 *= y_lo;
 541 
 542         fmovdg  %fcc2,%f54,%f56         ! (1_1) if ( dmax < y ) dmax = y;
 543 
 544         faddd   %f2,D2ON28,%f10         ! (0_1) x_hi = x + D2ON28;
 545         fcmple32        DC1,%f18,%o3    ! (3_1) c0 = vis_fcmple32(DC1,x);
 546 
 547         faddd   %f44,D2ON28,%f20        ! (0_1) y_hi = y + D2ON28;
 548         fcmple32        DC1,%f30,%o0    ! (3_1) c2 = vis_fcmple32(DC1,y);
 549 
 550         faddd   %f60,%f22,%f22          ! (3_2) res += dtmp0;
 551         fcmpgt32        DC2,%f18,%o4    ! (3_1) c1 = vis_fcmpgt32(DC2,x);
 552 
 553         faddd   %f28,%f24,%f26          ! (3_2) dtmp1 += dtmp2;
 554         fcmpgt32        DC2,%f30,%o1    ! (3_1) c3 = vis_fcmpgt32(DC2,y);
 555 
 556         fand    %f56,DC0,%f38           ! (1_1) dmax = vis_fand(dmax,DC0);
 557 
 558         or      %o3,%o0,%o3             ! (3_1) c0 |= c2;
 559         fsubd   %f10,D2ON28,%f58        ! (0_1) x_hi -= D2ON28;
 560 
 561         andcc   %o3,2,%g0               ! (3_1) c0 & 2
 562         bnz,pn  %icc,.update10          ! (3_1) if ( (c0 & 2) != 0 )
 563         fsubd   %f20,D2ON28,%f56        ! (0_1) y_hi -= D2ON28;
 564 .cont10:
 565         faddd   %f22,%f26,%f28          ! (3_2) res += dtmp1;
 566         and     %o4,%o1,%o4             ! (3_1) c1 &= c3;
 567 
 568         add     %l4,stridey,%i3         ! py += stridey
 569         andcc   %o4,2,%g0               ! (3_1) c1 & 2
 570         bnz,pn  %icc,.update11          ! (3_1) if ( (c1 & 2) != 0 )
 571         fmovd   %f50,%f32               ! (2_1) dmax = x;
 572 .cont11:
 573         fpsub32 DC1,%f38,%f10           ! (1_1) dnorm = vis_fpsub32(DC1,dmax);
 574         add     %l2,stridex,%l1         ! px += stridex
 575         lda     [%i3]%asi,%f20          ! (0_0) ((float*)&y)[0] = ((float*)py)[0];
 576 
 577         fmuld   %f58,%f58,%f6           ! (0_1) res = x_hi * x_hi;
 578         lda     [%i3+4]%asi,%f21        ! (0_0) ((float*)&y)[1] = ((float*)py)[1];
 579         add     %i5,stridez,%l6         ! pz += stridez
 580         faddd   %f44,%f56,%f60          ! (0_1) dtmp2 = y + y_hi;
 581 
 582         fsqrtd  %f28,%f4                ! (3_2) res = sqrt(res);
 583         lda     [%l1]%asi,%f22          ! (0_0) ((float*)&x)[0] = ((float*)px)[0];
 584         faddd   %f2,%f58,%f24           ! (0_1) dtmp1 = x + x_hi;
 585 
 586         fmuld   %f36,%f10,%f36          ! (1_1) x *= dnorm;
 587         fsubd   %f2,%f58,%f26           ! (0_1) x_lo = x - x_hi;
 588         lda     [%l1+4]%asi,%f23        ! (0_0) ((float*)&x)[1] = ((float*)px)[1];
 589 
 590         fmuld   %f56,%f56,%f28          ! (0_1) dtmp0 = y_hi * y_hi;
 591         fsubd   %f44,%f56,%f44          ! (0_1) y_lo = y - y_hi;
 592 
 593         fmuld   %f54,%f10,%f56          ! (1_1) y *= dnorm;
 594         fabsd   %f20,%f40               ! (0_0) y = fabs(y);
 595 
 596         fmuld   %f52,%f12,%f12          ! (2_2) res = dmax * res;
 597         fabsd   %f22,%f20               ! (0_0) x = fabs(x);
 598         st      %f12,[%l6]              ! (2_2) ((float*)pz)[0] = ((float*)&res)[0];
 599 
 600         fmuld   %f24,%f26,%f10          ! (0_1) dtmp1 *= x_lo;
 601         st      %f13,[%l6+4]            ! (2_2) ((float*)pz)[1] = ((float*)&res)[1];
 602         fcmped  %fcc3,%f34,%f32         ! (2_1) dmax ? y
 603 
 604         fmuld   %f60,%f44,%f12          ! (0_1) dtmp2 *= y_lo;
 605 
 606         fmovdg  %fcc3,%f34,%f32         ! (2_1) if ( dmax < y ) dmax = y;
 607 
 608         faddd   %f36,D2ON28,%f58        ! (1_1) x_hi = x + D2ON28;
 609         fcmple32        DC1,%f20,%g5    ! (0_0) c0 = vis_fcmple32(DC1,x);
 610 
 611         faddd   %f56,D2ON28,%f22        ! (1_1) y_hi = y + D2ON28;
 612         fcmple32        DC1,%f40,%o2    ! (0_0) c2 = vis_fcmple32(DC1,y);
 613 
 614         faddd   %f6,%f28,%f24           ! (0_1) res += dtmp0;
 615         fcmpgt32        DC2,%f20,%g1    ! (0_0) c1 = vis_fcmpgt32(DC2,x);
 616 
 617         faddd   %f10,%f12,%f26          ! (0_1) dtmp1 += dtmp2;
 618         fcmpgt32        DC2,%f40,%o4    ! (0_0) c3 = vis_fcmpgt32(DC2,y);
 619 
 620         fand    %f32,DC0,%f52           ! (2_1) dmax = vis_fand(dmax,DC0);
 621 
 622         or      %g5,%o2,%g5             ! (0_0) c0 |= c2;
 623         fsubd   %f58,D2ON28,%f58        ! (1_1) x_hi -= D2ON28;
 624 
 625         andcc   %g5,2,%g0               ! (0_0) c0 & 2
 626         bnz,pn  %icc,.update12          ! (0_0) if ( (c0 & 2) != 0 )
 627         fsubd   %f22,D2ON28,%f22        ! (1_1) y_hi -= D2ON28;
 628 .cont12:
 629         and     %g1,%o4,%g1             ! (0_0) c1 &= c3;
 630         faddd   %f24,%f26,%f12          ! (0_1) res += dtmp1;
 631 
 632         add     %i3,stridey,%l2         ! py += stridey
 633         andcc   %g1,2,%g0               ! (0_0) c1 & 2
 634         bnz,pn  %icc,.update13          ! (0_0) if ( (c1 & 2) != 0 )
 635         fmovd   %f18,%f44               ! (3_1) dmax = x;
 636 .cont13:
 637         fpsub32 DC1,%f52,%f10           ! (2_1) dnorm = vis_fpsub32(DC1,dmax);
 638         add     %l1,stridex,%l7         ! px += stridex
 639         lda     [%l2]%asi,%f24          ! (1_0) ((float*)&y)[0] = ((float*)py)[0];
 640 
 641         fmuld   %f58,%f58,%f60          ! (1_1) res = x_hi * x_hi;
 642         add     %l6,stridez,%i5         ! pz += stridez
 643         lda     [%l2+4]%asi,%f25        ! (1_0) ((float*)&y)[1] = ((float*)py)[1];
 644         faddd   %f56,%f22,%f28          ! (1_1) dtmp2 = y + y_hi;
 645 
 646         fsqrtd  %f12,%f12               ! (0_1) res = sqrt(res);
 647         lda     [%l7]%asi,%f26          ! (1_0) ((float*)&x)[0] = ((float*)px)[0];
 648         faddd   %f36,%f58,%f6           ! (1_1) dtmp1 = x + x_hi;
 649 
 650         fmuld   %f50,%f10,%f50          ! (2_1) x *= dnorm;
 651         fsubd   %f36,%f58,%f58          ! (1_1) x_lo = x - x_hi;
 652         lda     [%l7+4]%asi,%f27        ! (1_0) ((float*)&x)[1] = ((float*)px)[1];
 653 
 654         fmuld   %f22,%f22,%f2           ! (1_1) dtmp0 = y_hi * y_hi;
 655         fsubd   %f56,%f22,%f56          ! (1_1) y_lo = y - y_hi;
 656 
 657         fmuld   %f34,%f10,%f34          ! (2_1) y *= dnorm;
 658         fabsd   %f24,%f54               ! (1_0) y = fabs(y);
 659 
 660         fmuld   %f14,%f4,%f14           ! (3_2) res = dmax * res;
 661         fabsd   %f26,%f36               ! (1_0) x = fabs(x);
 662         st      %f14,[%i5]              ! (3_2) ((float*)pz)[0] = ((float*)&res)[0];
 663 
 664         fmuld   %f6,%f58,%f10           ! (1_1) dtmp1 *= x_lo;
 665         st      %f15,[%i5+4]            ! (3_2) ((float*)pz)[1] = ((float*)&res)[1];
 666         fcmped  %fcc0,%f30,%f44         ! (3_1) dmax ? y
 667 
 668         fmuld   %f28,%f56,%f26          ! (1_1) dtmp2 *= y_lo;
 669 
 670         fmovdg  %fcc0,%f30,%f44         ! (3_1) if ( dmax < y ) dmax = y;
 671 
 672         faddd   %f50,D2ON28,%f58        ! (2_1) x_hi = x + D2ON28;
 673         fcmple32        DC1,%f36,%g1    ! (1_0) c0 = vis_fcmple32(DC1,x);
 674 
 675         faddd   %f34,D2ON28,%f22        ! (2_1) y_hi = y + D2ON28;
 676         fcmple32        DC1,%f54,%g5    ! (1_0) c2 = vis_fcmple32(DC1,y);
 677 
 678         faddd   %f60,%f2,%f24           ! (1_1) res += dtmp0;
 679         fcmpgt32        DC2,%f36,%o5    ! (1_0) c1 = vis_fcmpgt32(DC2,x);
 680 
 681         faddd   %f10,%f26,%f28          ! (1_1) dtmp1 += dtmp2;
 682         fcmpgt32        DC2,%f54,%o1    ! (1_0) c3 = vis_fcmpgt32(DC2,y);
 683 
 684         fand    %f44,DC0,%f14           ! (3_1) dmax = vis_fand(dmax,DC0);
 685 
 686         or      %g1,%g5,%g1             ! (1_0) c0 |= c2;
 687         fsubd   %f58,D2ON28,%f44        ! (2_1) x_hi -= D2ON28;
 688 
 689         andcc   %g1,2,%g0               ! (1_0) c0 & 2
 690         bnz,pn  %icc,.update14          ! (1_0) if ( (c0 & 2) != 0 )
 691         fsubd   %f22,D2ON28,%f58        ! (2_1) y_hi -= D2ON28;
 692 .cont14:
 693         and     %o5,%o1,%o5             ! (1_0) c1 &= c3;
 694         faddd   %f24,%f28,%f26          ! (1_1) res += dtmp1;
 695 
 696         add     %l2,stridey,%i3         ! py += stridey
 697         andcc   %o5,2,%g0               ! (1_0) c1 & 2
 698         bnz,pn  %icc,.update15          ! (1_0) if ( (c1 & 2) != 0 )
 699         fmovd   %f20,%f4                ! (0_0) dmax = x;
 700 .cont15:
 701         fpsub32 DC1,%f14,%f10           ! (3_1) dnorm = vis_fpsub32(DC1,dmax);
 702         add     %l7,stridex,%o1         ! px += stridex
 703         lda     [%i3]%asi,%f28          ! (2_0) ((float*)&y)[0] = ((float*)py)[0];
 704 
 705         fmuld   %f44,%f44,%f2           ! (2_1) res = x_hi * x_hi;
 706         add     %i5,stridez,%g5         ! pz += stridez
 707         lda     [%i3+4]%asi,%f29        ! (2_0) ((float*)&y)[1] = ((float*)py)[1];
 708         faddd   %f34,%f58,%f60          ! (2_1) dtmp2 = y + y_hi;
 709 
 710         fsqrtd  %f26,%f24               ! (1_1) res = sqrt(res);
 711         lda     [%o1]%asi,%f26          ! (2_0) ((float*)&x)[0] = ((float*)px)[0];
 712         faddd   %f50,%f44,%f56          ! (2_1) dtmp1 = x + x_hi;
 713 
 714         fmuld   %f18,%f10,%f6           ! (3_1) x *= dnorm;
 715         fsubd   %f50,%f44,%f18          ! (2_1) x_lo = x - x_hi;
 716         lda     [%o1+4]%asi,%f27        ! (2_0) ((float*)&x)[1] = ((float*)px)[1];
 717 
 718         fmuld   %f58,%f58,%f44          ! (2_1) dtmp0 = y_hi * y_hi;
 719         fsubd   %f34,%f58,%f22          ! (2_1) y_lo = y - y_hi;
 720 
 721         fmuld   %f30,%f10,%f58          ! (3_1) y *= dnorm;
 722         fabsd   %f28,%f34               ! (2_0) y = fabs(y);
 723 
 724         fmuld   %f16,%f12,%f16          ! (0_1) res = dmax * res;
 725         fabsd   %f26,%f50               ! (2_0) x = fabs(x);
 726         st      %f16,[%g5]              ! (0_1) ((float*)pz)[0] = ((float*)&res)[0];
 727 
 728         fmuld   %f56,%f18,%f10          ! (2_1) dtmp1 *= x_lo;
 729         st      %f17,[%g5+4]            ! (0_1) ((float*)pz)[1] = ((float*)&res)[1];
 730         fcmped  %fcc1,%f40,%f4          ! (0_0) dmax ? y
 731 
 732         fmuld   %f60,%f22,%f12          ! (2_1) dtmp2 *= y_lo;
 733 
 734         fmovdg  %fcc1,%f40,%f4          ! (0_0) if ( dmax < y ) dmax = y;
 735 
 736         faddd   %f6,D2ON28,%f56         ! (3_1) x_hi = x + D2ON28;
 737         fcmple32        DC1,%f50,%o3    ! (2_0) c0 = vis_fcmple32(DC1,x);
 738 
 739         faddd   %f58,D2ON28,%f28        ! (3_1) y_hi = y + D2ON28;
 740         fcmple32        DC1,%f34,%o0    ! (2_0) c2 = vis_fcmple32(DC1,y);
 741 
 742         faddd   %f2,%f44,%f30           ! (2_1) res += dtmp0;
 743         fcmpgt32        DC2,%f50,%o4    ! (2_0) c1 = vis_fcmpgt32(DC2,x);
 744 
 745         faddd   %f10,%f12,%f26          ! (2_1) dtmp1 += dtmp2;
 746         fcmpgt32        DC2,%f34,%o5    ! (2_0) c3 = vis_fcmpgt32(DC2,y);
 747 
 748         fand    %f4,DC0,%f16            ! (0_0) dmax = vis_fand(dmax,DC0);
 749 
 750         or      %o3,%o0,%o3             ! (2_0) c0 |= c2;
 751         fsubd   %f56,D2ON28,%f18        ! (3_1) x_hi -= D2ON28;
 752 
 753         andcc   %o3,2,%g0               ! (2_0) c0 & 2
 754         bnz,pn  %icc,.update16          ! (2_0) if ( (c0 & 2) != 0 )
 755         fsubd   %f28,D2ON28,%f4         ! (3_1) y_hi -= D2ON28;
 756 .cont16:
 757         and     %o4,%o5,%o4             ! (2_0) c1 &= c3;
 758         faddd   %f30,%f26,%f12          ! (2_1) res += dtmp1;
 759 
 760         add     %i3,stridey,%l4         ! py += stridey
 761         andcc   %o4,2,%g0               ! (2_0) c1 & 2
 762         bnz,pn  %icc,.update17          ! (2_0) if ( (c1 & 2) != 0 )
 763         fmovd   %f36,%f56               ! (1_0) dmax = x;
 764 .cont17:
 765         lda     [%l4]%asi,%f30          ! (3_0) ((float*)&y)[0] = ((float*)py)[0];
 766         add     %o1,stridex,%l2         ! px += stridex
 767         fpsub32 DC1,%f16,%f44           ! (0_0) dnorm = vis_fpsub32(DC1,dmax);
 768 
 769         fmuld   %f18,%f18,%f60          ! (3_1) res = x_hi * x_hi;
 770         add     %g5,stridez,%i5         ! pz += stridez
 771         lda     [%l4+4]%asi,%f31        ! (3_0) ((float*)&y)[1] = ((float*)py)[1];
 772         faddd   %f58,%f4,%f32           ! (3_1) dtmp2 = y + y_hi;
 773 
 774         fsqrtd  %f12,%f12               ! (2_1) res = sqrt(res);
 775         subcc   counter,4,counter       ! counter -= 4;
 776         bpos,pt %icc,.main_loop
 777         faddd   %f6,%f18,%f28           ! (3_1) dtmp1 = x + x_hi;
 778 
 779         add     counter,4,counter
 780 
 781 .tail:
 782         subcc   counter,1,counter
 783         bneg,a  .begin
 784         nop
 785 
 786         fsubd   %f6,%f18,%f20           ! (3_2) x_lo = x - x_hi;
 787 
 788         fmuld   %f4,%f4,%f22            ! (3_2) dtmp0 = y_hi * y_hi;
 789         fsubd   %f58,%f4,%f58           ! (3_2) y_lo = y - y_hi;
 790 
 791         fmuld   %f38,%f24,%f10          ! (1_2) res = dmax * res;
 792         st      %f10,[%i5]              ! (1_2) ((float*)pz)[0] = ((float*)&res)[0];
 793 
 794         st      %f11,[%i5+4]            ! (1_2) ((float*)pz)[1] = ((float*)&res)[1];
 795 
 796         subcc   counter,1,counter
 797         bneg,a  .begin
 798         add     %i5,stridez,%i5
 799 
 800         fmuld   %f28,%f20,%f28          ! (3_2) dtmp1 *= x_lo;
 801 
 802         fmuld   %f32,%f58,%f24          ! (3_2) dtmp2 *= y_lo;
 803 
 804         faddd   %f60,%f22,%f22          ! (3_2) res += dtmp0;
 805 
 806         faddd   %f28,%f24,%f26          ! (3_2) dtmp1 += dtmp2;
 807 
 808         faddd   %f22,%f26,%f28          ! (3_2) res += dtmp1;
 809 
 810         add     %i5,stridez,%l6         ! pz += stridez
 811 
 812         fsqrtd  %f28,%f4                ! (3_2) res = sqrt(res);
 813         add     %l2,stridex,%l1         ! px += stridex
 814 
 815         fmuld   %f52,%f12,%f12          ! (2_2) res = dmax * res;
 816         st      %f12,[%l6]              ! (2_2) ((float*)pz)[0] = ((float*)&res)[0];
 817 
 818         st      %f13,[%l6+4]            ! (2_2) ((float*)pz)[1] = ((float*)&res)[1];
 819 
 820         subcc   counter,1,counter
 821         bneg    .begin
 822         add     %l6,stridez,%i5
 823 
 824         fmuld   %f14,%f4,%f14           ! (3_2) res = dmax * res;
 825         st      %f14,[%i5]              ! (3_2) ((float*)pz)[0] = ((float*)&res)[0];
 826 
 827         st      %f15,[%i5+4]            ! (3_2) ((float*)pz)[1] = ((float*)&res)[1];
 828 
 829         ba      .begin
 830         add     %i5,stridez,%i5
 831 
 832         .align  16
 833 .spec0:
 834         ld      [%i1+4],%l1             ! lx = ((int*)px)[1];
 835         cmp     %o2,%o4                 ! j0 ? 0x7ff00000
 836         bge,pn  %icc,1f                 ! if ( j0 >= 0x7ff00000 )
 837         fabsd   %f26,%f26               ! x = fabs(x);
 838 
 839         sub     %o0,%l4,%o0             ! diff = hy - hx;
 840         fabsd   %f24,%f24               ! y = fabs(y);
 841 
 842         sra     %o0,31,%l4              ! j0 = diff >> 31;
 843 
 844         xor     %o0,%l4,%o0             ! diff ^ j0
 845 
 846         sethi   %hi(0x03600000),%l1
 847         sub     %o0,%l4,%o0             ! (diff ^ j0) - j0
 848 
 849         cmp     %o0,%l1                 ! ((diff ^ j0) - j0) ? 0x03600000
 850         bge,a,pn        %icc,2f         ! if ( ((diff ^ j0) - j0) >= 0x03600000 )
 851         faddd   %f26,%f24,%f24          ! *pz = x + y
 852 
 853         fmuld   %f26,DC2,%f36           ! (1_1) x *= dnorm;
 854 
 855         fmuld   %f24,DC2,%f56           ! (1_1) y *= dnorm;
 856 
 857         faddd   %f36,D2ON28,%f58        ! (1_1) x_hi = x + D2ON28;
 858 
 859         faddd   %f56,D2ON28,%f22        ! (1_1) y_hi = y + D2ON28;
 860 
 861         fsubd   %f58,D2ON28,%f58        ! (1_1) x_hi -= D2ON28;
 862 
 863         fsubd   %f22,D2ON28,%f22        ! (1_1) y_hi -= D2ON28;
 864 
 865         fmuld   %f58,%f58,%f60          ! (1_1) res = x_hi * x_hi;
 866         faddd   %f56,%f22,%f28          ! (1_1) dtmp2 = y + y_hi;
 867 
 868         faddd   %f36,%f58,%f6           ! (1_1) dtmp1 = x + x_hi;
 869 
 870         fsubd   %f36,%f58,%f58          ! (1_1) x_lo = x - x_hi;
 871 
 872         fmuld   %f22,%f22,%f2           ! (1_1) dtmp0 = y_hi * y_hi;
 873         fsubd   %f56,%f22,%f56          ! (1_1) y_lo = y - y_hi;
 874 
 875         fmuld   %f6,%f58,%f10           ! (1_1) dtmp1 *= x_lo;
 876 
 877         fmuld   %f28,%f56,%f26          ! (1_1) dtmp2 *= y_lo;
 878 
 879         faddd   %f60,%f2,%f24           ! (1_1) res += dtmp0;
 880 
 881         faddd   %f10,%f26,%f28          ! (1_1) dtmp1 += dtmp2;
 882 
 883         faddd   %f24,%f28,%f26          ! (1_1) res += dtmp1;
 884 
 885         fsqrtd  %f26,%f24               ! (1_1) res = sqrt(res);
 886 
 887         fmuld   DC3,%f24,%f24           ! (1_2) res = dmax * res;
 888 2:
 889         add     %i3,stridey,%i3
 890         add     %i1,stridex,%i1
 891         st      %f24,[%i5]              ! ((float*)pz)[0] = ((float*)&res)[0];
 892         st      %f25,[%i5+4]            ! ((float*)pz)[1] = ((float*)&res)[1];
 893 
 894         add     %i5,stridez,%i5
 895         ba      .begin1
 896         sub     counter,1,counter
 897 
 898 1:
 899         ld      [%i3+4],%l2             ! ly = ((int*)py)[1];
 900         cmp     %o0,%o4                 ! hx ? 0x7ff00000
 901         bne,pn  %icc,1f                 ! if ( hx != 0x7ff00000 )
 902         fabsd   %f24,%f24               ! y = fabs(y);
 903 
 904         cmp     %l1,0                   ! lx ? 0
 905         be,pn   %icc,2f                 ! if ( lx == 0 )
 906         nop
 907 1:
 908         cmp     %l4,%o4                 ! hy ? 0x7ff00000
 909         bne,pn  %icc,1f                 ! if ( hy != 0x7ff00000 )
 910         nop
 911 
 912         cmp     %l2,0                   ! ly ? 0
 913         be,pn   %icc,2f                 ! if ( ly == 0 )
 914         nop
 915 1:
 916         add     %i3,stridey,%i3
 917         add     %i1,stridex,%i1
 918         fmuld   %f26,%f24,%f24          ! res = x * y;
 919         st      %f24,[%i5]              ! ((float*)pz)[0] = ((float*)&res)[0];
 920 
 921         st      %f25,[%i5+4]            ! ((float*)pz)[1] = ((float*)&res)[1];
 922 
 923         add     %i5,stridez,%i5
 924         ba      .begin1
 925         sub     counter,1,counter
 926 
 927 2:
 928         add     %i1,stridex,%i1
 929         add     %i3,stridey,%i3
 930         st      DC0_HI,[%i5]            ! ((int*)pz)[0] = 0x7ff00000;
 931         st      DC0_LO,[%i5+4]          ! ((int*)pz)[1] = 0;
 932         fcmpd   %f26,%f24               ! x ? y
 933 
 934         add     %i5,stridez,%i5
 935         ba      .begin1
 936         sub     counter,1,counter
 937 
 938         .align  16
 939 .spec1:
 940         fmuld   %f26,DC3,%f36           ! (1_1) x *= dnorm;
 941 
 942         fmuld   %f24,DC3,%f56           ! (1_1) y *= dnorm;
 943 
 944         faddd   %f36,D2ON28,%f58        ! (1_1) x_hi = x + D2ON28;
 945 
 946         faddd   %f56,D2ON28,%f22        ! (1_1) y_hi = y + D2ON28;
 947 
 948         fsubd   %f58,D2ON28,%f58        ! (1_1) x_hi -= D2ON28;
 949 
 950         fsubd   %f22,D2ON28,%f22        ! (1_1) y_hi -= D2ON28;
 951 
 952         fmuld   %f58,%f58,%f60          ! (1_1) res = x_hi * x_hi;
 953         faddd   %f56,%f22,%f28          ! (1_1) dtmp2 = y + y_hi;
 954 
 955         faddd   %f36,%f58,%f6           ! (1_1) dtmp1 = x + x_hi;
 956 
 957         fsubd   %f36,%f58,%f58          ! (1_1) x_lo = x - x_hi;
 958 
 959         fmuld   %f22,%f22,%f2           ! (1_1) dtmp0 = y_hi * y_hi;
 960         fsubd   %f56,%f22,%f56          ! (1_1) y_lo = y - y_hi;
 961 
 962         fmuld   %f6,%f58,%f10           ! (1_1) dtmp1 *= x_lo;
 963 
 964         fmuld   %f28,%f56,%f26          ! (1_1) dtmp2 *= y_lo;
 965 
 966         faddd   %f60,%f2,%f24           ! (1_1) res += dtmp0;
 967 
 968         faddd   %f10,%f26,%f28          ! (1_1) dtmp1 += dtmp2;
 969 
 970         faddd   %f24,%f28,%f26          ! (1_1) res += dtmp1;
 971 
 972         fsqrtd  %f26,%f24               ! (1_1) res = sqrt(res);
 973 
 974         fmuld   DC2,%f24,%f24           ! (1_2) res = dmax * res;
 975 
 976         add     %i3,stridey,%i3
 977         add     %i1,stridex,%i1
 978         st      %f24,[%i5]              ! ((float*)pz)[0] = ((float*)&res)[0];
 979 
 980         st      %f25,[%i5+4]            ! ((float*)pz)[1] = ((float*)&res)[1];
 981         add     %i5,stridez,%i5
 982         ba      .begin1
 983         sub     counter,1,counter
 984 
 985         .align  16
 986 .update0:
 987         fzero   %f50
 988         cmp     counter,1
 989         ble     .cont0
 990         fzero   %f34
 991 
 992         mov     %o1,tmp_px
 993         mov     %i3,tmp_py
 994 
 995         sub     counter,1,tmp_counter
 996         ba      .cont0
 997         mov     1,counter
 998 
 999         .align  16
1000 .update1:
1001         fzero   %f50
1002         cmp     counter,1
1003         ble     .cont1
1004         fzero   %f34
1005 
1006         mov     %o1,tmp_px
1007         mov     %i3,tmp_py
1008 
1009         sub     counter,1,tmp_counter
1010         ba      .cont1
1011         mov     1,counter
1012 
1013         .align  16
1014 .update2:
1015         fzero   %f18
1016         cmp     counter,2
1017         ble     .cont2
1018         fzero   %f30
1019 
1020         mov     %l2,tmp_px
1021         mov     %l4,tmp_py
1022 
1023         sub     counter,2,tmp_counter
1024         ba      .cont1
1025         mov     2,counter
1026 
1027         .align  16
1028 .update3:
1029         fzero   %f18
1030         cmp     counter,2
1031         ble     .cont3
1032         fzero   %f30
1033 
1034         mov     %l2,tmp_px
1035         mov     %l4,tmp_py
1036 
1037         sub     counter,2,tmp_counter
1038         ba      .cont3
1039         mov     2,counter
1040 
1041         .align  16
1042 .update4:
1043         fzero   %f20
1044         cmp     counter,3
1045         ble     .cont4
1046         fzero   %f40
1047 
1048         mov     %l1,tmp_px
1049         mov     %i3,tmp_py
1050 
1051         sub     counter,3,tmp_counter
1052         ba      .cont4
1053         mov     3,counter
1054 
1055         .align  16
1056 .update5:
1057         fzero   %f20
1058         cmp     counter,3
1059         ble     .cont5
1060         fzero   %f40
1061 
1062         mov     %l1,tmp_px
1063         mov     %i3,tmp_py
1064 
1065         sub     counter,3,tmp_counter
1066         ba      .cont5
1067         mov     3,counter
1068 
1069         .align  16
1070 .update6:
1071         fzero   %f36
1072         cmp     counter,4
1073         ble     .cont6
1074         fzero   %f54
1075 
1076         mov     %l7,tmp_px
1077         mov     %l2,tmp_py
1078 
1079         sub     counter,4,tmp_counter
1080         ba      .cont6
1081         mov     4,counter
1082 
1083         .align  16
1084 .update7:
1085         fzero   %f36
1086         cmp     counter,4
1087         ble     .cont7
1088         fzero   %f54
1089 
1090         mov     %l7,tmp_px
1091         mov     %l2,tmp_py
1092 
1093         sub     counter,4,tmp_counter
1094         ba      .cont7
1095         mov     4,counter
1096 
1097         .align  16
1098 .update8:
1099         fzero   %f50
1100         cmp     counter,5
1101         ble     .cont8
1102         fzero   %f34
1103 
1104         mov     %o1,tmp_px
1105         mov     %i3,tmp_py
1106 
1107         sub     counter,5,tmp_counter
1108         ba      .cont8
1109         mov     5,counter
1110 
1111         .align  16
1112 .update9:
1113         fzero   %f50
1114         cmp     counter,5
1115         ble     .cont9
1116         fzero   %f34
1117 
1118         mov     %o1,tmp_px
1119         mov     %i3,tmp_py
1120 
1121         sub     counter,5,tmp_counter
1122         ba      .cont9
1123         mov     5,counter
1124 
1125 
1126         .align  16
1127 .update10:
1128         fzero   %f18
1129         cmp     counter,2
1130         ble     .cont10
1131         fzero   %f30
1132 
1133         mov     %l2,tmp_px
1134         mov     %l4,tmp_py
1135 
1136         sub     counter,2,tmp_counter
1137         ba      .cont10
1138         mov     2,counter
1139 
1140         .align  16
1141 .update11:
1142         fzero   %f18
1143         cmp     counter,2
1144         ble     .cont11
1145         fzero   %f30
1146 
1147         mov     %l2,tmp_px
1148         mov     %l4,tmp_py
1149 
1150         sub     counter,2,tmp_counter
1151         ba      .cont11
1152         mov     2,counter
1153 
1154         .align  16
1155 .update12:
1156         fzero   %f20
1157         cmp     counter,3
1158         ble     .cont12
1159         fzero   %f40
1160 
1161         mov     %l1,tmp_px
1162         mov     %i3,tmp_py
1163 
1164         sub     counter,3,tmp_counter
1165         ba      .cont12
1166         mov     3,counter
1167 
1168         .align  16
1169 .update13:
1170         fzero   %f20
1171         cmp     counter,3
1172         ble     .cont13
1173         fzero   %f40
1174 
1175         mov     %l1,tmp_px
1176         mov     %i3,tmp_py
1177 
1178         sub     counter,3,tmp_counter
1179         ba      .cont13
1180         mov     3,counter
1181 
1182         .align  16
1183 .update14:
1184         fzero   %f54
1185         cmp     counter,4
1186         ble     .cont14
1187         fzero   %f36
1188 
1189         mov     %l7,tmp_px
1190         mov     %l2,tmp_py
1191 
1192         sub     counter,4,tmp_counter
1193         ba      .cont14
1194         mov     4,counter
1195 
1196         .align  16
1197 .update15:
1198         fzero   %f54
1199         cmp     counter,4
1200         ble     .cont15
1201         fzero   %f36
1202 
1203         mov     %l7,tmp_px
1204         mov     %l2,tmp_py
1205 
1206         sub     counter,4,tmp_counter
1207         ba      .cont15
1208         mov     4,counter
1209 
1210         .align  16
1211 .update16:
1212         fzero   %f50
1213         cmp     counter,5
1214         ble     .cont16
1215         fzero   %f34
1216 
1217         mov     %o1,tmp_px
1218         mov     %i3,tmp_py
1219 
1220         sub     counter,5,tmp_counter
1221         ba      .cont16
1222         mov     5,counter
1223 
1224         .align  16
1225 .update17:
1226         fzero   %f50
1227         cmp     counter,5
1228         ble     .cont17
1229         fzero   %f34
1230 
1231         mov     %o1,tmp_px
1232         mov     %i3,tmp_py
1233 
1234         sub     counter,5,tmp_counter
1235         ba      .cont17
1236         mov     5,counter
1237 
1238         .align  16
1239 .exit:
1240         ret
1241         restore
1242         SET_SIZE(__vhypot)
1243