1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 23 */ 24 /* 25 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 26 * Use is subject to license terms. 27 */ 28 29 .file "__vhypot.S" 30 31 #include "libm.h" 32 33 RO_DATA 34 .align 64 35 36 .CONST_TBL: 37 .word 0x7ff00000, 0 ! DC0 38 .word 0x7fe00000, 0 ! DC1 39 .word 0x00100000, 0 ! DC2 40 .word 0x41b00000, 0 ! D2ON28 = 268435456.0 41 .word 0x7fd00000, 0 ! DC3 42 43 #define counter %i0 44 #define tmp_counter %l3 45 #define tmp_px %l5 46 #define tmp_py %o7 47 #define stridex %i2 48 #define stridey %i4 49 #define stridez %l0 50 51 #define DC0 %f8 52 #define DC0_HI %f8 53 #define DC0_LO %f9 54 #define DC1 %f46 55 #define DC2 %f48 56 #define DC3 %f0 57 #define D2ON28 %f62 58 59 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 60 ! !!!!! algorithm !!!!! 61 ! ((float*)&x)[0] = ((float*)px)[0]; 62 ! ((float*)&x)[1] = ((float*)px)[1]; 63 ! 64 ! ((float*)&y)[0] = ((float*)py)[0]; 65 ! ((float*)&y)[1] = ((float*)py)[1]; 66 ! 67 ! x = fabs(x); 68 ! y = fabs(y); 69 ! 70 ! c0 = vis_fcmple32(DC1,x); 71 ! c2 = vis_fcmple32(DC1,y); 72 ! c1 = vis_fcmpgt32(DC2,x); 73 ! c3 = vis_fcmpgt32(DC2,y); 74 ! 75 ! c0 |= c2; 76 ! c1 &= c3; 77 ! if ( (c0 & 2) != 0 ) 78 ! { 79 ! lx = ((int*)px)[1]; 80 ! ly = ((int*)py)[1]; 81 ! hx = *(int*)px; 82 ! hy = *(int*)py; 83 ! 84 ! hx &= 0x7fffffff; 85 ! hy &= 0x7fffffff; 86 ! 87 ! j0 = hx; 88 ! if ( j0 < hy ) j0 = hy; 89 ! j0 &= 0x7ff00000; 90 ! if ( j0 >= 0x7ff00000 ) 91 ! { 92 ! if ( hx == 0x7ff00000 && lx == 0 ) res = x == y ? y : x; 93 ! else if ( hy == 0x7ff00000 && ly == 0 ) res = x == y ? x : y; 94 ! else res = x * y; 95 ! 96 ! ((float*)pz)[0] = ((float*)&res)[0]; 97 ! ((float*)pz)[1] = ((float*)&res)[1]; 98 ! } 99 ! else 100 ! { 101 ! diff = hy - hx; 102 ! j0 = diff >> 31; 103 ! if ( ((diff ^ j0) - j0) < 0x03600000 ) 104 ! {! 105 ! x *= D2ONM1022; 106 ! y *= D2ONM1022; 107 ! 108 ! x_hi = ( x + two28 ) - two28; 109 ! x_lo = x - x_hi; 110 ! y_hi = ( y + two28 ) - two28; 111 ! y_lo = y - y_hi; 112 ! res = (x_hi * x_hi + y_hi * y_hi); 113 ! res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo); 114 ! 115 ! res = sqrt(res); 116 ! 117 ! res = D2ONP1022 * res; 118 ! ((float*)pz)[0] = ((float*)&res)[0]; 119 ! ((float*)pz)[1] = ((float*)&res)[1]; 120 ! } 121 ! else 122 ! { 123 ! res = x + y; 124 ! ((float*)pz)[0] = ((float*)&res)[0]; 125 ! ((float*)pz)[1] = ((float*)&res)[1]; 126 ! } 127 ! } 128 ! px += stridex; 129 ! py += stridey; 130 ! pz += stridez; 131 ! continue; 132 ! } 133 ! if ( (c1 & 2) != 0 ) 134 ! { 135 ! x *= D2ONP1022; 136 ! y *= D2ONP1022; 137 ! 138 ! x_hi = ( x + two28 ) - two28; 139 ! x_lo = x - x_hi; 140 ! y_hi = ( y + two28 ) - two28; 141 ! y_lo = y - y_hi; 142 ! res = (x_hi * x_hi + y_hi * y_hi); 143 ! res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo); 144 ! 145 ! res = sqrt(res); 146 ! 147 ! res = D2ONM1022 * res; 148 ! ((float*)pz)[0] = ((float*)&res)[0]; 149 ! ((float*)pz)[1] = ((float*)&res)[1]; 150 ! px += stridex; 151 ! py += stridey; 152 ! pz += stridez; 153 ! continue; 154 ! } 155 ! 156 ! dmax = x; 157 ! if ( dmax < y ) dmax = y; 158 ! 159 ! dmax = vis_fand(dmax,DC0); 160 ! dnorm = vis_fpsub32(DC1,dmax); 161 ! 162 ! x *= dnorm; 163 ! y *= dnorm; 164 ! 165 ! x_hi = x + D2ON28; 166 ! x_hi -= D2ON28; 167 ! x_lo = x - x_hi; 168 ! 169 ! y_hi = y + D2ON28; 170 ! y_hi -= D2ON28; 171 ! y_lo = y - y_hi; 172 ! 173 ! res = x_hi * x_hi; 174 ! dtmp1 = x + x_hi; 175 ! dtmp0 = y_hi * y_hi; 176 ! dtmp2 = y + y_hi; 177 ! 178 ! res += dtmp0; 179 ! dtmp1 *= x_lo; 180 ! dtmp2 *= y_lo; 181 ! dtmp1 += dtmp2; 182 ! res += dtmp1; 183 ! 184 ! res = sqrt(res); 185 ! 186 ! res = dmax * res; 187 ! ((float*)pz)[0] = ((float*)&res)[0]; 188 ! ((float*)pz)[1] = ((float*)&res)[1]; 189 ! 190 ! px += stridex; 191 ! py += stridey; 192 ! pz += stridez; 193 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 194 195 ENTRY(__vhypot) 196 save %sp,-SA(MINFRAME),%sp 197 PIC_SETUP(l7) 198 PIC_SET(l7,.CONST_TBL,o3) 199 wr %g0,0x82,%asi 200 201 #ifdef __sparcv9 202 ldx [%fp+STACK_BIAS+176],%l0 203 #else 204 ld [%fp+STACK_BIAS+92],%l0 205 #endif 206 ldd [%o3],DC0 207 sll %i2,3,stridex 208 mov %i0,tmp_counter 209 210 ldd [%o3+8],DC1 211 sll %i4,3,stridey 212 mov %i1,tmp_px 213 214 ldd [%o3+16],DC2 215 sll %l0,3,stridez 216 mov %i3,tmp_py 217 218 ldd [%o3+24],D2ON28 219 220 ldd [%o3+32],DC3 221 222 .begin: 223 mov tmp_counter,counter 224 mov tmp_px,%i1 225 mov tmp_py,%i3 226 clr tmp_counter 227 .begin1: 228 cmp counter,0 229 ble,pn %icc,.exit 230 nop 231 232 lda [%i1]%asi,%o0 233 sethi %hi(0x7ffffc00),%o5 234 235 lda [%i3]%asi,%o2 236 add %o5,1023,%o5 237 238 lda [%i1]%asi,%f26 ! (1_0) ((float*)&x)[0] = ((float*)px)[0]; 239 240 lda [%i1+4]%asi,%f27 ! (1_0) ((float*)&x)[1] = ((float*)px)[1]; 241 add %i1,stridex,%o1 ! px += stridex 242 243 lda [%i3]%asi,%f24 ! (1_0) ((float*)&y)[0] = ((float*)py)[0]; 244 sethi %hi(0x00100000),%l7 245 and %o0,%o5,%o0 246 247 lda [%i3+4]%asi,%f25 ! (1_0) ((float*)&y)[1] = ((float*)py)[1]; 248 and %o2,%o5,%o2 249 sethi %hi(0x7fe00000),%l6 250 251 fabsd %f26,%f36 ! (1_0) x = fabs(x); 252 cmp %o0,%o2 253 mov %o2,%l4 254 255 fabsd %f24,%f54 ! (1_0) y = fabs(y); 256 add %i3,stridey,%o5 ! py += stridey 257 movg %icc,%o0,%o2 258 lda [%o5]%asi,%f28 ! (2_0) ((float*)&y)[0] = ((float*)py)[0]; 259 260 cmp %o2,%l6 261 sethi %hi(0x7ff00000),%o4 262 bge,pn %icc,.spec0 263 lda [%o5+4]%asi,%f29 ! (2_0) ((float*)&y)[1] = ((float*)py)[1]; 264 265 cmp %o2,%l7 266 bl,pn %icc,.spec1 267 nop 268 lda [%o1]%asi,%f26 ! (2_0) ((float*)&x)[0] = ((float*)px)[0]; 269 270 lda [%o1+4]%asi,%f27 ! (2_0) ((float*)&x)[1] = ((float*)px)[1]; 271 add %i3,stridey,%i3 ! py += stridey 272 273 fabsd %f28,%f34 ! (2_0) y = fabs(y); 274 275 fabsd %f26,%f50 ! (2_0) x = fabs(x); 276 277 fcmple32 DC1,%f50,%o3 ! (2_0) c0 = vis_fcmple32(DC1,x); 278 279 fcmple32 DC1,%f34,%o0 ! (2_0) c2 = vis_fcmple32(DC1,y); 280 281 fcmpgt32 DC2,%f50,%o4 ! (2_0) c1 = vis_fcmpgt32(DC2,x); 282 283 fcmpgt32 DC2,%f34,%o5 ! (2_0) c3 = vis_fcmpgt32(DC2,y); 284 285 or %o3,%o0,%o3 ! (2_0) c0 |= c2; 286 287 andcc %o3,2,%g0 ! (2_0) c0 & 2 288 bnz,pn %icc,.update0 ! (2_0) if ( (c0 & 2) != 0 ) 289 and %o4,%o5,%o4 ! (2_0) c1 &= c3; 290 .cont0: 291 add %i3,stridey,%l4 ! py += stridey 292 andcc %o4,2,%g0 ! (2_0) c1 & 2 293 bnz,pn %icc,.update1 ! (2_0) if ( (c1 & 2) != 0 ) 294 fmovd %f36,%f56 ! (1_0) dmax = x; 295 .cont1: 296 lda [%l4]%asi,%f30 ! (3_0) ((float*)&y)[0] = ((float*)py)[0]; 297 add %o1,stridex,%l2 ! px += stridex 298 299 lda [%l4+4]%asi,%f31 ! (3_0) ((float*)&y)[1] = ((float*)py)[1]; 300 301 lda [%l2]%asi,%f18 ! (3_1) ((float*)&x)[0] = ((float*)px)[0]; 302 303 lda [%l2+4]%asi,%f19 ! (3_1) ((float*)&x)[1] = ((float*)px)[1]; 304 305 fabsd %f30,%f30 ! (3_1) y = fabs(y); 306 307 fabsd %f18,%f18 ! (3_1) x = fabs(x); 308 309 fcmped %fcc2,%f54,%f56 ! (1_1) dmax ? y 310 311 fmovdg %fcc2,%f54,%f56 ! (1_1) if ( dmax < y ) dmax = y; 312 313 fcmple32 DC1,%f18,%o3 ! (3_1) c0 = vis_fcmple32(DC1,x); 314 315 fcmple32 DC1,%f30,%o0 ! (3_1) c2 = vis_fcmple32(DC1,y); 316 317 fcmpgt32 DC2,%f18,%o4 ! (3_1) c1 = vis_fcmpgt32(DC2,x); 318 319 fcmpgt32 DC2,%f30,%o1 ! (3_1) c3 = vis_fcmpgt32(DC2,y); 320 321 fand %f56,DC0,%f38 ! (1_1) dmax = vis_fand(dmax,DC0); 322 323 or %o3,%o0,%o3 ! (3_1) c0 |= c2; 324 325 andcc %o3,2,%g0 ! (3_1) c0 & 2 326 bnz,pn %icc,.update2 ! (3_1) if ( (c0 & 2) != 0 ) 327 and %o4,%o1,%o4 ! (3_1) c1 &= c3; 328 .cont2: 329 add %l4,stridey,%i3 ! py += stridey 330 andcc %o4,2,%g0 ! (3_1) c1 & 2 331 bnz,pn %icc,.update3 ! (3_1) if ( (c1 & 2) != 0 ) 332 fmovd %f50,%f32 ! (2_1) dmax = x; 333 .cont3: 334 fpsub32 DC1,%f38,%f10 ! (1_1) dnorm = vis_fpsub32(DC1,dmax); 335 lda [%i3]%asi,%f20 ! (0_0) ((float*)&y)[0] = ((float*)py)[0]; 336 337 lda [%i3+4]%asi,%f21 ! (0_0) ((float*)&y)[1] = ((float*)py)[1]; 338 339 add %l2,stridex,%l1 ! px += stridex 340 341 fmuld %f36,%f10,%f36 ! (1_1) x *= dnorm; 342 lda [%l1]%asi,%f22 ! (0_0) ((float*)&x)[0] = ((float*)px)[0] 343 344 lda [%l1+4]%asi,%f23 ! (0_0) ((float*)&x)[1] = ((float*)px)[1]; 345 346 fmuld %f54,%f10,%f56 ! (1_1) y *= dnorm; 347 fabsd %f20,%f40 ! (0_0) y = fabs(y); 348 349 fabsd %f22,%f20 ! (0_0) x = fabs(x); 350 351 fcmped %fcc3,%f34,%f32 ! (2_1) dmax ? y 352 353 354 fmovdg %fcc3,%f34,%f32 ! (2_1) if ( dmax < y ) dmax = y; 355 356 faddd %f36,D2ON28,%f58 ! (1_1) x_hi = x + D2ON28; 357 fcmple32 DC1,%f20,%g5 ! (0_0) c0 = vis_fcmple32(DC1,x); 358 359 faddd %f56,D2ON28,%f22 ! (1_1) y_hi = y + D2ON28; 360 fcmple32 DC1,%f40,%o2 ! (0_0) c2 = vis_fcmple32(DC1,y); 361 362 fcmpgt32 DC2,%f20,%g1 ! (0_0) c1 = vis_fcmpgt32(DC2,x); 363 364 fcmpgt32 DC2,%f40,%o4 ! (0_0) c3 = vis_fcmpgt32(DC2,y); 365 366 fand %f32,DC0,%f52 ! (2_1) dmax = vis_fand(dmax,DC0); 367 368 or %g5,%o2,%g5 ! (0_0) c0 |= c2; 369 fsubd %f58,D2ON28,%f58 ! (1_1) x_hi -= D2ON28; 370 371 andcc %g5,2,%g0 ! (0_0) c0 & 2 372 bnz,pn %icc,.update4 ! (0_0) if ( (c0 & 2) != 0 ) 373 fsubd %f22,D2ON28,%f22 ! (1_1) y_hi -= D2ON28; 374 .cont4: 375 and %g1,%o4,%g1 ! (0_0) c1 &= c3; 376 377 add %i3,stridey,%l2 ! py += stridey 378 andcc %g1,2,%g0 ! (0_0) c1 & 2 379 bnz,pn %icc,.update5 ! (0_0) if ( (c1 & 2) != 0 ) 380 fmovd %f18,%f44 ! (3_1) dmax = x; 381 .cont5: 382 fpsub32 DC1,%f52,%f10 ! (2_1) dnorm = vis_fpsub32(DC1,dmax); 383 lda [%l2]%asi,%f24 ! (1_0) ((float*)&y)[0] = ((float*)py)[0]; 384 385 fmuld %f58,%f58,%f60 ! (1_1) res = x_hi * x_hi; 386 lda [%l2+4]%asi,%f25 ! (1_0) ((float*)&y)[1] = ((float*)py)[1]; 387 add %l1,stridex,%l7 ! px += stridex 388 faddd %f56,%f22,%f28 ! (1_1) dtmp2 = y + y_hi; 389 390 faddd %f36,%f58,%f6 ! (1_1) dtmp1 = x + x_hi; 391 lda [%l7]%asi,%f26 ! (1_0) ((float*)&x)[0] = ((float*)px)[0]; 392 393 fmuld %f50,%f10,%f50 ! (2_1) x *= dnorm; 394 fsubd %f36,%f58,%f58 ! (1_1) x_lo = x - x_hi; 395 lda [%l7+4]%asi,%f27 ! (1_0) ((float*)&x)[1] = ((float*)px)[1]; 396 397 fmuld %f22,%f22,%f2 ! (1_1) dtmp0 = y_hi * y_hi; 398 fsubd %f56,%f22,%f56 ! (1_1) y_lo = y - y_hi; 399 400 fmuld %f34,%f10,%f34 ! (2_1) y *= dnorm; 401 fabsd %f24,%f54 ! (1_0) y = fabs(y); 402 403 fabsd %f26,%f36 ! (1_0) x = fabs(x); 404 405 fmuld %f6,%f58,%f10 ! (1_1) dtmp1 *= x_lo; 406 fcmped %fcc0,%f30,%f44 ! (3_1) dmax ? y 407 408 fmuld %f28,%f56,%f26 ! (1_1) dtmp2 *= y_lo; 409 410 fmovdg %fcc0,%f30,%f44 ! (3_1) if ( dmax < y ) dmax = y; 411 412 faddd %f50,D2ON28,%f58 ! (2_1) x_hi = x + D2ON28; 413 fcmple32 DC1,%f36,%g1 ! (1_0) c0 = vis_fcmple32(DC1,x); 414 415 faddd %f34,D2ON28,%f22 ! (2_1) y_hi = y + D2ON28; 416 fcmple32 DC1,%f54,%g5 ! (1_0) c2 = vis_fcmple32(DC1,y); 417 418 faddd %f60,%f2,%f24 ! (1_1) res += dtmp0; 419 fcmpgt32 DC2,%f36,%o5 ! (1_0) c1 = vis_fcmpgt32(DC2,x); 420 421 faddd %f10,%f26,%f28 ! (1_1) dtmp1 += dtmp2; 422 fcmpgt32 DC2,%f54,%o1 ! (1_0) c3 = vis_fcmpgt32(DC2,y); 423 424 fand %f44,DC0,%f14 ! (3_1) dmax = vis_fand(dmax,DC0); 425 426 or %g1,%g5,%g1 ! (1_0) c0 |= c2; 427 fsubd %f58,D2ON28,%f44 ! (2_1) x_hi -= D2ON28; 428 429 andcc %g1,2,%g0 ! (1_0) c0 & 2 430 bnz,pn %icc,.update6 ! (1_0) if ( (c0 & 2) != 0 ) 431 fsubd %f22,D2ON28,%f58 ! (2_1) y_hi -= D2ON28; 432 .cont6: 433 and %o5,%o1,%o5 ! (1_0) c1 &= c3; 434 faddd %f24,%f28,%f26 ! (1_1) res += dtmp1; 435 436 add %l2,stridey,%i3 ! py += stridey 437 andcc %o5,2,%g0 ! (1_0) c1 & 2 438 bnz,pn %icc,.update7 ! (1_0) if ( (c1 & 2) != 0 ) 439 fmovd %f20,%f4 ! (0_0) dmax = x; 440 .cont7: 441 fpsub32 DC1,%f14,%f10 ! (3_1) dnorm = vis_fpsub32(DC1,dmax); 442 lda [%i3]%asi,%f28 ! (2_0) ((float*)&y)[0] = ((float*)py)[0]; 443 444 fmuld %f44,%f44,%f2 ! (2_1) res = x_hi * x_hi; 445 lda [%i3+4]%asi,%f29 ! (2_0) ((float*)&y)[1] = ((float*)py)[1]; 446 add %l7,stridex,%o1 ! px += stridex 447 faddd %f34,%f58,%f60 ! (2_1) dtmp2 = y + y_hi; 448 449 fsqrtd %f26,%f24 ! (1_1) res = sqrt(res); 450 lda [%o1]%asi,%f26 ! (2_0) ((float*)&x)[0] = ((float*)px)[0]; 451 faddd %f50,%f44,%f56 ! (2_1) dtmp1 = x + x_hi; 452 453 fmuld %f18,%f10,%f6 ! (3_1) x *= dnorm; 454 fsubd %f50,%f44,%f18 ! (2_1) x_lo = x - x_hi; 455 lda [%o1+4]%asi,%f27 ! (2_0) ((float*)&x)[1] = ((float*)px)[1]; 456 457 fmuld %f58,%f58,%f44 ! (2_1) dtmp0 = y_hi * y_hi; 458 fsubd %f34,%f58,%f22 ! (2_1) y_lo = y - y_hi; 459 460 fmuld %f30,%f10,%f58 ! (3_1) y *= dnorm; 461 fabsd %f28,%f34 ! (2_0) y = fabs(y); 462 463 fabsd %f26,%f50 ! (2_0) x = fabs(x); 464 465 fmuld %f56,%f18,%f10 ! (2_1) dtmp1 *= x_lo; 466 fcmped %fcc1,%f40,%f4 ! (0_0) dmax ? y 467 468 fmuld %f60,%f22,%f12 ! (2_1) dtmp2 *= y_lo; 469 470 fmovdg %fcc1,%f40,%f4 ! (0_0) if ( dmax < y ) dmax = y; 471 472 faddd %f6,D2ON28,%f56 ! (3_1) x_hi = x + D2ON28; 473 fcmple32 DC1,%f50,%o3 ! (2_0) c0 = vis_fcmple32(DC1,x); 474 475 faddd %f58,D2ON28,%f28 ! (3_1) y_hi = y + D2ON28; 476 fcmple32 DC1,%f34,%o0 ! (2_0) c2 = vis_fcmple32(DC1,y); 477 478 faddd %f2,%f44,%f30 ! (2_1) res += dtmp0; 479 fcmpgt32 DC2,%f50,%o4 ! (2_0) c1 = vis_fcmpgt32(DC2,x); 480 481 faddd %f10,%f12,%f26 ! (2_1) dtmp1 += dtmp2; 482 fcmpgt32 DC2,%f34,%o5 ! (2_0) c3 = vis_fcmpgt32(DC2,y); 483 484 fand %f4,DC0,%f16 ! (0_0) dmax = vis_fand(dmax,DC0); 485 486 or %o3,%o0,%o3 ! (2_0) c0 |= c2; 487 fsubd %f56,D2ON28,%f18 ! (3_1) x_hi -= D2ON28; 488 489 andcc %o3,2,%g0 ! (2_0) c0 & 2 490 bnz,pn %icc,.update8 ! (2_0) if ( (c0 & 2) != 0 ) 491 fsubd %f28,D2ON28,%f4 ! (3_1) y_hi -= D2ON28; 492 .cont8: 493 and %o4,%o5,%o4 ! (2_0) c1 &= c3; 494 faddd %f30,%f26,%f12 ! (2_1) res += dtmp1; 495 496 add %i3,stridey,%l4 ! py += stridey 497 andcc %o4,2,%g0 ! (2_0) c1 & 2 498 bnz,pn %icc,.update9 ! (2_0) if ( (c1 & 2) != 0 ) 499 fmovd %f36,%f56 ! (1_0) dmax = x; 500 .cont9: 501 lda [%l4]%asi,%f30 ! (3_0) ((float*)&y)[0] = ((float*)py)[0]; 502 add %o1,stridex,%l2 ! px += stridex 503 fpsub32 DC1,%f16,%f44 ! (0_0) dnorm = vis_fpsub32(DC1,dmax); 504 505 fmuld %f18,%f18,%f60 ! (3_1) res = x_hi * x_hi; 506 lda [%l4+4]%asi,%f31 ! (3_0) ((float*)&y)[1] = ((float*)py)[1]; 507 faddd %f58,%f4,%f32 ! (3_1) dtmp2 = y + y_hi; 508 509 fsqrtd %f12,%f12 ! (2_1) res = sqrt(res); 510 faddd %f6,%f18,%f28 ! (3_1) dtmp1 = x + x_hi; 511 512 cmp counter,4 513 bl,pn %icc,.tail 514 nop 515 516 ba .main_loop 517 sub counter,4,counter 518 519 .align 16 520 .main_loop: 521 fmuld %f20,%f44,%f2 ! (0_1) x *= dnorm; 522 fsubd %f6,%f18,%f20 ! (3_2) x_lo = x - x_hi; 523 lda [%l2]%asi,%f18 ! (3_1) ((float*)&x)[0] = ((float*)px)[0]; 524 525 fmuld %f4,%f4,%f22 ! (3_2) dtmp0 = y_hi * y_hi; 526 lda [%l2+4]%asi,%f19 ! (3_1) ((float*)&x)[1] = ((float*)px)[1]; 527 fsubd %f58,%f4,%f58 ! (3_2) y_lo = y - y_hi; 528 529 fmuld %f40,%f44,%f44 ! (0_1) y *= dnorm; 530 fabsd %f30,%f30 ! (3_1) y = fabs(y); 531 532 fmuld %f38,%f24,%f10 ! (1_2) res = dmax * res; 533 fabsd %f18,%f18 ! (3_1) x = fabs(x); 534 st %f10,[%i5] ! (1_2) ((float*)pz)[0] = ((float*)&res)[0]; 535 536 fmuld %f28,%f20,%f28 ! (3_2) dtmp1 *= x_lo; 537 st %f11,[%i5+4] ! (1_2) ((float*)pz)[1] = ((float*)&res)[1]; 538 fcmped %fcc2,%f54,%f56 ! (1_1) dmax ? y 539 540 fmuld %f32,%f58,%f24 ! (3_2) dtmp2 *= y_lo; 541 542 fmovdg %fcc2,%f54,%f56 ! (1_1) if ( dmax < y ) dmax = y; 543 544 faddd %f2,D2ON28,%f10 ! (0_1) x_hi = x + D2ON28; 545 fcmple32 DC1,%f18,%o3 ! (3_1) c0 = vis_fcmple32(DC1,x); 546 547 faddd %f44,D2ON28,%f20 ! (0_1) y_hi = y + D2ON28; 548 fcmple32 DC1,%f30,%o0 ! (3_1) c2 = vis_fcmple32(DC1,y); 549 550 faddd %f60,%f22,%f22 ! (3_2) res += dtmp0; 551 fcmpgt32 DC2,%f18,%o4 ! (3_1) c1 = vis_fcmpgt32(DC2,x); 552 553 faddd %f28,%f24,%f26 ! (3_2) dtmp1 += dtmp2; 554 fcmpgt32 DC2,%f30,%o1 ! (3_1) c3 = vis_fcmpgt32(DC2,y); 555 556 fand %f56,DC0,%f38 ! (1_1) dmax = vis_fand(dmax,DC0); 557 558 or %o3,%o0,%o3 ! (3_1) c0 |= c2; 559 fsubd %f10,D2ON28,%f58 ! (0_1) x_hi -= D2ON28; 560 561 andcc %o3,2,%g0 ! (3_1) c0 & 2 562 bnz,pn %icc,.update10 ! (3_1) if ( (c0 & 2) != 0 ) 563 fsubd %f20,D2ON28,%f56 ! (0_1) y_hi -= D2ON28; 564 .cont10: 565 faddd %f22,%f26,%f28 ! (3_2) res += dtmp1; 566 and %o4,%o1,%o4 ! (3_1) c1 &= c3; 567 568 add %l4,stridey,%i3 ! py += stridey 569 andcc %o4,2,%g0 ! (3_1) c1 & 2 570 bnz,pn %icc,.update11 ! (3_1) if ( (c1 & 2) != 0 ) 571 fmovd %f50,%f32 ! (2_1) dmax = x; 572 .cont11: 573 fpsub32 DC1,%f38,%f10 ! (1_1) dnorm = vis_fpsub32(DC1,dmax); 574 add %l2,stridex,%l1 ! px += stridex 575 lda [%i3]%asi,%f20 ! (0_0) ((float*)&y)[0] = ((float*)py)[0]; 576 577 fmuld %f58,%f58,%f6 ! (0_1) res = x_hi * x_hi; 578 lda [%i3+4]%asi,%f21 ! (0_0) ((float*)&y)[1] = ((float*)py)[1]; 579 add %i5,stridez,%l6 ! pz += stridez 580 faddd %f44,%f56,%f60 ! (0_1) dtmp2 = y + y_hi; 581 582 fsqrtd %f28,%f4 ! (3_2) res = sqrt(res); 583 lda [%l1]%asi,%f22 ! (0_0) ((float*)&x)[0] = ((float*)px)[0]; 584 faddd %f2,%f58,%f24 ! (0_1) dtmp1 = x + x_hi; 585 586 fmuld %f36,%f10,%f36 ! (1_1) x *= dnorm; 587 fsubd %f2,%f58,%f26 ! (0_1) x_lo = x - x_hi; 588 lda [%l1+4]%asi,%f23 ! (0_0) ((float*)&x)[1] = ((float*)px)[1]; 589 590 fmuld %f56,%f56,%f28 ! (0_1) dtmp0 = y_hi * y_hi; 591 fsubd %f44,%f56,%f44 ! (0_1) y_lo = y - y_hi; 592 593 fmuld %f54,%f10,%f56 ! (1_1) y *= dnorm; 594 fabsd %f20,%f40 ! (0_0) y = fabs(y); 595 596 fmuld %f52,%f12,%f12 ! (2_2) res = dmax * res; 597 fabsd %f22,%f20 ! (0_0) x = fabs(x); 598 st %f12,[%l6] ! (2_2) ((float*)pz)[0] = ((float*)&res)[0]; 599 600 fmuld %f24,%f26,%f10 ! (0_1) dtmp1 *= x_lo; 601 st %f13,[%l6+4] ! (2_2) ((float*)pz)[1] = ((float*)&res)[1]; 602 fcmped %fcc3,%f34,%f32 ! (2_1) dmax ? y 603 604 fmuld %f60,%f44,%f12 ! (0_1) dtmp2 *= y_lo; 605 606 fmovdg %fcc3,%f34,%f32 ! (2_1) if ( dmax < y ) dmax = y; 607 608 faddd %f36,D2ON28,%f58 ! (1_1) x_hi = x + D2ON28; 609 fcmple32 DC1,%f20,%g5 ! (0_0) c0 = vis_fcmple32(DC1,x); 610 611 faddd %f56,D2ON28,%f22 ! (1_1) y_hi = y + D2ON28; 612 fcmple32 DC1,%f40,%o2 ! (0_0) c2 = vis_fcmple32(DC1,y); 613 614 faddd %f6,%f28,%f24 ! (0_1) res += dtmp0; 615 fcmpgt32 DC2,%f20,%g1 ! (0_0) c1 = vis_fcmpgt32(DC2,x); 616 617 faddd %f10,%f12,%f26 ! (0_1) dtmp1 += dtmp2; 618 fcmpgt32 DC2,%f40,%o4 ! (0_0) c3 = vis_fcmpgt32(DC2,y); 619 620 fand %f32,DC0,%f52 ! (2_1) dmax = vis_fand(dmax,DC0); 621 622 or %g5,%o2,%g5 ! (0_0) c0 |= c2; 623 fsubd %f58,D2ON28,%f58 ! (1_1) x_hi -= D2ON28; 624 625 andcc %g5,2,%g0 ! (0_0) c0 & 2 626 bnz,pn %icc,.update12 ! (0_0) if ( (c0 & 2) != 0 ) 627 fsubd %f22,D2ON28,%f22 ! (1_1) y_hi -= D2ON28; 628 .cont12: 629 and %g1,%o4,%g1 ! (0_0) c1 &= c3; 630 faddd %f24,%f26,%f12 ! (0_1) res += dtmp1; 631 632 add %i3,stridey,%l2 ! py += stridey 633 andcc %g1,2,%g0 ! (0_0) c1 & 2 634 bnz,pn %icc,.update13 ! (0_0) if ( (c1 & 2) != 0 ) 635 fmovd %f18,%f44 ! (3_1) dmax = x; 636 .cont13: 637 fpsub32 DC1,%f52,%f10 ! (2_1) dnorm = vis_fpsub32(DC1,dmax); 638 add %l1,stridex,%l7 ! px += stridex 639 lda [%l2]%asi,%f24 ! (1_0) ((float*)&y)[0] = ((float*)py)[0]; 640 641 fmuld %f58,%f58,%f60 ! (1_1) res = x_hi * x_hi; 642 add %l6,stridez,%i5 ! pz += stridez 643 lda [%l2+4]%asi,%f25 ! (1_0) ((float*)&y)[1] = ((float*)py)[1]; 644 faddd %f56,%f22,%f28 ! (1_1) dtmp2 = y + y_hi; 645 646 fsqrtd %f12,%f12 ! (0_1) res = sqrt(res); 647 lda [%l7]%asi,%f26 ! (1_0) ((float*)&x)[0] = ((float*)px)[0]; 648 faddd %f36,%f58,%f6 ! (1_1) dtmp1 = x + x_hi; 649 650 fmuld %f50,%f10,%f50 ! (2_1) x *= dnorm; 651 fsubd %f36,%f58,%f58 ! (1_1) x_lo = x - x_hi; 652 lda [%l7+4]%asi,%f27 ! (1_0) ((float*)&x)[1] = ((float*)px)[1]; 653 654 fmuld %f22,%f22,%f2 ! (1_1) dtmp0 = y_hi * y_hi; 655 fsubd %f56,%f22,%f56 ! (1_1) y_lo = y - y_hi; 656 657 fmuld %f34,%f10,%f34 ! (2_1) y *= dnorm; 658 fabsd %f24,%f54 ! (1_0) y = fabs(y); 659 660 fmuld %f14,%f4,%f14 ! (3_2) res = dmax * res; 661 fabsd %f26,%f36 ! (1_0) x = fabs(x); 662 st %f14,[%i5] ! (3_2) ((float*)pz)[0] = ((float*)&res)[0]; 663 664 fmuld %f6,%f58,%f10 ! (1_1) dtmp1 *= x_lo; 665 st %f15,[%i5+4] ! (3_2) ((float*)pz)[1] = ((float*)&res)[1]; 666 fcmped %fcc0,%f30,%f44 ! (3_1) dmax ? y 667 668 fmuld %f28,%f56,%f26 ! (1_1) dtmp2 *= y_lo; 669 670 fmovdg %fcc0,%f30,%f44 ! (3_1) if ( dmax < y ) dmax = y; 671 672 faddd %f50,D2ON28,%f58 ! (2_1) x_hi = x + D2ON28; 673 fcmple32 DC1,%f36,%g1 ! (1_0) c0 = vis_fcmple32(DC1,x); 674 675 faddd %f34,D2ON28,%f22 ! (2_1) y_hi = y + D2ON28; 676 fcmple32 DC1,%f54,%g5 ! (1_0) c2 = vis_fcmple32(DC1,y); 677 678 faddd %f60,%f2,%f24 ! (1_1) res += dtmp0; 679 fcmpgt32 DC2,%f36,%o5 ! (1_0) c1 = vis_fcmpgt32(DC2,x); 680 681 faddd %f10,%f26,%f28 ! (1_1) dtmp1 += dtmp2; 682 fcmpgt32 DC2,%f54,%o1 ! (1_0) c3 = vis_fcmpgt32(DC2,y); 683 684 fand %f44,DC0,%f14 ! (3_1) dmax = vis_fand(dmax,DC0); 685 686 or %g1,%g5,%g1 ! (1_0) c0 |= c2; 687 fsubd %f58,D2ON28,%f44 ! (2_1) x_hi -= D2ON28; 688 689 andcc %g1,2,%g0 ! (1_0) c0 & 2 690 bnz,pn %icc,.update14 ! (1_0) if ( (c0 & 2) != 0 ) 691 fsubd %f22,D2ON28,%f58 ! (2_1) y_hi -= D2ON28; 692 .cont14: 693 and %o5,%o1,%o5 ! (1_0) c1 &= c3; 694 faddd %f24,%f28,%f26 ! (1_1) res += dtmp1; 695 696 add %l2,stridey,%i3 ! py += stridey 697 andcc %o5,2,%g0 ! (1_0) c1 & 2 698 bnz,pn %icc,.update15 ! (1_0) if ( (c1 & 2) != 0 ) 699 fmovd %f20,%f4 ! (0_0) dmax = x; 700 .cont15: 701 fpsub32 DC1,%f14,%f10 ! (3_1) dnorm = vis_fpsub32(DC1,dmax); 702 add %l7,stridex,%o1 ! px += stridex 703 lda [%i3]%asi,%f28 ! (2_0) ((float*)&y)[0] = ((float*)py)[0]; 704 705 fmuld %f44,%f44,%f2 ! (2_1) res = x_hi * x_hi; 706 add %i5,stridez,%g5 ! pz += stridez 707 lda [%i3+4]%asi,%f29 ! (2_0) ((float*)&y)[1] = ((float*)py)[1]; 708 faddd %f34,%f58,%f60 ! (2_1) dtmp2 = y + y_hi; 709 710 fsqrtd %f26,%f24 ! (1_1) res = sqrt(res); 711 lda [%o1]%asi,%f26 ! (2_0) ((float*)&x)[0] = ((float*)px)[0]; 712 faddd %f50,%f44,%f56 ! (2_1) dtmp1 = x + x_hi; 713 714 fmuld %f18,%f10,%f6 ! (3_1) x *= dnorm; 715 fsubd %f50,%f44,%f18 ! (2_1) x_lo = x - x_hi; 716 lda [%o1+4]%asi,%f27 ! (2_0) ((float*)&x)[1] = ((float*)px)[1]; 717 718 fmuld %f58,%f58,%f44 ! (2_1) dtmp0 = y_hi * y_hi; 719 fsubd %f34,%f58,%f22 ! (2_1) y_lo = y - y_hi; 720 721 fmuld %f30,%f10,%f58 ! (3_1) y *= dnorm; 722 fabsd %f28,%f34 ! (2_0) y = fabs(y); 723 724 fmuld %f16,%f12,%f16 ! (0_1) res = dmax * res; 725 fabsd %f26,%f50 ! (2_0) x = fabs(x); 726 st %f16,[%g5] ! (0_1) ((float*)pz)[0] = ((float*)&res)[0]; 727 728 fmuld %f56,%f18,%f10 ! (2_1) dtmp1 *= x_lo; 729 st %f17,[%g5+4] ! (0_1) ((float*)pz)[1] = ((float*)&res)[1]; 730 fcmped %fcc1,%f40,%f4 ! (0_0) dmax ? y 731 732 fmuld %f60,%f22,%f12 ! (2_1) dtmp2 *= y_lo; 733 734 fmovdg %fcc1,%f40,%f4 ! (0_0) if ( dmax < y ) dmax = y; 735 736 faddd %f6,D2ON28,%f56 ! (3_1) x_hi = x + D2ON28; 737 fcmple32 DC1,%f50,%o3 ! (2_0) c0 = vis_fcmple32(DC1,x); 738 739 faddd %f58,D2ON28,%f28 ! (3_1) y_hi = y + D2ON28; 740 fcmple32 DC1,%f34,%o0 ! (2_0) c2 = vis_fcmple32(DC1,y); 741 742 faddd %f2,%f44,%f30 ! (2_1) res += dtmp0; 743 fcmpgt32 DC2,%f50,%o4 ! (2_0) c1 = vis_fcmpgt32(DC2,x); 744 745 faddd %f10,%f12,%f26 ! (2_1) dtmp1 += dtmp2; 746 fcmpgt32 DC2,%f34,%o5 ! (2_0) c3 = vis_fcmpgt32(DC2,y); 747 748 fand %f4,DC0,%f16 ! (0_0) dmax = vis_fand(dmax,DC0); 749 750 or %o3,%o0,%o3 ! (2_0) c0 |= c2; 751 fsubd %f56,D2ON28,%f18 ! (3_1) x_hi -= D2ON28; 752 753 andcc %o3,2,%g0 ! (2_0) c0 & 2 754 bnz,pn %icc,.update16 ! (2_0) if ( (c0 & 2) != 0 ) 755 fsubd %f28,D2ON28,%f4 ! (3_1) y_hi -= D2ON28; 756 .cont16: 757 and %o4,%o5,%o4 ! (2_0) c1 &= c3; 758 faddd %f30,%f26,%f12 ! (2_1) res += dtmp1; 759 760 add %i3,stridey,%l4 ! py += stridey 761 andcc %o4,2,%g0 ! (2_0) c1 & 2 762 bnz,pn %icc,.update17 ! (2_0) if ( (c1 & 2) != 0 ) 763 fmovd %f36,%f56 ! (1_0) dmax = x; 764 .cont17: 765 lda [%l4]%asi,%f30 ! (3_0) ((float*)&y)[0] = ((float*)py)[0]; 766 add %o1,stridex,%l2 ! px += stridex 767 fpsub32 DC1,%f16,%f44 ! (0_0) dnorm = vis_fpsub32(DC1,dmax); 768 769 fmuld %f18,%f18,%f60 ! (3_1) res = x_hi * x_hi; 770 add %g5,stridez,%i5 ! pz += stridez 771 lda [%l4+4]%asi,%f31 ! (3_0) ((float*)&y)[1] = ((float*)py)[1]; 772 faddd %f58,%f4,%f32 ! (3_1) dtmp2 = y + y_hi; 773 774 fsqrtd %f12,%f12 ! (2_1) res = sqrt(res); 775 subcc counter,4,counter ! counter -= 4; 776 bpos,pt %icc,.main_loop 777 faddd %f6,%f18,%f28 ! (3_1) dtmp1 = x + x_hi; 778 779 add counter,4,counter 780 781 .tail: 782 subcc counter,1,counter 783 bneg,a .begin 784 nop 785 786 fsubd %f6,%f18,%f20 ! (3_2) x_lo = x - x_hi; 787 788 fmuld %f4,%f4,%f22 ! (3_2) dtmp0 = y_hi * y_hi; 789 fsubd %f58,%f4,%f58 ! (3_2) y_lo = y - y_hi; 790 791 fmuld %f38,%f24,%f10 ! (1_2) res = dmax * res; 792 st %f10,[%i5] ! (1_2) ((float*)pz)[0] = ((float*)&res)[0]; 793 794 st %f11,[%i5+4] ! (1_2) ((float*)pz)[1] = ((float*)&res)[1]; 795 796 subcc counter,1,counter 797 bneg,a .begin 798 add %i5,stridez,%i5 799 800 fmuld %f28,%f20,%f28 ! (3_2) dtmp1 *= x_lo; 801 802 fmuld %f32,%f58,%f24 ! (3_2) dtmp2 *= y_lo; 803 804 faddd %f60,%f22,%f22 ! (3_2) res += dtmp0; 805 806 faddd %f28,%f24,%f26 ! (3_2) dtmp1 += dtmp2; 807 808 faddd %f22,%f26,%f28 ! (3_2) res += dtmp1; 809 810 add %i5,stridez,%l6 ! pz += stridez 811 812 fsqrtd %f28,%f4 ! (3_2) res = sqrt(res); 813 add %l2,stridex,%l1 ! px += stridex 814 815 fmuld %f52,%f12,%f12 ! (2_2) res = dmax * res; 816 st %f12,[%l6] ! (2_2) ((float*)pz)[0] = ((float*)&res)[0]; 817 818 st %f13,[%l6+4] ! (2_2) ((float*)pz)[1] = ((float*)&res)[1]; 819 820 subcc counter,1,counter 821 bneg .begin 822 add %l6,stridez,%i5 823 824 fmuld %f14,%f4,%f14 ! (3_2) res = dmax * res; 825 st %f14,[%i5] ! (3_2) ((float*)pz)[0] = ((float*)&res)[0]; 826 827 st %f15,[%i5+4] ! (3_2) ((float*)pz)[1] = ((float*)&res)[1]; 828 829 ba .begin 830 add %i5,stridez,%i5 831 832 .align 16 833 .spec0: 834 ld [%i1+4],%l1 ! lx = ((int*)px)[1]; 835 cmp %o2,%o4 ! j0 ? 0x7ff00000 836 bge,pn %icc,1f ! if ( j0 >= 0x7ff00000 ) 837 fabsd %f26,%f26 ! x = fabs(x); 838 839 sub %o0,%l4,%o0 ! diff = hy - hx; 840 fabsd %f24,%f24 ! y = fabs(y); 841 842 sra %o0,31,%l4 ! j0 = diff >> 31; 843 844 xor %o0,%l4,%o0 ! diff ^ j0 845 846 sethi %hi(0x03600000),%l1 847 sub %o0,%l4,%o0 ! (diff ^ j0) - j0 848 849 cmp %o0,%l1 ! ((diff ^ j0) - j0) ? 0x03600000 850 bge,a,pn %icc,2f ! if ( ((diff ^ j0) - j0) >= 0x03600000 ) 851 faddd %f26,%f24,%f24 ! *pz = x + y 852 853 fmuld %f26,DC2,%f36 ! (1_1) x *= dnorm; 854 855 fmuld %f24,DC2,%f56 ! (1_1) y *= dnorm; 856 857 faddd %f36,D2ON28,%f58 ! (1_1) x_hi = x + D2ON28; 858 859 faddd %f56,D2ON28,%f22 ! (1_1) y_hi = y + D2ON28; 860 861 fsubd %f58,D2ON28,%f58 ! (1_1) x_hi -= D2ON28; 862 863 fsubd %f22,D2ON28,%f22 ! (1_1) y_hi -= D2ON28; 864 865 fmuld %f58,%f58,%f60 ! (1_1) res = x_hi * x_hi; 866 faddd %f56,%f22,%f28 ! (1_1) dtmp2 = y + y_hi; 867 868 faddd %f36,%f58,%f6 ! (1_1) dtmp1 = x + x_hi; 869 870 fsubd %f36,%f58,%f58 ! (1_1) x_lo = x - x_hi; 871 872 fmuld %f22,%f22,%f2 ! (1_1) dtmp0 = y_hi * y_hi; 873 fsubd %f56,%f22,%f56 ! (1_1) y_lo = y - y_hi; 874 875 fmuld %f6,%f58,%f10 ! (1_1) dtmp1 *= x_lo; 876 877 fmuld %f28,%f56,%f26 ! (1_1) dtmp2 *= y_lo; 878 879 faddd %f60,%f2,%f24 ! (1_1) res += dtmp0; 880 881 faddd %f10,%f26,%f28 ! (1_1) dtmp1 += dtmp2; 882 883 faddd %f24,%f28,%f26 ! (1_1) res += dtmp1; 884 885 fsqrtd %f26,%f24 ! (1_1) res = sqrt(res); 886 887 fmuld DC3,%f24,%f24 ! (1_2) res = dmax * res; 888 2: 889 add %i3,stridey,%i3 890 add %i1,stridex,%i1 891 st %f24,[%i5] ! ((float*)pz)[0] = ((float*)&res)[0]; 892 st %f25,[%i5+4] ! ((float*)pz)[1] = ((float*)&res)[1]; 893 894 add %i5,stridez,%i5 895 ba .begin1 896 sub counter,1,counter 897 898 1: 899 ld [%i3+4],%l2 ! ly = ((int*)py)[1]; 900 cmp %o0,%o4 ! hx ? 0x7ff00000 901 bne,pn %icc,1f ! if ( hx != 0x7ff00000 ) 902 fabsd %f24,%f24 ! y = fabs(y); 903 904 cmp %l1,0 ! lx ? 0 905 be,pn %icc,2f ! if ( lx == 0 ) 906 nop 907 1: 908 cmp %l4,%o4 ! hy ? 0x7ff00000 909 bne,pn %icc,1f ! if ( hy != 0x7ff00000 ) 910 nop 911 912 cmp %l2,0 ! ly ? 0 913 be,pn %icc,2f ! if ( ly == 0 ) 914 nop 915 1: 916 add %i3,stridey,%i3 917 add %i1,stridex,%i1 918 fmuld %f26,%f24,%f24 ! res = x * y; 919 st %f24,[%i5] ! ((float*)pz)[0] = ((float*)&res)[0]; 920 921 st %f25,[%i5+4] ! ((float*)pz)[1] = ((float*)&res)[1]; 922 923 add %i5,stridez,%i5 924 ba .begin1 925 sub counter,1,counter 926 927 2: 928 add %i1,stridex,%i1 929 add %i3,stridey,%i3 930 st DC0_HI,[%i5] ! ((int*)pz)[0] = 0x7ff00000; 931 st DC0_LO,[%i5+4] ! ((int*)pz)[1] = 0; 932 fcmpd %f26,%f24 ! x ? y 933 934 add %i5,stridez,%i5 935 ba .begin1 936 sub counter,1,counter 937 938 .align 16 939 .spec1: 940 fmuld %f26,DC3,%f36 ! (1_1) x *= dnorm; 941 942 fmuld %f24,DC3,%f56 ! (1_1) y *= dnorm; 943 944 faddd %f36,D2ON28,%f58 ! (1_1) x_hi = x + D2ON28; 945 946 faddd %f56,D2ON28,%f22 ! (1_1) y_hi = y + D2ON28; 947 948 fsubd %f58,D2ON28,%f58 ! (1_1) x_hi -= D2ON28; 949 950 fsubd %f22,D2ON28,%f22 ! (1_1) y_hi -= D2ON28; 951 952 fmuld %f58,%f58,%f60 ! (1_1) res = x_hi * x_hi; 953 faddd %f56,%f22,%f28 ! (1_1) dtmp2 = y + y_hi; 954 955 faddd %f36,%f58,%f6 ! (1_1) dtmp1 = x + x_hi; 956 957 fsubd %f36,%f58,%f58 ! (1_1) x_lo = x - x_hi; 958 959 fmuld %f22,%f22,%f2 ! (1_1) dtmp0 = y_hi * y_hi; 960 fsubd %f56,%f22,%f56 ! (1_1) y_lo = y - y_hi; 961 962 fmuld %f6,%f58,%f10 ! (1_1) dtmp1 *= x_lo; 963 964 fmuld %f28,%f56,%f26 ! (1_1) dtmp2 *= y_lo; 965 966 faddd %f60,%f2,%f24 ! (1_1) res += dtmp0; 967 968 faddd %f10,%f26,%f28 ! (1_1) dtmp1 += dtmp2; 969 970 faddd %f24,%f28,%f26 ! (1_1) res += dtmp1; 971 972 fsqrtd %f26,%f24 ! (1_1) res = sqrt(res); 973 974 fmuld DC2,%f24,%f24 ! (1_2) res = dmax * res; 975 976 add %i3,stridey,%i3 977 add %i1,stridex,%i1 978 st %f24,[%i5] ! ((float*)pz)[0] = ((float*)&res)[0]; 979 980 st %f25,[%i5+4] ! ((float*)pz)[1] = ((float*)&res)[1]; 981 add %i5,stridez,%i5 982 ba .begin1 983 sub counter,1,counter 984 985 .align 16 986 .update0: 987 fzero %f50 988 cmp counter,1 989 ble .cont0 990 fzero %f34 991 992 mov %o1,tmp_px 993 mov %i3,tmp_py 994 995 sub counter,1,tmp_counter 996 ba .cont0 997 mov 1,counter 998 999 .align 16 1000 .update1: 1001 fzero %f50 1002 cmp counter,1 1003 ble .cont1 1004 fzero %f34 1005 1006 mov %o1,tmp_px 1007 mov %i3,tmp_py 1008 1009 sub counter,1,tmp_counter 1010 ba .cont1 1011 mov 1,counter 1012 1013 .align 16 1014 .update2: 1015 fzero %f18 1016 cmp counter,2 1017 ble .cont2 1018 fzero %f30 1019 1020 mov %l2,tmp_px 1021 mov %l4,tmp_py 1022 1023 sub counter,2,tmp_counter 1024 ba .cont1 1025 mov 2,counter 1026 1027 .align 16 1028 .update3: 1029 fzero %f18 1030 cmp counter,2 1031 ble .cont3 1032 fzero %f30 1033 1034 mov %l2,tmp_px 1035 mov %l4,tmp_py 1036 1037 sub counter,2,tmp_counter 1038 ba .cont3 1039 mov 2,counter 1040 1041 .align 16 1042 .update4: 1043 fzero %f20 1044 cmp counter,3 1045 ble .cont4 1046 fzero %f40 1047 1048 mov %l1,tmp_px 1049 mov %i3,tmp_py 1050 1051 sub counter,3,tmp_counter 1052 ba .cont4 1053 mov 3,counter 1054 1055 .align 16 1056 .update5: 1057 fzero %f20 1058 cmp counter,3 1059 ble .cont5 1060 fzero %f40 1061 1062 mov %l1,tmp_px 1063 mov %i3,tmp_py 1064 1065 sub counter,3,tmp_counter 1066 ba .cont5 1067 mov 3,counter 1068 1069 .align 16 1070 .update6: 1071 fzero %f36 1072 cmp counter,4 1073 ble .cont6 1074 fzero %f54 1075 1076 mov %l7,tmp_px 1077 mov %l2,tmp_py 1078 1079 sub counter,4,tmp_counter 1080 ba .cont6 1081 mov 4,counter 1082 1083 .align 16 1084 .update7: 1085 fzero %f36 1086 cmp counter,4 1087 ble .cont7 1088 fzero %f54 1089 1090 mov %l7,tmp_px 1091 mov %l2,tmp_py 1092 1093 sub counter,4,tmp_counter 1094 ba .cont7 1095 mov 4,counter 1096 1097 .align 16 1098 .update8: 1099 fzero %f50 1100 cmp counter,5 1101 ble .cont8 1102 fzero %f34 1103 1104 mov %o1,tmp_px 1105 mov %i3,tmp_py 1106 1107 sub counter,5,tmp_counter 1108 ba .cont8 1109 mov 5,counter 1110 1111 .align 16 1112 .update9: 1113 fzero %f50 1114 cmp counter,5 1115 ble .cont9 1116 fzero %f34 1117 1118 mov %o1,tmp_px 1119 mov %i3,tmp_py 1120 1121 sub counter,5,tmp_counter 1122 ba .cont9 1123 mov 5,counter 1124 1125 1126 .align 16 1127 .update10: 1128 fzero %f18 1129 cmp counter,2 1130 ble .cont10 1131 fzero %f30 1132 1133 mov %l2,tmp_px 1134 mov %l4,tmp_py 1135 1136 sub counter,2,tmp_counter 1137 ba .cont10 1138 mov 2,counter 1139 1140 .align 16 1141 .update11: 1142 fzero %f18 1143 cmp counter,2 1144 ble .cont11 1145 fzero %f30 1146 1147 mov %l2,tmp_px 1148 mov %l4,tmp_py 1149 1150 sub counter,2,tmp_counter 1151 ba .cont11 1152 mov 2,counter 1153 1154 .align 16 1155 .update12: 1156 fzero %f20 1157 cmp counter,3 1158 ble .cont12 1159 fzero %f40 1160 1161 mov %l1,tmp_px 1162 mov %i3,tmp_py 1163 1164 sub counter,3,tmp_counter 1165 ba .cont12 1166 mov 3,counter 1167 1168 .align 16 1169 .update13: 1170 fzero %f20 1171 cmp counter,3 1172 ble .cont13 1173 fzero %f40 1174 1175 mov %l1,tmp_px 1176 mov %i3,tmp_py 1177 1178 sub counter,3,tmp_counter 1179 ba .cont13 1180 mov 3,counter 1181 1182 .align 16 1183 .update14: 1184 fzero %f54 1185 cmp counter,4 1186 ble .cont14 1187 fzero %f36 1188 1189 mov %l7,tmp_px 1190 mov %l2,tmp_py 1191 1192 sub counter,4,tmp_counter 1193 ba .cont14 1194 mov 4,counter 1195 1196 .align 16 1197 .update15: 1198 fzero %f54 1199 cmp counter,4 1200 ble .cont15 1201 fzero %f36 1202 1203 mov %l7,tmp_px 1204 mov %l2,tmp_py 1205 1206 sub counter,4,tmp_counter 1207 ba .cont15 1208 mov 4,counter 1209 1210 .align 16 1211 .update16: 1212 fzero %f50 1213 cmp counter,5 1214 ble .cont16 1215 fzero %f34 1216 1217 mov %o1,tmp_px 1218 mov %i3,tmp_py 1219 1220 sub counter,5,tmp_counter 1221 ba .cont16 1222 mov 5,counter 1223 1224 .align 16 1225 .update17: 1226 fzero %f50 1227 cmp counter,5 1228 ble .cont17 1229 fzero %f34 1230 1231 mov %o1,tmp_px 1232 mov %i3,tmp_py 1233 1234 sub counter,5,tmp_counter 1235 ba .cont17 1236 mov 5,counter 1237 1238 .align 16 1239 .exit: 1240 ret 1241 restore 1242 SET_SIZE(__vhypot) 1243