1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 23 */ 24 /* 25 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 26 * Use is subject to license terms. 27 */ 28 29 .file "__vhypotf.S" 30 31 #include "libm.h" 32 33 RO_DATA 34 .align 64 35 36 .CONST_TBL: 37 .word 0x3fe00001, 0x80007e00 ! K1 = 5.00000715259318464227e-01 38 .word 0xbfc00003, 0xc0017a01 ! K2 = -1.25000447037521686593e-01 39 .word 0x000fffff, 0xffffffff ! DC0 = 0x000fffffffffffff 40 .word 0x3ff00000, 0x00000000 ! DC1 = 0x3ff0000000000000 41 .word 0x7ffff000, 0x00000000 ! DC2 = 0x7ffff00000000000 42 .word 0x7fe00000, 0x00000000 ! DA0 = 0x7fe0000000000000 43 .word 0x47efffff, 0xe0000000 ! DFMAX = 3.402823e+38 44 .word 0x7f7fffff, 0x80808080 ! FMAX = 3.402823e+38 , SCALE = 0x80808080 45 .word 0x20000000, 0x00000000 ! DA1 = 0x2000000000000000 46 47 #define DC0 %f12 48 #define DC1 %f10 49 #define DC2 %f42 50 #define DA0 %f6 51 #define DA1 %f4 52 #define K2 %f26 53 #define K1 %f28 54 #define SCALE %f3 55 #define FMAX %f2 56 #define DFMAX %f50 57 58 #define stridex %l6 59 #define stridey %i4 60 #define stridez %l5 61 #define _0x7fffffff %o1 62 #define _0x7f3504f3 %o2 63 #define _0x1ff0 %l2 64 #define TBL %l1 65 66 #define counter %l0 67 68 #define tmp_px STACK_BIAS-0x30 69 #define tmp_py STACK_BIAS-0x28 70 #define tmp_counter STACK_BIAS-0x20 71 #define tmp0 STACK_BIAS-0x18 72 #define tmp1 STACK_BIAS-0x10 73 #define tmp2 STACK_BIAS-0x0c 74 #define tmp3 STACK_BIAS-0x08 75 #define tmp4 STACK_BIAS-0x04 76 77 ! sizeof temp storage - must be a multiple of 16 for V9 78 #define tmps 0x30 79 80 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 81 ! !!!!! algorithm !!!!! 82 ! hx0 = *(int*)px; 83 ! x0 = *px; 84 ! px += stridex; 85 ! 86 ! hy0 = *(int*)py; 87 ! y0 = *py; 88 ! py += stridey; 89 ! 90 ! hx0 &= 0x7fffffff; 91 ! hy0 &= 0x7fffffff; 92 ! 93 ! if ( hx >= 0x7f3504f3 || hy >= 0x7f3504f3 ) 94 ! { 95 ! if ( hx >= 0x7f800000 || hy >= 0x7f800000 ) 96 ! { 97 ! if ( hx == 0x7f800000 || hy == 0x7f800000 ) 98 ! *(int*)pz = 0x7f800000; 99 ! else *pz = x * y; 100 ! } 101 ! else 102 ! { 103 ! hyp = sqrt(x * (double)x + y * (double)y); 104 ! if ( hyp <= DMAX ) ftmp0 = (float)hyp; 105 ! else ftmp0 = FMAX * FMAX; 106 ! *pz = ftmp0; 107 ! } 108 ! pz += stridez; 109 ! continue; 110 ! } 111 ! if ( (hx | hy) == 0 ) 112 ! { 113 ! *pz = 0; 114 ! pz += stridez; 115 ! continue; 116 ! } 117 ! dx0 = x0 * (double)x0; 118 ! dy0 = y0 * (double)y0; 119 ! db0 = dx0 + dy0; 120 ! 121 ! iexp0 = ((int*)&db0)[0]; 122 ! 123 ! h0 = vis_fand(db0,DC0); 124 ! h0 = vis_for(h0,DC1); 125 ! h_hi0 = vis_fand(h0,DC2); 126 ! 127 ! db0 = vis_fand(db0,DA0); 128 ! db0 = vis_fmul8x16(SCALE, db0); 129 ! db0 = vis_fpadd32(db0,DA1); 130 ! 131 ! iexp0 >>= 8; 132 ! di0 = iexp0 & 0x1ff0; 133 ! si0 = (char*)sqrt_arr + di0; 134 ! 135 ! dtmp0 = ((double*)((char*)div_arr + di0))[0]; 136 ! xx0 = h0 - h_hi0; 137 ! xx0 *= dmp0; 138 ! 139 ! dtmp0 = ((double*)si0)[1]; 140 ! res0 = K2 * xx0; 141 ! res0 += K1; 142 ! res0 *= xx0; 143 ! res0 += DC1; 144 ! res0 = dtmp0 * res0; 145 ! res0 *= db0; 146 ! ftmp0 = (float)res0; 147 ! *pz = ftmp0; 148 ! pz += stridez; 149 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 150 151 ENTRY(__vhypotf) 152 save %sp,-SA(MINFRAME)-tmps,%sp 153 PIC_SETUP(l7) 154 PIC_SET(l7,.CONST_TBL,o3) 155 PIC_SET(l7,__vlibm_TBL_sqrtf,l1) 156 157 #ifdef __sparcv9 158 ldx [%fp+STACK_BIAS+176],stridez 159 #else 160 ld [%fp+STACK_BIAS+92],stridez 161 #endif 162 st %i0,[%fp+tmp_counter] 163 164 stx %i1,[%fp+tmp_px] 165 166 stx %i3,[%fp+tmp_py] 167 168 ldd [%o3],K1 169 sethi %hi(0x7ffffc00),%o1 170 171 ldd [%o3+8],K2 172 sethi %hi(0x7f350400),%o2 173 174 ldd [%o3+16],DC0 175 add %o1,1023,_0x7fffffff 176 add %o2,0xf3,_0x7f3504f3 177 178 ldd [%o3+24],DC1 179 sll %i2,2,stridex 180 181 ld [%o3+56],FMAX 182 183 ldd [%o3+32],DC2 184 sll %i4,2,stridey 185 186 ldd [%o3+40],DA0 187 sll stridez,2,stridez 188 189 ldd [%o3+48],DFMAX 190 191 ld [%o3+60],SCALE 192 or %g0,0xff8,%l2 193 194 ldd [%o3+64],DA1 195 sll %l2,1,_0x1ff0 196 or %g0,%i5,%l7 197 198 .begin: 199 ld [%fp+tmp_counter],counter 200 ldx [%fp+tmp_px],%i1 201 ldx [%fp+tmp_py],%i2 202 st %g0,[%fp+tmp_counter] 203 .begin1: 204 cmp counter,0 205 ble,pn %icc,.exit 206 lda [%i1]0x82,%l3 ! (3_0) hx0 = *(int*)px; 207 208 lda [%i2]0x82,%l4 ! (3_0) hy0 = *(int*)py; 209 210 lda [%i1]0x82,%f17 ! (3_0) x0 = *px; 211 and %l3,_0x7fffffff,%l3 ! (3_0) hx0 &= 0x7fffffff; 212 213 cmp %l3,_0x7f3504f3 ! (3_0) hx ? 0x7f3504f3 214 bge,pn %icc,.spec ! (3_0) if ( hx >= 0x7f3504f3 ) 215 and %l4,_0x7fffffff,%l4 ! (3_0) hy0 &= 0x7fffffff; 216 217 cmp %l4,_0x7f3504f3 ! (3_0) hy ? 0x7f3504f3 218 bge,pn %icc,.spec ! (3_0) if ( hy >= 0x7f3504f3 ) 219 or %g0,%i2,%o7 220 221 orcc %l3,%l4,%g0 222 bz,pn %icc,.spec1 223 224 add %i1,stridex,%i1 ! px += stridex 225 fsmuld %f17,%f17,%f44 ! (3_0) dx0 = x0 * (double)x0; 226 lda [%i2]0x82,%f17 ! (3_0) y0 = *py; 227 228 lda [%i1]0x82,%l3 ! (4_0) hx0 = *(int*)px; 229 230 lda [stridey+%o7]0x82,%l4 ! (4_0) hy0 = *(int*)py; 231 232 and %l3,_0x7fffffff,%l3 ! (4_0) hx0 &= 0x7fffffff; 233 234 fsmuld %f17,%f17,%f24 ! (3_0) dy0 = y0 * (double)y0; 235 cmp %l3,_0x7f3504f3 ! (4_0) hx ? 0x7f3504f3 236 bge,pn %icc,.update0 ! (4_0) if ( hx >= 0x7f3504f3 ) 237 and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff; 238 239 orcc %l3,%l4,%g0 240 bz,pn %icc,.update0 241 lda [%i1]0x82,%f17 ! (4_0) x0 = *px; 242 .cont0: 243 faddd %f44,%f24,%f24 ! (3_0) db0 = dx0 + dy0; 244 245 fsmuld %f17,%f17,%f40 ! (4_1) dy0 = x0 * (double)x0; 246 cmp %l4,_0x7f3504f3 ! (4_1) hy ? 0x7f3504f3 247 lda [stridey+%o7]0x82,%f17 ! (4_1) hy0 = *py; 248 249 add %o7,stridey,%i5 ! py += stridey 250 lda [%i1+stridex]0x82,%l3 ! (0_0) hx0 = *(int*)px; 251 252 bge,pn %icc,.update1 ! (4_1) if ( hy >= 0x7f3504f3 ) 253 st %f24,[%fp+tmp0] ! (3_1) iexp0 = ((int*)&db0)[0]; 254 .cont1: 255 and %l3,_0x7fffffff,%l3 ! (0_0) hx0 &= 0x7fffffff; 256 257 fsmuld %f17,%f17,%f48 ! (4_1) dy0 = y0 * (double)y0; 258 lda [%i1+stridex]0x82,%f8 ! (0_0) x0 = *px; 259 260 add %i1,stridex,%i1 ! px += stridex 261 262 lda [%i5+stridey]0x82,%l4 ! (0_0) hy0 = *(int*)py; 263 cmp %l3,_0x7f3504f3 ! (0_0) hx ? 0x7f3504f3 264 bge,pn %icc,.update2 ! (0_0) if ( hx >= 0x7f3504f3 ) 265 add %i5,stridey,%o4 ! py += stridey 266 .cont2: 267 faddd %f40,%f48,%f20 ! (4_1) db0 = dx0 + dy0; 268 269 fsmuld %f8,%f8,%f40 ! (0_0) dx0 = x0 * (double)x0; 270 and %l4,_0x7fffffff,%l4 ! (0_0) hy0 &= 0x7fffffff; 271 lda [%i5+stridey]0x82,%f17 ! (0_0) hy0 = *py; 272 273 cmp %l4,_0x7f3504f3 ! (0_0) hy ? 0x7f3504f3 274 bge,pn %icc,.update3 ! (0_0) if ( hy >= 0x7f3504f3 ) 275 st %f20,[%fp+tmp1] ! (4_1) iexp0 = ((int*)&db0)[0]; 276 277 orcc %l3,%l4,%g0 278 bz,pn %icc,.update3 279 .cont3: 280 lda [%i1+stridex]0x82,%l3 ! (1_0) hx0 = *(int*)px; 281 282 fand %f24,DC0,%f60 ! (3_1) h0 = vis_fand(db0,DC0); 283 284 and %l3,_0x7fffffff,%l3 ! (1_0) hx0 &= 0x7fffffff; 285 286 fsmuld %f17,%f17,%f34 ! (0_0) dy0 = y0 * (double)y0; 287 cmp %l3,_0x7f3504f3 ! (1_0) hx ? 0x7f3504f3 288 lda [%o4+stridey]0x82,%l4 ! (1_0) hy0 = *(int*)py; 289 290 add %i1,stridex,%i1 ! px += stridex 291 292 lda [%i1]0x82,%f17 ! (1_0) x0 = *px; 293 bge,pn %icc,.update4 ! (1_0) if ( hx >= 0x7f3504f3 ) 294 add %o4,stridey,%i5 ! py += stridey 295 .cont4: 296 and %l4,_0x7fffffff,%l4 ! (1_0) hy0 &= 0x7fffffff; 297 for %f60,DC1,%f46 ! (3_1) h0 = vis_for(h0,DC1); 298 299 cmp %l4,_0x7f3504f3 ! (1_0) hy ? 0x7f3504f3 300 ld [%fp+tmp0],%o0 ! (3_1) iexp0 = ((int*)&db0)[0]; 301 faddd %f40,%f34,%f0 ! (0_0) db0 = dx0 + dy0; 302 303 fsmuld %f17,%f17,%f40 ! (1_0) dx0 = x0 * (double)x0; 304 add %i1,stridex,%i1 ! px += stridex 305 lda [%o4+stridey]0x82,%f17 ! (1_0) y0 = *py; 306 307 srax %o0,8,%o0 ! (3_1) iexp0 >>= 8; 308 bge,pn %icc,.update5 ! (1_0) if ( hy >= 0x7f3504f3 ) 309 fand %f46,DC2,%f38 ! (3_1) h_hi0 = vis_fand(h0,DC2); 310 311 orcc %l3,%l4,%g0 312 bz,pn %icc,.update5 313 .cont5: 314 lda [%i1]0x82,%l3 ! (2_0) hx0 = *(int*)px; 315 316 and %o0,_0x1ff0,%o0 ! (3_1) di0 = iexp0 & 0x1ff0; 317 st %f0,[%fp+tmp2] ! (0_0) iexp0 = ((int*)&db0)[0]; 318 fand %f20,DC0,%f60 ! (4_1) h0 = vis_fand(db0,DC0); 319 320 ldd [TBL+%o0],%f22 ! (3_1) dtmp0 = ((double*)((char*)div_arr + di0))[0]; 321 fsubd %f46,%f38,%f38 ! (3_1) xx0 = h0 - h_hi0; 322 323 fsmuld %f17,%f17,%f32 ! (1_0) dy0 = y0 * (double)y0; 324 add %i5,stridey,%i2 ! py += stridey 325 lda [stridey+%i5]0x82,%l4 ! (2_0) hy0 = *(int*)py; 326 327 and %l3,_0x7fffffff,%l3 ! (2_0) hx0 &= 0x7fffffff; 328 329 lda [%i1]0x82,%f17 ! (2_0) x0 = *px; 330 cmp %l3,_0x7f3504f3 ! (2_0) hx ? 0x7f3504f3 331 332 fmuld %f38,%f22,%f38 ! (3_1) xx0 *= dmp0; 333 and %l4,_0x7fffffff,%l4 ! (2_0) hy0 &= 0x7fffffff; 334 for %f60,DC1,%f46 ! (4_1) h0 = vis_for(h0,DC1); 335 336 bge,pn %icc,.update6 ! (2_0) if ( hx >= 0x7f3504f3 ) 337 ld [%fp+tmp1],%o3 ! (4_1) iexp0 = ((int*)&db0)[0]; 338 .cont6: 339 faddd %f40,%f32,%f18 ! (1_0) db0 = dx0 + dy0; 340 341 fsmuld %f17,%f17,%f44 ! (2_0) dx0 = x0 * (double)x0; 342 cmp %l4,_0x7f3504f3 ! (2_0) hy ? 0x7f3504f3 343 lda [stridey+%i5]0x82,%f17 ! (2_0) y0 = *py; 344 345 add %i1,stridex,%i1 ! px += stridex 346 bge,pn %icc,.update7 ! (2_0) if ( hy >= 0x7f3504f3 ) 347 fand %f46,DC2,%f58 ! (4_1) h_hi0 = vis_fand(h0,DC2); 348 349 orcc %l3,%l4,%g0 350 bz,pn %icc,.update7 351 nop 352 .cont7: 353 fmuld K2,%f38,%f56 ! (3_1) res0 = K2 * xx0; 354 srax %o3,8,%o3 ! (4_1) iexp0 >>= 8; 355 lda [%i1]0x82,%l3 ! (3_0) hx0 = *(int*)px; 356 357 and %o3,_0x1ff0,%o3 ! (4_1) di0 = iexp0 & 0x1ff0; 358 st %f18,[%fp+tmp3] ! (1_0) iexp0 = ((int*)&db0)[0]; 359 fand %f0,DC0,%f60 ! (0_0) h0 = vis_fand(db0,DC0); 360 361 ldd [TBL+%o3],%f22 ! (4_1) dtmp0 = ((double*)((char*)div_arr + di0))[0]; 362 add %i2,stridey,%o7 ! py += stridey 363 fsubd %f46,%f58,%f58 ! (4_1) xx0 = h0 - h_hi0; 364 365 fsmuld %f17,%f17,%f30 ! (2_0) dy0 = y0 * (double)y0; 366 lda [stridey+%i2]0x82,%l4 ! (3_0) hy0 = *(int*)py; 367 and %l3,_0x7fffffff,%l3 ! (3_0) hx0 &= 0x7fffffff; 368 369 faddd %f56,K1,%f54 ! (3_1) res0 += K1; 370 cmp %l3,_0x7f3504f3 ! (3_0) hx ? 0x7f3504f3 371 372 lda [%i1]0x82,%f17 ! (3_0) x0 = *px; 373 add %i1,stridex,%i1 ! px += stridex 374 bge,pn %icc,.update8 ! (3_0) if ( hx >= 0x7f3504f3 ) 375 376 fmuld %f58,%f22,%f58 ! (4_1) xx0 *= dmp0; 377 .cont8: 378 and %l4,_0x7fffffff,%l4 ! (3_0) hy0 &= 0x7fffffff; 379 for %f60,DC1,%f46 ! (0_0) h0 = vis_for(h0,DC1); 380 381 cmp %l4,_0x7f3504f3 ! (3_0) hy ? 0x7f3504f3 382 ld [%fp+tmp2],%g1 ! (0_0) iexp0 = ((int*)&db0)[0]; 383 faddd %f44,%f30,%f30 ! (2_0) db0 = dx0 + dy0; 384 385 fsmuld %f17,%f17,%f44 ! (3_0) dx0 = x0 * (double)x0; 386 bge,pn %icc,.update9 ! (3_0) if ( hy >= 0x7f3504f3 ) 387 lda [stridey+%i2]0x82,%f17 ! (3_0) y0 = *py; 388 389 orcc %l3,%l4,%g0 390 bz,pn %icc,.update9 391 nop 392 .cont9: 393 fmuld %f54,%f38,%f40 ! (3_1) res0 *= xx0; 394 lda [%i1]0x82,%l3 ! (4_0) hx0 = *(int*)px; 395 fand %f46,DC2,%f38 ! (0_0) h_hi0 = vis_fand(h0,DC2); 396 397 fmuld K2,%f58,%f54 ! (4_1) res0 = K2 * xx0; 398 srax %g1,8,%o5 ! (0_0) iexp0 >>= 8; 399 lda [stridey+%o7]0x82,%l4 ! (4_0) hy0 = *(int*)py; 400 fand %f24,DA0,%f56 ! (3_1) db0 = vis_fand(db0,DA0); 401 402 and %o5,_0x1ff0,%o5 ! (0_0) di0 = iexp0 & 0x1ff0; 403 st %f30,[%fp+tmp4] ! (2_0) iexp0 = ((int*)&db0)[0]; 404 fand %f18,DC0,%f60 ! (1_0) h0 = vis_fand(db0,DC0); 405 406 ldd [TBL+%o5],%f22 ! (0_0) dtmp0 = ((double*)((char*)div_arr + di0))[0]; 407 add %o0,TBL,%g1 ! (3_1) si0 = (char*)sqrt_arr + di0; 408 and %l3,_0x7fffffff,%l3 ! (4_0) hx0 &= 0x7fffffff; 409 fsubd %f46,%f38,%f38 ! (0_0) xx0 = h0 - h_hi0; 410 411 fsmuld %f17,%f17,%f24 ! (3_0) dy0 = y0 * (double)y0; 412 cmp %l3,_0x7f3504f3 ! (4_0) hx ? 0x7f3504f3 413 bge,pn %icc,.update10 ! (4_0) if ( hx >= 0x7f3504f3 ) 414 faddd %f40,DC1,%f40 ! (3_1) res0 += DC1; 415 416 fmul8x16 SCALE,%f56,%f36 ! (3_1) db0 = vis_fmul8x16(SCALE, db0); 417 and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff; 418 ldd [%g1+8],%f56 ! (3_1) dtmp0 = ((double*)si0)[1]; 419 faddd %f54,K1,%f54 ! (4_1) res0 += K1; 420 421 lda [%i1]0x82,%f17 ! (4_0) x0 = *px; 422 .cont10: 423 fmuld %f38,%f22,%f38 ! (0_0) xx0 *= dmp0; 424 cmp counter,5 425 for %f60,DC1,%f46 ! (1_0) h0 = vis_for(h0,DC1); 426 427 ld [%fp+tmp3],%g1 ! (1_0) iexp0 = ((int*)&db0)[0]; 428 fmuld %f56,%f40,%f62 ! (3_1) res0 = dtmp0 * res0; 429 faddd %f44,%f24,%f24 ! (3_0) db0 = dx0 + dy0; 430 431 bl,pn %icc,.tail 432 nop 433 434 ba .main_loop 435 sub counter,5,counter 436 437 .align 16 438 .main_loop: 439 fsmuld %f17,%f17,%f40 ! (4_1) dy0 = x0 * (double)x0; 440 cmp %l4,_0x7f3504f3 ! (4_1) hy ? 0x7f3504f3 441 lda [stridey+%o7]0x82,%f17 ! (4_1) hy0 = *py; 442 fpadd32 %f36,DA1,%f36 ! (3_2) db0 = vis_fpadd32(db0,DA1); 443 444 fmuld %f54,%f58,%f58 ! (4_2) res0 *= xx0; 445 add %o7,stridey,%i5 ! py += stridey 446 st %f24,[%fp+tmp0] ! (3_1) iexp0 = ((int*)&db0)[0]; 447 fand %f46,DC2,%f44 ! (1_1) h_hi0 = vis_fand(h0,DC2); 448 449 fmuld K2,%f38,%f56 ! (0_1) res0 = K2 * xx0; 450 srax %g1,8,%g5 ! (1_1) iexp0 >>= 8; 451 bge,pn %icc,.update11 ! (4_1) if ( hy >= 0x7f3504f3 ) 452 fand %f20,DA0,%f54 ! (4_2) db0 = vis_fand(db0,DA0); 453 454 orcc %l3,%l4,%g0 455 nop 456 bz,pn %icc,.update11 457 fzero %f52 458 .cont11: 459 fmuld %f62,%f36,%f62 ! (3_2) res0 *= db0; 460 and %g5,_0x1ff0,%g5 ! (1_1) di0 = iexp0 & 0x1ff0; 461 lda [%i1+stridex]0x82,%l3 ! (0_0) hx0 = *(int*)px; 462 fand %f30,DC0,%f60 ! (2_1) h0 = vis_fand(db0,DC0); 463 464 ldd [%g5+TBL],%f22 ! (1_1) dtmp0 = ((double*)((char*)div_arr + di0))[0]; 465 add %o3,TBL,%g1 ! (4_2) si0 = (char*)sqrt_arr + di0; 466 add %i1,stridex,%i0 ! px += stridex 467 fsubd %f46,%f44,%f44 ! (1_1) xx0 = h0 - h_hi0; 468 469 fsmuld %f17,%f17,%f48 ! (4_1) dy0 = y0 * (double)y0; 470 nop 471 lda [%i1+stridex]0x82,%f8 ! (0_0) x0 = *px; 472 faddd %f58,DC1,%f36 ! (4_2) res0 += DC1; 473 474 faddd %f56,K1,%f58 ! (0_1) res0 += K1; 475 and %l3,_0x7fffffff,%l3 ! (0_0) hx0 &= 0x7fffffff; 476 ldd [%g1+8],%f56 ! (4_2) dtmp0 = ((double*)si0)[1]; 477 fmul8x16 SCALE,%f54,%f54 ! (4_2) db0 = vis_fmul8x16(SCALE, db0); 478 479 lda [%i5+stridey]0x82,%l4 ! (0_0) hy0 = *(int*)py; 480 cmp %l3,_0x7f3504f3 ! (0_0) hx ? 0x7f3504f3 481 bge,pn %icc,.update12 ! (0_0) if ( hx >= 0x7f3504f3 ) 482 fdtos %f62,%f14 ! (3_2) ftmp0 = (float)res0; 483 .cont12: 484 fmuld %f44,%f22,%f44 ! (1_1) xx0 *= dmp0; 485 add %l7,stridez,%o7 ! pz += stridez 486 st %f14,[%l7] ! (3_2) *pz = ftmp0; 487 for %f60,DC1,%f46 ! (2_1) h0 = vis_for(h0,DC1); 488 489 fmuld %f56,%f36,%f36 ! (4_2) res0 = dtmp0 * res0; 490 add %i5,stridey,%o4 ! py += stridey 491 ld [%fp+tmp4],%g1 ! (2_1) iexp0 = ((int*)&db0)[0]; 492 faddd %f40,%f48,%f20 ! (4_1) db0 = dx0 + dy0; 493 494 fsmuld %f8,%f8,%f40 ! (0_0) dx0 = x0 * (double)x0; 495 and %l4,_0x7fffffff,%l4 ! (0_0) hy0 &= 0x7fffffff; 496 lda [%i5+stridey]0x82,%f17 ! (0_0) hy0 = *py; 497 fpadd32 %f54,DA1,%f62 ! (4_2) db0 = vis_fpadd32(db0,DA1); 498 499 fmuld %f58,%f38,%f38 ! (0_1) res0 *= xx0; 500 cmp %l4,_0x7f3504f3 ! (0_0) hy ? 0x7f3504f3 501 st %f20,[%fp+tmp1] ! (4_1) iexp0 = ((int*)&db0)[0]; 502 fand %f46,DC2,%f58 ! (2_1) h_hi0 = vis_fand(h0,DC2); 503 504 fmuld K2,%f44,%f56 ! (1_1) res0 = K2 * xx0; 505 srax %g1,8,%g1 ! (2_1) iexp0 >>= 8; 506 bge,pn %icc,.update13 ! (0_0) if ( hy >= 0x7f3504f3 ) 507 fand %f0,DA0,%f54 ! (0_1) db0 = vis_fand(db0,DA0); 508 509 orcc %l3,%l4,%g0 510 nop 511 bz,pn %icc,.update13 512 fzero %f52 513 .cont13: 514 fmuld %f36,%f62,%f62 ! (4_2) res0 *= db0; 515 and %g1,_0x1ff0,%g1 ! (2_1) di0 = iexp0 & 0x1ff0; 516 lda [%i0+stridex]0x82,%l3 ! (1_0) hx0 = *(int*)px; 517 fand %f24,DC0,%f60 ! (3_1) h0 = vis_fand(db0,DC0); 518 519 ldd [TBL+%g1],%f22 ! (2_1) dtmp0 = ((double*)((char*)div_arr + di0))[0]; 520 add %o5,TBL,%o0 ! (0_1) si0 = (char*)sqrt_arr + di0; 521 add %i0,stridex,%i1 ! px += stridex 522 fsubd %f46,%f58,%f58 ! (2_1) xx0 = h0 - h_hi0; 523 524 fsmuld %f17,%f17,%f34 ! (0_0) dy0 = y0 * (double)y0; 525 add %o7,stridez,%i0 ! pz += stridez 526 lda [%o4+stridey]0x82,%l4 ! (1_0) hy0 = *(int*)py; 527 faddd %f38,DC1,%f36 ! (0_1) res0 += DC1; 528 529 faddd %f56,K1,%f38 ! (1_1) res0 += K1; 530 and %l3,_0x7fffffff,%l3 ! (1_0) hx0 &= 0x7fffffff; 531 ldd [%o0+8],%f56 ! (0_1) dtmp0 = ((double*)si0)[1]; 532 fmul8x16 SCALE,%f54,%f54 ! (0_1) db0 = vis_fmul8x16(SCALE, db0); 533 534 lda [%i1]0x82,%f17 ! (1_0) x0 = *px; 535 cmp %l3,_0x7f3504f3 ! (1_0) hx ? 0x7f3504f3 536 bge,pn %icc,.update14 ! (1_0) if ( hx >= 0x7f3504f3 ) 537 fdtos %f62,%f14 ! (4_2) ftmp0 = (float)res0; 538 .cont14: 539 fmuld %f58,%f22,%f58 ! (2_1) xx0 *= dmp0; 540 and %l4,_0x7fffffff,%l4 ! (1_0) hy0 &= 0x7fffffff; 541 add %o4,stridey,%i5 ! py += stridey 542 for %f60,DC1,%f46 ! (3_1) h0 = vis_for(h0,DC1); 543 544 fmuld %f56,%f36,%f36 ! (0_1) res0 = dtmp0 * res0; 545 cmp %l4,_0x7f3504f3 ! (1_0) hy ? 0x7f3504f3 546 ld [%fp+tmp0],%o0 ! (3_1) iexp0 = ((int*)&db0)[0]; 547 faddd %f40,%f34,%f0 ! (0_0) db0 = dx0 + dy0; 548 549 fsmuld %f17,%f17,%f40 ! (1_0) dx0 = x0 * (double)x0; 550 add %i1,stridex,%i1 ! px += stridex 551 lda [%o4+stridey]0x82,%f17 ! (1_0) y0 = *py; 552 fpadd32 %f54,DA1,%f62 ! (0_1) db0 = vis_fpadd32(db0,DA1); 553 554 fmuld %f38,%f44,%f44 ! (1_1) res0 *= xx0; 555 st %f14,[%o7] ! (4_2) *pz = ftmp0; 556 bge,pn %icc,.update15 ! (1_0) if ( hy >= 0x7f3504f3 ) 557 fand %f46,DC2,%f38 ! (3_1) h_hi0 = vis_fand(h0,DC2); 558 559 orcc %l3,%l4,%g0 560 bz,pn %icc,.update15 561 nop 562 .cont15: 563 fmuld K2,%f58,%f54 ! (2_1) res0 = K2 * xx0; 564 srax %o0,8,%o0 ! (3_1) iexp0 >>= 8; 565 st %f0,[%fp+tmp2] ! (0_0) iexp0 = ((int*)&db0)[0]; 566 fand %f18,DA0,%f56 ! (1_1) db0 = vis_fand(db0,DA0); 567 568 fmuld %f36,%f62,%f62 ! (0_1) res0 *= db0; 569 and %o0,_0x1ff0,%o0 ! (3_1) di0 = iexp0 & 0x1ff0; 570 lda [%i1]0x82,%l3 ! (2_0) hx0 = *(int*)px; 571 fand %f20,DC0,%f60 ! (4_1) h0 = vis_fand(db0,DC0); 572 573 ldd [TBL+%o0],%f22 ! (3_1) dtmp0 = ((double*)((char*)div_arr + di0))[0]; 574 add %g5,TBL,%o3 ! (1_1) si0 = (char*)sqrt_arr + di0; 575 add %i0,stridez,%i3 ! pz += stridez 576 fsubd %f46,%f38,%f38 ! (3_1) xx0 = h0 - h_hi0; 577 578 fsmuld %f17,%f17,%f32 ! (1_0) dy0 = y0 * (double)y0; 579 add %i5,stridey,%i2 ! py += stridey 580 lda [stridey+%i5]0x82,%l4 ! (2_0) hy0 = *(int*)py; 581 faddd %f44,DC1,%f44 ! (1_1) res0 += DC1; 582 583 fmul8x16 SCALE,%f56,%f36 ! (1_1) db0 = vis_fmul8x16(SCALE, db0); 584 and %l3,_0x7fffffff,%l3 ! (2_0) hx0 &= 0x7fffffff; 585 ldd [%o3+8],%f56 ! (1_1) dtmp0 = ((double*)si0)[1]; 586 faddd %f54,K1,%f54 ! (2_1) res0 += K1; 587 588 lda [%i1]0x82,%f17 ! (2_0) x0 = *px; 589 cmp %l3,_0x7f3504f3 ! (2_0) hx ? 0x7f3504f3 590 add %i3,stridez,%o4 ! pz += stridez 591 fdtos %f62,%f14 ! (0_1) ftmp0 = (float)res0; 592 593 fmuld %f38,%f22,%f38 ! (3_1) xx0 *= dmp0; 594 and %l4,_0x7fffffff,%l4 ! (2_0) hy0 &= 0x7fffffff; 595 st %f14,[%i0] ! (0_1) *pz = ftmp0; 596 for %f60,DC1,%f46 ! (4_1) h0 = vis_for(h0,DC1); 597 598 fmuld %f56,%f44,%f62 ! (1_1) res0 = dtmp0 * res0; 599 bge,pn %icc,.update16 ! (2_0) if ( hx >= 0x7f3504f3 ) 600 ld [%fp+tmp1],%o3 ! (4_1) iexp0 = ((int*)&db0)[0]; 601 faddd %f40,%f32,%f18 ! (1_0) db0 = dx0 + dy0; 602 .cont16: 603 fsmuld %f17,%f17,%f44 ! (2_0) dx0 = x0 * (double)x0; 604 cmp %l4,_0x7f3504f3 ! (2_0) hy ? 0x7f3504f3 605 lda [stridey+%i5]0x82,%f17 ! (2_0) y0 = *py; 606 fpadd32 %f36,DA1,%f36 ! (1_1) db0 = vis_fpadd32(db0,DA1); 607 608 fmuld %f54,%f58,%f54 ! (2_1) res0 *= xx0; 609 add %i1,stridex,%l7 ! px += stridex 610 bge,pn %icc,.update17 ! (2_0) if ( hy >= 0x7f3504f3 ) 611 fand %f46,DC2,%f58 ! (4_1) h_hi0 = vis_fand(h0,DC2); 612 613 orcc %l3,%l4,%g0 614 nop 615 bz,pn %icc,.update17 616 fzero %f52 617 .cont17: 618 fmuld K2,%f38,%f56 ! (3_1) res0 = K2 * xx0; 619 srax %o3,8,%o3 ! (4_1) iexp0 >>= 8; 620 st %f18,[%fp+tmp3] ! (1_0) iexp0 = ((int*)&db0)[0]; 621 fand %f30,DA0,%f40 ! (2_1) db0 = vis_fand(db0,DA0); 622 623 fmuld %f62,%f36,%f62 ! (1_1) res0 *= db0; 624 and %o3,_0x1ff0,%o3 ! (4_1) di0 = iexp0 & 0x1ff0; 625 lda [%l7]0x82,%l3 ! (3_0) hx0 = *(int*)px; 626 fand %f0,DC0,%f60 ! (0_0) h0 = vis_fand(db0,DC0); 627 628 ldd [TBL+%o3],%f22 ! (4_1) dtmp0 = ((double*)((char*)div_arr + di0))[0]; 629 add %g1,TBL,%g1 ! (2_1) si0 = (char*)sqrt_arr + di0; 630 add %i2,stridey,%o7 ! py += stridey 631 fsubd %f46,%f58,%f58 ! (4_1) xx0 = h0 - h_hi0; 632 633 fsmuld %f17,%f17,%f30 ! (2_0) dy0 = y0 * (double)y0; 634 lda [stridey+%i2]0x82,%l4 ! (3_0) hy0 = *(int*)py; 635 add %l7,stridex,%i1 ! px += stridex 636 faddd %f54,DC1,%f36 ! (2_1) res0 += DC1; 637 638 faddd %f56,K1,%f54 ! (3_1) res0 += K1; 639 and %l3,_0x7fffffff,%l3 ! (3_0) hx0 &= 0x7fffffff; 640 ldd [%g1+8],%f56 ! (2_1) dtmp0 = ((double*)si0)[1]; 641 fmul8x16 SCALE,%f40,%f40 ! (2_1) db0 = vis_fmul8x16(SCALE, db0); 642 643 lda [%l7]0x82,%f17 ! (3_0) x0 = *px; 644 cmp %l3,_0x7f3504f3 ! (3_0) hx ? 0x7f3504f3 645 bge,pn %icc,.update18 ! (3_0) if ( hx >= 0x7f3504f3 ) 646 fdtos %f62,%f14 ! (1_1) ftmp0 = (float)res0; 647 .cont18: 648 fmuld %f58,%f22,%f58 ! (4_1) xx0 *= dmp0; 649 and %l4,_0x7fffffff,%l4 ! (3_0) hy0 &= 0x7fffffff; 650 st %f14,[%i3] ! (1_1) *pz = ftmp0; 651 for %f60,DC1,%f46 ! (0_0) h0 = vis_for(h0,DC1); 652 653 fmuld %f56,%f36,%f36 ! (2_1) res0 = dtmp0 * res0; 654 cmp %l4,_0x7f3504f3 ! (3_0) hy ? 0x7f3504f3 655 ld [%fp+tmp2],%g1 ! (0_0) iexp0 = ((int*)&db0)[0]; 656 faddd %f44,%f30,%f30 ! (2_0) db0 = dx0 + dy0; 657 658 fsmuld %f17,%f17,%f44 ! (3_0) dx0 = x0 * (double)x0; 659 bge,pn %icc,.update19 ! (3_0) if ( hy >= 0x7f3504f3 ) 660 lda [stridey+%i2]0x82,%f17 ! (3_0) y0 = *py; 661 fpadd32 %f40,DA1,%f62 ! (2_1) db0 = vis_fpadd32(db0,DA1); 662 663 .cont19: 664 fmuld %f54,%f38,%f40 ! (3_1) res0 *= xx0; 665 orcc %l3,%l4,%g0 666 st %f30,[%fp+tmp4] ! (2_0) iexp0 = ((int*)&db0)[0]; 667 fand %f46,DC2,%f38 ! (0_0) h_hi0 = vis_fand(h0,DC2); 668 669 fmuld K2,%f58,%f54 ! (4_1) res0 = K2 * xx0; 670 srax %g1,8,%o5 ! (0_0) iexp0 >>= 8; 671 lda [%i1]0x82,%l3 ! (4_0) hx0 = *(int*)px; 672 fand %f24,DA0,%f56 ! (3_1) db0 = vis_fand(db0,DA0); 673 674 fmuld %f36,%f62,%f62 ! (2_1) res0 *= db0; 675 and %o5,_0x1ff0,%o5 ! (0_0) di0 = iexp0 & 0x1ff0; 676 bz,pn %icc,.update19a 677 fand %f18,DC0,%f60 ! (1_0) h0 = vis_fand(db0,DC0); 678 .cont19a: 679 ldd [TBL+%o5],%f22 ! (0_0) dtmp0 = ((double*)((char*)div_arr + di0))[0]; 680 add %o0,TBL,%g1 ! (3_1) si0 = (char*)sqrt_arr + di0; 681 and %l3,_0x7fffffff,%l3 ! (4_0) hx0 &= 0x7fffffff; 682 fsubd %f46,%f38,%f38 ! (0_0) xx0 = h0 - h_hi0; 683 684 fsmuld %f17,%f17,%f24 ! (3_0) dy0 = y0 * (double)y0; 685 cmp %l3,_0x7f3504f3 ! (4_0) hx ? 0x7f3504f3 686 lda [stridey+%o7]0x82,%l4 ! (4_0) hy0 = *(int*)py; 687 faddd %f40,DC1,%f40 ! (3_1) res0 += DC1; 688 689 fmul8x16 SCALE,%f56,%f36 ! (3_1) db0 = vis_fmul8x16(SCALE, db0); 690 bge,pn %icc,.update20 ! (4_0) if ( hx >= 0x7f3504f3 ) 691 ldd [%g1+8],%f56 ! (3_1) dtmp0 = ((double*)si0)[1]; 692 faddd %f54,K1,%f54 ! (4_1) res0 += K1; 693 694 lda [%i1]0x82,%f17 ! (4_0) x0 = *px; 695 .cont20: 696 subcc counter,5,counter ! counter -= 5 697 add %o4,stridez,%l7 ! pz += stridez 698 fdtos %f62,%f14 ! (2_1) ftmp0 = (float)res0; 699 700 fmuld %f38,%f22,%f38 ! (0_0) xx0 *= dmp0; 701 and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff; 702 st %f14,[%o4] ! (2_1) *pz = ftmp0; 703 for %f60,DC1,%f46 ! (1_0) h0 = vis_for(h0,DC1); 704 705 ld [%fp+tmp3],%g1 ! (1_0) iexp0 = ((int*)&db0)[0]; 706 fmuld %f56,%f40,%f62 ! (3_1) res0 = dtmp0 * res0; 707 bpos,pt %icc,.main_loop 708 faddd %f44,%f24,%f24 ! (3_0) db0 = dx0 + dy0; 709 710 add counter,5,counter 711 712 .tail: 713 subcc counter,1,counter 714 bneg .begin 715 nop 716 717 fpadd32 %f36,DA1,%f36 ! (3_2) db0 = vis_fpadd32(db0,DA1); 718 719 fmuld %f54,%f58,%f58 ! (4_2) res0 *= xx0; 720 fand %f46,DC2,%f44 ! (1_1) h_hi0 = vis_fand(h0,DC2); 721 722 fmuld K2,%f38,%f56 ! (0_1) res0 = K2 * xx0; 723 srax %g1,8,%g5 ! (1_1) iexp0 >>= 8; 724 fand %f20,DA0,%f54 ! (4_2) db0 = vis_fand(db0,DA0); 725 726 fmuld %f62,%f36,%f62 ! (3_2) res0 *= db0; 727 and %g5,_0x1ff0,%g5 ! (1_1) di0 = iexp0 & 0x1ff0; 728 729 ldd [%g5+TBL],%f22 ! (1_1) dtmp0 = ((double*)((char*)div_arr + di0))[0]; 730 add %o3,TBL,%g1 ! (4_2) si0 = (char*)sqrt_arr + di0; 731 fsubd %f46,%f44,%f44 ! (1_1) xx0 = h0 - h_hi0; 732 733 faddd %f58,DC1,%f36 ! (4_2) res0 += DC1; 734 735 faddd %f56,K1,%f58 ! (0_1) res0 += K1; 736 ldd [%g1+8],%f56 ! (4_2) dtmp0 = ((double*)si0)[1]; 737 fmul8x16 SCALE,%f54,%f54 ! (4_2) db0 = vis_fmul8x16(SCALE, db0); 738 739 fdtos %f62,%f14 ! (3_2) ftmp0 = (float)res0; 740 741 fmuld %f44,%f22,%f44 ! (1_1) xx0 *= dmp0; 742 add %l7,stridez,%o7 ! pz += stridez 743 st %f14,[%l7] ! (3_2) *pz = ftmp0; 744 745 subcc counter,1,counter 746 bneg .begin 747 or %g0,%o7,%l7 748 749 fmuld %f56,%f36,%f36 ! (4_2) res0 = dtmp0 * res0; 750 751 fpadd32 %f54,DA1,%f62 ! (4_2) db0 = vis_fpadd32(db0,DA1); 752 753 fmuld %f58,%f38,%f38 ! (0_1) res0 *= xx0; 754 755 fmuld K2,%f44,%f56 ! (1_1) res0 = K2 * xx0; 756 fand %f0,DA0,%f54 ! (0_1) db0 = vis_fand(db0,DA0); 757 758 fmuld %f36,%f62,%f62 ! (4_2) res0 *= db0; 759 760 add %o5,TBL,%o0 ! (0_1) si0 = (char*)sqrt_arr + di0; 761 762 faddd %f38,DC1,%f36 ! (0_1) res0 += DC1; 763 764 faddd %f56,K1,%f38 ! (1_1) res0 += K1; 765 ldd [%o0+8],%f56 ! (0_1) dtmp0 = ((double*)si0)[1]; 766 fmul8x16 SCALE,%f54,%f54 ! (0_1) db0 = vis_fmul8x16(SCALE, db0); 767 768 add %o7,stridez,%i0 ! pz += stridez 769 fdtos %f62,%f14 ! (4_2) ftmp0 = (float)res0; 770 771 fmuld %f56,%f36,%f36 ! (0_1) res0 = dtmp0 * res0; 772 773 fpadd32 %f54,DA1,%f62 ! (0_1) db0 = vis_fpadd32(db0,DA1); 774 775 fmuld %f38,%f44,%f44 ! (1_1) res0 *= xx0; 776 add %i0,stridez,%i3 ! pz += stridez 777 st %f14,[%o7] ! (4_2) *pz = ftmp0; 778 779 subcc counter,1,counter 780 bneg .begin 781 or %g0,%i0,%l7 782 783 fand %f18,DA0,%f56 ! (1_1) db0 = vis_fand(db0,DA0); 784 785 fmuld %f36,%f62,%f62 ! (0_1) res0 *= db0; 786 787 add %g5,TBL,%o3 ! (1_1) si0 = (char*)sqrt_arr + di0; 788 789 faddd %f44,DC1,%f44 ! (1_1) res0 += DC1; 790 791 fmul8x16 SCALE,%f56,%f36 ! (1_1) db0 = vis_fmul8x16(SCALE, db0); 792 ldd [%o3+8],%f56 ! (1_1) dtmp0 = ((double*)si0)[1]; 793 794 add %i3,stridez,%o4 ! pz += stridez 795 fdtos %f62,%f14 ! (0_1) ftmp0 = (float)res0; 796 797 st %f14,[%i0] ! (0_1) *pz = ftmp0; 798 799 subcc counter,1,counter 800 bneg .begin 801 or %g0,%i3,%l7 802 803 fmuld %f56,%f44,%f62 ! (1_1) res0 = dtmp0 * res0; 804 805 fpadd32 %f36,DA1,%f36 ! (1_1) db0 = vis_fpadd32(db0,DA1); 806 807 fmuld %f62,%f36,%f62 ! (1_1) res0 *= db0; 808 809 fdtos %f62,%f14 ! (1_1) ftmp0 = (float)res0; 810 811 st %f14,[%i3] ! (1_1) *pz = ftmp0; 812 813 ba .begin 814 or %g0,%o4,%l7 815 816 .align 16 817 .spec1: 818 st %g0,[%l7] ! *pz = 0; 819 add %l7,stridez,%l7 ! pz += stridez 820 821 add %i2,stridey,%i2 ! py += stridey 822 ba .begin1 823 sub counter,1,counter ! counter-- 824 825 .align 16 826 .spec: 827 sethi %hi(0x7f800000),%i0 828 cmp %l3,%i0 ! hx ? 0x7f800000 829 bge,pt %icc,2f ! if ( hx >= 0x7f800000 ) 830 ld [%i2],%f8 831 832 cmp %l4,%i0 ! hy ? 0x7f800000 833 bge,pt %icc,2f ! if ( hy >= 0x7f800000 ) 834 nop 835 836 fsmuld %f17,%f17,%f44 ! x * (double)x 837 fsmuld %f8,%f8,%f24 ! y * (double)y 838 faddd %f44,%f24,%f24 ! x * (double)x + y * (double)y 839 fsqrtd %f24,%f24 ! hyp = sqrt(x * (double)x + y * (double)y); 840 fcmped %f24,DFMAX ! hyp ? DMAX 841 fbug,a 1f ! if ( hyp > DMAX ) 842 fmuls FMAX,FMAX,%f20 ! ftmp0 = FMAX * FMAX; 843 844 fdtos %f24,%f20 ! ftmp0 = (float)hyp; 845 1: 846 st %f20,[%l7] ! *pz = ftmp0; 847 add %l7,stridez,%l7 ! pz += stridez 848 add %i1,stridex,%i1 ! px += stridex 849 850 add %i2,stridey,%i2 ! py += stridey 851 ba .begin1 852 sub counter,1,counter ! counter-- 853 2: 854 fcmps %f17,%f8 ! exceptions 855 cmp %l3,%i0 ! hx ? 0x7f800000 856 be,a %icc,1f ! if ( hx == 0x7f800000 ) 857 st %i0,[%l7] ! *(int*)pz = 0x7f800000; 858 859 cmp %l4,%i0 ! hy ? 0x7f800000 860 be,a %icc,1f ! if ( hy == 0x7f800000 861 st %i0,[%l7] ! *(int*)pz = 0x7f800000; 862 863 fmuls %f17,%f8,%f8 ! x * y 864 st %f8,[%l7] ! *pz = x * y; 865 866 1: 867 add %l7,stridez,%l7 ! pz += stridez 868 add %i1,stridex,%i1 ! px += stridex 869 870 add %i2,stridey,%i2 ! py += stridey 871 ba .begin1 872 sub counter,1,counter ! counter-- 873 874 .align 16 875 .update0: 876 cmp counter,1 877 ble .cont0 878 fzeros %f17 879 880 stx %i1,[%fp+tmp_px] 881 882 add %o7,stridey,%i5 883 stx %i5,[%fp+tmp_py] 884 885 sub counter,1,counter 886 st counter,[%fp+tmp_counter] 887 888 ba .cont0 889 or %g0,1,counter 890 891 .align 16 892 .update1: 893 cmp counter,1 894 ble .cont1 895 fzeros %f17 896 897 stx %i1,[%fp+tmp_px] 898 stx %i5,[%fp+tmp_py] 899 900 sub counter,1,counter 901 st counter,[%fp+tmp_counter] 902 903 ba .cont1 904 or %g0,1,counter 905 906 .align 16 907 .update2: 908 cmp counter,2 909 ble .cont2 910 fzeros %f8 911 912 stx %i1,[%fp+tmp_px] 913 stx %o4,[%fp+tmp_py] 914 915 sub counter,2,counter 916 st counter,[%fp+tmp_counter] 917 918 ba .cont2 919 or %g0,2,counter 920 921 .align 16 922 .update3: 923 cmp counter,2 924 ble .cont3 925 fzeros %f17 926 927 stx %i1,[%fp+tmp_px] 928 stx %o4,[%fp+tmp_py] 929 930 sub counter,2,counter 931 st counter,[%fp+tmp_counter] 932 933 ba .cont3 934 or %g0,2,counter 935 936 .align 16 937 .update4: 938 cmp counter,3 939 ble .cont4 940 fzeros %f17 941 942 stx %i1,[%fp+tmp_px] 943 stx %i5,[%fp+tmp_py] 944 945 sub counter,3,counter 946 st counter,[%fp+tmp_counter] 947 948 ba .cont4 949 or %g0,3,counter 950 951 .align 16 952 .update5: 953 cmp counter,3 954 ble .cont5 955 fzeros %f17 956 957 sub %i1,stridex,%i2 958 stx %i2,[%fp+tmp_px] 959 stx %i5,[%fp+tmp_py] 960 961 sub counter,3,counter 962 st counter,[%fp+tmp_counter] 963 964 ba .cont5 965 or %g0,3,counter 966 967 .align 16 968 .update6: 969 cmp counter,4 970 ble .cont6 971 fzeros %f17 972 973 stx %i1,[%fp+tmp_px] 974 stx %i2,[%fp+tmp_py] 975 976 sub counter,4,counter 977 st counter,[%fp+tmp_counter] 978 979 ba .cont6 980 or %g0,4,counter 981 982 .align 16 983 .update7: 984 cmp counter,4 985 ble .cont7 986 fzeros %f17 987 988 sub %i1,stridex,%o7 989 stx %o7,[%fp+tmp_px] 990 stx %i2,[%fp+tmp_py] 991 992 sub counter,4,counter 993 st counter,[%fp+tmp_counter] 994 995 ba .cont7 996 or %g0,4,counter 997 998 .align 16 999 .update8: 1000 cmp counter,5 1001 ble .cont8 1002 fzeros %f17 1003 1004 sub %i1,stridex,%o5 1005 stx %o5,[%fp+tmp_px] 1006 stx %o7,[%fp+tmp_py] 1007 1008 sub counter,5,counter 1009 st counter,[%fp+tmp_counter] 1010 1011 ba .cont8 1012 or %g0,5,counter 1013 1014 .align 16 1015 .update9: 1016 cmp counter,5 1017 ble .cont9 1018 fzeros %f17 1019 1020 sub %i1,stridex,%o5 1021 stx %o5,[%fp+tmp_px] 1022 stx %o7,[%fp+tmp_py] 1023 1024 sub counter,5,counter 1025 st counter,[%fp+tmp_counter] 1026 1027 ba .cont9 1028 or %g0,5,counter 1029 1030 .align 16 1031 .update10: 1032 fmul8x16 SCALE,%f56,%f36 ! (3_1) db0 = vis_fmul8x16(SCALE, db0); 1033 and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff; 1034 ldd [%g1+8],%f56 ! (3_1) dtmp0 = ((double*)si0)[1]; 1035 faddd %f54,K1,%f54 ! (4_1) res0 += K1; 1036 1037 cmp counter,6 1038 ble .cont10 1039 fzeros %f17 1040 1041 stx %i1,[%fp+tmp_px] 1042 add %o7,stridey,%i5 1043 stx %i5,[%fp+tmp_py] 1044 1045 sub counter,6,counter 1046 st counter,[%fp+tmp_counter] 1047 1048 ba .cont10 1049 or %g0,6,counter 1050 1051 .align 16 1052 .update11: 1053 cmp counter,1 1054 ble .cont11 1055 fzeros %f17 1056 1057 stx %i1,[%fp+tmp_px] 1058 stx %i5,[%fp+tmp_py] 1059 1060 sub counter,1,counter 1061 st counter,[%fp+tmp_counter] 1062 1063 ba .cont11 1064 or %g0,1,counter 1065 1066 .align 16 1067 .update12: 1068 cmp counter,2 1069 ble .cont12 1070 fzeros %f8 1071 1072 stx %i0,[%fp+tmp_px] 1073 add %i5,stridey,%o4 1074 stx %o4,[%fp+tmp_py] 1075 1076 sub counter,2,counter 1077 st counter,[%fp+tmp_counter] 1078 1079 ba .cont12 1080 or %g0,2,counter 1081 1082 .align 16 1083 .update13: 1084 cmp counter,2 1085 ble .cont13 1086 fzeros %f17 1087 1088 stx %i0,[%fp+tmp_px] 1089 stx %o4,[%fp+tmp_py] 1090 1091 sub counter,2,counter 1092 st counter,[%fp+tmp_counter] 1093 1094 ba .cont13 1095 or %g0,2,counter 1096 1097 .align 16 1098 .update14: 1099 cmp counter,3 1100 ble .cont14 1101 fzeros %f17 1102 1103 stx %i1,[%fp+tmp_px] 1104 add %o4,stridey,%i5 1105 stx %i5,[%fp+tmp_py] 1106 1107 sub counter,3,counter 1108 st counter,[%fp+tmp_counter] 1109 1110 ba .cont14 1111 or %g0,3,counter 1112 1113 .align 16 1114 .update15: 1115 cmp counter,3 1116 ble .cont15 1117 fzeros %f17 1118 1119 sub %i1,stridex,%i2 1120 stx %i2,[%fp+tmp_px] 1121 stx %i5,[%fp+tmp_py] 1122 1123 sub counter,3,counter 1124 st counter,[%fp+tmp_counter] 1125 1126 ba .cont15 1127 or %g0,3,counter 1128 1129 .align 16 1130 .update16: 1131 faddd %f40,%f32,%f18 ! (1_0) db0 = dx0 + dy0; 1132 cmp counter,4 1133 ble .cont16 1134 fzeros %f17 1135 1136 stx %i1,[%fp+tmp_px] 1137 stx %i2,[%fp+tmp_py] 1138 1139 sub counter,4,counter 1140 st counter,[%fp+tmp_counter] 1141 1142 ba .cont16 1143 or %g0,4,counter 1144 1145 .align 16 1146 .update17: 1147 cmp counter,4 1148 ble .cont17 1149 fzeros %f17 1150 1151 stx %i1,[%fp+tmp_px] 1152 stx %i2,[%fp+tmp_py] 1153 1154 sub counter,4,counter 1155 st counter,[%fp+tmp_counter] 1156 1157 ba .cont17 1158 or %g0,4,counter 1159 1160 .align 16 1161 .update18: 1162 cmp counter,5 1163 ble .cont18 1164 fzeros %f17 1165 1166 stx %l7,[%fp+tmp_px] 1167 stx %o7,[%fp+tmp_py] 1168 1169 sub counter,5,counter 1170 st counter,[%fp+tmp_counter] 1171 1172 ba .cont18 1173 or %g0,5,counter 1174 1175 .align 16 1176 .update19: 1177 fpadd32 %f40,DA1,%f62 ! (2_1) db0 = vis_fpadd32(db0,DA1); 1178 cmp counter,5 1179 ble .cont19 1180 fzeros %f17 1181 1182 stx %l7,[%fp+tmp_px] 1183 stx %o7,[%fp+tmp_py] 1184 1185 sub counter,5,counter 1186 st counter,[%fp+tmp_counter] 1187 1188 ba .cont19 1189 or %g0,5,counter 1190 1191 .align 16 1192 .update19a: 1193 cmp counter,5 1194 ble .cont19a 1195 fzeros %f17 1196 1197 stx %l7,[%fp+tmp_px] 1198 stx %o7,[%fp+tmp_py] 1199 1200 sub counter,5,counter 1201 st counter,[%fp+tmp_counter] 1202 1203 ba .cont19a 1204 or %g0,5,counter 1205 1206 .align 16 1207 .update20: 1208 faddd %f54,K1,%f54 ! (4_1) res0 += K1; 1209 cmp counter,6 1210 ble .cont20 1211 fzeros %f17 1212 1213 stx %i1,[%fp+tmp_px] 1214 add %o7,stridey,%g1 1215 stx %g1,[%fp+tmp_py] 1216 1217 sub counter,6,counter 1218 st counter,[%fp+tmp_counter] 1219 1220 ba .cont20 1221 or %g0,6,counter 1222 1223 .exit: 1224 ret 1225 restore 1226 SET_SIZE(__vhypotf) 1227