1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 23 */ 24 /* 25 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 26 * Use is subject to license terms. 27 */ 28 29 .file "__vsqrtf_ultra3.S" 30 31 #include "libm.h" 32 .weak __vsqrtf 33 .type __vsqrtf,#function 34 __vsqrtf = __vsqrtf_ultra3 35 36 RO_DATA 37 .align 64 38 39 .CONST_TBL: 40 .word 0x3fe00001, 0x80007e00 ! K1 = 5.00000715259318464227e-01 41 .word 0xbfc00003, 0xc0017a01 ! K2 = -1.25000447037521686593e-01 42 .word 0x000fffff, 0xffffffff ! DC0 = 0x000fffffffffffff 43 .word 0x3ff00000, 0x00000000 ! DC1 = 0x3ff0000000000000 44 .word 0x7ffff000, 0x00000000 ! DC2 = 0x7ffff00000000000 45 46 #define DC0 %f6 47 #define DC1 %f4 48 #define DC2 %f2 49 #define K2 %f38 50 #define K1 %f36 51 #define TBL %l2 52 #define stridex %l3 53 #define stridey %l4 54 #define _0x1ff0 %l5 55 #define counter %l6 56 #define _0x00800000 %l7 57 #define _0x7f800000 %o0 58 59 #define tmp_px STACK_BIAS-0x40 60 #define tmp_counter STACK_BIAS-0x38 61 #define tmp0 STACK_BIAS-0x30 62 #define tmp1 STACK_BIAS-0x28 63 #define tmp2 STACK_BIAS-0x20 64 #define tmp3 STACK_BIAS-0x18 65 #define tmp4 STACK_BIAS-0x10 66 67 ! sizeof temp storage - must be a multiple of 16 for V9 68 #define tmps 0x40 69 70 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 71 ! !!!!! algorithm !!!!! 72 ! 73 ! x0 = *px; 74 ! ax = *(int*)px; 75 ! px += stridex; 76 ! 77 ! if( ax >= 0x7f800000 ) 78 ! { 79 ! *py = sqrtf(x0); 80 ! py += stridey; 81 ! continue; 82 ! } 83 ! if( ax < 0x00800000 ) 84 ! { 85 ! *py = sqrtf(x0); 86 ! py += stridey; 87 ! continue; 88 ! } 89 ! 90 ! db0 = (double)x0; 91 ! iexp0 = ax >> 24; 92 ! iexp0 += 0x3c0; 93 ! lexp0 = (long long)iexp0 << 52; 94 ! 95 ! db0 = vis_fand(db0,DC0); 96 ! db0 = vis_for(db0,DC1); 97 ! hi0 = vis_fand(db0,DC2); 98 ! 99 ! ax >>= 11; 100 ! si0 = ax & 0x1ff0; 101 ! dtmp0 = ((double*)((char*)TBL + si0))[0]; 102 ! xx0 = (db0 - hi0); 103 ! xx0 *= dtmp0; 104 ! dtmp0 = ((double*)((char*)TBL + si0))[1] 105 ! res0 = K2 * xx0; 106 ! res0 += K1; 107 ! res0 *= xx0; 108 ! res0 += DC1; 109 ! res0 = dtmp0 * res0; 110 ! dtmp1 = *((double*)&lexp0); 111 ! res0 *= dtmp1; 112 ! fres0 = (float)res0; 113 ! *py = fres0; 114 ! py += stridey; 115 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 116 117 ENTRY(__vsqrtf_ultra3) 118 save %sp,-SA(MINFRAME)-tmps,%sp 119 PIC_SETUP(l7) 120 PIC_SET(l7,.CONST_TBL,o2) 121 PIC_SET(l7,__vlibm_TBL_sqrtf,l2) 122 123 st %i0,[%fp+tmp_counter] 124 sll %i2,2,stridex 125 or %g0,0xff8,%l5 126 127 stx %i1,[%fp+tmp_px] 128 sll %l5,1,_0x1ff0 129 130 ldd [%o2],K1 131 sll %i4,2,stridey 132 133 ldd [%o2+8],K2 134 or %g0,%i3,%g5 135 136 ldd [%o2+16],DC0 137 sethi %hi(0x7f800000),%o0 138 139 ldd [%o2+24],DC1 140 sethi %hi(0x00800000),%l7 141 142 ldd [%o2+32],DC2 143 144 .begin: 145 ld [%fp+tmp_counter],counter 146 ldx [%fp+tmp_px],%i1 147 st %g0,[%fp+tmp_counter] 148 .begin1: 149 cmp counter,0 150 ble,pn %icc,.exit 151 152 lda [%i1]0x82,%o2 ! (2_0) ax = *(int*)px; 153 154 or %g0,%i1,%o7 155 lda [%i1]0x82,%f25 ! (2_0) x0 = *px; 156 157 cmp %o2,_0x7f800000 ! (2_0) ax ? 0x7f800000 158 bge,pn %icc,.spec ! (2_0) if( ax >= 0x7f800000 ) 159 nop 160 161 cmp %o2,_0x00800000 ! (2_0) ax ? 0x00800000 162 bl,pn %icc,.spec ! (2_0) if( ax < 0x00800000 ) 163 nop 164 165 fstod %f25,%f56 ! (2_0) db0 = (double)x0; 166 167 lda [stridex+%o7]0x82,%o1 ! (3_0) ax = *(int*)px; 168 169 sra %o2,24,%l1 ! (2_0) iexp0 = ax >> 24; 170 171 add %o7,stridex,%i1 ! px += stridex 172 add %l1,960,%l0 ! (2_0) iexp0 += 0x3c0; 173 lda [stridex+%o7]0x82,%f0 ! (3_0) x0 = *px; 174 fand %f56,DC0,%f60 ! (2_0) db0 = vis_fand(db0,DC0); 175 176 cmp %o1,_0x7f800000 ! (3_0) ax ? 0x7f800000 177 bge,pn %icc,.update0 ! (3_0) if( ax >= 0x7f800000 ) 178 nop 179 .cont0: 180 sllx %l0,52,%o3 ! (2_0) lexp0 = (long long)iexp0 << 52; 181 182 sra %o2,11,%i2 ! (2_0) ax >>= 11; 183 stx %o3,[%fp+tmp0] ! (2_0) dtmp1 = *((double*)&lexp0); 184 for %f60,DC1,%f40 ! (2_0) db0 = vis_for(db0,DC1); 185 186 cmp %o1,_0x00800000 ! (3_0) ax ? 0x00800000 187 bl,pn %icc,.update1 ! (3_0) if( ax < 0x00800000 ) 188 nop 189 .cont1: 190 fstod %f0,%f48 ! (3_0) db0 = (double)x0; 191 192 and %i2,_0x1ff0,%o3 ! (2_0) si0 = ax & 0x1ff0; 193 lda [%i1+stridex]0x82,%o2 ! (4_0) ax = *(int*)px; 194 195 add %i1,stridex,%i1 ! px += stridex 196 add %o3,TBL,%i2 ! (2_0) (char*)TBL + si0 197 fand %f40,DC2,%f46 ! (2_0) hi0 = vis_fand(db0,DC2); 198 199 sra %o1,24,%o4 ! (3_0) iexp0 = ax >> 24; 200 201 lda [%i1]0x82,%f13 ! (4_0) x0 = *px; 202 fand %f48,DC0,%f58 ! (3_0) db0 = vis_fand(db0,DC0); 203 204 add %o4,960,%i0 ! (3_0) iexp0 += 0x3c0; 205 206 cmp %o2,_0x7f800000 ! (4_1) ax ? 0x7f800000 207 bge,pn %icc,.update2 ! (4_1) if( ax >= 0x7f800000 ) 208 nop 209 .cont2: 210 fsubd %f40,%f46,%f44 ! (2_1) xx0 = (db0 - hi0); 211 sllx %i0,52,%g1 ! (3_1) lexp0 = (long long)iexp0 << 52; 212 ldd [%i2],%f40 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; 213 214 sra %o1,11,%l0 ! (3_1) ax >>= 11; 215 stx %g1,[%fp+tmp1] ! (3_1) dtmp1 = *((double*)&lexp0); 216 for %f58,DC1,%f48 ! (3_1) db0 = vis_for(db0,DC1); 217 218 cmp %o2,_0x00800000 ! (4_1) ax ? 0x00800000 219 bl,pn %icc,.update3 ! (4_1) if( ax < 0x00800000 ) 220 nop 221 .cont3: 222 fstod %f13,%f50 ! (4_1) db0 = (double)x0; 223 224 fmuld %f44,%f40,%f46 ! (2_1) xx0 *= dtmp0; 225 and %l0,_0x1ff0,%i0 ! (3_1) si0 = ax & 0x1ff0; 226 lda [%i1+stridex]0x82,%l1 ! (0_0) ax = *(int*)px; 227 228 add %i0,TBL,%l0 ! (3_1) (char*)TBL + si0 229 fand %f48,DC2,%f62 ! (3_1) hi0 = vis_fand(db0,DC2); 230 231 sra %o2,24,%o7 ! (4_1) iexp0 = ax >> 24; 232 233 add %i1,stridex,%o4 ! px += stridex 234 add %o7,960,%o7 ! (4_1) iexp0 += 0x3c0; 235 lda [%i1+stridex]0x82,%f17 ! (0_0) x0 = *px; 236 fand %f50,DC0,%f54 ! (4_1) db0 = vis_fand(db0,DC0); 237 238 fmuld K2,%f46,%f52 ! (2_1) res0 = K2 * xx0; 239 cmp %l1,_0x7f800000 ! (0_0) ax ? 0x7f800000 240 bge,pn %icc,.update4 ! (0_0) if( ax >= 0x7f800000 ) 241 fsubd %f48,%f62,%f42 ! (3_1) xx0 = (db0 - hi0); 242 .cont4: 243 sllx %o7,52,%o1 ! (4_1) lexp0 = (long long)iexp0 << 52; 244 ldd [%i0+TBL],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; 245 246 sra %o2,11,%i5 ! (4_1) ax >>= 11; 247 stx %o1,[%fp+tmp2] ! (4_1) dtmp1 = *((double*)&lexp0); 248 for %f54,DC1,%f34 ! (4_1) db0 = vis_for(db0,DC1); 249 250 cmp %l1,_0x00800000 ! (0_0) ax ? 0x00800000 251 bl,pn %icc,.update5 ! (0_0) if( ax < 0x00800000 ) 252 nop 253 .cont5: 254 fstod %f17,%f56 ! (0_0) db0 = (double)x0; 255 256 fmuld %f42,%f40,%f42 ! (3_1) xx0 *= dtmp0; 257 lda [stridex+%o4]0x82,%i0 ! (1_0) ax = *(int*)px; 258 faddd %f52,K1,%f52 ! (2_1) res0 += K1; 259 260 sra %l1,24,%g1 ! (0_0) iexp0 = ax >> 24; 261 and %i5,_0x1ff0,%i5 ! (4_1) si0 = ax & 0x1ff0; 262 fand %f34,DC2,%f62 ! (4_1) hi0 = vis_fand(db0,DC2); 263 264 add %o4,stridex,%i1 ! px += stridex 265 266 add %g1,960,%o5 ! (0_0) iexp0 += 0x3c0; 267 add %i5,TBL,%i3 ! (4_1) (char*)TBL + si0 268 lda [stridex+%o4]0x82,%f21 ! (1_0) x0 = *px; 269 fand %f56,DC0,%f32 ! (0_0) db0 = vis_fand(db0,DC0); 270 271 fmuld K2,%f42,%f50 ! (3_1) res0 = K2 * xx0; 272 cmp %i0,_0x7f800000 ! (1_0) ax ? 0x7f800000 273 bge,pn %icc,.update6 ! (1_0) if( ax >= 0x7f800000 ) 274 fsubd %f34,%f62,%f54 ! (4_1) xx0 = (db0 - hi0); 275 .cont6: 276 fmuld %f52,%f46,%f52 ! (2_1) res0 *= xx0; 277 sllx %o5,52,%o7 ! (0_0) lexp0 = (long long)iexp0 << 52; 278 ldd [TBL+%i5],%f62 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; 279 280 sra %l1,11,%i4 ! (0_0) ax >>= 11; 281 stx %o7,[%fp+tmp3] ! (0_0) dtmp1 = *((double*)&lexp0); 282 for %f32,DC1,%f48 ! (0_0) db0 = vis_for(db0,DC1); 283 284 cmp %i0,_0x00800000 ! (1_0) ax ? 0x00800000 285 bl,pn %icc,.update7 ! (1_0) if( ax < 0x00800000 ) 286 nop 287 .cont7: 288 fstod %f21,%f56 ! (1_0) db0 = (double)x0; 289 290 fmuld %f54,%f62,%f46 ! (4_1) xx0 *= dtmp0; 291 and %i4,_0x1ff0,%g1 ! (0_0) si0 = ax & 0x1ff0; 292 lda [%i1+stridex]0x82,%o2 ! (2_0) ax = *(int*)px; 293 faddd %f50,K1,%f62 ! (3_1) res0 += K1; 294 295 add %g1,TBL,%i5 ! (0_0) (double*)((char*)TBL + si0 296 fand %f48,DC2,%f32 ! (0_0) hi0 = vis_fand(db0,DC2); 297 298 sra %i0,24,%o4 ! (1_0) iexp0 = ax >> 24; 299 ldd [%i2+8],%f60 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[1] 300 faddd %f52,DC1,%f58 ! (2_1) res0 += DC1; 301 302 add %i1,stridex,%o7 ! px += stridex 303 add %o4,960,%i2 ! (1_0) iexp0 += 0x3c0; 304 lda [%i1+stridex]0x82,%f25 ! (2_0) x0 = *px; 305 fand %f56,DC0,%f34 ! (1_0) db0 = vis_fand(db0,DC0); 306 307 fmuld K2,%f46,%f50 ! (4_1) res0 = K2 * xx0; 308 cmp %o2,_0x7f800000 ! (2_0) ax ? 0x7f800000 309 bge,pn %icc,.update8 ! (2_0) if( ax >= 0x7f800000 ) 310 fsubd %f48,%f32,%f52 ! (0_0) xx0 = (db0 - hi0); 311 .cont8: 312 fmuld %f62,%f42,%f54 ! (3_1) res0 *= xx0; 313 sllx %i2,52,%o4 ! (1_0) lexp0 = (long long)iexp0 << 52; 314 ldd [TBL+%g1],%f32 ! (0_0) dtmp0 = ((double*)((char*)TBL + si0))[0]; 315 316 fmuld %f60,%f58,%f60 ! (2_1) res0 = dtmp0 * res0; 317 sra %i0,11,%g1 ! (1_0) ax >>= 11; 318 stx %o4,[%fp+tmp4] ! (1_0) dtmp1 = *((double*)&lexp0); 319 for %f34,DC1,%f48 ! (1_0) db0 = vis_for(db0,DC1); 320 321 cmp %o2,_0x00800000 ! (2_0) ax ? 0x00800000 322 bl,pn %icc,.update9 ! (2_0) if( ax < 0x00800000 ) 323 ldd [%fp+tmp0],%f40 ! (2_1) dtmp1 = *((double*)&lexp0); 324 fstod %f25,%f56 ! (2_0) db0 = (double)x0; 325 .cont9: 326 fmuld %f52,%f32,%f42 ! (0_0) xx0 *= dtmp0; 327 and %g1,_0x1ff0,%o5 ! (1_0) si0 = ax & 0x1ff0; 328 lda [stridex+%o7]0x82,%o1 ! (3_0) ax = *(int*)px; 329 faddd %f50,K1,%f34 ! (4_1) res0 += K1; 330 331 add %o5,TBL,%i4 ! (1_0) (char*)TBL + si0 332 fand %f48,DC2,%f62 ! (1_0) hi0 = vis_fand(db0,DC2); 333 334 fmuld %f60,%f40,%f32 ! (2_1) res0 *= dtmp1; 335 sra %o2,24,%l1 ! (2_0) iexp0 = ax >> 24; 336 ldd [%l0+8],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[1] 337 faddd %f54,DC1,%f58 ! (3_1) res0 += DC1; 338 339 add %o7,stridex,%i1 ! px += stridex 340 add %l1,960,%l0 ! (2_0) iexp0 += 0x3c0; 341 lda [stridex+%o7]0x82,%f0 ! (3_0) x0 = *px; 342 fand %f56,DC0,%f60 ! (2_0) db0 = vis_fand(db0,DC0); 343 344 fmuld K2,%f42,%f50 ! (0_0) res0 = K2 * xx0; 345 cmp %o1,_0x7f800000 ! (3_0) ax ? 0x7f800000 346 bge,pn %icc,.update10 ! (3_0) if( ax >= 0x7f800000 ) 347 fsubd %f48,%f62,%f54 ! (1_0) xx0 = (db0 - hi0); 348 .cont10: 349 fmuld %f34,%f46,%f52 ! (4_1) res0 *= xx0; 350 sllx %l0,52,%o3 ! (2_0) lexp0 = (long long)iexp0 << 52; 351 ldd [TBL+%o5],%f56 ! (1_0) dtmp0 = ((double*)((char*)TBL + si0))[0]; 352 353 fmuld %f40,%f58,%f34 ! (3_1) res0 = dtmp0 * res0; 354 sra %o2,11,%i2 ! (2_0) ax >>= 11; 355 stx %o3,[%fp+tmp0] ! (2_0) dtmp1 = *((double*)&lexp0); 356 for %f60,DC1,%f40 ! (2_0) db0 = vis_for(db0,DC1); 357 358 cmp %o1,_0x00800000 ! (3_0) ax ? 0x00800000 359 bl,pn %icc,.update11 ! (3_0) if( ax < 0x00800000 ) 360 ldd [%fp+tmp1],%f62 ! (3_1) dtmp1 = *((double*)&lexp0); 361 fstod %f0,%f48 ! (3_0) db0 = (double)x0; 362 .cont11: 363 fmuld %f54,%f56,%f30 ! (1_0) xx0 *= dtmp0; 364 and %i2,_0x1ff0,%o3 ! (2_0) si0 = ax & 0x1ff0; 365 lda [%i1+stridex]0x82,%o2 ! (4_0) ax = *(int*)px; 366 faddd %f50,K1,%f56 ! (0_0) res0 += K1; 367 368 add %i1,stridex,%i1 ! px += stridex 369 add %o3,TBL,%i2 ! (2_0) (char*)TBL + si0 370 fand %f40,DC2,%f46 ! (2_0) hi0 = vis_fand(db0,DC2); 371 372 fmuld %f34,%f62,%f28 ! (3_1) res0 *= dtmp1; 373 sra %o1,24,%o4 ! (3_0) iexp0 = ax >> 24; 374 ldd [%i3+8],%f50 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[1] 375 faddd %f52,DC1,%f54 ! (4_1) res0 += DC1; 376 377 lda [%i1]0x82,%f13 ! (4_0) x0 = *px; 378 fand %f48,DC0,%f58 ! (3_0) db0 = vis_fand(db0,DC0); 379 380 or %g0,%g5,%i3 381 cmp counter,5 382 bl,pn %icc,.tail 383 add %o4,960,%g5 ! (3_0) iexp0 += 0x3c0; 384 385 ba .main_loop 386 sub counter,5,counter ! counter 387 388 .align 16 389 .main_loop: 390 fmuld K2,%f30,%f60 ! (1_1) res0 = K2 * xx0; 391 cmp %o2,_0x7f800000 ! (4_1) ax ? 0x7f800000 392 bge,pn %icc,.update12 ! (4_1) if( ax >= 0x7f800000 ) 393 fsubd %f40,%f46,%f44 ! (2_1) xx0 = (db0 - hi0); 394 .cont12: 395 fmuld %f56,%f42,%f52 ! (0_1) res0 *= xx0; 396 sllx %g5,52,%g5 ! (3_1) lexp0 = (long long)iexp0 << 52; 397 ldd [%i2],%f40 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; 398 fdtos %f32,%f15 ! (2_2) fres0 = (float)res0; 399 400 fmuld %f50,%f54,%f42 ! (4_2) res0 = dtmp0 * res0; 401 sra %o1,11,%l0 ! (3_1) ax >>= 11; 402 stx %g5,[%fp+tmp1] ! (3_1) dtmp1 = *((double*)&lexp0); 403 for %f58,DC1,%f48 ! (3_1) db0 = vis_for(db0,DC1); 404 405 cmp %o2,_0x00800000 ! (4_1) ax ? 0x00800000 406 bl,pn %icc,.update13 ! (4_1) if( ax < 0x00800000 ) 407 ldd [%fp+tmp2],%f56 ! (4_2) dtmp1 = *((double*)&lexp0); 408 fstod %f13,%f50 ! (4_1) db0 = (double)x0; 409 .cont13: 410 fmuld %f44,%f40,%f46 ! (2_1) xx0 *= dtmp0; 411 and %l0,_0x1ff0,%i0 ! (3_1) si0 = ax & 0x1ff0; 412 lda [%i1+stridex]0x82,%l1 ! (0_0) ax = *(int*)px; 413 faddd %f60,K1,%f32 ! (1_1) res0 += K1; 414 415 add %i0,TBL,%l0 ! (3_1) (char*)TBL + si0 416 add %i3,stridey,%o3 ! py += stridey 417 st %f15,[%i3] ! (2_2) *py = fres0; 418 fand %f48,DC2,%f62 ! (3_1) hi0 = vis_fand(db0,DC2); 419 420 fmuld %f42,%f56,%f44 ! (4_2) res0 *= dtmp1; 421 sra %o2,24,%o7 ! (4_1) iexp0 = ax >> 24; 422 ldd [%i5+8],%f58 ! (0_1) dtmp0 = ((double*)((char*)TBL + si0))[1] 423 faddd %f52,DC1,%f34 ! (0_1) res0 += DC1; 424 425 add %i1,stridex,%o4 ! px += stridex 426 add %o7,960,%o7 ! (4_1) iexp0 += 0x3c0; 427 lda [%i1+stridex]0x82,%f17 ! (0_0) x0 = *px; 428 fand %f50,DC0,%f54 ! (4_1) db0 = vis_fand(db0,DC0); 429 430 fmuld K2,%f46,%f52 ! (2_1) res0 = K2 * xx0; 431 cmp %l1,_0x7f800000 ! (0_0) ax ? 0x7f800000 432 bge,pn %icc,.update14 ! (0_0) if( ax >= 0x7f800000 ) 433 fsubd %f48,%f62,%f42 ! (3_1) xx0 = (db0 - hi0); 434 .cont14: 435 fmuld %f32,%f30,%f48 ! (1_1) res0 *= xx0; 436 sllx %o7,52,%o1 ! (4_1) lexp0 = (long long)iexp0 << 52; 437 ldd [%i0+TBL],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; 438 fdtos %f28,%f19 ! (3_2) fres0 = (float)res0; 439 440 fmuld %f58,%f34,%f32 ! (0_1) res0 = dtmp0 * res0; 441 sra %o2,11,%i5 ! (4_1) ax >>= 11; 442 stx %o1,[%fp+tmp2] ! (4_1) dtmp1 = *((double*)&lexp0); 443 for %f54,DC1,%f34 ! (4_1) db0 = vis_for(db0,DC1); 444 445 cmp %l1,_0x00800000 ! (0_0) ax ? 0x00800000 446 bl,pn %icc,.update15 ! (0_0) if( ax < 0x00800000 ) 447 ldd [%fp+tmp3],%f60 ! (0_1) dtmp1 = *((double*)&lexp0); 448 fstod %f17,%f56 ! (0_0) db0 = (double)x0; 449 .cont15: 450 fmuld %f42,%f40,%f42 ! (3_1) xx0 *= dtmp0; 451 add %o3,stridey,%g5 ! py += stridey 452 lda [stridex+%o4]0x82,%i0 ! (1_0) ax = *(int*)px; 453 faddd %f52,K1,%f52 ! (2_1) res0 += K1; 454 455 sra %l1,24,%g1 ! (0_0) iexp0 = ax >> 24; 456 and %i5,_0x1ff0,%i5 ! (4_1) si0 = ax & 0x1ff0; 457 st %f19,[%o3] ! (3_2) *py = fres0; 458 fand %f34,DC2,%f62 ! (4_1) hi0 = vis_fand(db0,DC2); 459 460 fmuld %f32,%f60,%f40 ! (0_1) res0 *= dtmp1; 461 add %o4,stridex,%i1 ! px += stridex 462 ldd [%i4+8],%f60 ! (1_1) dtmp0 = ((double*)((char*)TBL + si0))[1] 463 faddd %f48,DC1,%f58 ! (1_1) res0 += DC1; 464 465 add %g1,960,%o5 ! (0_0) iexp0 += 0x3c0; 466 add %i5,TBL,%i3 ! (4_1) (char*)TBL + si0 467 lda [stridex+%o4]0x82,%f21 ! (1_0) x0 = *px; 468 fand %f56,DC0,%f32 ! (0_0) db0 = vis_fand(db0,DC0); 469 470 fmuld K2,%f42,%f50 ! (3_1) res0 = K2 * xx0; 471 cmp %i0,_0x7f800000 ! (1_0) ax ? 0x7f800000 472 bge,pn %icc,.update16 ! (1_0) if( ax >= 0x7f800000 ) 473 fsubd %f34,%f62,%f54 ! (4_1) xx0 = (db0 - hi0); 474 .cont16: 475 fmuld %f52,%f46,%f52 ! (2_1) res0 *= xx0; 476 sllx %o5,52,%o7 ! (0_0) lexp0 = (long long)iexp0 << 52; 477 ldd [TBL+%i5],%f62 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; 478 fdtos %f44,%f23 ! (4_2) fres0 = (float)res0; 479 480 fmuld %f60,%f58,%f44 ! (1_1) res0 = dtmp0 * res0; 481 sra %l1,11,%i4 ! (0_0) ax >>= 11; 482 stx %o7,[%fp+tmp3] ! (0_0) dtmp1 = *((double*)&lexp0); 483 for %f32,DC1,%f48 ! (0_0) db0 = vis_for(db0,DC1); 484 485 cmp %i0,_0x00800000 ! (1_0) ax ? 0x00800000 486 bl,pn %icc,.update17 ! (1_0) if( ax < 0x00800000 ) 487 ldd [%fp+tmp4],%f34 ! (1_1) dtmp1 = *((double*)&lexp0); 488 fstod %f21,%f56 ! (1_0) db0 = (double)x0; 489 .cont17: 490 fmuld %f54,%f62,%f46 ! (4_1) xx0 *= dtmp0; 491 and %i4,_0x1ff0,%g1 ! (0_0) si0 = ax & 0x1ff0; 492 lda [%i1+stridex]0x82,%o2 ! (2_0) ax = *(int*)px; 493 faddd %f50,K1,%f62 ! (3_1) res0 += K1; 494 495 add %g1,TBL,%i5 ! (0_0) (double*)((char*)TBL + si0 496 add %g5,stridey,%g5 ! py += stridey 497 st %f23,[stridey+%o3] ! (4_2) *py = fres0; 498 fand %f48,DC2,%f32 ! (0_0) hi0 = vis_fand(db0,DC2); 499 500 fmuld %f44,%f34,%f44 ! (1_1) res0 *= dtmp1; 501 sra %i0,24,%o4 ! (1_0) iexp0 = ax >> 24; 502 ldd [%i2+8],%f60 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[1] 503 faddd %f52,DC1,%f58 ! (2_1) res0 += DC1; 504 505 add %i1,stridex,%o7 ! px += stridex 506 add %o4,960,%i2 ! (1_0) iexp0 += 0x3c0; 507 lda [%i1+stridex]0x82,%f25 ! (2_0) x0 = *px; 508 fand %f56,DC0,%f34 ! (1_0) db0 = vis_fand(db0,DC0); 509 510 fmuld K2,%f46,%f50 ! (4_1) res0 = K2 * xx0; 511 cmp %o2,_0x7f800000 ! (2_0) ax ? 0x7f800000 512 bge,pn %icc,.update18 ! (2_0) if( ax >= 0x7f800000 ) 513 fsubd %f48,%f32,%f52 ! (0_0) xx0 = (db0 - hi0); 514 .cont18: 515 fmuld %f62,%f42,%f54 ! (3_1) res0 *= xx0; 516 sllx %i2,52,%o4 ! (1_0) lexp0 = (long long)iexp0 << 52; 517 ldd [TBL+%g1],%f32 ! (0_0) dtmp0 = ((double*)((char*)TBL + si0))[0]; 518 fdtos %f40,%f27 ! (0_1) fres0 = (float)res0; 519 520 fmuld %f60,%f58,%f60 ! (2_1) res0 = dtmp0 * res0; 521 sra %i0,11,%g1 ! (1_0) ax >>= 11; 522 stx %o4,[%fp+tmp4] ! (1_0) dtmp1 = *((double*)&lexp0); 523 for %f34,DC1,%f48 ! (1_0) db0 = vis_for(db0,DC1); 524 525 cmp %o2,_0x00800000 ! (2_0) ax ? 0x00800000 526 bl,pn %icc,.update19 ! (2_0) if( ax < 0x00800000 ) 527 ldd [%fp+tmp0],%f40 ! (2_1) dtmp1 = *((double*)&lexp0); 528 fstod %f25,%f56 ! (2_0) db0 = (double)x0; 529 .cont19: 530 fmuld %f52,%f32,%f42 ! (0_0) xx0 *= dtmp0; 531 and %g1,_0x1ff0,%o5 ! (1_0) si0 = ax & 0x1ff0; 532 lda [stridex+%o7]0x82,%o1 ! (3_0) ax = *(int*)px; 533 faddd %f50,K1,%f34 ! (4_1) res0 += K1; 534 535 add %o5,TBL,%i4 ! (1_0) (char*)TBL + si0 536 add %g5,stridey,%g1 ! py += stridey 537 st %f27,[%g5] ! (0_1) *py = fres0; 538 fand %f48,DC2,%f62 ! (1_0) hi0 = vis_fand(db0,DC2); 539 540 fmuld %f60,%f40,%f32 ! (2_1) res0 *= dtmp1; 541 sra %o2,24,%l1 ! (2_0) iexp0 = ax >> 24; 542 ldd [%l0+8],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[1] 543 faddd %f54,DC1,%f58 ! (3_1) res0 += DC1; 544 545 add %o7,stridex,%i1 ! px += stridex 546 add %l1,960,%l0 ! (2_0) iexp0 += 0x3c0; 547 lda [stridex+%o7]0x82,%f0 ! (3_0) x0 = *px; 548 fand %f56,DC0,%f60 ! (2_0) db0 = vis_fand(db0,DC0); 549 550 fmuld K2,%f42,%f50 ! (0_0) res0 = K2 * xx0; 551 cmp %o1,_0x7f800000 ! (3_0) ax ? 0x7f800000 552 bge,pn %icc,.update20 ! (3_0) if( ax >= 0x7f800000 ) 553 fsubd %f48,%f62,%f54 ! (1_0) xx0 = (db0 - hi0); 554 .cont20: 555 fmuld %f34,%f46,%f52 ! (4_1) res0 *= xx0; 556 sllx %l0,52,%o3 ! (2_0) lexp0 = (long long)iexp0 << 52; 557 ldd [TBL+%o5],%f56 ! (1_0) dtmp0 = ((double*)((char*)TBL + si0))[0]; 558 fdtos %f44,%f8 ! (1_1) fres0 = (float)res0; 559 560 fmuld %f40,%f58,%f34 ! (3_1) res0 = dtmp0 * res0; 561 sra %o2,11,%i2 ! (2_0) ax >>= 11; 562 stx %o3,[%fp+tmp0] ! (2_0) dtmp1 = *((double*)&lexp0); 563 for %f60,DC1,%f40 ! (2_0) db0 = vis_for(db0,DC1); 564 565 cmp %o1,_0x00800000 ! (3_0) ax ? 0x00800000 566 bl,pn %icc,.update21 ! (3_0) if( ax < 0x00800000 ) 567 ldd [%fp+tmp1],%f62 ! (3_1) dtmp1 = *((double*)&lexp0); 568 fstod %f0,%f48 ! (3_0) db0 = (double)x0; 569 .cont21: 570 fmuld %f54,%f56,%f30 ! (1_0) xx0 *= dtmp0; 571 and %i2,_0x1ff0,%o3 ! (2_0) si0 = ax & 0x1ff0; 572 lda [%i1+stridex]0x82,%o2 ! (4_0) ax = *(int*)px; 573 faddd %f50,K1,%f56 ! (0_0) res0 += K1; 574 575 add %i1,stridex,%i1 ! px += stridex 576 add %o3,TBL,%i2 ! (2_0) (char*)TBL + si0 577 st %f8,[stridey+%g5] ! (1_1) *py = fres0; 578 fand %f40,DC2,%f46 ! (2_0) hi0 = vis_fand(db0,DC2); 579 580 fmuld %f34,%f62,%f28 ! (3_1) res0 *= dtmp1; 581 sra %o1,24,%o4 ! (3_0) iexp0 = ax >> 24; 582 ldd [%i3+8],%f50 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[1] 583 faddd %f52,DC1,%f54 ! (4_1) res0 += DC1; 584 585 add %g1,stridey,%i3 ! py += stridey 586 subcc counter,5,counter ! counter 587 lda [%i1]0x82,%f13 ! (4_0) x0 = *px; 588 fand %f48,DC0,%f58 ! (3_0) db0 = vis_fand(db0,DC0); 589 590 bpos,pt %icc,.main_loop 591 add %o4,960,%g5 ! (3_0) iexp0 += 0x3c0; 592 593 add counter,5,counter 594 .tail: 595 subcc counter,1,counter 596 bneg,a .begin 597 or %g0,%i3,%g5 598 599 fmuld %f56,%f42,%f52 ! (0_1) res0 *= xx0; 600 fdtos %f32,%f15 ! (2_2) fres0 = (float)res0; 601 602 fmuld %f50,%f54,%f42 ! (4_2) res0 = dtmp0 * res0; 603 604 ldd [%fp+tmp2],%f56 ! (4_2) dtmp1 = *((double*)&lexp0); 605 606 add %i3,stridey,%o3 ! py += stridey 607 st %f15,[%i3] ! (2_2) *py = fres0; 608 609 subcc counter,1,counter 610 bneg,a .begin 611 or %g0,%o3,%g5 612 613 fmuld %f42,%f56,%f44 ! (4_2) res0 *= dtmp1; 614 ldd [%i5+8],%f58 ! (0_1) dtmp0 = ((double*)((char*)TBL + si0))[1] 615 faddd %f52,DC1,%f34 ! (0_1) res0 += DC1; 616 617 fdtos %f28,%f19 ! (3_2) fres0 = (float)res0; 618 619 fmuld %f58,%f34,%f32 ! (0_1) res0 = dtmp0 * res0; 620 621 ldd [%fp+tmp3],%f60 ! (0_1) dtmp1 = *((double*)&lexp0); 622 623 add %o3,stridey,%g5 ! py += stridey 624 625 st %f19,[%o3] ! (3_2) *py = fres0; 626 627 subcc counter,1,counter 628 bneg,a .begin 629 nop 630 631 fmuld %f32,%f60,%f40 ! (0_1) res0 *= dtmp1; 632 633 fdtos %f44,%f23 ! (4_2) fres0 = (float)res0; 634 635 add %g5,stridey,%g5 ! py += stridey 636 st %f23,[stridey+%o3] ! (4_2) *py = fres0; 637 638 subcc counter,1,counter 639 bneg,a .begin 640 nop 641 642 fdtos %f40,%f27 ! (0_1) fres0 = (float)res0; 643 644 st %f27,[%g5] ! (0_1) *py = fres0; 645 646 ba .begin 647 add %g5,stridey,%g5 648 649 .align 16 650 .spec: 651 fsqrts %f25,%f25 652 sub counter,1,counter 653 add %i1,stridex,%i1 654 st %f25,[%g5] 655 ba .begin1 656 add %g5,stridey,%g5 657 658 .align 16 659 .update0: 660 cmp counter,1 661 ble .cont0 662 fzeros %f0 663 664 stx %i1,[%fp+tmp_px] 665 sethi %hi(0x7f800000),%o1 666 667 sub counter,1,counter 668 st counter,[%fp+tmp_counter] 669 670 ba .cont0 671 or %g0,1,counter 672 673 .align 16 674 .update1: 675 cmp counter,1 676 ble .cont1 677 fzeros %f0 678 679 stx %i1,[%fp+tmp_px] 680 clr %o1 681 682 sub counter,1,counter 683 st counter,[%fp+tmp_counter] 684 685 ba .cont1 686 or %g0,1,counter 687 688 .align 16 689 .update2: 690 cmp counter,2 691 ble .cont2 692 fzeros %f13 693 694 stx %i1,[%fp+tmp_px] 695 sethi %hi(0x7f800000),%o2 696 697 sub counter,2,counter 698 st counter,[%fp+tmp_counter] 699 700 ba .cont2 701 or %g0,2,counter 702 703 .align 16 704 .update3: 705 cmp counter,2 706 ble .cont3 707 fzeros %f13 708 709 stx %i1,[%fp+tmp_px] 710 clr %o2 711 712 sub counter,2,counter 713 st counter,[%fp+tmp_counter] 714 715 ba .cont3 716 or %g0,2,counter 717 718 .align 16 719 .update4: 720 cmp counter,3 721 ble .cont4 722 fzeros %f17 723 724 stx %o4,[%fp+tmp_px] 725 sethi %hi(0x7f800000),%l1 726 727 sub counter,3,counter 728 st counter,[%fp+tmp_counter] 729 730 ba .cont4 731 or %g0,3,counter 732 733 .align 16 734 .update5: 735 cmp counter,3 736 ble .cont5 737 fzeros %f17 738 739 stx %o4,[%fp+tmp_px] 740 clr %l1 741 742 sub counter,3,counter 743 st counter,[%fp+tmp_counter] 744 745 ba .cont5 746 or %g0,3,counter 747 748 .align 16 749 .update6: 750 cmp counter,4 751 ble .cont6 752 fzeros %f21 753 754 stx %i1,[%fp+tmp_px] 755 sethi %hi(0x7f800000),%i0 756 757 sub counter,4,counter 758 st counter,[%fp+tmp_counter] 759 760 ba .cont6 761 or %g0,4,counter 762 763 .align 16 764 .update7: 765 cmp counter,4 766 ble .cont7 767 fzeros %f21 768 769 stx %i1,[%fp+tmp_px] 770 clr %i0 771 772 sub counter,4,counter 773 st counter,[%fp+tmp_counter] 774 775 ba .cont7 776 or %g0,4,counter 777 778 .align 16 779 .update8: 780 cmp counter,5 781 ble .cont8 782 fzeros %f25 783 784 stx %o7,[%fp+tmp_px] 785 sethi %hi(0x7f800000),%o2 786 787 sub counter,5,counter 788 st counter,[%fp+tmp_counter] 789 790 ba .cont8 791 or %g0,5,counter 792 793 .align 16 794 .update9: 795 cmp counter,5 796 ble .cont9 797 fzeros %f25 798 799 stx %o7,[%fp+tmp_px] 800 clr %o2 801 802 sub counter,5,counter 803 st counter,[%fp+tmp_counter] 804 805 ba .cont9 806 or %g0,5,counter 807 808 .align 16 809 .update10: 810 cmp counter,6 811 ble .cont10 812 fzeros %f0 813 814 stx %i1,[%fp+tmp_px] 815 sethi %hi(0x7f800000),%o1 816 817 sub counter,6,counter 818 st counter,[%fp+tmp_counter] 819 820 ba .cont10 821 or %g0,6,counter 822 823 .align 16 824 .update11: 825 cmp counter,6 826 ble .cont11 827 fzeros %f0 828 829 stx %i1,[%fp+tmp_px] 830 clr %o1 831 832 sub counter,6,counter 833 st counter,[%fp+tmp_counter] 834 835 ba .cont11 836 or %g0,6,counter 837 838 .align 16 839 .update12: 840 cmp counter,2 841 ble .cont12 842 fzeros %f13 843 844 stx %i1,[%fp+tmp_px] 845 sethi %hi(0x7f800000),%o2 846 847 sub counter,2,counter 848 st counter,[%fp+tmp_counter] 849 850 ba .cont12 851 or %g0,2,counter 852 853 .align 16 854 .update13: 855 cmp counter,2 856 ble .cont13 857 fzeros %f13 858 859 stx %i1,[%fp+tmp_px] 860 clr %o2 861 862 sub counter,2,counter 863 st counter,[%fp+tmp_counter] 864 865 ba .cont13 866 or %g0,2,counter 867 868 .align 16 869 .update14: 870 cmp counter,3 871 ble .cont14 872 fzeros %f17 873 874 stx %o4,[%fp+tmp_px] 875 sethi %hi(0x7f800000),%l1 876 877 sub counter,3,counter 878 st counter,[%fp+tmp_counter] 879 880 ba .cont14 881 or %g0,3,counter 882 883 .align 16 884 .update15: 885 cmp counter,3 886 ble .cont15 887 fzeros %f17 888 889 stx %o4,[%fp+tmp_px] 890 clr %l1 891 892 sub counter,3,counter 893 st counter,[%fp+tmp_counter] 894 895 ba .cont15 896 or %g0,3,counter 897 898 .align 16 899 .update16: 900 cmp counter,4 901 ble .cont16 902 fzeros %f21 903 904 stx %i1,[%fp+tmp_px] 905 sethi %hi(0x7f800000),%i0 906 907 sub counter,4,counter 908 st counter,[%fp+tmp_counter] 909 910 ba .cont16 911 or %g0,4,counter 912 913 .align 16 914 .update17: 915 cmp counter,4 916 ble .cont17 917 fzeros %f21 918 919 stx %i1,[%fp+tmp_px] 920 clr %i0 921 922 sub counter,4,counter 923 st counter,[%fp+tmp_counter] 924 925 ba .cont17 926 or %g0,4,counter 927 928 .align 16 929 .update18: 930 cmp counter,5 931 ble .cont18 932 fzeros %f25 933 934 stx %o7,[%fp+tmp_px] 935 sethi %hi(0x7f800000),%o2 936 937 sub counter,5,counter 938 st counter,[%fp+tmp_counter] 939 940 ba .cont18 941 or %g0,5,counter 942 943 .align 16 944 .update19: 945 cmp counter,5 946 ble .cont19 947 fzeros %f25 948 949 stx %o7,[%fp+tmp_px] 950 clr %o2 951 952 sub counter,5,counter 953 st counter,[%fp+tmp_counter] 954 955 ba .cont19 956 or %g0,5,counter 957 958 .align 16 959 .update20: 960 cmp counter,6 961 ble .cont20 962 fzeros %f0 963 964 stx %i1,[%fp+tmp_px] 965 sethi %hi(0x7f800000),%o1 966 967 sub counter,6,counter 968 st counter,[%fp+tmp_counter] 969 970 ba .cont20 971 or %g0,6,counter 972 973 .align 16 974 .update21: 975 cmp counter,6 976 ble .cont21 977 fzeros %f0 978 979 stx %i1,[%fp+tmp_px] 980 clr %o1 981 982 sub counter,6,counter 983 st counter,[%fp+tmp_counter] 984 985 ba .cont21 986 or %g0,6,counter 987 988 .exit: 989 ret 990 restore 991 SET_SIZE(__vsqrtf_ultra3) 992