1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 */ 25 /* 26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 27 * Use is subject to license terms. 28 */ 29 30 #if defined(ELFOBJ) 31 #pragma weak fma = __fma 32 #endif 33 34 #include "libm.h" 35 #include "fma.h" 36 #include "fenv_inlines.h" 37 38 #if defined(__sparc) 39 40 static const union { 41 unsigned i[2]; 42 double d; 43 } C[] = { 44 { 0x3fe00000u, 0 }, 45 { 0x40000000u, 0 }, 46 { 0x43300000u, 0 }, 47 { 0x41a00000u, 0 }, 48 { 0x3e500000u, 0 }, 49 { 0x3df00000u, 0 }, 50 { 0x3bf00000u, 0 }, 51 { 0x7fe00000u, 0 }, 52 { 0x00100000u, 0 }, 53 { 0x00100001u, 0 } 54 }; 55 56 #define half C[0].d 57 #define two C[1].d 58 #define two52 C[2].d 59 #define two27 C[3].d 60 #define twom26 C[4].d 61 #define twom32 C[5].d 62 #define twom64 C[6].d 63 #define huge C[7].d 64 #define tiny C[8].d 65 #define tiny2 C[9].d 66 67 static const unsigned int fsr_rm = 0xc0000000u; 68 69 /* 70 * fma for SPARC: 64-bit double precision, big-endian 71 */ 72 double 73 __fma(double x, double y, double z) { 74 union { 75 unsigned i[2]; 76 double d; 77 } xx, yy, zz; 78 double xhi, yhi, xlo, ylo, t; 79 unsigned xy0, xy1, xy2, xy3, z0, z1, z2, z3, rm, sticky; 80 unsigned int fsr; 81 int hx, hy, hz, ex, ey, ez, exy, sxy, sz, e, ibit; 82 volatile double dummy; 83 84 /* extract the high order words of the arguments */ 85 xx.d = x; 86 yy.d = y; 87 zz.d = z; 88 hx = xx.i[0] & ~0x80000000; 89 hy = yy.i[0] & ~0x80000000; 90 hz = zz.i[0] & ~0x80000000; 91 92 /* dispense with inf, nan, and zero cases */ 93 if (hx >= 0x7ff00000 || hy >= 0x7ff00000 || (hx | xx.i[1]) == 0 || 94 (hy | yy.i[1]) == 0) /* x or y is inf, nan, or zero */ 95 return (x * y + z); 96 97 if (hz >= 0x7ff00000) /* z is inf or nan */ 98 return (x + z); /* avoid spurious under/overflow in x * y */ 99 100 if ((hz | zz.i[1]) == 0) /* z is zero */ 101 /* 102 * x * y isn't zero but could underflow to zero, 103 * so don't add z, lest we perturb the sign 104 */ 105 return (x * y); 106 107 /* 108 * now x, y, and z are all finite and nonzero; save the fsr and 109 * set round-to-negative-infinity mode (and clear nonstandard 110 * mode before we try to scale subnormal operands) 111 */ 112 __fenv_getfsr32(&fsr); 113 __fenv_setfsr32(&fsr_rm); 114 115 /* extract signs and exponents, and normalize subnormals */ 116 sxy = (xx.i[0] ^ yy.i[0]) & 0x80000000; 117 sz = zz.i[0] & 0x80000000; 118 ex = hx >> 20; 119 if (!ex) { 120 xx.d = x * two52; 121 ex = ((xx.i[0] & ~0x80000000) >> 20) - 52; 122 } 123 ey = hy >> 20; 124 if (!ey) { 125 yy.d = y * two52; 126 ey = ((yy.i[0] & ~0x80000000) >> 20) - 52; 127 } 128 ez = hz >> 20; 129 if (!ez) { 130 zz.d = z * two52; 131 ez = ((zz.i[0] & ~0x80000000) >> 20) - 52; 132 } 133 134 /* multiply x*y to 106 bits */ 135 exy = ex + ey - 0x3ff; 136 xx.i[0] = (xx.i[0] & 0xfffff) | 0x3ff00000; 137 yy.i[0] = (yy.i[0] & 0xfffff) | 0x3ff00000; 138 x = xx.d; 139 y = yy.d; 140 xhi = ((x + twom26) + two27) - two27; 141 yhi = ((y + twom26) + two27) - two27; 142 xlo = x - xhi; 143 ylo = y - yhi; 144 x *= y; 145 y = ((xhi * yhi - x) + xhi * ylo + xlo * yhi) + xlo * ylo; 146 if (x >= two) { 147 x *= half; 148 y *= half; 149 exy++; 150 } 151 152 /* extract the significands */ 153 xx.d = x; 154 xy0 = (xx.i[0] & 0xfffff) | 0x100000; 155 xy1 = xx.i[1]; 156 yy.d = t = y + twom32; 157 xy2 = yy.i[1]; 158 yy.d = (y - (t - twom32)) + twom64; 159 xy3 = yy.i[1]; 160 z0 = (zz.i[0] & 0xfffff) | 0x100000; 161 z1 = zz.i[1]; 162 z2 = z3 = 0; 163 164 /* 165 * now x*y is represented by sxy, exy, and xy[0-3], and z is 166 * represented likewise; swap if need be so |xy| <= |z| 167 */ 168 if (exy > ez || (exy == ez && (xy0 > z0 || (xy0 == z0 && 169 (xy1 > z1 || (xy1 == z1 && (xy2 | xy3) != 0)))))) { 170 e = sxy; sxy = sz; sz = e; 171 e = exy; exy = ez; ez = e; 172 e = xy0; xy0 = z0; z0 = e; 173 e = xy1; xy1 = z1; z1 = e; 174 z2 = xy2; xy2 = 0; 175 z3 = xy3; xy3 = 0; 176 } 177 178 /* shift the significand of xy keeping a sticky bit */ 179 e = ez - exy; 180 if (e > 116) { 181 xy0 = xy1 = xy2 = 0; 182 xy3 = 1; 183 } else if (e >= 96) { 184 sticky = xy3 | xy2 | xy1 | ((xy0 << 1) << (127 - e)); 185 xy3 = xy0 >> (e - 96); 186 if (sticky) 187 xy3 |= 1; 188 xy0 = xy1 = xy2 = 0; 189 } else if (e >= 64) { 190 sticky = xy3 | xy2 | ((xy1 << 1) << (95 - e)); 191 xy3 = (xy1 >> (e - 64)) | ((xy0 << 1) << (95 - e)); 192 if (sticky) 193 xy3 |= 1; 194 xy2 = xy0 >> (e - 64); 195 xy0 = xy1 = 0; 196 } else if (e >= 32) { 197 sticky = xy3 | ((xy2 << 1) << (63 - e)); 198 xy3 = (xy2 >> (e - 32)) | ((xy1 << 1) << (63 - e)); 199 if (sticky) 200 xy3 |= 1; 201 xy2 = (xy1 >> (e - 32)) | ((xy0 << 1) << (63 - e)); 202 xy1 = xy0 >> (e - 32); 203 xy0 = 0; 204 } else if (e) { 205 sticky = (xy3 << 1) << (31 - e); 206 xy3 = (xy3 >> e) | ((xy2 << 1) << (31 - e)); 207 if (sticky) 208 xy3 |= 1; 209 xy2 = (xy2 >> e) | ((xy1 << 1) << (31 - e)); 210 xy1 = (xy1 >> e) | ((xy0 << 1) << (31 - e)); 211 xy0 >>= e; 212 } 213 214 /* if this is a magnitude subtract, negate the significand of xy */ 215 if (sxy ^ sz) { 216 xy0 = ~xy0; 217 xy1 = ~xy1; 218 xy2 = ~xy2; 219 xy3 = -xy3; 220 if (xy3 == 0) 221 if (++xy2 == 0) 222 if (++xy1 == 0) 223 xy0++; 224 } 225 226 /* add, propagating carries */ 227 z3 += xy3; 228 e = (z3 < xy3); 229 z2 += xy2; 230 if (e) { 231 z2++; 232 e = (z2 <= xy2); 233 } else 234 e = (z2 < xy2); 235 z1 += xy1; 236 if (e) { 237 z1++; 238 e = (z1 <= xy1); 239 } else 240 e = (z1 < xy1); 241 z0 += xy0; 242 if (e) 243 z0++; 244 245 /* postnormalize and collect rounding information into z2 */ 246 if (ez < 1) { 247 /* result is tiny; shift right until exponent is within range */ 248 e = 1 - ez; 249 if (e > 56) { 250 z2 = 1; /* result can't be exactly zero */ 251 z0 = z1 = 0; 252 } else if (e >= 32) { 253 sticky = z3 | z2 | ((z1 << 1) << (63 - e)); 254 z2 = (z1 >> (e - 32)) | ((z0 << 1) << (63 - e)); 255 if (sticky) 256 z2 |= 1; 257 z1 = z0 >> (e - 32); 258 z0 = 0; 259 } else { 260 sticky = z3 | (z2 << 1) << (31 - e); 261 z2 = (z2 >> e) | ((z1 << 1) << (31 - e)); 262 if (sticky) 263 z2 |= 1; 264 z1 = (z1 >> e) | ((z0 << 1) << (31 - e)); 265 z0 >>= e; 266 } 267 ez = 1; 268 } else if (z0 >= 0x200000) { 269 /* carry out; shift right by one */ 270 sticky = (z2 & 1) | z3; 271 z2 = (z2 >> 1) | (z1 << 31); 272 if (sticky) 273 z2 |= 1; 274 z1 = (z1 >> 1) | (z0 << 31); 275 z0 >>= 1; 276 ez++; 277 } else { 278 if (z0 < 0x100000 && (z0 | z1 | z2 | z3) != 0) { 279 /* 280 * borrow/cancellation; shift left as much as 281 * exponent allows 282 */ 283 while (!(z0 | (z1 & 0xffe00000)) && ez >= 33) { 284 z0 = z1; 285 z1 = z2; 286 z2 = z3; 287 z3 = 0; 288 ez -= 32; 289 } 290 while (z0 < 0x100000 && ez > 1) { 291 z0 = (z0 << 1) | (z1 >> 31); 292 z1 = (z1 << 1) | (z2 >> 31); 293 z2 = (z2 << 1) | (z3 >> 31); 294 z3 <<= 1; 295 ez--; 296 } 297 } 298 if (z3) 299 z2 |= 1; 300 } 301 302 /* get the rounding mode and clear current exceptions */ 303 rm = fsr >> 30; 304 fsr &= ~FSR_CEXC; 305 306 /* strip off the integer bit, if there is one */ 307 ibit = z0 & 0x100000; 308 if (ibit) 309 z0 -= 0x100000; 310 else { 311 ez = 0; 312 if (!(z0 | z1 | z2)) { /* exact zero */ 313 zz.i[0] = rm == FSR_RM ? 0x80000000 : 0; 314 zz.i[1] = 0; 315 __fenv_setfsr32(&fsr); 316 return (zz.d); 317 } 318 } 319 320 /* 321 * flip the sense of directed roundings if the result is negative; 322 * the logic below applies to a positive result 323 */ 324 if (sz) 325 rm ^= rm >> 1; 326 327 /* round and raise exceptions */ 328 if (z2) { 329 fsr |= FSR_NXC; 330 331 /* decide whether to round the fraction up */ 332 if (rm == FSR_RP || (rm == FSR_RN && (z2 > 0x80000000u || 333 (z2 == 0x80000000u && (z1 & 1))))) { 334 /* round up and renormalize if necessary */ 335 if (++z1 == 0) { 336 if (++z0 == 0x100000) { 337 z0 = 0; 338 ez++; 339 } 340 } 341 } 342 } 343 344 /* check for under/overflow */ 345 if (ez >= 0x7ff) { 346 if (rm == FSR_RN || rm == FSR_RP) { 347 zz.i[0] = sz | 0x7ff00000; 348 zz.i[1] = 0; 349 } else { 350 zz.i[0] = sz | 0x7fefffff; 351 zz.i[1] = 0xffffffff; 352 } 353 fsr |= FSR_OFC | FSR_NXC; 354 } else { 355 zz.i[0] = sz | (ez << 20) | z0; 356 zz.i[1] = z1; 357 358 /* 359 * !ibit => exact result was tiny before rounding, 360 * z2 nonzero => result delivered is inexact 361 */ 362 if (!ibit) { 363 if (z2) 364 fsr |= FSR_UFC | FSR_NXC; 365 else if (fsr & FSR_UFM) 366 fsr |= FSR_UFC; 367 } 368 } 369 370 /* restore the fsr and emulate exceptions as needed */ 371 if ((fsr & FSR_CEXC) & (fsr >> 23)) { 372 __fenv_setfsr32(&fsr); 373 if (fsr & FSR_OFC) { 374 dummy = huge; 375 dummy *= huge; 376 } else if (fsr & FSR_UFC) { 377 dummy = tiny; 378 if (fsr & FSR_NXC) 379 dummy *= tiny; 380 else 381 dummy -= tiny2; 382 } else { 383 dummy = huge; 384 dummy += tiny; 385 } 386 } else { 387 fsr |= (fsr & 0x1f) << 5; 388 __fenv_setfsr32(&fsr); 389 } 390 return (zz.d); 391 } 392 393 #elif defined(__x86) 394 395 #if defined(__amd64) 396 #define NI 4 397 #else 398 #define NI 3 399 #endif 400 401 /* 402 * fma for x86: 64-bit double precision, little-endian 403 */ 404 double 405 __fma(double x, double y, double z) { 406 union { 407 unsigned i[NI]; 408 long double e; 409 } xx, yy, zz; 410 long double xe, ye, xhi, xlo, yhi, ylo; 411 int ex, ey, ez; 412 unsigned cwsw, oldcwsw, rm; 413 414 /* convert the operands to double extended */ 415 xx.e = (long double) x; 416 yy.e = (long double) y; 417 zz.e = (long double) z; 418 419 /* extract the exponents of the arguments */ 420 ex = xx.i[2] & 0x7fff; 421 ey = yy.i[2] & 0x7fff; 422 ez = zz.i[2] & 0x7fff; 423 424 /* dispense with inf, nan, and zero cases */ 425 if (ex == 0x7fff || ey == 0x7fff || ex == 0 || ey == 0) 426 /* x or y is inf, nan, or zero */ 427 return ((double) (xx.e * yy.e + zz.e)); 428 429 if (ez >= 0x7fff) /* z is inf or nan */ 430 return ((double) (xx.e + zz.e)); 431 /* avoid spurious inexact in x * y */ 432 433 /* 434 * save the control and status words, mask all exceptions, and 435 * set rounding to 64-bit precision and to-nearest 436 */ 437 __fenv_getcwsw(&oldcwsw); 438 cwsw = (oldcwsw & 0xf0c0ffff) | 0x033f0000; 439 __fenv_setcwsw(&cwsw); 440 441 /* multiply x*y to 106 bits */ 442 xe = xx.e; 443 xx.i[0] = 0; 444 xhi = xx.e; /* hi 32 bits */ 445 xlo = xe - xhi; /* lo 21 bits */ 446 ye = yy.e; 447 yy.i[0] = 0; 448 yhi = yy.e; 449 ylo = ye - yhi; 450 xe = xe * ye; 451 ye = ((xhi * yhi - xe) + xhi * ylo + xlo * yhi) + xlo * ylo; 452 453 /* distill the sum of xe, ye, and z */ 454 xhi = ye + zz.e; 455 yhi = xhi - ye; 456 xlo = (zz.e - yhi) + (ye - (xhi - yhi)); 457 /* now (xhi,xlo) = ye + z */ 458 459 yhi = xe + xhi; 460 ye = yhi - xe; 461 ylo = (xhi - ye) + (xe - (yhi - ye)); /* now (yhi,ylo) = xe + xhi */ 462 463 xhi = xlo + ylo; 464 xe = xhi - xlo; 465 xlo = (ylo - xe) + (xlo - (xhi - xe)); /* now (xhi,xlo) = xlo + ylo */ 466 467 yy.e = yhi + xhi; 468 ylo = (yhi - yy.e) + xhi; /* now (yy.e,ylo) = xhi + yhi */ 469 470 if (yy.i[1] != 0) { /* yy.e is nonzero */ 471 /* perturb yy.e if its least significant 10 bits are zero */ 472 if (!(yy.i[0] & 0x3ff)) { 473 xx.e = ylo + xlo; 474 if (xx.i[1] != 0) { 475 xx.i[2] = (xx.i[2] & 0x8000) | 476 ((yy.i[2] & 0x7fff) - 63); 477 xx.i[1] = 0x80000000; 478 xx.i[0] = 0; 479 yy.e += xx.e; 480 } 481 } 482 } else { 483 /* set sign of zero result according to rounding direction */ 484 rm = oldcwsw & 0x0c000000; 485 yy.i[2] = ((rm == FCW_RM)? 0x8000 : 0); 486 } 487 488 /* 489 * restore the control and status words and convert the result 490 * to double 491 */ 492 __fenv_setcwsw(&oldcwsw); 493 return ((double) yy.e); 494 } 495 496 #if 0 497 /* 498 * another fma for x86: assumes return value will be left in 499 * long double (80-bit double extended) precision 500 */ 501 long double 502 __fma(double x, double y, double z) { 503 union { 504 unsigned i[3]; 505 long double e; 506 } xx, yy, zz, tt; 507 long double xe, ye, xhi, xlo, yhi, ylo, zhi, zlo; 508 int ex, ey, ez; 509 unsigned cwsw, oldcwsw, s; 510 511 /* convert the operands to double extended */ 512 xx.e = (long double) x; 513 yy.e = (long double) y; 514 zz.e = (long double) z; 515 516 /* extract the exponents of the arguments */ 517 ex = xx.i[2] & 0x7fff; 518 ey = yy.i[2] & 0x7fff; 519 ez = zz.i[2] & 0x7fff; 520 521 /* dispense with inf, nan, and zero cases */ 522 if (ex == 0x7fff || ey == 0x7fff || ex == 0 || ey == 0) 523 /* x or y is inf, nan, or zero */ 524 return (xx.e * yy.e + zz.e); 525 526 if (ez >= 0x7fff) /* z is inf or nan */ 527 return (xx.e + zz.e); /* avoid spurious inexact in x * y */ 528 529 if (ez == 0) /* z is zero */ 530 return (xx.e * yy.e); /* x * y isn't zero; no need to add z */ 531 532 /* 533 * save the control and status words, mask all exceptions, and 534 * set rounding to 64-bit precision and to-nearest 535 */ 536 __fenv_getcwsw(&oldcwsw); 537 cwsw = (oldcwsw & 0xf0c0ffff) | 0x033f0000; 538 __fenv_setcwsw(&cwsw); 539 540 /* multiply x*y to 106 bits */ 541 xe = xx.e; 542 xx.i[0] = 0; 543 xhi = xx.e; /* hi 32 bits */ 544 xlo = xe - xhi; /* lo 21 bits */ 545 ye = yy.e; 546 yy.i[0] = 0; 547 yhi = yy.e; 548 ylo = ye - yhi; 549 xx.e = xe * ye; 550 xx.i[0] &= ~0x7ff; /* 53 bits of x*y */ 551 yy.e = ((xhi * yhi - xx.e) + xhi * ylo + xlo * yhi) + xlo * ylo; 552 553 /* reduce to a sum of two terms */ 554 if (yy.e != 0.0) { 555 ex = xx.i[2] & 0x7fff; 556 if (ez - ex > 10) { 557 /* collapse y into a single bit and add to x */ 558 yy.i[0] = 0; 559 yy.i[1] = 0x80000000; 560 yy.i[2] = (yy.i[2] & 0x8000) | (ex - 60); 561 xx.e += yy.e; 562 } else if (ex - ez <= 10) { 563 xx.e += zz.e; /* exact */ 564 zz.e = yy.e; 565 } else if (ex - ez <= 42) { 566 /* split z into two pieces */ 567 tt.i[0] = 0; 568 tt.i[1] = 0x80000000; 569 tt.i[2] = ex + 11; 570 zhi = (zz.e + tt.e) - tt.e; 571 zlo = zz.e - zhi; 572 xx.e += zhi; 573 zz.e = yy.e + zlo; 574 } else if (ex - ez <= 63) { 575 zz.e += yy.e; /* exact */ 576 } else if (ex - ez <= 106) { 577 /* 578 * collapse the tail of z into a sticky bit and add z 579 * to y without error 580 */ 581 if (ex - ez <= 81) { 582 s = 1 << (ex - ez - 50); 583 if (zz.i[0] & (s - 1)) 584 zz.i[0] |= s; 585 zz.i[0] &= ~(s - 1); 586 } else { 587 s = 1 << (ex - ez - 82); 588 if ((zz.i[1] & (s - 1)) | zz.i[0]) 589 zz.i[1] |= s; 590 zz.i[1] &= ~(s - 1); 591 zz.i[0] = 0; 592 } 593 zz.e += yy.e; 594 } else { 595 /* collapse z into a single bit and add to y */ 596 zz.i[0] = 0; 597 zz.i[1] = 0x80000000; 598 zz.i[2] = (zz.i[2] & 0x8000) | (ex - 113); 599 zz.e += yy.e; 600 } 601 } 602 603 /* restore the control and status words, and sum */ 604 __fenv_setcwsw(&oldcwsw); 605 return (xx.e + zz.e); 606 } 607 #endif 608 609 #else 610 #error Unknown architecture 611 #endif