webrev_693-libm-all-changes-since-original-2012-v02 Old usr/src/lib/libm/common/m9x/fma.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25 /*
  26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  27  * Use is subject to license terms.
  28  */
  29 
  30 #if defined(ELFOBJ)
  31 #pragma weak fma = __fma
  32 #endif
  33 
  34 #include "libm.h"
  35 #include "fma.h"
  36 #include "fenv_inlines.h"
  37 
  38 #if defined(__sparc)
  39 
  40 static const union {
  41         unsigned i[2];
  42         double d;
  43 } C[] = {
  44         { 0x3fe00000u, 0 },
  45         { 0x40000000u, 0 },
  46         { 0x43300000u, 0 },
  47         { 0x41a00000u, 0 },
  48         { 0x3e500000u, 0 },
  49         { 0x3df00000u, 0 },
  50         { 0x3bf00000u, 0 },
  51         { 0x7fe00000u, 0 },
  52         { 0x00100000u, 0 },
  53         { 0x00100001u, 0 }
  54 };
  55 
  56 #define half    C[0].d
  57 #define two     C[1].d
  58 #define two52   C[2].d
  59 #define two27   C[3].d
  60 #define twom26  C[4].d
  61 #define twom32  C[5].d
  62 #define twom64  C[6].d
  63 #define huge    C[7].d
  64 #define tiny    C[8].d
  65 #define tiny2   C[9].d
  66 
  67 static const unsigned int fsr_rm = 0xc0000000u;
  68 
  69 /*
  70  * fma for SPARC: 64-bit double precision, big-endian
  71  */
  72 double
  73 __fma(double x, double y, double z) {
  74         union {
  75                 unsigned i[2];
  76                 double d;
  77         } xx, yy, zz;
  78         double xhi, yhi, xlo, ylo, t;
  79         unsigned xy0, xy1, xy2, xy3, z0, z1, z2, z3, rm, sticky;
  80         unsigned int fsr;
  81         int hx, hy, hz, ex, ey, ez, exy, sxy, sz, e, ibit;
  82         volatile double dummy;
  83 
  84         /* extract the high order words of the arguments */
  85         xx.d = x;
  86         yy.d = y;
  87         zz.d = z;
  88         hx = xx.i[0] & ~0x80000000;
  89         hy = yy.i[0] & ~0x80000000;
  90         hz = zz.i[0] & ~0x80000000;
  91 
  92         /* dispense with inf, nan, and zero cases */
  93         if (hx >= 0x7ff00000 || hy >= 0x7ff00000 || (hx | xx.i[1]) == 0 ||
  94                 (hy | yy.i[1]) == 0)    /* x or y is inf, nan, or zero */
  95                 return (x * y + z);
  96 
  97         if (hz >= 0x7ff00000)        /* z is inf or nan */
  98                 return (x + z); /* avoid spurious under/overflow in x * y */
  99 
 100         if ((hz | zz.i[1]) == 0)        /* z is zero */
 101                 /*
 102                  * x * y isn't zero but could underflow to zero,
 103                  * so don't add z, lest we perturb the sign
 104                  */
 105                 return (x * y);
 106 
 107         /*
 108          * now x, y, and z are all finite and nonzero; save the fsr and
 109          * set round-to-negative-infinity mode (and clear nonstandard
 110          * mode before we try to scale subnormal operands)
 111          */
 112         __fenv_getfsr32(&fsr);
 113         __fenv_setfsr32(&fsr_rm);
 114 
 115         /* extract signs and exponents, and normalize subnormals */
 116         sxy = (xx.i[0] ^ yy.i[0]) & 0x80000000;
 117         sz = zz.i[0] & 0x80000000;
 118         ex = hx >> 20;
 119         if (!ex) {
 120                 xx.d = x * two52;
 121                 ex = ((xx.i[0] & ~0x80000000) >> 20) - 52;
 122         }
 123         ey = hy >> 20;
 124         if (!ey) {
 125                 yy.d = y * two52;
 126                 ey = ((yy.i[0] & ~0x80000000) >> 20) - 52;
 127         }
 128         ez = hz >> 20;
 129         if (!ez) {
 130                 zz.d = z * two52;
 131                 ez = ((zz.i[0] & ~0x80000000) >> 20) - 52;
 132         }
 133 
 134         /* multiply x*y to 106 bits */
 135         exy = ex + ey - 0x3ff;
 136         xx.i[0] = (xx.i[0] & 0xfffff) | 0x3ff00000;
 137         yy.i[0] = (yy.i[0] & 0xfffff) | 0x3ff00000;
 138         x = xx.d;
 139         y = yy.d;
 140         xhi = ((x + twom26) + two27) - two27;
 141         yhi = ((y + twom26) + two27) - two27;
 142         xlo = x - xhi;
 143         ylo = y - yhi;
 144         x *= y;
 145         y = ((xhi * yhi - x) + xhi * ylo + xlo * yhi) + xlo * ylo;
 146         if (x >= two) {
 147                 x *= half;
 148                 y *= half;
 149                 exy++;
 150         }
 151 
 152         /* extract the significands */
 153         xx.d = x;
 154         xy0 = (xx.i[0] & 0xfffff) | 0x100000;
 155         xy1 = xx.i[1];
 156         yy.d = t = y + twom32;
 157         xy2 = yy.i[1];
 158         yy.d = (y - (t - twom32)) + twom64;
 159         xy3 = yy.i[1];
 160         z0 = (zz.i[0] & 0xfffff) | 0x100000;
 161         z1 = zz.i[1];
 162         z2 = z3 = 0;
 163 
 164         /*
 165          * now x*y is represented by sxy, exy, and xy[0-3], and z is
 166          * represented likewise; swap if need be so |xy| <= |z|
 167          */
 168         if (exy > ez || (exy == ez && (xy0 > z0 || (xy0 == z0 &&
 169                 (xy1 > z1 || (xy1 == z1 && (xy2 | xy3) != 0)))))) {
 170                 e = sxy; sxy = sz; sz = e;
 171                 e = exy; exy = ez; ez = e;
 172                 e = xy0; xy0 = z0; z0 = e;
 173                 e = xy1; xy1 = z1; z1 = e;
 174                 z2 = xy2; xy2 = 0;
 175                 z3 = xy3; xy3 = 0;
 176         }
 177 
 178         /* shift the significand of xy keeping a sticky bit */
 179         e = ez - exy;
 180         if (e > 116) {
 181                 xy0 = xy1 = xy2 = 0;
 182                 xy3 = 1;
 183         } else if (e >= 96) {
 184                 sticky = xy3 | xy2 | xy1 | ((xy0 << 1) << (127 - e));
 185                 xy3 = xy0 >> (e - 96);
 186                 if (sticky)
 187                         xy3 |= 1;
 188                 xy0 = xy1 = xy2 = 0;
 189         } else if (e >= 64) {
 190                 sticky = xy3 | xy2 | ((xy1 << 1) << (95 - e));
 191                 xy3 = (xy1 >> (e - 64)) | ((xy0 << 1) << (95 - e));
 192                 if (sticky)
 193                         xy3 |= 1;
 194                 xy2 = xy0 >> (e - 64);
 195                 xy0 = xy1 = 0;
 196         } else if (e >= 32) {
 197                 sticky = xy3 | ((xy2 << 1) << (63 - e));
 198                 xy3 = (xy2 >> (e - 32)) | ((xy1 << 1) << (63 - e));
 199                 if (sticky)
 200                         xy3 |= 1;
 201                 xy2 = (xy1 >> (e - 32)) | ((xy0 << 1) << (63 - e));
 202                 xy1 = xy0 >> (e - 32);
 203                 xy0 = 0;
 204         } else if (e) {
 205                 sticky = (xy3 << 1) << (31 - e);
 206                 xy3 = (xy3 >> e) | ((xy2 << 1) << (31 - e));
 207                 if (sticky)
 208                         xy3 |= 1;
 209                 xy2 = (xy2 >> e) | ((xy1 << 1) << (31 - e));
 210                 xy1 = (xy1 >> e) | ((xy0 << 1) << (31 - e));
 211                 xy0 >>= e;
 212         }
 213 
 214         /* if this is a magnitude subtract, negate the significand of xy */
 215         if (sxy ^ sz) {
 216                 xy0 = ~xy0;
 217                 xy1 = ~xy1;
 218                 xy2 = ~xy2;
 219                 xy3 = -xy3;
 220                 if (xy3 == 0)
 221                         if (++xy2 == 0)
 222                                 if (++xy1 == 0)
 223                                         xy0++;
 224         }
 225 
 226         /* add, propagating carries */
 227         z3 += xy3;
 228         e = (z3 < xy3);
 229         z2 += xy2;
 230         if (e) {
 231                 z2++;
 232                 e = (z2 <= xy2);
 233         } else
 234                 e = (z2 < xy2);
 235         z1 += xy1;
 236         if (e) {
 237                 z1++;
 238                 e = (z1 <= xy1);
 239         } else
 240                 e = (z1 < xy1);
 241         z0 += xy0;
 242         if (e)
 243                 z0++;
 244 
 245         /* postnormalize and collect rounding information into z2 */
 246         if (ez < 1) {
 247                 /* result is tiny; shift right until exponent is within range */
 248                 e = 1 - ez;
 249                 if (e > 56) {
 250                         z2 = 1; /* result can't be exactly zero */
 251                         z0 = z1 = 0;
 252                 } else if (e >= 32) {
 253                         sticky = z3 | z2 | ((z1 << 1) << (63 - e));
 254                         z2 = (z1 >> (e - 32)) | ((z0 << 1) << (63 - e));
 255                         if (sticky)
 256                                 z2 |= 1;
 257                         z1 = z0 >> (e - 32);
 258                         z0 = 0;
 259                 } else {
 260                         sticky = z3 | (z2 << 1) << (31 - e);
 261                         z2 = (z2 >> e) | ((z1 << 1) << (31 - e));
 262                         if (sticky)
 263                                 z2 |= 1;
 264                         z1 = (z1 >> e) | ((z0 << 1) << (31 - e));
 265                         z0 >>= e;
 266                 }
 267                 ez = 1;
 268         } else if (z0 >= 0x200000) {
 269                 /* carry out; shift right by one */
 270                 sticky = (z2 & 1) | z3;
 271                 z2 = (z2 >> 1) | (z1 << 31);
 272                 if (sticky)
 273                         z2 |= 1;
 274                 z1 = (z1 >> 1) | (z0 << 31);
 275                 z0 >>= 1;
 276                 ez++;
 277         } else {
 278                 if (z0 < 0x100000 && (z0 | z1 | z2 | z3) != 0) {
 279                         /*
 280                          * borrow/cancellation; shift left as much as
 281                          * exponent allows
 282                          */
 283                         while (!(z0 | (z1 & 0xffe00000)) && ez >= 33) {
 284                                 z0 = z1;
 285                                 z1 = z2;
 286                                 z2 = z3;
 287                                 z3 = 0;
 288                                 ez -= 32;
 289                         }
 290                         while (z0 < 0x100000 && ez > 1) {
 291                                 z0 = (z0 << 1) | (z1 >> 31);
 292                                 z1 = (z1 << 1) | (z2 >> 31);
 293                                 z2 = (z2 << 1) | (z3 >> 31);
 294                                 z3 <<= 1;
 295                                 ez--;
 296                         }
 297                 }
 298                 if (z3)
 299                         z2 |= 1;
 300         }
 301 
 302         /* get the rounding mode and clear current exceptions */
 303         rm = fsr >> 30;
 304         fsr &= ~FSR_CEXC;
 305 
 306         /* strip off the integer bit, if there is one */
 307         ibit = z0 & 0x100000;
 308         if (ibit)
 309                 z0 -= 0x100000;
 310         else {
 311                 ez = 0;
 312                 if (!(z0 | z1 | z2)) { /* exact zero */
 313                         zz.i[0] = rm == FSR_RM ? 0x80000000 : 0;
 314                         zz.i[1] = 0;
 315                         __fenv_setfsr32(&fsr);
 316                         return (zz.d);
 317                 }
 318         }
 319 
 320         /*
 321          * flip the sense of directed roundings if the result is negative;
 322          * the logic below applies to a positive result
 323          */
 324         if (sz)
 325                 rm ^= rm >> 1;
 326 
 327         /* round and raise exceptions */
 328         if (z2) {
 329                 fsr |= FSR_NXC;
 330 
 331                 /* decide whether to round the fraction up */
 332                 if (rm == FSR_RP || (rm == FSR_RN && (z2 > 0x80000000u ||
 333                         (z2 == 0x80000000u && (z1 & 1))))) {
 334                         /* round up and renormalize if necessary */
 335                         if (++z1 == 0) {
 336                                 if (++z0 == 0x100000) {
 337                                         z0 = 0;
 338                                         ez++;
 339                                 }
 340                         }
 341                 }
 342         }
 343 
 344         /* check for under/overflow */
 345         if (ez >= 0x7ff) {
 346                 if (rm == FSR_RN || rm == FSR_RP) {
 347                         zz.i[0] = sz | 0x7ff00000;
 348                         zz.i[1] = 0;
 349                 } else {
 350                         zz.i[0] = sz | 0x7fefffff;
 351                         zz.i[1] = 0xffffffff;
 352                 }
 353                 fsr |= FSR_OFC | FSR_NXC;
 354         } else {
 355                 zz.i[0] = sz | (ez << 20) | z0;
 356                 zz.i[1] = z1;
 357 
 358                 /*
 359                  * !ibit => exact result was tiny before rounding,
 360                  * z2 nonzero => result delivered is inexact
 361                  */
 362                 if (!ibit) {
 363                         if (z2)
 364                                 fsr |= FSR_UFC | FSR_NXC;
 365                         else if (fsr & FSR_UFM)
 366                                 fsr |= FSR_UFC;
 367                 }
 368         }
 369 
 370         /* restore the fsr and emulate exceptions as needed */
 371         if ((fsr & FSR_CEXC) & (fsr >> 23)) {
 372                 __fenv_setfsr32(&fsr);
 373                 if (fsr & FSR_OFC) {
 374                         dummy = huge;
 375                         dummy *= huge;
 376                 } else if (fsr & FSR_UFC) {
 377                         dummy = tiny;
 378                         if (fsr & FSR_NXC)
 379                                 dummy *= tiny;
 380                         else
 381                                 dummy -= tiny2;
 382                 } else {
 383                         dummy = huge;
 384                         dummy += tiny;
 385                 }
 386         } else {
 387                 fsr |= (fsr & 0x1f) << 5;
 388                 __fenv_setfsr32(&fsr);
 389         }
 390         return (zz.d);
 391 }
 392 
 393 #elif defined(__x86)
 394 
 395 #if defined(__amd64)
 396 #define NI      4
 397 #else
 398 #define NI      3
 399 #endif
 400 
 401 /*
 402  *  fma for x86: 64-bit double precision, little-endian
 403  */
 404 double
 405 __fma(double x, double y, double z) {
 406         union {
 407                 unsigned i[NI];
 408                 long double e;
 409         } xx, yy, zz;
 410         long double xe, ye, xhi, xlo, yhi, ylo;
 411         int ex, ey, ez;
 412         unsigned cwsw, oldcwsw, rm;
 413 
 414         /* convert the operands to double extended */
 415         xx.e = (long double) x;
 416         yy.e = (long double) y;
 417         zz.e = (long double) z;
 418 
 419         /* extract the exponents of the arguments */
 420         ex = xx.i[2] & 0x7fff;
 421         ey = yy.i[2] & 0x7fff;
 422         ez = zz.i[2] & 0x7fff;
 423 
 424         /* dispense with inf, nan, and zero cases */
 425         if (ex == 0x7fff || ey == 0x7fff || ex == 0 || ey == 0)
 426                 /* x or y is inf, nan, or zero */
 427                 return ((double) (xx.e * yy.e + zz.e));
 428 
 429         if (ez >= 0x7fff) /* z is inf or nan */
 430                 return ((double) (xx.e + zz.e));
 431                                         /* avoid spurious inexact in x * y */
 432 
 433         /*
 434          * save the control and status words, mask all exceptions, and
 435          * set rounding to 64-bit precision and to-nearest
 436          */
 437         __fenv_getcwsw(&oldcwsw);
 438         cwsw = (oldcwsw & 0xf0c0ffff) | 0x033f0000;
 439         __fenv_setcwsw(&cwsw);
 440 
 441         /* multiply x*y to 106 bits */
 442         xe = xx.e;
 443         xx.i[0] = 0;
 444         xhi = xx.e; /* hi 32 bits */
 445         xlo = xe - xhi; /* lo 21 bits */
 446         ye = yy.e;
 447         yy.i[0] = 0;
 448         yhi = yy.e;
 449         ylo = ye - yhi;
 450         xe = xe * ye;
 451         ye = ((xhi * yhi - xe) + xhi * ylo + xlo * yhi) + xlo * ylo;
 452 
 453         /* distill the sum of xe, ye, and z */
 454         xhi = ye + zz.e;
 455         yhi = xhi - ye;
 456         xlo = (zz.e - yhi) + (ye - (xhi - yhi));
 457                                                 /* now (xhi,xlo) = ye + z */
 458 
 459         yhi = xe + xhi;
 460         ye = yhi - xe;
 461         ylo = (xhi - ye) + (xe - (yhi - ye));   /* now (yhi,ylo) = xe + xhi */
 462 
 463         xhi = xlo + ylo;
 464         xe = xhi - xlo;
 465         xlo = (ylo - xe) + (xlo - (xhi - xe));  /* now (xhi,xlo) = xlo + ylo */
 466 
 467         yy.e = yhi + xhi;
 468         ylo = (yhi - yy.e) + xhi;               /* now (yy.e,ylo) = xhi + yhi */
 469 
 470         if (yy.i[1] != 0) {     /* yy.e is nonzero */
 471                 /* perturb yy.e if its least significant 10 bits are zero */
 472                 if (!(yy.i[0] & 0x3ff)) {
 473                         xx.e = ylo + xlo;
 474                         if (xx.i[1] != 0) {
 475                                 xx.i[2] = (xx.i[2] & 0x8000) |
 476                                         ((yy.i[2] & 0x7fff) - 63);
 477                                 xx.i[1] = 0x80000000;
 478                                 xx.i[0] = 0;
 479                                 yy.e += xx.e;
 480                         }
 481                 }
 482         } else {
 483                 /* set sign of zero result according to rounding direction */
 484                 rm = oldcwsw & 0x0c000000;
 485                 yy.i[2] = ((rm == FCW_RM)? 0x8000 : 0);
 486         }
 487 
 488         /*
 489          * restore the control and status words and convert the result
 490          * to double
 491          */
 492         __fenv_setcwsw(&oldcwsw);
 493         return ((double) yy.e);
 494 }
 495 
 496 #if 0
 497 /*
 498  * another fma for x86: assumes return value will be left in
 499  * long double (80-bit double extended) precision
 500  */
 501 long double
 502 __fma(double x, double y, double z) {
 503         union {
 504                 unsigned i[3];
 505                 long double e;
 506         } xx, yy, zz, tt;
 507         long double xe, ye, xhi, xlo, yhi, ylo, zhi, zlo;
 508         int ex, ey, ez;
 509         unsigned cwsw, oldcwsw, s;
 510 
 511         /* convert the operands to double extended */
 512         xx.e = (long double) x;
 513         yy.e = (long double) y;
 514         zz.e = (long double) z;
 515 
 516         /* extract the exponents of the arguments */
 517         ex = xx.i[2] & 0x7fff;
 518         ey = yy.i[2] & 0x7fff;
 519         ez = zz.i[2] & 0x7fff;
 520 
 521         /* dispense with inf, nan, and zero cases */
 522         if (ex == 0x7fff || ey == 0x7fff || ex == 0 || ey == 0)
 523                 /* x or y is inf, nan, or zero */
 524                 return (xx.e * yy.e + zz.e);
 525 
 526         if (ez >= 0x7fff) /* z is inf or nan */
 527                 return (xx.e + zz.e);   /* avoid spurious inexact in x * y */
 528 
 529         if (ez == 0) /* z is zero */
 530                 return (xx.e * yy.e);   /* x * y isn't zero; no need to add z */
 531 
 532         /*
 533          * save the control and status words, mask all exceptions, and
 534          * set rounding to 64-bit precision and to-nearest
 535          */
 536         __fenv_getcwsw(&oldcwsw);
 537         cwsw = (oldcwsw & 0xf0c0ffff) | 0x033f0000;
 538         __fenv_setcwsw(&cwsw);
 539 
 540         /* multiply x*y to 106 bits */
 541         xe = xx.e;
 542         xx.i[0] = 0;
 543         xhi = xx.e; /* hi 32 bits */
 544         xlo = xe - xhi; /* lo 21 bits */
 545         ye = yy.e;
 546         yy.i[0] = 0;
 547         yhi = yy.e;
 548         ylo = ye - yhi;
 549         xx.e = xe * ye;
 550         xx.i[0] &= ~0x7ff; /* 53 bits of x*y */
 551         yy.e = ((xhi * yhi - xx.e) + xhi * ylo + xlo * yhi) + xlo * ylo;
 552 
 553         /* reduce to a sum of two terms */
 554         if (yy.e != 0.0) {
 555                 ex = xx.i[2] & 0x7fff;
 556                 if (ez - ex > 10) {
 557                         /* collapse y into a single bit and add to x */
 558                         yy.i[0] = 0;
 559                         yy.i[1] = 0x80000000;
 560                         yy.i[2] = (yy.i[2] & 0x8000) | (ex - 60);
 561                         xx.e += yy.e;
 562                 } else if (ex - ez <= 10) {
 563                         xx.e += zz.e; /* exact */
 564                         zz.e = yy.e;
 565                 } else if (ex - ez <= 42) {
 566                         /* split z into two pieces */
 567                         tt.i[0] = 0;
 568                         tt.i[1] = 0x80000000;
 569                         tt.i[2] = ex + 11;
 570                         zhi = (zz.e + tt.e) - tt.e;
 571                         zlo = zz.e - zhi;
 572                         xx.e += zhi;
 573                         zz.e = yy.e + zlo;
 574                 } else if (ex - ez <= 63) {
 575                         zz.e += yy.e; /* exact */
 576                 } else if (ex - ez <= 106) {
 577                         /*
 578                          * collapse the tail of z into a sticky bit and add z
 579                          * to y without error
 580                          */
 581                         if (ex - ez <= 81) {
 582                                 s = 1 << (ex - ez - 50);
 583                                 if (zz.i[0] & (s - 1))
 584                                         zz.i[0] |= s;
 585                                 zz.i[0] &= ~(s - 1);
 586                         } else {
 587                                 s = 1 << (ex - ez - 82);
 588                                 if ((zz.i[1] & (s - 1)) | zz.i[0])
 589                                         zz.i[1] |= s;
 590                                 zz.i[1] &= ~(s - 1);
 591                                 zz.i[0] = 0;
 592                         }
 593                         zz.e += yy.e;
 594                 } else {
 595                         /* collapse z into a single bit and add to y */
 596                         zz.i[0] = 0;
 597                         zz.i[1] = 0x80000000;
 598                         zz.i[2] = (zz.i[2] & 0x8000) | (ex - 113);
 599                         zz.e += yy.e;
 600                 }
 601         }
 602 
 603         /* restore the control and status words, and sum */
 604         __fenv_setcwsw(&oldcwsw);
 605         return (xx.e + zz.e);
 606 }
 607 #endif
 608 
 609 #else
 610 #error Unknown architecture
 611 #endif