il_11210 Sdiff usr/src/lib/libm/common/m9x/fma.c

Print this page

11210 libm should be cstyle(1ONBLD) clean

   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  */

  25 /*
  26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  27  * Use is subject to license terms.
  28  */
  29 
  30 #pragma weak fma = __fma
  31 
  32 #include "libm.h"
  33 #include "fma.h"
  34 #include "fenv_inlines.h"
  35 
  36 #if defined(__sparc)
  37 
  38 static const union {
  39         unsigned i[2];
  40         double d;
  41 } C[] = {
  42         { 0x3fe00000u, 0 },
  43         { 0x40000000u, 0 },
  44         { 0x43300000u, 0 },
  45         { 0x41a00000u, 0 },
  46         { 0x3e500000u, 0 },
  47         { 0x3df00000u, 0 },
  48         { 0x3bf00000u, 0 },
  49         { 0x7fe00000u, 0 },
  50         { 0x00100000u, 0 },
  51         { 0x00100001u, 0 }
  52 };
  53 
  54 #define half    C[0].d
  55 #define two     C[1].d
  56 #define two52   C[2].d
  57 #define two27   C[3].d
  58 #define twom26  C[4].d
  59 #define twom32  C[5].d
  60 #define twom64  C[6].d
  61 #define huge    C[7].d
  62 #define tiny    C[8].d
  63 #define tiny2   C[9].d
  64 
  65 static const unsigned int fsr_rm = 0xc0000000u;
  66 
  67 /*
  68  * fma for SPARC: 64-bit double precision, big-endian
  69  */
  70 double
  71 __fma(double x, double y, double z) {

  72         union {
  73                 unsigned i[2];
  74                 double d;
  75         } xx, yy, zz;

  76         double xhi, yhi, xlo, ylo, t;
  77         unsigned int xy0, xy1, xy2, xy3, z0, z1, z2, z3, fsr, rm, sticky;
  78         int hx, hy, hz, ex, ey, ez, exy, sxy, sz, e, ibit;
  79         volatile double dummy;
  80 
  81         /* extract the high order words of the arguments */
  82         xx.d = x;
  83         yy.d = y;
  84         zz.d = z;
  85         hx = xx.i[0] & ~0x80000000;
  86         hy = yy.i[0] & ~0x80000000;
  87         hz = zz.i[0] & ~0x80000000;
  88 
  89         /* dispense with inf, nan, and zero cases */
  90         if (hx >= 0x7ff00000 || hy >= 0x7ff00000 || (hx | xx.i[1]) == 0 ||
  91                 (hy | yy.i[1]) == 0)    /* x or y is inf, nan, or zero */
  92                 return (x * y + z);
  93 
  94         if (hz >= 0x7ff00000)        /* z is inf or nan */
  95                 return (x + z); /* avoid spurious under/overflow in x * y */
  96 
  97         if ((hz | zz.i[1]) == 0)        /* z is zero */
  98                 /*
  99                  * x * y isn't zero but could underflow to zero,
 100                  * so don't add z, lest we perturb the sign
 101                  */
 102                 return (x * y);
 103 
 104         /*
 105          * now x, y, and z are all finite and nonzero; save the fsr and
 106          * set round-to-negative-infinity mode (and clear nonstandard
 107          * mode before we try to scale subnormal operands)
 108          */
 109         __fenv_getfsr32(&fsr);
 110         __fenv_setfsr32(&fsr_rm);
 111 
 112         /* extract signs and exponents, and normalize subnormals */
 113         sxy = (xx.i[0] ^ yy.i[0]) & 0x80000000;
 114         sz = zz.i[0] & 0x80000000;
 115         ex = hx >> 20;

 116         if (!ex) {
 117                 xx.d = x * two52;
 118                 ex = ((xx.i[0] & ~0x80000000) >> 20) - 52;
 119         }

 120         ey = hy >> 20;

 121         if (!ey) {
 122                 yy.d = y * two52;
 123                 ey = ((yy.i[0] & ~0x80000000) >> 20) - 52;
 124         }

 125         ez = hz >> 20;

 126         if (!ez) {
 127                 zz.d = z * two52;
 128                 ez = ((zz.i[0] & ~0x80000000) >> 20) - 52;
 129         }
 130 
 131         /* multiply x*y to 106 bits */
 132         exy = ex + ey - 0x3ff;
 133         xx.i[0] = (xx.i[0] & 0xfffff) | 0x3ff00000;
 134         yy.i[0] = (yy.i[0] & 0xfffff) | 0x3ff00000;
 135         x = xx.d;
 136         y = yy.d;
 137         xhi = ((x + twom26) + two27) - two27;
 138         yhi = ((y + twom26) + two27) - two27;
 139         xlo = x - xhi;
 140         ylo = y - yhi;
 141         x *= y;
 142         y = ((xhi * yhi - x) + xhi * ylo + xlo * yhi) + xlo * ylo;

 143         if (x >= two) {
 144                 x *= half;
 145                 y *= half;
 146                 exy++;
 147         }
 148 
 149         /* extract the significands */
 150         xx.d = x;
 151         xy0 = (xx.i[0] & 0xfffff) | 0x100000;
 152         xy1 = xx.i[1];
 153         yy.d = t = y + twom32;
 154         xy2 = yy.i[1];
 155         yy.d = (y - (t - twom32)) + twom64;
 156         xy3 = yy.i[1];
 157         z0 = (zz.i[0] & 0xfffff) | 0x100000;
 158         z1 = zz.i[1];
 159         z2 = z3 = 0;
 160 
 161         /*
 162          * now x*y is represented by sxy, exy, and xy[0-3], and z is
 163          * represented likewise; swap if need be so |xy| <= |z|
 164          */
 165         if (exy > ez || (exy == ez && (xy0 > z0 || (xy0 == z0 &&
 166                 (xy1 > z1 || (xy1 == z1 && (xy2 | xy3) != 0)))))) {
 167                 e = sxy; sxy = sz; sz = e;
 168                 e = exy; exy = ez; ez = e;
 169                 e = xy0; xy0 = z0; z0 = e;
 170                 e = xy1; xy1 = z1; z1 = e;
 171                 z2 = xy2; xy2 = 0;
 172                 z3 = xy3; xy3 = 0;










 173         }
 174 
 175         /* shift the significand of xy keeping a sticky bit */
 176         e = ez - exy;

 177         if (e > 116) {
 178                 xy0 = xy1 = xy2 = 0;
 179                 xy3 = 1;
 180         } else if (e >= 96) {
 181                 sticky = xy3 | xy2 | xy1 | ((xy0 << 1) << (127 - e));
 182                 xy3 = xy0 >> (e - 96);

 183                 if (sticky)
 184                         xy3 |= 1;

 185                 xy0 = xy1 = xy2 = 0;
 186         } else if (e >= 64) {
 187                 sticky = xy3 | xy2 | ((xy1 << 1) << (95 - e));
 188                 xy3 = (xy1 >> (e - 64)) | ((xy0 << 1) << (95 - e));

 189                 if (sticky)
 190                         xy3 |= 1;

 191                 xy2 = xy0 >> (e - 64);
 192                 xy0 = xy1 = 0;
 193         } else if (e >= 32) {
 194                 sticky = xy3 | ((xy2 << 1) << (63 - e));
 195                 xy3 = (xy2 >> (e - 32)) | ((xy1 << 1) << (63 - e));

 196                 if (sticky)
 197                         xy3 |= 1;

 198                 xy2 = (xy1 >> (e - 32)) | ((xy0 << 1) << (63 - e));
 199                 xy1 = xy0 >> (e - 32);
 200                 xy0 = 0;
 201         } else if (e) {
 202                 sticky = (xy3 << 1) << (31 - e);
 203                 xy3 = (xy3 >> e) | ((xy2 << 1) << (31 - e));

 204                 if (sticky)
 205                         xy3 |= 1;

 206                 xy2 = (xy2 >> e) | ((xy1 << 1) << (31 - e));
 207                 xy1 = (xy1 >> e) | ((xy0 << 1) << (31 - e));
 208                 xy0 >>= e;
 209         }
 210 
 211         /* if this is a magnitude subtract, negate the significand of xy */
 212         if (sxy ^ sz) {
 213                 xy0 = ~xy0;
 214                 xy1 = ~xy1;
 215                 xy2 = ~xy2;
 216                 xy3 = -xy3;

 217                 if (xy3 == 0)
 218                         if (++xy2 == 0)
 219                                 if (++xy1 == 0)
 220                                         xy0++;
 221         }
 222 
 223         /* add, propagating carries */
 224         z3 += xy3;
 225         e = (z3 < xy3);
 226         z2 += xy2;

 227         if (e) {
 228                 z2++;
 229                 e = (z2 <= xy2);
 230         } else
 231                 e = (z2 < xy2);


 232         z1 += xy1;

 233         if (e) {
 234                 z1++;
 235                 e = (z1 <= xy1);
 236         } else
 237                 e = (z1 < xy1);


 238         z0 += xy0;

 239         if (e)
 240                 z0++;
 241 
 242         /* postnormalize and collect rounding information into z2 */
 243         if (ez < 1) {
 244                 /* result is tiny; shift right until exponent is within range */
 245                 e = 1 - ez;

 246                 if (e > 56) {
 247                         z2 = 1; /* result can't be exactly zero */
 248                         z0 = z1 = 0;
 249                 } else if (e >= 32) {
 250                         sticky = z3 | z2 | ((z1 << 1) << (63 - e));
 251                         z2 = (z1 >> (e - 32)) | ((z0 << 1) << (63 - e));

 252                         if (sticky)
 253                                 z2 |= 1;

 254                         z1 = z0 >> (e - 32);
 255                         z0 = 0;
 256                 } else {
 257                         sticky = z3 | (z2 << 1) << (31 - e);
 258                         z2 = (z2 >> e) | ((z1 << 1) << (31 - e));

 259                         if (sticky)
 260                                 z2 |= 1;

 261                         z1 = (z1 >> e) | ((z0 << 1) << (31 - e));
 262                         z0 >>= e;
 263                 }

 264                 ez = 1;
 265         } else if (z0 >= 0x200000) {
 266                 /* carry out; shift right by one */
 267                 sticky = (z2 & 1) | z3;
 268                 z2 = (z2 >> 1) | (z1 << 31);

 269                 if (sticky)
 270                         z2 |= 1;

 271                 z1 = (z1 >> 1) | (z0 << 31);
 272                 z0 >>= 1;
 273                 ez++;
 274         } else {
 275                 if (z0 < 0x100000 && (z0 | z1 | z2 | z3) != 0) {
 276                         /*
 277                          * borrow/cancellation; shift left as much as
 278                          * exponent allows
 279                          */
 280                         while (!(z0 | (z1 & 0xffe00000)) && ez >= 33) {
 281                                 z0 = z1;
 282                                 z1 = z2;
 283                                 z2 = z3;
 284                                 z3 = 0;
 285                                 ez -= 32;
 286                         }

 287                         while (z0 < 0x100000 && ez > 1) {
 288                                 z0 = (z0 << 1) | (z1 >> 31);
 289                                 z1 = (z1 << 1) | (z2 >> 31);
 290                                 z2 = (z2 << 1) | (z3 >> 31);
 291                                 z3 <<= 1;
 292                                 ez--;
 293                         }
 294                 }

 295                 if (z3)
 296                         z2 |= 1;
 297         }
 298 
 299         /* get the rounding mode and clear current exceptions */
 300         rm = fsr >> 30;
 301         fsr &= ~FSR_CEXC;
 302 
 303         /* strip off the integer bit, if there is one */
 304         ibit = z0 & 0x100000;
 305         if (ibit)

 306                 z0 -= 0x100000;
 307         else {
 308                 ez = 0;

 309                 if (!(z0 | z1 | z2)) { /* exact zero */
 310                         zz.i[0] = rm == FSR_RM ? 0x80000000 : 0;
 311                         zz.i[1] = 0;
 312                         __fenv_setfsr32(&fsr);
 313                         return (zz.d);
 314                 }
 315         }
 316 
 317         /*
 318          * flip the sense of directed roundings if the result is negative;
 319          * the logic below applies to a positive result
 320          */
 321         if (sz)
 322                 rm ^= rm >> 1;
 323 
 324         /* round and raise exceptions */
 325         if (z2) {
 326                 fsr |= FSR_NXC;
 327 
 328                 /* decide whether to round the fraction up */
 329                 if (rm == FSR_RP || (rm == FSR_RN && (z2 > 0x80000000u ||
 330                         (z2 == 0x80000000u && (z1 & 1))))) {
 331                         /* round up and renormalize if necessary */
 332                         if (++z1 == 0) {
 333                                 if (++z0 == 0x100000) {
 334                                         z0 = 0;
 335                                         ez++;
 336                                 }
 337                         }
 338                 }
 339         }
 340 
 341         /* check for under/overflow */
 342         if (ez >= 0x7ff) {
 343                 if (rm == FSR_RN || rm == FSR_RP) {
 344                         zz.i[0] = sz | 0x7ff00000;
 345                         zz.i[1] = 0;
 346                 } else {
 347                         zz.i[0] = sz | 0x7fefffff;
 348                         zz.i[1] = 0xffffffff;
 349                 }

 350                 fsr |= FSR_OFC | FSR_NXC;
 351         } else {
 352                 zz.i[0] = sz | (ez << 20) | z0;
 353                 zz.i[1] = z1;
 354 
 355                 /*
 356                  * !ibit => exact result was tiny before rounding,
 357                  * z2 nonzero => result delivered is inexact
 358                  */
 359                 if (!ibit) {
 360                         if (z2)
 361                                 fsr |= FSR_UFC | FSR_NXC;
 362                         else if (fsr & FSR_UFM)
 363                                 fsr |= FSR_UFC;
 364                 }
 365         }
 366 
 367         /* restore the fsr and emulate exceptions as needed */
 368         if ((fsr & FSR_CEXC) & (fsr >> 23)) {
 369                 __fenv_setfsr32(&fsr);

 370                 if (fsr & FSR_OFC) {
 371                         dummy = huge;
 372                         dummy *= huge;
 373                 } else if (fsr & FSR_UFC) {
 374                         dummy = tiny;

 375                         if (fsr & FSR_NXC)
 376                                 dummy *= tiny;
 377                         else
 378                                 dummy -= tiny2;
 379                 } else {
 380                         dummy = huge;
 381                         dummy += tiny;
 382                 }
 383         } else {
 384                 fsr |= (fsr & 0x1f) << 5;
 385                 __fenv_setfsr32(&fsr);
 386         }

 387         return (zz.d);
 388 }
 389 
 390 #elif defined(__x86)
 391 
 392 #if defined(__amd64)
 393 #define NI      4
 394 #else
 395 #define NI      3
 396 #endif
 397 
 398 /*
 399  *  fma for x86: 64-bit double precision, little-endian
 400  */
 401 double
 402 __fma(double x, double y, double z) {

 403         union {
 404                 unsigned i[NI];
 405                 long double e;
 406         } xx, yy, zz;

 407         long double xe, ye, xhi, xlo, yhi, ylo;
 408         int ex, ey, ez;
 409         unsigned cwsw, oldcwsw, rm;
 410 
 411         /* convert the operands to double extended */
 412         xx.e = (long double) x;
 413         yy.e = (long double) y;
 414         zz.e = (long double) z;
 415 
 416         /* extract the exponents of the arguments */
 417         ex = xx.i[2] & 0x7fff;
 418         ey = yy.i[2] & 0x7fff;
 419         ez = zz.i[2] & 0x7fff;
 420 
 421         /* dispense with inf, nan, and zero cases */
 422         if (ex == 0x7fff || ey == 0x7fff || ex == 0 || ey == 0)
 423                 /* x or y is inf, nan, or zero */
 424                 return ((double) (xx.e * yy.e + zz.e));
 425 
 426         if (ez >= 0x7fff) /* z is inf or nan */
 427                 return ((double) (xx.e + zz.e));

 428                                         /* avoid spurious inexact in x * y */
 429 
 430         /*
 431          * save the control and status words, mask all exceptions, and
 432          * set rounding to 64-bit precision and to-nearest
 433          */
 434         __fenv_getcwsw(&oldcwsw);
 435         cwsw = (oldcwsw & 0xf0c0ffff) | 0x033f0000;
 436         __fenv_setcwsw(&cwsw);
 437 
 438         /* multiply x*y to 106 bits */
 439         xe = xx.e;
 440         xx.i[0] = 0;
 441         xhi = xx.e; /* hi 32 bits */
 442         xlo = xe - xhi; /* lo 21 bits */
 443         ye = yy.e;
 444         yy.i[0] = 0;
 445         yhi = yy.e;
 446         ylo = ye - yhi;
 447         xe = xe * ye;

 451         xhi = ye + zz.e;
 452         yhi = xhi - ye;
 453         xlo = (zz.e - yhi) + (ye - (xhi - yhi));
 454                                                 /* now (xhi,xlo) = ye + z */
 455 
 456         yhi = xe + xhi;
 457         ye = yhi - xe;
 458         ylo = (xhi - ye) + (xe - (yhi - ye));   /* now (yhi,ylo) = xe + xhi */
 459 
 460         xhi = xlo + ylo;
 461         xe = xhi - xlo;
 462         xlo = (ylo - xe) + (xlo - (xhi - xe));  /* now (xhi,xlo) = xlo + ylo */
 463 
 464         yy.e = yhi + xhi;
 465         ylo = (yhi - yy.e) + xhi;               /* now (yy.e,ylo) = xhi + yhi */
 466 
 467         if (yy.i[1] != 0) {     /* yy.e is nonzero */
 468                 /* perturb yy.e if its least significant 10 bits are zero */
 469                 if (!(yy.i[0] & 0x3ff)) {
 470                         xx.e = ylo + xlo;

 471                         if (xx.i[1] != 0) {
 472                                 xx.i[2] = (xx.i[2] & 0x8000) |
 473                                         ((yy.i[2] & 0x7fff) - 63);
 474                                 xx.i[1] = 0x80000000;
 475                                 xx.i[0] = 0;
 476                                 yy.e += xx.e;
 477                         }
 478                 }
 479         } else {
 480                 /* set sign of zero result according to rounding direction */
 481                 rm = oldcwsw & 0x0c000000;
 482                 yy.i[2] = ((rm == FCW_RM)? 0x8000 : 0);
 483         }
 484 
 485         /*
 486          * restore the control and status words and convert the result
 487          * to double
 488          */
 489         __fenv_setcwsw(&oldcwsw);
 490         return ((double) yy.e);
 491 }
 492 
 493 #else
 494 #error Unknown architecture
 495 #endif

   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25 
  26 /*
  27  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  28  * Use is subject to license terms.
  29  */
  30 
  31 #pragma weak fma = __fma
  32 
  33 #include "libm.h"
  34 #include "fma.h"
  35 #include "fenv_inlines.h"
  36 
  37 #if defined(__sparc)

  38 static const union {
  39         unsigned i[2];
  40         double d;
  41 } C[] = {
  42         { 0x3fe00000u, 0 },
  43         { 0x40000000u, 0 },
  44         { 0x43300000u, 0 },
  45         { 0x41a00000u, 0 },
  46         { 0x3e500000u, 0 },
  47         { 0x3df00000u, 0 },
  48         { 0x3bf00000u, 0 },
  49         { 0x7fe00000u, 0 },
  50         { 0x00100000u, 0 },
  51         { 0x00100001u, 0 }
  52 };
  53 
  54 #define half            C[0].d
  55 #define two             C[1].d
  56 #define two52           C[2].d
  57 #define two27           C[3].d
  58 #define twom26          C[4].d
  59 #define twom32          C[5].d
  60 #define twom64          C[6].d
  61 #define huge            C[7].d
  62 #define tiny            C[8].d
  63 #define tiny2           C[9].d
  64 
  65 static const unsigned int fsr_rm = 0xc0000000u;
  66 
  67 /*
  68  * fma for SPARC: 64-bit double precision, big-endian
  69  */
  70 double
  71 __fma(double x, double y, double z)
  72 {
  73         union {
  74                 unsigned i[2];
  75                 double d;
  76         } xx, yy, zz;
  77 
  78         double xhi, yhi, xlo, ylo, t;
  79         unsigned int xy0, xy1, xy2, xy3, z0, z1, z2, z3, fsr, rm, sticky;
  80         int hx, hy, hz, ex, ey, ez, exy, sxy, sz, e, ibit;
  81         volatile double dummy;
  82 
  83         /* extract the high order words of the arguments */
  84         xx.d = x;
  85         yy.d = y;
  86         zz.d = z;
  87         hx = xx.i[0] & ~0x80000000;
  88         hy = yy.i[0] & ~0x80000000;
  89         hz = zz.i[0] & ~0x80000000;
  90 
  91         /* dispense with inf, nan, and zero cases */
  92         if (hx >= 0x7ff00000 || hy >= 0x7ff00000 || (hx | xx.i[1]) == 0 || (hy |
  93             yy.i[1]) == 0)              /* x or y is inf, nan, or zero */
  94                 return (x * y + z);
  95 
  96         if (hz >= 0x7ff00000)        /* z is inf or nan */
  97                 return (x + z); /* avoid spurious under/overflow in x * y */
  98 
  99         if ((hz | zz.i[1]) == 0) /* z is zero */
 100                 /*
 101                  * x * y isn't zero but could underflow to zero,
 102                  * so don't add z, lest we perturb the sign
 103                  */
 104                 return (x * y);
 105 
 106         /*
 107          * now x, y, and z are all finite and nonzero; save the fsr and
 108          * set round-to-negative-infinity mode (and clear nonstandard
 109          * mode before we try to scale subnormal operands)
 110          */
 111         __fenv_getfsr32(&fsr);
 112         __fenv_setfsr32(&fsr_rm);
 113 
 114         /* extract signs and exponents, and normalize subnormals */
 115         sxy = (xx.i[0] ^ yy.i[0]) & 0x80000000;
 116         sz = zz.i[0] & 0x80000000;
 117         ex = hx >> 20;
 118 
 119         if (!ex) {
 120                 xx.d = x * two52;
 121                 ex = ((xx.i[0] & ~0x80000000) >> 20) - 52;
 122         }
 123 
 124         ey = hy >> 20;
 125 
 126         if (!ey) {
 127                 yy.d = y * two52;
 128                 ey = ((yy.i[0] & ~0x80000000) >> 20) - 52;
 129         }
 130 
 131         ez = hz >> 20;
 132 
 133         if (!ez) {
 134                 zz.d = z * two52;
 135                 ez = ((zz.i[0] & ~0x80000000) >> 20) - 52;
 136         }
 137 
 138         /* multiply x*y to 106 bits */
 139         exy = ex + ey - 0x3ff;
 140         xx.i[0] = (xx.i[0] & 0xfffff) | 0x3ff00000;
 141         yy.i[0] = (yy.i[0] & 0xfffff) | 0x3ff00000;
 142         x = xx.d;
 143         y = yy.d;
 144         xhi = ((x + twom26) + two27) - two27;
 145         yhi = ((y + twom26) + two27) - two27;
 146         xlo = x - xhi;
 147         ylo = y - yhi;
 148         x *= y;
 149         y = ((xhi * yhi - x) + xhi * ylo + xlo * yhi) + xlo * ylo;
 150 
 151         if (x >= two) {
 152                 x *= half;
 153                 y *= half;
 154                 exy++;
 155         }
 156 
 157         /* extract the significands */
 158         xx.d = x;
 159         xy0 = (xx.i[0] & 0xfffff) | 0x100000;
 160         xy1 = xx.i[1];
 161         yy.d = t = y + twom32;
 162         xy2 = yy.i[1];
 163         yy.d = (y - (t - twom32)) + twom64;
 164         xy3 = yy.i[1];
 165         z0 = (zz.i[0] & 0xfffff) | 0x100000;
 166         z1 = zz.i[1];
 167         z2 = z3 = 0;
 168 
 169         /*
 170          * now x*y is represented by sxy, exy, and xy[0-3], and z is
 171          * represented likewise; swap if need be so |xy| <= |z|
 172          */
 173         if (exy > ez || (exy == ez && (xy0 > z0 || (xy0 == z0 && (xy1 > z1 ||
 174             (xy1 == z1 && (xy2 | xy3) != 0)))))) {
 175                 e = sxy;
 176                 sxy = sz;
 177                 sz = e;
 178                 e = exy;
 179                 exy = ez;
 180                 ez = e;
 181                 e = xy0;
 182                 xy0 = z0;
 183                 z0 = e;
 184                 e = xy1;
 185                 xy1 = z1;
 186                 z1 = e;
 187                 z2 = xy2;
 188                 xy2 = 0;
 189                 z3 = xy3;
 190                 xy3 = 0;
 191         }
 192 
 193         /* shift the significand of xy keeping a sticky bit */
 194         e = ez - exy;
 195 
 196         if (e > 116) {
 197                 xy0 = xy1 = xy2 = 0;
 198                 xy3 = 1;
 199         } else if (e >= 96) {
 200                 sticky = xy3 | xy2 | xy1 | ((xy0 << 1) << (127 - e));
 201                 xy3 = xy0 >> (e - 96);
 202 
 203                 if (sticky)
 204                         xy3 |= 1;
 205 
 206                 xy0 = xy1 = xy2 = 0;
 207         } else if (e >= 64) {
 208                 sticky = xy3 | xy2 | ((xy1 << 1) << (95 - e));
 209                 xy3 = (xy1 >> (e - 64)) | ((xy0 << 1) << (95 - e));
 210 
 211                 if (sticky)
 212                         xy3 |= 1;
 213 
 214                 xy2 = xy0 >> (e - 64);
 215                 xy0 = xy1 = 0;
 216         } else if (e >= 32) {
 217                 sticky = xy3 | ((xy2 << 1) << (63 - e));
 218                 xy3 = (xy2 >> (e - 32)) | ((xy1 << 1) << (63 - e));
 219 
 220                 if (sticky)
 221                         xy3 |= 1;
 222 
 223                 xy2 = (xy1 >> (e - 32)) | ((xy0 << 1) << (63 - e));
 224                 xy1 = xy0 >> (e - 32);
 225                 xy0 = 0;
 226         } else if (e) {
 227                 sticky = (xy3 << 1) << (31 - e);
 228                 xy3 = (xy3 >> e) | ((xy2 << 1) << (31 - e));
 229 
 230                 if (sticky)
 231                         xy3 |= 1;
 232 
 233                 xy2 = (xy2 >> e) | ((xy1 << 1) << (31 - e));
 234                 xy1 = (xy1 >> e) | ((xy0 << 1) << (31 - e));
 235                 xy0 >>= e;
 236         }
 237 
 238         /* if this is a magnitude subtract, negate the significand of xy */
 239         if (sxy ^ sz) {
 240                 xy0 = ~xy0;
 241                 xy1 = ~xy1;
 242                 xy2 = ~xy2;
 243                 xy3 = -xy3;
 244 
 245                 if (xy3 == 0)
 246                         if (++xy2 == 0)
 247                                 if (++xy1 == 0)
 248                                         xy0++;
 249         }
 250 
 251         /* add, propagating carries */
 252         z3 += xy3;
 253         e = (z3 < xy3);
 254         z2 += xy2;
 255 
 256         if (e) {
 257                 z2++;
 258                 e = (z2 <= xy2);
 259         } else {
 260                 e = (z2 < xy2);
 261         }
 262 
 263         z1 += xy1;
 264 
 265         if (e) {
 266                 z1++;
 267                 e = (z1 <= xy1);
 268         } else {
 269                 e = (z1 < xy1);
 270         }
 271 
 272         z0 += xy0;
 273 
 274         if (e)
 275                 z0++;
 276 
 277         /* postnormalize and collect rounding information into z2 */
 278         if (ez < 1) {
 279                 /* result is tiny; shift right until exponent is within range */
 280                 e = 1 - ez;
 281 
 282                 if (e > 56) {
 283                         z2 = 1;         /* result can't be exactly zero */
 284                         z0 = z1 = 0;
 285                 } else if (e >= 32) {
 286                         sticky = z3 | z2 | ((z1 << 1) << (63 - e));
 287                         z2 = (z1 >> (e - 32)) | ((z0 << 1) << (63 - e));
 288 
 289                         if (sticky)
 290                                 z2 |= 1;
 291 
 292                         z1 = z0 >> (e - 32);
 293                         z0 = 0;
 294                 } else {
 295                         sticky = z3 | (z2 << 1) << (31 - e);
 296                         z2 = (z2 >> e) | ((z1 << 1) << (31 - e));
 297 
 298                         if (sticky)
 299                                 z2 |= 1;
 300 
 301                         z1 = (z1 >> e) | ((z0 << 1) << (31 - e));
 302                         z0 >>= e;
 303                 }
 304 
 305                 ez = 1;
 306         } else if (z0 >= 0x200000) {
 307                 /* carry out; shift right by one */
 308                 sticky = (z2 & 1) | z3;
 309                 z2 = (z2 >> 1) | (z1 << 31);
 310 
 311                 if (sticky)
 312                         z2 |= 1;
 313 
 314                 z1 = (z1 >> 1) | (z0 << 31);
 315                 z0 >>= 1;
 316                 ez++;
 317         } else {
 318                 if (z0 < 0x100000 && (z0 | z1 | z2 | z3) != 0) {
 319                         /*
 320                          * borrow/cancellation; shift left as much as
 321                          * exponent allows
 322                          */
 323                         while (!(z0 | (z1 & 0xffe00000)) && ez >= 33) {
 324                                 z0 = z1;
 325                                 z1 = z2;
 326                                 z2 = z3;
 327                                 z3 = 0;
 328                                 ez -= 32;
 329                         }
 330 
 331                         while (z0 < 0x100000 && ez > 1) {
 332                                 z0 = (z0 << 1) | (z1 >> 31);
 333                                 z1 = (z1 << 1) | (z2 >> 31);
 334                                 z2 = (z2 << 1) | (z3 >> 31);
 335                                 z3 <<= 1;
 336                                 ez--;
 337                         }
 338                 }
 339 
 340                 if (z3)
 341                         z2 |= 1;
 342         }
 343 
 344         /* get the rounding mode and clear current exceptions */
 345         rm = fsr >> 30;
 346         fsr &= ~FSR_CEXC;
 347 
 348         /* strip off the integer bit, if there is one */
 349         ibit = z0 & 0x100000;
 350 
 351         if (ibit) {
 352                 z0 -= 0x100000;
 353         } else {
 354                 ez = 0;
 355 
 356                 if (!(z0 | z1 | z2)) {  /* exact zero */
 357                         zz.i[0] = rm == FSR_RM ? 0x80000000 : 0;
 358                         zz.i[1] = 0;
 359                         __fenv_setfsr32(&fsr);
 360                         return (zz.d);
 361                 }
 362         }
 363 
 364         /*
 365          * flip the sense of directed roundings if the result is negative;
 366          * the logic below applies to a positive result
 367          */
 368         if (sz)
 369                 rm ^= rm >> 1;
 370 
 371         /* round and raise exceptions */
 372         if (z2) {
 373                 fsr |= FSR_NXC;
 374 
 375                 /* decide whether to round the fraction up */
 376                 if (rm == FSR_RP || (rm == FSR_RN && (z2 > 0x80000000u || (z2 ==
 377                     0x80000000u && (z1 & 1))))) {
 378                         /* round up and renormalize if necessary */
 379                         if (++z1 == 0) {
 380                                 if (++z0 == 0x100000) {
 381                                         z0 = 0;
 382                                         ez++;
 383                                 }
 384                         }
 385                 }
 386         }
 387 
 388         /* check for under/overflow */
 389         if (ez >= 0x7ff) {
 390                 if (rm == FSR_RN || rm == FSR_RP) {
 391                         zz.i[0] = sz | 0x7ff00000;
 392                         zz.i[1] = 0;
 393                 } else {
 394                         zz.i[0] = sz | 0x7fefffff;
 395                         zz.i[1] = 0xffffffff;
 396                 }
 397 
 398                 fsr |= FSR_OFC | FSR_NXC;
 399         } else {
 400                 zz.i[0] = sz | (ez << 20) | z0;
 401                 zz.i[1] = z1;
 402 
 403                 /*
 404                  * !ibit => exact result was tiny before rounding,
 405                  * z2 nonzero => result delivered is inexact
 406                  */
 407                 if (!ibit) {
 408                         if (z2)
 409                                 fsr |= FSR_UFC | FSR_NXC;
 410                         else if (fsr & FSR_UFM)
 411                                 fsr |= FSR_UFC;
 412                 }
 413         }
 414 
 415         /* restore the fsr and emulate exceptions as needed */
 416         if ((fsr & FSR_CEXC) & (fsr >> 23)) {
 417                 __fenv_setfsr32(&fsr);
 418 
 419                 if (fsr & FSR_OFC) {
 420                         dummy = huge;
 421                         dummy *= huge;
 422                 } else if (fsr & FSR_UFC) {
 423                         dummy = tiny;
 424 
 425                         if (fsr & FSR_NXC)
 426                                 dummy *= tiny;
 427                         else
 428                                 dummy -= tiny2;
 429                 } else {
 430                         dummy = huge;
 431                         dummy += tiny;
 432                 }
 433         } else {
 434                 fsr |= (fsr & 0x1f) << 5;
 435                 __fenv_setfsr32(&fsr);
 436         }
 437 
 438         return (zz.d);
 439 }

 440 #elif defined(__x86)

 441 #if defined(__amd64)
 442 #define NI              4
 443 #else
 444 #define NI              3
 445 #endif
 446 
 447 /*
 448  *  fma for x86: 64-bit double precision, little-endian
 449  */
 450 double
 451 __fma(double x, double y, double z)
 452 {
 453         union {
 454                 unsigned i[NI];
 455                 long double e;
 456         } xx, yy, zz;
 457 
 458         long double xe, ye, xhi, xlo, yhi, ylo;
 459         int ex, ey, ez;
 460         unsigned cwsw, oldcwsw, rm;
 461 
 462         /* convert the operands to double extended */
 463         xx.e = (long double)x;
 464         yy.e = (long double)y;
 465         zz.e = (long double)z;
 466 
 467         /* extract the exponents of the arguments */
 468         ex = xx.i[2] & 0x7fff;
 469         ey = yy.i[2] & 0x7fff;
 470         ez = zz.i[2] & 0x7fff;
 471 
 472         /* dispense with inf, nan, and zero cases */
 473         if (ex == 0x7fff || ey == 0x7fff || ex == 0 || ey == 0)
 474                 /* x or y is inf, nan, or zero */
 475                 return ((double)(xx.e * yy.e + zz.e));
 476 
 477         if (ez >= 0x7fff)            /* z is inf or nan */
 478                 return ((double)(xx.e + zz.e));
 479 
 480         /* avoid spurious inexact in x * y */
 481 
 482         /*
 483          * save the control and status words, mask all exceptions, and
 484          * set rounding to 64-bit precision and to-nearest
 485          */
 486         __fenv_getcwsw(&oldcwsw);
 487         cwsw = (oldcwsw & 0xf0c0ffff) | 0x033f0000;
 488         __fenv_setcwsw(&cwsw);
 489 
 490         /* multiply x*y to 106 bits */
 491         xe = xx.e;
 492         xx.i[0] = 0;
 493         xhi = xx.e;                     /* hi 32 bits */
 494         xlo = xe - xhi;                 /* lo 21 bits */
 495         ye = yy.e;
 496         yy.i[0] = 0;
 497         yhi = yy.e;
 498         ylo = ye - yhi;
 499         xe = xe * ye;

 503         xhi = ye + zz.e;
 504         yhi = xhi - ye;
 505         xlo = (zz.e - yhi) + (ye - (xhi - yhi));
 506         /* now (xhi,xlo) = ye + z */
 507 
 508         yhi = xe + xhi;
 509         ye = yhi - xe;
 510         ylo = (xhi - ye) + (xe - (yhi - ye));   /* now (yhi,ylo) = xe + xhi */
 511 
 512         xhi = xlo + ylo;
 513         xe = xhi - xlo;
 514         xlo = (ylo - xe) + (xlo - (xhi - xe));  /* now (xhi,xlo) = xlo + ylo */
 515 
 516         yy.e = yhi + xhi;
 517         ylo = (yhi - yy.e) + xhi;               /* now (yy.e,ylo) = xhi + yhi */
 518 
 519         if (yy.i[1] != 0) {                     /* yy.e is nonzero */
 520                 /* perturb yy.e if its least significant 10 bits are zero */
 521                 if (!(yy.i[0] & 0x3ff)) {
 522                         xx.e = ylo + xlo;
 523 
 524                         if (xx.i[1] != 0) {
 525                                 xx.i[2] = (xx.i[2] & 0x8000) | ((yy.i[2] &
 526                                     0x7fff) - 63);
 527                                 xx.i[1] = 0x80000000;
 528                                 xx.i[0] = 0;
 529                                 yy.e += xx.e;
 530                         }
 531                 }
 532         } else {
 533                 /* set sign of zero result according to rounding direction */
 534                 rm = oldcwsw & 0x0c000000;
 535                 yy.i[2] = ((rm == FCW_RM) ? 0x8000 : 0);
 536         }
 537 
 538         /*
 539          * restore the control and status words and convert the result
 540          * to double
 541          */
 542         __fenv_setcwsw(&oldcwsw);
 543         return ((double)yy.e);
 544 }

 545 #else
 546 #error Unknown architecture
 547 #endif