il_5262 Wdiff usr/src/lib/libm/common/m9x/fma.c

Print this page

5262 libm needs to be carefully unifdef'd
5268 libm doesn't need to hide symbols which are already local

Split	Close
Expand all
Collapse all

          --- old/usr/src/lib/libm/common/m9x/fma.c
          +++ new/usr/src/lib/libm/common/m9x/fma.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END

↓ open down ↓

19 lines elided

↑ open up ↑

  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24   24   */
  25   25  /*
  26   26   * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  27   27   * Use is subject to license terms.
  28   28   */
  29   29  
  30      -#if defined(ELFOBJ)
  31   30  #pragma weak fma = __fma
  32      -#endif
  33   31  
  34   32  #include "libm.h"
  35   33  #include "fma.h"
  36   34  #include "fenv_inlines.h"
  37   35  
  38   36  #if defined(__sparc)
  39   37  
  40   38  static const union {
  41   39          unsigned i[2];
  42   40          double d;

  43   41  } C[] = {
  44   42          { 0x3fe00000u, 0 },
  45   43          { 0x40000000u, 0 },
  46   44          { 0x43300000u, 0 },
  47   45          { 0x41a00000u, 0 },
  48   46          { 0x3e500000u, 0 },
  49   47          { 0x3df00000u, 0 },
  50   48          { 0x3bf00000u, 0 },
  51   49          { 0x7fe00000u, 0 },
  52   50          { 0x00100000u, 0 },
  53   51          { 0x00100001u, 0 }
  54   52  };
  55   53  
  56   54  #define half    C[0].d
  57   55  #define two     C[1].d
  58   56  #define two52   C[2].d
  59   57  #define two27   C[3].d
  60   58  #define twom26  C[4].d
  61   59  #define twom32  C[5].d
  62   60  #define twom64  C[6].d
  63   61  #define huge    C[7].d
  64   62  #define tiny    C[8].d
  65   63  #define tiny2   C[9].d
  66   64  
  67   65  static const unsigned int fsr_rm = 0xc0000000u;
  68   66  
  69   67  /*
  70   68   * fma for SPARC: 64-bit double precision, big-endian
  71   69   */
  72   70  double
  73   71  __fma(double x, double y, double z) {
  74   72          union {
  75   73                  unsigned i[2];
  76   74                  double d;
  77   75          } xx, yy, zz;
  78   76          double xhi, yhi, xlo, ylo, t;
  79   77          unsigned int xy0, xy1, xy2, xy3, z0, z1, z2, z3, fsr, rm, sticky;
  80   78          int hx, hy, hz, ex, ey, ez, exy, sxy, sz, e, ibit;
  81   79          volatile double dummy;
  82   80  
  83   81          /* extract the high order words of the arguments */
  84   82          xx.d = x;
  85   83          yy.d = y;
  86   84          zz.d = z;
  87   85          hx = xx.i[0] & ~0x80000000;
  88   86          hy = yy.i[0] & ~0x80000000;
  89   87          hz = zz.i[0] & ~0x80000000;
  90   88  
  91   89          /* dispense with inf, nan, and zero cases */
  92   90          if (hx >= 0x7ff00000 || hy >= 0x7ff00000 || (hx | xx.i[1]) == 0 ||
  93   91                  (hy | yy.i[1]) == 0)    /* x or y is inf, nan, or zero */
  94   92                  return (x * y + z);
  95   93  
  96   94          if (hz >= 0x7ff00000)   /* z is inf or nan */
  97   95                  return (x + z); /* avoid spurious under/overflow in x * y */
  98   96  
  99   97          if ((hz | zz.i[1]) == 0)        /* z is zero */
 100   98                  /*
 101   99                   * x * y isn't zero but could underflow to zero,
 102  100                   * so don't add z, lest we perturb the sign
 103  101                   */
 104  102                  return (x * y);
 105  103  
 106  104          /*
 107  105           * now x, y, and z are all finite and nonzero; save the fsr and
 108  106           * set round-to-negative-infinity mode (and clear nonstandard
 109  107           * mode before we try to scale subnormal operands)
 110  108           */
 111  109          __fenv_getfsr32(&fsr);
 112  110          __fenv_setfsr32(&fsr_rm);
 113  111  
 114  112          /* extract signs and exponents, and normalize subnormals */
 115  113          sxy = (xx.i[0] ^ yy.i[0]) & 0x80000000;
 116  114          sz = zz.i[0] & 0x80000000;
 117  115          ex = hx >> 20;
 118  116          if (!ex) {
 119  117                  xx.d = x * two52;
 120  118                  ex = ((xx.i[0] & ~0x80000000) >> 20) - 52;
 121  119          }
 122  120          ey = hy >> 20;
 123  121          if (!ey) {
 124  122                  yy.d = y * two52;
 125  123                  ey = ((yy.i[0] & ~0x80000000) >> 20) - 52;
 126  124          }
 127  125          ez = hz >> 20;
 128  126          if (!ez) {
 129  127                  zz.d = z * two52;
 130  128                  ez = ((zz.i[0] & ~0x80000000) >> 20) - 52;
 131  129          }
 132  130  
 133  131          /* multiply x*y to 106 bits */
 134  132          exy = ex + ey - 0x3ff;
 135  133          xx.i[0] = (xx.i[0] & 0xfffff) | 0x3ff00000;
 136  134          yy.i[0] = (yy.i[0] & 0xfffff) | 0x3ff00000;
 137  135          x = xx.d;
 138  136          y = yy.d;
 139  137          xhi = ((x + twom26) + two27) - two27;
 140  138          yhi = ((y + twom26) + two27) - two27;
 141  139          xlo = x - xhi;
 142  140          ylo = y - yhi;
 143  141          x *= y;
 144  142          y = ((xhi * yhi - x) + xhi * ylo + xlo * yhi) + xlo * ylo;
 145  143          if (x >= two) {
 146  144                  x *= half;
 147  145                  y *= half;
 148  146                  exy++;
 149  147          }
 150  148  
 151  149          /* extract the significands */
 152  150          xx.d = x;
 153  151          xy0 = (xx.i[0] & 0xfffff) | 0x100000;
 154  152          xy1 = xx.i[1];
 155  153          yy.d = t = y + twom32;
 156  154          xy2 = yy.i[1];
 157  155          yy.d = (y - (t - twom32)) + twom64;
 158  156          xy3 = yy.i[1];
 159  157          z0 = (zz.i[0] & 0xfffff) | 0x100000;
 160  158          z1 = zz.i[1];
 161  159          z2 = z3 = 0;
 162  160  
 163  161          /*
 164  162           * now x*y is represented by sxy, exy, and xy[0-3], and z is
 165  163           * represented likewise; swap if need be so |xy| <= |z|
 166  164           */
 167  165          if (exy > ez || (exy == ez && (xy0 > z0 || (xy0 == z0 &&
 168  166                  (xy1 > z1 || (xy1 == z1 && (xy2 | xy3) != 0)))))) {
 169  167                  e = sxy; sxy = sz; sz = e;
 170  168                  e = exy; exy = ez; ez = e;
 171  169                  e = xy0; xy0 = z0; z0 = e;
 172  170                  e = xy1; xy1 = z1; z1 = e;
 173  171                  z2 = xy2; xy2 = 0;
 174  172                  z3 = xy3; xy3 = 0;
 175  173          }
 176  174  
 177  175          /* shift the significand of xy keeping a sticky bit */
 178  176          e = ez - exy;
 179  177          if (e > 116) {
 180  178                  xy0 = xy1 = xy2 = 0;
 181  179                  xy3 = 1;
 182  180          } else if (e >= 96) {
 183  181                  sticky = xy3 | xy2 | xy1 | ((xy0 << 1) << (127 - e));
 184  182                  xy3 = xy0 >> (e - 96);
 185  183                  if (sticky)
 186  184                          xy3 |= 1;
 187  185                  xy0 = xy1 = xy2 = 0;
 188  186          } else if (e >= 64) {
 189  187                  sticky = xy3 | xy2 | ((xy1 << 1) << (95 - e));
 190  188                  xy3 = (xy1 >> (e - 64)) | ((xy0 << 1) << (95 - e));
 191  189                  if (sticky)
 192  190                          xy3 |= 1;
 193  191                  xy2 = xy0 >> (e - 64);
 194  192                  xy0 = xy1 = 0;
 195  193          } else if (e >= 32) {
 196  194                  sticky = xy3 | ((xy2 << 1) << (63 - e));
 197  195                  xy3 = (xy2 >> (e - 32)) | ((xy1 << 1) << (63 - e));
 198  196                  if (sticky)
 199  197                          xy3 |= 1;
 200  198                  xy2 = (xy1 >> (e - 32)) | ((xy0 << 1) << (63 - e));
 201  199                  xy1 = xy0 >> (e - 32);
 202  200                  xy0 = 0;
 203  201          } else if (e) {
 204  202                  sticky = (xy3 << 1) << (31 - e);
 205  203                  xy3 = (xy3 >> e) | ((xy2 << 1) << (31 - e));
 206  204                  if (sticky)
 207  205                          xy3 |= 1;
 208  206                  xy2 = (xy2 >> e) | ((xy1 << 1) << (31 - e));
 209  207                  xy1 = (xy1 >> e) | ((xy0 << 1) << (31 - e));
 210  208                  xy0 >>= e;
 211  209          }
 212  210  
 213  211          /* if this is a magnitude subtract, negate the significand of xy */
 214  212          if (sxy ^ sz) {
 215  213                  xy0 = ~xy0;
 216  214                  xy1 = ~xy1;
 217  215                  xy2 = ~xy2;
 218  216                  xy3 = -xy3;
 219  217                  if (xy3 == 0)
 220  218                          if (++xy2 == 0)
 221  219                                  if (++xy1 == 0)
 222  220                                          xy0++;
 223  221          }
 224  222  
 225  223          /* add, propagating carries */
 226  224          z3 += xy3;
 227  225          e = (z3 < xy3);
 228  226          z2 += xy2;
 229  227          if (e) {
 230  228                  z2++;
 231  229                  e = (z2 <= xy2);
 232  230          } else
 233  231                  e = (z2 < xy2);
 234  232          z1 += xy1;
 235  233          if (e) {
 236  234                  z1++;
 237  235                  e = (z1 <= xy1);
 238  236          } else
 239  237                  e = (z1 < xy1);
 240  238          z0 += xy0;
 241  239          if (e)
 242  240                  z0++;
 243  241  
 244  242          /* postnormalize and collect rounding information into z2 */
 245  243          if (ez < 1) {
 246  244                  /* result is tiny; shift right until exponent is within range */
 247  245                  e = 1 - ez;
 248  246                  if (e > 56) {
 249  247                          z2 = 1; /* result can't be exactly zero */
 250  248                          z0 = z1 = 0;
 251  249                  } else if (e >= 32) {
 252  250                          sticky = z3 | z2 | ((z1 << 1) << (63 - e));
 253  251                          z2 = (z1 >> (e - 32)) | ((z0 << 1) << (63 - e));
 254  252                          if (sticky)
 255  253                                  z2 |= 1;
 256  254                          z1 = z0 >> (e - 32);
 257  255                          z0 = 0;
 258  256                  } else {
 259  257                          sticky = z3 | (z2 << 1) << (31 - e);
 260  258                          z2 = (z2 >> e) | ((z1 << 1) << (31 - e));
 261  259                          if (sticky)
 262  260                                  z2 |= 1;
 263  261                          z1 = (z1 >> e) | ((z0 << 1) << (31 - e));
 264  262                          z0 >>= e;
 265  263                  }
 266  264                  ez = 1;
 267  265          } else if (z0 >= 0x200000) {
 268  266                  /* carry out; shift right by one */
 269  267                  sticky = (z2 & 1) | z3;
 270  268                  z2 = (z2 >> 1) | (z1 << 31);
 271  269                  if (sticky)
 272  270                          z2 |= 1;
 273  271                  z1 = (z1 >> 1) | (z0 << 31);
 274  272                  z0 >>= 1;
 275  273                  ez++;
 276  274          } else {
 277  275                  if (z0 < 0x100000 && (z0 | z1 | z2 | z3) != 0) {
 278  276                          /*
 279  277                           * borrow/cancellation; shift left as much as
 280  278                           * exponent allows
 281  279                           */
 282  280                          while (!(z0 | (z1 & 0xffe00000)) && ez >= 33) {
 283  281                                  z0 = z1;
 284  282                                  z1 = z2;
 285  283                                  z2 = z3;
 286  284                                  z3 = 0;
 287  285                                  ez -= 32;
 288  286                          }
 289  287                          while (z0 < 0x100000 && ez > 1) {
 290  288                                  z0 = (z0 << 1) | (z1 >> 31);
 291  289                                  z1 = (z1 << 1) | (z2 >> 31);
 292  290                                  z2 = (z2 << 1) | (z3 >> 31);
 293  291                                  z3 <<= 1;
 294  292                                  ez--;
 295  293                          }
 296  294                  }
 297  295                  if (z3)
 298  296                          z2 |= 1;
 299  297          }
 300  298  
 301  299          /* get the rounding mode and clear current exceptions */
 302  300          rm = fsr >> 30;
 303  301          fsr &= ~FSR_CEXC;
 304  302  
 305  303          /* strip off the integer bit, if there is one */
 306  304          ibit = z0 & 0x100000;
 307  305          if (ibit)
 308  306                  z0 -= 0x100000;
 309  307          else {
 310  308                  ez = 0;
 311  309                  if (!(z0 | z1 | z2)) { /* exact zero */
 312  310                          zz.i[0] = rm == FSR_RM ? 0x80000000 : 0;
 313  311                          zz.i[1] = 0;
 314  312                          __fenv_setfsr32(&fsr);
 315  313                          return (zz.d);
 316  314                  }
 317  315          }
 318  316  
 319  317          /*
 320  318           * flip the sense of directed roundings if the result is negative;
 321  319           * the logic below applies to a positive result
 322  320           */
 323  321          if (sz)
 324  322                  rm ^= rm >> 1;
 325  323  
 326  324          /* round and raise exceptions */
 327  325          if (z2) {
 328  326                  fsr |= FSR_NXC;
 329  327  
 330  328                  /* decide whether to round the fraction up */
 331  329                  if (rm == FSR_RP || (rm == FSR_RN && (z2 > 0x80000000u ||
 332  330                          (z2 == 0x80000000u && (z1 & 1))))) {
 333  331                          /* round up and renormalize if necessary */
 334  332                          if (++z1 == 0) {
 335  333                                  if (++z0 == 0x100000) {
 336  334                                          z0 = 0;
 337  335                                          ez++;
 338  336                                  }
 339  337                          }
 340  338                  }
 341  339          }
 342  340  
 343  341          /* check for under/overflow */
 344  342          if (ez >= 0x7ff) {
 345  343                  if (rm == FSR_RN || rm == FSR_RP) {
 346  344                          zz.i[0] = sz | 0x7ff00000;
 347  345                          zz.i[1] = 0;
 348  346                  } else {
 349  347                          zz.i[0] = sz | 0x7fefffff;
 350  348                          zz.i[1] = 0xffffffff;
 351  349                  }
 352  350                  fsr |= FSR_OFC | FSR_NXC;
 353  351          } else {
 354  352                  zz.i[0] = sz | (ez << 20) | z0;
 355  353                  zz.i[1] = z1;
 356  354  
 357  355                  /*
 358  356                   * !ibit => exact result was tiny before rounding,
 359  357                   * z2 nonzero => result delivered is inexact
 360  358                   */
 361  359                  if (!ibit) {
 362  360                          if (z2)
 363  361                                  fsr |= FSR_UFC | FSR_NXC;
 364  362                          else if (fsr & FSR_UFM)
 365  363                                  fsr |= FSR_UFC;
 366  364                  }
 367  365          }
 368  366  
 369  367          /* restore the fsr and emulate exceptions as needed */
 370  368          if ((fsr & FSR_CEXC) & (fsr >> 23)) {
 371  369                  __fenv_setfsr32(&fsr);
 372  370                  if (fsr & FSR_OFC) {
 373  371                          dummy = huge;
 374  372                          dummy *= huge;
 375  373                  } else if (fsr & FSR_UFC) {
 376  374                          dummy = tiny;
 377  375                          if (fsr & FSR_NXC)
 378  376                                  dummy *= tiny;
 379  377                          else
 380  378                                  dummy -= tiny2;
 381  379                  } else {
 382  380                          dummy = huge;
 383  381                          dummy += tiny;
 384  382                  }
 385  383          } else {
 386  384                  fsr |= (fsr & 0x1f) << 5;
 387  385                  __fenv_setfsr32(&fsr);
 388  386          }
 389  387          return (zz.d);
 390  388  }
 391  389  
 392  390  #elif defined(__x86)
 393  391  
 394  392  #if defined(__amd64)
 395  393  #define NI      4
 396  394  #else
 397  395  #define NI      3
 398  396  #endif
 399  397  
 400  398  /*
 401  399   *  fma for x86: 64-bit double precision, little-endian
 402  400   */
 403  401  double
 404  402  __fma(double x, double y, double z) {
 405  403          union {
 406  404                  unsigned i[NI];
 407  405                  long double e;
 408  406          } xx, yy, zz;
 409  407          long double xe, ye, xhi, xlo, yhi, ylo;
 410  408          int ex, ey, ez;
 411  409          unsigned cwsw, oldcwsw, rm;
 412  410  
 413  411          /* convert the operands to double extended */
 414  412          xx.e = (long double) x;
 415  413          yy.e = (long double) y;
 416  414          zz.e = (long double) z;
 417  415  
 418  416          /* extract the exponents of the arguments */
 419  417          ex = xx.i[2] & 0x7fff;
 420  418          ey = yy.i[2] & 0x7fff;
 421  419          ez = zz.i[2] & 0x7fff;
 422  420  
 423  421          /* dispense with inf, nan, and zero cases */
 424  422          if (ex == 0x7fff || ey == 0x7fff || ex == 0 || ey == 0)
 425  423                  /* x or y is inf, nan, or zero */
 426  424                  return ((double) (xx.e * yy.e + zz.e));
 427  425  
 428  426          if (ez >= 0x7fff) /* z is inf or nan */
 429  427                  return ((double) (xx.e + zz.e));
 430  428                                          /* avoid spurious inexact in x * y */
 431  429  
 432  430          /*
 433  431           * save the control and status words, mask all exceptions, and
 434  432           * set rounding to 64-bit precision and to-nearest
 435  433           */
 436  434          __fenv_getcwsw(&oldcwsw);
 437  435          cwsw = (oldcwsw & 0xf0c0ffff) | 0x033f0000;
 438  436          __fenv_setcwsw(&cwsw);
 439  437  
 440  438          /* multiply x*y to 106 bits */
 441  439          xe = xx.e;
 442  440          xx.i[0] = 0;
 443  441          xhi = xx.e; /* hi 32 bits */
 444  442          xlo = xe - xhi; /* lo 21 bits */
 445  443          ye = yy.e;
 446  444          yy.i[0] = 0;
 447  445          yhi = yy.e;
 448  446          ylo = ye - yhi;
 449  447          xe = xe * ye;
 450  448          ye = ((xhi * yhi - xe) + xhi * ylo + xlo * yhi) + xlo * ylo;
 451  449  
 452  450          /* distill the sum of xe, ye, and z */
 453  451          xhi = ye + zz.e;
 454  452          yhi = xhi - ye;
 455  453          xlo = (zz.e - yhi) + (ye - (xhi - yhi));
 456  454                                                  /* now (xhi,xlo) = ye + z */
 457  455  
 458  456          yhi = xe + xhi;
 459  457          ye = yhi - xe;
 460  458          ylo = (xhi - ye) + (xe - (yhi - ye));   /* now (yhi,ylo) = xe + xhi */
 461  459  
 462  460          xhi = xlo + ylo;
 463  461          xe = xhi - xlo;
 464  462          xlo = (ylo - xe) + (xlo - (xhi - xe));  /* now (xhi,xlo) = xlo + ylo */
 465  463  
 466  464          yy.e = yhi + xhi;
 467  465          ylo = (yhi - yy.e) + xhi;               /* now (yy.e,ylo) = xhi + yhi */
 468  466  
 469  467          if (yy.i[1] != 0) {     /* yy.e is nonzero */
 470  468                  /* perturb yy.e if its least significant 10 bits are zero */
 471  469                  if (!(yy.i[0] & 0x3ff)) {
 472  470                          xx.e = ylo + xlo;
 473  471                          if (xx.i[1] != 0) {
 474  472                                  xx.i[2] = (xx.i[2] & 0x8000) |
 475  473                                          ((yy.i[2] & 0x7fff) - 63);
 476  474                                  xx.i[1] = 0x80000000;
 477  475                                  xx.i[0] = 0;
 478  476                                  yy.e += xx.e;
 479  477                          }
 480  478                  }
 481  479          } else {
 482  480                  /* set sign of zero result according to rounding direction */
 483  481                  rm = oldcwsw & 0x0c000000;
 484  482                  yy.i[2] = ((rm == FCW_RM)? 0x8000 : 0);
 485  483          }
 486  484  
 487  485          /*
 488  486           * restore the control and status words and convert the result
 489  487           * to double
 490  488           */
 491  489          __fenv_setcwsw(&oldcwsw);
 492  490          return ((double) yy.e);
 493  491  }
 494  492  
 495  493  #else
 496  494  #error Unknown architecture
 497  495  #endif

↓ open down ↓

455 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX