1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25 /*
  26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  27  * Use is subject to license terms.
  28  */
  29 
  30 #pragma weak fmaf = __fmaf
  31 
  32 #include "libm.h"
  33 #include "fma.h"
  34 #include "fenv_inlines.h"
  35 
  36 #if defined(__sparc)
  37 
  38 /*
  39  * fmaf for SPARC: 32-bit single precision, big-endian
  40  */
  41 float
  42 __fmaf(float x, float y, float z) {
  43         union {
  44                 unsigned i[2];
  45                 double d;
  46         } xy, zz;
  47         unsigned u, s;
  48         int exy, ez;
  49 
  50         /*
  51          * the following operations can only raise the invalid exception,
  52          * and then only if either x*y is of the form Inf*0 or one of x,
  53          * y, or z is a signaling NaN
  54          */
  55         xy.d = (double) x * y;
  56         zz.d = (double) z;
  57 
  58         /*
  59          * if the sum xy + z will be exact, just compute it and cast the
  60          * result to float
  61          */
  62         exy = (xy.i[0] >> 20) & 0x7ff;
  63         ez = (zz.i[0] >> 20) & 0x7ff;
  64         if ((ez - exy <= 4 && exy - ez <= 28) || exy == 0x7ff || exy == 0 ||
  65                 ez == 0x7ff || ez == 0) {
  66                 return ((float) (xy.d + zz.d));
  67         }
  68 
  69         /*
  70          * collapse the tail of the smaller summand into a "sticky bit"
  71          * so that the sum can be computed without error
  72          */
  73         if (ez > exy) {
  74                 if (ez - exy < 31) {
  75                         u = xy.i[1];
  76                         s = 2 << (ez - exy);
  77                         if (u & (s - 1))
  78                                 u |= s;
  79                         xy.i[1] = u & ~(s - 1);
  80                 } else if (ez - exy < 51) {
  81                         u = xy.i[0];
  82                         s = 1 << (ez - exy - 31);
  83                         if ((u & (s - 1)) | xy.i[1])
  84                                 u |= s;
  85                         xy.i[0] = u & ~(s - 1);
  86                         xy.i[1] = 0;
  87                 } else {
  88                         /* collapse all of xy into a single bit */
  89                         xy.i[0] = (xy.i[0] & 0x80000000) | ((ez - 51) << 20);
  90                         xy.i[1] = 0;
  91                 }
  92         } else {
  93                 if (exy - ez < 31) {
  94                         u = zz.i[1];
  95                         s = 2 << (exy - ez);
  96                         if (u & (s - 1))
  97                                 u |= s;
  98                         zz.i[1] = u & ~(s - 1);
  99                 } else if (exy - ez < 51) {
 100                         u = zz.i[0];
 101                         s = 1 << (exy - ez - 31);
 102                         if ((u & (s - 1)) | zz.i[1])
 103                                 u |= s;
 104                         zz.i[0] = u & ~(s - 1);
 105                         zz.i[1] = 0;
 106                 } else {
 107                         /* collapse all of zz into a single bit */
 108                         zz.i[0] = (zz.i[0] & 0x80000000) | ((exy - 51) << 20);
 109                         zz.i[1] = 0;
 110                 }
 111         }
 112 
 113         return ((float) (xy.d + zz.d));
 114 }
 115 
 116 #elif defined(__x86)
 117 
 118 #if defined(__amd64)
 119 #define NI      4
 120 #else
 121 #define NI      3
 122 #endif
 123 
 124 /*
 125  * fmaf for x86: 32-bit single precision, little-endian
 126  */
 127 float
 128 __fmaf(float x, float y, float z) {
 129         union {
 130                 unsigned i[NI];
 131                 long double e;
 132         } xy, zz;
 133         unsigned u, s, cwsw, oldcwsw;
 134         int exy, ez;
 135 
 136         /* set rounding precision to 64 bits */
 137         __fenv_getcwsw(&oldcwsw);
 138         cwsw = (oldcwsw & 0xfcffffff) | 0x03000000;
 139         __fenv_setcwsw(&cwsw);
 140 
 141         /*
 142          * the following operations can only raise the invalid exception,
 143          * and then only if either x*y is of the form Inf*0 or one of x,
 144          * y, or z is a signaling NaN
 145          */
 146         xy.e = (long double) x * y;
 147         zz.e = (long double) z;
 148 
 149         /*
 150          * if the sum xy + z will be exact, just compute it and cast the
 151          * result to float
 152          */
 153         exy = xy.i[2] & 0x7fff;
 154         ez = zz.i[2] & 0x7fff;
 155         if ((ez - exy <= 15 && exy - ez <= 39) || exy == 0x7fff || exy == 0 ||
 156                 ez == 0x7fff || ez == 0) {
 157                 goto cont;
 158         }
 159 
 160         /*
 161          * collapse the tail of the smaller summand into a "sticky bit"
 162          * so that the sum can be computed without error
 163          */
 164         if (ez > exy) {
 165                 if (ez - exy < 31) {
 166                         u = xy.i[0];
 167                         s = 2 << (ez - exy);
 168                         if (u & (s - 1))
 169                                 u |= s;
 170                         xy.i[0] = u & ~(s - 1);
 171                 } else if (ez - exy < 62) {
 172                         u = xy.i[1];
 173                         s = 1 << (ez - exy - 31);
 174                         if ((u & (s - 1)) | xy.i[0])
 175                                 u |= s;
 176                         xy.i[1] = u & ~(s - 1);
 177                         xy.i[0] = 0;
 178                 } else {
 179                         /* collapse all of xy into a single bit */
 180                         xy.i[0] = 0;
 181                         xy.i[1] = 0x80000000;
 182                         xy.i[2] = (xy.i[2] & 0x8000) | (ez - 62);
 183                 }
 184         } else {
 185                 if (exy - ez < 62) {
 186                         u = zz.i[1];
 187                         s = 1 << (exy - ez - 31);
 188                         if ((u & (s - 1)) | zz.i[0])
 189                                 u |= s;
 190                         zz.i[1] = u & ~(s - 1);
 191                         zz.i[0] = 0;
 192                 } else {
 193                         /* collapse all of zz into a single bit */
 194                         zz.i[0] = 0;
 195                         zz.i[1] = 0x80000000;
 196                         zz.i[2] = (zz.i[2] & 0x8000) | (exy - 62);
 197                 }
 198         }
 199 
 200 cont:
 201         xy.e += zz.e;
 202 
 203         /* restore the rounding precision */
 204         __fenv_getcwsw(&cwsw);
 205         cwsw = (cwsw & 0xfcffffff) | (oldcwsw & 0x03000000);
 206         __fenv_setcwsw(&cwsw);
 207 
 208         return ((float) xy.e);
 209 }
 210 
 211 #if 0
 212 /*
 213  * another fmaf for x86: assumes return value will be left in
 214  * long double (80-bit double extended) precision
 215  */
 216 long double
 217 __fmaf(float x, float y, float z) {
 218         /*
 219          * Note: This implementation assumes the rounding precision mode
 220          * is set to the default, rounding to 64 bit precision.  If this
 221          * routine must work in non-default rounding precision modes, do
 222          * the following instead:
 223          *
 224          *   long double t;
 225          *
 226          *   <set rp mode to round to 64 bit precision>
 227          *   t = x * y;
 228          *   <restore rp mode>
 229          *   return t + z;
 230          *
 231          * Note that the code to change rounding precision must not alter
 232          * the exception masks or flags, since the product x * y may raise
 233          * an invalid operation exception.
 234          */
 235         return ((long double) x * y + z);
 236 }
 237 #endif
 238 
 239 #else
 240 #error Unknown architecture
 241 #endif