1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 */
25 /*
26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
27 * Use is subject to license terms.
28 */
29
30 #if defined(ELFOBJ)
31 #pragma weak fma = __fma
32 #endif
33
34 #include "libm.h"
35 #include "fma.h"
36 #include "fenv_inlines.h"
37
38 #if defined(__sparc)
39
40 static const union {
41 unsigned i[2];
42 double d;
43 } C[] = {
44 { 0x3fe00000u, 0 },
45 { 0x40000000u, 0 },
46 { 0x43300000u, 0 },
47 { 0x41a00000u, 0 },
48 { 0x3e500000u, 0 },
49 { 0x3df00000u, 0 },
50 { 0x3bf00000u, 0 },
51 { 0x7fe00000u, 0 },
52 { 0x00100000u, 0 },
53 { 0x00100001u, 0 }
54 };
55
56 #define half C[0].d
57 #define two C[1].d
58 #define two52 C[2].d
59 #define two27 C[3].d
60 #define twom26 C[4].d
61 #define twom32 C[5].d
62 #define twom64 C[6].d
63 #define huge C[7].d
64 #define tiny C[8].d
65 #define tiny2 C[9].d
66
67 static const unsigned int fsr_rm = 0xc0000000u;
68
69 /*
70 * fma for SPARC: 64-bit double precision, big-endian
71 */
72 double
73 __fma(double x, double y, double z) {
74 union {
75 unsigned i[2];
76 double d;
77 } xx, yy, zz;
78 double xhi, yhi, xlo, ylo, t;
79 unsigned xy0, xy1, xy2, xy3, z0, z1, z2, z3, rm, sticky;
80 unsigned int fsr;
81 int hx, hy, hz, ex, ey, ez, exy, sxy, sz, e, ibit;
82 volatile double dummy;
83
84 /* extract the high order words of the arguments */
85 xx.d = x;
86 yy.d = y;
87 zz.d = z;
88 hx = xx.i[0] & ~0x80000000;
89 hy = yy.i[0] & ~0x80000000;
90 hz = zz.i[0] & ~0x80000000;
91
92 /* dispense with inf, nan, and zero cases */
93 if (hx >= 0x7ff00000 || hy >= 0x7ff00000 || (hx | xx.i[1]) == 0 ||
94 (hy | yy.i[1]) == 0) /* x or y is inf, nan, or zero */
95 return (x * y + z);
96
97 if (hz >= 0x7ff00000) /* z is inf or nan */
98 return (x + z); /* avoid spurious under/overflow in x * y */
99
100 if ((hz | zz.i[1]) == 0) /* z is zero */
101 /*
102 * x * y isn't zero but could underflow to zero,
103 * so don't add z, lest we perturb the sign
104 */
105 return (x * y);
106
107 /*
108 * now x, y, and z are all finite and nonzero; save the fsr and
109 * set round-to-negative-infinity mode (and clear nonstandard
110 * mode before we try to scale subnormal operands)
111 */
112 __fenv_getfsr32(&fsr);
113 __fenv_setfsr32(&fsr_rm);
114
115 /* extract signs and exponents, and normalize subnormals */
116 sxy = (xx.i[0] ^ yy.i[0]) & 0x80000000;
117 sz = zz.i[0] & 0x80000000;
118 ex = hx >> 20;
119 if (!ex) {
120 xx.d = x * two52;
121 ex = ((xx.i[0] & ~0x80000000) >> 20) - 52;
122 }
123 ey = hy >> 20;
124 if (!ey) {
125 yy.d = y * two52;
126 ey = ((yy.i[0] & ~0x80000000) >> 20) - 52;
127 }
128 ez = hz >> 20;
129 if (!ez) {
130 zz.d = z * two52;
131 ez = ((zz.i[0] & ~0x80000000) >> 20) - 52;
132 }
133
134 /* multiply x*y to 106 bits */
135 exy = ex + ey - 0x3ff;
136 xx.i[0] = (xx.i[0] & 0xfffff) | 0x3ff00000;
137 yy.i[0] = (yy.i[0] & 0xfffff) | 0x3ff00000;
138 x = xx.d;
139 y = yy.d;
140 xhi = ((x + twom26) + two27) - two27;
141 yhi = ((y + twom26) + two27) - two27;
142 xlo = x - xhi;
143 ylo = y - yhi;
144 x *= y;
145 y = ((xhi * yhi - x) + xhi * ylo + xlo * yhi) + xlo * ylo;
146 if (x >= two) {
147 x *= half;
148 y *= half;
149 exy++;
150 }
151
152 /* extract the significands */
153 xx.d = x;
154 xy0 = (xx.i[0] & 0xfffff) | 0x100000;
155 xy1 = xx.i[1];
156 yy.d = t = y + twom32;
157 xy2 = yy.i[1];
158 yy.d = (y - (t - twom32)) + twom64;
159 xy3 = yy.i[1];
160 z0 = (zz.i[0] & 0xfffff) | 0x100000;
161 z1 = zz.i[1];
162 z2 = z3 = 0;
163
164 /*
165 * now x*y is represented by sxy, exy, and xy[0-3], and z is
166 * represented likewise; swap if need be so |xy| <= |z|
167 */
168 if (exy > ez || (exy == ez && (xy0 > z0 || (xy0 == z0 &&
169 (xy1 > z1 || (xy1 == z1 && (xy2 | xy3) != 0)))))) {
170 e = sxy; sxy = sz; sz = e;
171 e = exy; exy = ez; ez = e;
172 e = xy0; xy0 = z0; z0 = e;
173 e = xy1; xy1 = z1; z1 = e;
174 z2 = xy2; xy2 = 0;
175 z3 = xy3; xy3 = 0;
176 }
177
178 /* shift the significand of xy keeping a sticky bit */
179 e = ez - exy;
180 if (e > 116) {
181 xy0 = xy1 = xy2 = 0;
182 xy3 = 1;
183 } else if (e >= 96) {
184 sticky = xy3 | xy2 | xy1 | ((xy0 << 1) << (127 - e));
185 xy3 = xy0 >> (e - 96);
186 if (sticky)
187 xy3 |= 1;
188 xy0 = xy1 = xy2 = 0;
189 } else if (e >= 64) {
190 sticky = xy3 | xy2 | ((xy1 << 1) << (95 - e));
191 xy3 = (xy1 >> (e - 64)) | ((xy0 << 1) << (95 - e));
192 if (sticky)
193 xy3 |= 1;
194 xy2 = xy0 >> (e - 64);
195 xy0 = xy1 = 0;
196 } else if (e >= 32) {
197 sticky = xy3 | ((xy2 << 1) << (63 - e));
198 xy3 = (xy2 >> (e - 32)) | ((xy1 << 1) << (63 - e));
199 if (sticky)
200 xy3 |= 1;
201 xy2 = (xy1 >> (e - 32)) | ((xy0 << 1) << (63 - e));
202 xy1 = xy0 >> (e - 32);
203 xy0 = 0;
204 } else if (e) {
205 sticky = (xy3 << 1) << (31 - e);
206 xy3 = (xy3 >> e) | ((xy2 << 1) << (31 - e));
207 if (sticky)
208 xy3 |= 1;
209 xy2 = (xy2 >> e) | ((xy1 << 1) << (31 - e));
210 xy1 = (xy1 >> e) | ((xy0 << 1) << (31 - e));
211 xy0 >>= e;
212 }
213
214 /* if this is a magnitude subtract, negate the significand of xy */
215 if (sxy ^ sz) {
216 xy0 = ~xy0;
217 xy1 = ~xy1;
218 xy2 = ~xy2;
219 xy3 = -xy3;
220 if (xy3 == 0)
221 if (++xy2 == 0)
222 if (++xy1 == 0)
223 xy0++;
224 }
225
226 /* add, propagating carries */
227 z3 += xy3;
228 e = (z3 < xy3);
229 z2 += xy2;
230 if (e) {
231 z2++;
232 e = (z2 <= xy2);
233 } else
234 e = (z2 < xy2);
235 z1 += xy1;
236 if (e) {
237 z1++;
238 e = (z1 <= xy1);
239 } else
240 e = (z1 < xy1);
241 z0 += xy0;
242 if (e)
243 z0++;
244
245 /* postnormalize and collect rounding information into z2 */
246 if (ez < 1) {
247 /* result is tiny; shift right until exponent is within range */
248 e = 1 - ez;
249 if (e > 56) {
250 z2 = 1; /* result can't be exactly zero */
251 z0 = z1 = 0;
252 } else if (e >= 32) {
253 sticky = z3 | z2 | ((z1 << 1) << (63 - e));
254 z2 = (z1 >> (e - 32)) | ((z0 << 1) << (63 - e));
255 if (sticky)
256 z2 |= 1;
257 z1 = z0 >> (e - 32);
258 z0 = 0;
259 } else {
260 sticky = z3 | (z2 << 1) << (31 - e);
261 z2 = (z2 >> e) | ((z1 << 1) << (31 - e));
262 if (sticky)
263 z2 |= 1;
264 z1 = (z1 >> e) | ((z0 << 1) << (31 - e));
265 z0 >>= e;
266 }
267 ez = 1;
268 } else if (z0 >= 0x200000) {
269 /* carry out; shift right by one */
270 sticky = (z2 & 1) | z3;
271 z2 = (z2 >> 1) | (z1 << 31);
272 if (sticky)
273 z2 |= 1;
274 z1 = (z1 >> 1) | (z0 << 31);
275 z0 >>= 1;
276 ez++;
277 } else {
278 if (z0 < 0x100000 && (z0 | z1 | z2 | z3) != 0) {
279 /*
280 * borrow/cancellation; shift left as much as
281 * exponent allows
282 */
283 while (!(z0 | (z1 & 0xffe00000)) && ez >= 33) {
284 z0 = z1;
285 z1 = z2;
286 z2 = z3;
287 z3 = 0;
288 ez -= 32;
289 }
290 while (z0 < 0x100000 && ez > 1) {
291 z0 = (z0 << 1) | (z1 >> 31);
292 z1 = (z1 << 1) | (z2 >> 31);
293 z2 = (z2 << 1) | (z3 >> 31);
294 z3 <<= 1;
295 ez--;
296 }
297 }
298 if (z3)
299 z2 |= 1;
300 }
301
302 /* get the rounding mode and clear current exceptions */
303 rm = fsr >> 30;
304 fsr &= ~FSR_CEXC;
305
306 /* strip off the integer bit, if there is one */
307 ibit = z0 & 0x100000;
308 if (ibit)
309 z0 -= 0x100000;
310 else {
311 ez = 0;
312 if (!(z0 | z1 | z2)) { /* exact zero */
313 zz.i[0] = rm == FSR_RM ? 0x80000000 : 0;
314 zz.i[1] = 0;
315 __fenv_setfsr32(&fsr);
316 return (zz.d);
317 }
318 }
319
320 /*
321 * flip the sense of directed roundings if the result is negative;
322 * the logic below applies to a positive result
323 */
324 if (sz)
325 rm ^= rm >> 1;
326
327 /* round and raise exceptions */
328 if (z2) {
329 fsr |= FSR_NXC;
330
331 /* decide whether to round the fraction up */
332 if (rm == FSR_RP || (rm == FSR_RN && (z2 > 0x80000000u ||
333 (z2 == 0x80000000u && (z1 & 1))))) {
334 /* round up and renormalize if necessary */
335 if (++z1 == 0) {
336 if (++z0 == 0x100000) {
337 z0 = 0;
338 ez++;
339 }
340 }
341 }
342 }
343
344 /* check for under/overflow */
345 if (ez >= 0x7ff) {
346 if (rm == FSR_RN || rm == FSR_RP) {
347 zz.i[0] = sz | 0x7ff00000;
348 zz.i[1] = 0;
349 } else {
350 zz.i[0] = sz | 0x7fefffff;
351 zz.i[1] = 0xffffffff;
352 }
353 fsr |= FSR_OFC | FSR_NXC;
354 } else {
355 zz.i[0] = sz | (ez << 20) | z0;
356 zz.i[1] = z1;
357
358 /*
359 * !ibit => exact result was tiny before rounding,
360 * z2 nonzero => result delivered is inexact
361 */
362 if (!ibit) {
363 if (z2)
364 fsr |= FSR_UFC | FSR_NXC;
365 else if (fsr & FSR_UFM)
366 fsr |= FSR_UFC;
367 }
368 }
369
370 /* restore the fsr and emulate exceptions as needed */
371 if ((fsr & FSR_CEXC) & (fsr >> 23)) {
372 __fenv_setfsr32(&fsr);
373 if (fsr & FSR_OFC) {
374 dummy = huge;
375 dummy *= huge;
376 } else if (fsr & FSR_UFC) {
377 dummy = tiny;
378 if (fsr & FSR_NXC)
379 dummy *= tiny;
380 else
381 dummy -= tiny2;
382 } else {
383 dummy = huge;
384 dummy += tiny;
385 }
386 } else {
387 fsr |= (fsr & 0x1f) << 5;
388 __fenv_setfsr32(&fsr);
389 }
390 return (zz.d);
391 }
392
393 #elif defined(__x86)
394
395 #if defined(__amd64)
396 #define NI 4
397 #else
398 #define NI 3
399 #endif
400
401 /*
402 * fma for x86: 64-bit double precision, little-endian
403 */
404 double
405 __fma(double x, double y, double z) {
406 union {
407 unsigned i[NI];
408 long double e;
409 } xx, yy, zz;
410 long double xe, ye, xhi, xlo, yhi, ylo;
411 int ex, ey, ez;
412 unsigned cwsw, oldcwsw, rm;
413
414 /* convert the operands to double extended */
415 xx.e = (long double) x;
416 yy.e = (long double) y;
417 zz.e = (long double) z;
418
419 /* extract the exponents of the arguments */
420 ex = xx.i[2] & 0x7fff;
421 ey = yy.i[2] & 0x7fff;
422 ez = zz.i[2] & 0x7fff;
423
424 /* dispense with inf, nan, and zero cases */
425 if (ex == 0x7fff || ey == 0x7fff || ex == 0 || ey == 0)
426 /* x or y is inf, nan, or zero */
427 return ((double) (xx.e * yy.e + zz.e));
428
429 if (ez >= 0x7fff) /* z is inf or nan */
430 return ((double) (xx.e + zz.e));
431 /* avoid spurious inexact in x * y */
432
433 /*
434 * save the control and status words, mask all exceptions, and
435 * set rounding to 64-bit precision and to-nearest
436 */
437 __fenv_getcwsw(&oldcwsw);
438 cwsw = (oldcwsw & 0xf0c0ffff) | 0x033f0000;
439 __fenv_setcwsw(&cwsw);
440
441 /* multiply x*y to 106 bits */
442 xe = xx.e;
443 xx.i[0] = 0;
444 xhi = xx.e; /* hi 32 bits */
445 xlo = xe - xhi; /* lo 21 bits */
446 ye = yy.e;
447 yy.i[0] = 0;
448 yhi = yy.e;
449 ylo = ye - yhi;
450 xe = xe * ye;
451 ye = ((xhi * yhi - xe) + xhi * ylo + xlo * yhi) + xlo * ylo;
452
453 /* distill the sum of xe, ye, and z */
454 xhi = ye + zz.e;
455 yhi = xhi - ye;
456 xlo = (zz.e - yhi) + (ye - (xhi - yhi));
457 /* now (xhi,xlo) = ye + z */
458
459 yhi = xe + xhi;
460 ye = yhi - xe;
461 ylo = (xhi - ye) + (xe - (yhi - ye)); /* now (yhi,ylo) = xe + xhi */
462
463 xhi = xlo + ylo;
464 xe = xhi - xlo;
465 xlo = (ylo - xe) + (xlo - (xhi - xe)); /* now (xhi,xlo) = xlo + ylo */
466
467 yy.e = yhi + xhi;
468 ylo = (yhi - yy.e) + xhi; /* now (yy.e,ylo) = xhi + yhi */
469
470 if (yy.i[1] != 0) { /* yy.e is nonzero */
471 /* perturb yy.e if its least significant 10 bits are zero */
472 if (!(yy.i[0] & 0x3ff)) {
473 xx.e = ylo + xlo;
474 if (xx.i[1] != 0) {
475 xx.i[2] = (xx.i[2] & 0x8000) |
476 ((yy.i[2] & 0x7fff) - 63);
477 xx.i[1] = 0x80000000;
478 xx.i[0] = 0;
479 yy.e += xx.e;
480 }
481 }
482 } else {
483 /* set sign of zero result according to rounding direction */
484 rm = oldcwsw & 0x0c000000;
485 yy.i[2] = ((rm == FCW_RM)? 0x8000 : 0);
486 }
487
488 /*
489 * restore the control and status words and convert the result
490 * to double
491 */
492 __fenv_setcwsw(&oldcwsw);
493 return ((double) yy.e);
494 }
495
496 #if 0
497 /*
498 * another fma for x86: assumes return value will be left in
499 * long double (80-bit double extended) precision
500 */
501 long double
502 __fma(double x, double y, double z) {
503 union {
504 unsigned i[3];
505 long double e;
506 } xx, yy, zz, tt;
507 long double xe, ye, xhi, xlo, yhi, ylo, zhi, zlo;
508 int ex, ey, ez;
509 unsigned cwsw, oldcwsw, s;
510
511 /* convert the operands to double extended */
512 xx.e = (long double) x;
513 yy.e = (long double) y;
514 zz.e = (long double) z;
515
516 /* extract the exponents of the arguments */
517 ex = xx.i[2] & 0x7fff;
518 ey = yy.i[2] & 0x7fff;
519 ez = zz.i[2] & 0x7fff;
520
521 /* dispense with inf, nan, and zero cases */
522 if (ex == 0x7fff || ey == 0x7fff || ex == 0 || ey == 0)
523 /* x or y is inf, nan, or zero */
524 return (xx.e * yy.e + zz.e);
525
526 if (ez >= 0x7fff) /* z is inf or nan */
527 return (xx.e + zz.e); /* avoid spurious inexact in x * y */
528
529 if (ez == 0) /* z is zero */
530 return (xx.e * yy.e); /* x * y isn't zero; no need to add z */
531
532 /*
533 * save the control and status words, mask all exceptions, and
534 * set rounding to 64-bit precision and to-nearest
535 */
536 __fenv_getcwsw(&oldcwsw);
537 cwsw = (oldcwsw & 0xf0c0ffff) | 0x033f0000;
538 __fenv_setcwsw(&cwsw);
539
540 /* multiply x*y to 106 bits */
541 xe = xx.e;
542 xx.i[0] = 0;
543 xhi = xx.e; /* hi 32 bits */
544 xlo = xe - xhi; /* lo 21 bits */
545 ye = yy.e;
546 yy.i[0] = 0;
547 yhi = yy.e;
548 ylo = ye - yhi;
549 xx.e = xe * ye;
550 xx.i[0] &= ~0x7ff; /* 53 bits of x*y */
551 yy.e = ((xhi * yhi - xx.e) + xhi * ylo + xlo * yhi) + xlo * ylo;
552
553 /* reduce to a sum of two terms */
554 if (yy.e != 0.0) {
555 ex = xx.i[2] & 0x7fff;
556 if (ez - ex > 10) {
557 /* collapse y into a single bit and add to x */
558 yy.i[0] = 0;
559 yy.i[1] = 0x80000000;
560 yy.i[2] = (yy.i[2] & 0x8000) | (ex - 60);
561 xx.e += yy.e;
562 } else if (ex - ez <= 10) {
563 xx.e += zz.e; /* exact */
564 zz.e = yy.e;
565 } else if (ex - ez <= 42) {
566 /* split z into two pieces */
567 tt.i[0] = 0;
568 tt.i[1] = 0x80000000;
569 tt.i[2] = ex + 11;
570 zhi = (zz.e + tt.e) - tt.e;
571 zlo = zz.e - zhi;
572 xx.e += zhi;
573 zz.e = yy.e + zlo;
574 } else if (ex - ez <= 63) {
575 zz.e += yy.e; /* exact */
576 } else if (ex - ez <= 106) {
577 /*
578 * collapse the tail of z into a sticky bit and add z
579 * to y without error
580 */
581 if (ex - ez <= 81) {
582 s = 1 << (ex - ez - 50);
583 if (zz.i[0] & (s - 1))
584 zz.i[0] |= s;
585 zz.i[0] &= ~(s - 1);
586 } else {
587 s = 1 << (ex - ez - 82);
588 if ((zz.i[1] & (s - 1)) | zz.i[0])
589 zz.i[1] |= s;
590 zz.i[1] &= ~(s - 1);
591 zz.i[0] = 0;
592 }
593 zz.e += yy.e;
594 } else {
595 /* collapse z into a single bit and add to y */
596 zz.i[0] = 0;
597 zz.i[1] = 0x80000000;
598 zz.i[2] = (zz.i[2] & 0x8000) | (ex - 113);
599 zz.e += yy.e;
600 }
601 }
602
603 /* restore the control and status words, and sum */
604 __fenv_setcwsw(&oldcwsw);
605 return (xx.e + zz.e);
606 }
607 #endif
608
609 #else
610 #error Unknown architecture
611 #endif