Print this page
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/lib/libm/common/m9x/fma.c
+++ new/usr/src/lib/libm/common/m9x/fma.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 24 */
25 25 /*
26 26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
27 27 * Use is subject to license terms.
28 28 */
29 29
30 30 #if defined(ELFOBJ)
31 31 #pragma weak fma = __fma
32 32 #endif
33 33
34 34 #include "libm.h"
35 35 #include "fma.h"
36 36 #include "fenv_inlines.h"
37 37
38 38 #if defined(__sparc)
39 39
40 40 static const union {
41 41 unsigned i[2];
42 42 double d;
43 43 } C[] = {
44 44 { 0x3fe00000u, 0 },
45 45 { 0x40000000u, 0 },
46 46 { 0x43300000u, 0 },
47 47 { 0x41a00000u, 0 },
48 48 { 0x3e500000u, 0 },
49 49 { 0x3df00000u, 0 },
50 50 { 0x3bf00000u, 0 },
51 51 { 0x7fe00000u, 0 },
52 52 { 0x00100000u, 0 },
53 53 { 0x00100001u, 0 }
54 54 };
55 55
56 56 #define half C[0].d
57 57 #define two C[1].d
58 58 #define two52 C[2].d
59 59 #define two27 C[3].d
60 60 #define twom26 C[4].d
61 61 #define twom32 C[5].d
62 62 #define twom64 C[6].d
63 63 #define huge C[7].d
64 64 #define tiny C[8].d
65 65 #define tiny2 C[9].d
66 66
67 67 static const unsigned int fsr_rm = 0xc0000000u;
68 68
↓ open down ↓ |
68 lines elided |
↑ open up ↑ |
69 69 /*
70 70 * fma for SPARC: 64-bit double precision, big-endian
71 71 */
72 72 double
73 73 __fma(double x, double y, double z) {
74 74 union {
75 75 unsigned i[2];
76 76 double d;
77 77 } xx, yy, zz;
78 78 double xhi, yhi, xlo, ylo, t;
79 - unsigned int xy0, xy1, xy2, xy3, z0, z1, z2, z3, rm, sticky;
80 - unsigned int fsr;
79 + unsigned int xy0, xy1, xy2, xy3, z0, z1, z2, z3, fsr, rm, sticky;
81 80 int hx, hy, hz, ex, ey, ez, exy, sxy, sz, e, ibit;
82 81 volatile double dummy;
83 82
84 83 /* extract the high order words of the arguments */
85 84 xx.d = x;
86 85 yy.d = y;
87 86 zz.d = z;
88 87 hx = xx.i[0] & ~0x80000000;
89 88 hy = yy.i[0] & ~0x80000000;
90 89 hz = zz.i[0] & ~0x80000000;
91 90
92 91 /* dispense with inf, nan, and zero cases */
93 92 if (hx >= 0x7ff00000 || hy >= 0x7ff00000 || (hx | xx.i[1]) == 0 ||
94 93 (hy | yy.i[1]) == 0) /* x or y is inf, nan, or zero */
95 94 return (x * y + z);
96 95
97 96 if (hz >= 0x7ff00000) /* z is inf or nan */
98 97 return (x + z); /* avoid spurious under/overflow in x * y */
99 98
100 99 if ((hz | zz.i[1]) == 0) /* z is zero */
101 100 /*
102 101 * x * y isn't zero but could underflow to zero,
103 102 * so don't add z, lest we perturb the sign
104 103 */
105 104 return (x * y);
106 105
107 106 /*
108 107 * now x, y, and z are all finite and nonzero; save the fsr and
109 108 * set round-to-negative-infinity mode (and clear nonstandard
110 109 * mode before we try to scale subnormal operands)
111 110 */
112 111 __fenv_getfsr32(&fsr);
113 112 __fenv_setfsr32(&fsr_rm);
114 113
115 114 /* extract signs and exponents, and normalize subnormals */
116 115 sxy = (xx.i[0] ^ yy.i[0]) & 0x80000000;
117 116 sz = zz.i[0] & 0x80000000;
118 117 ex = hx >> 20;
119 118 if (!ex) {
120 119 xx.d = x * two52;
121 120 ex = ((xx.i[0] & ~0x80000000) >> 20) - 52;
122 121 }
123 122 ey = hy >> 20;
124 123 if (!ey) {
125 124 yy.d = y * two52;
126 125 ey = ((yy.i[0] & ~0x80000000) >> 20) - 52;
127 126 }
128 127 ez = hz >> 20;
129 128 if (!ez) {
130 129 zz.d = z * two52;
131 130 ez = ((zz.i[0] & ~0x80000000) >> 20) - 52;
132 131 }
133 132
134 133 /* multiply x*y to 106 bits */
135 134 exy = ex + ey - 0x3ff;
136 135 xx.i[0] = (xx.i[0] & 0xfffff) | 0x3ff00000;
137 136 yy.i[0] = (yy.i[0] & 0xfffff) | 0x3ff00000;
138 137 x = xx.d;
139 138 y = yy.d;
140 139 xhi = ((x + twom26) + two27) - two27;
141 140 yhi = ((y + twom26) + two27) - two27;
142 141 xlo = x - xhi;
143 142 ylo = y - yhi;
144 143 x *= y;
145 144 y = ((xhi * yhi - x) + xhi * ylo + xlo * yhi) + xlo * ylo;
146 145 if (x >= two) {
147 146 x *= half;
148 147 y *= half;
149 148 exy++;
150 149 }
151 150
152 151 /* extract the significands */
153 152 xx.d = x;
154 153 xy0 = (xx.i[0] & 0xfffff) | 0x100000;
155 154 xy1 = xx.i[1];
156 155 yy.d = t = y + twom32;
157 156 xy2 = yy.i[1];
158 157 yy.d = (y - (t - twom32)) + twom64;
159 158 xy3 = yy.i[1];
160 159 z0 = (zz.i[0] & 0xfffff) | 0x100000;
161 160 z1 = zz.i[1];
162 161 z2 = z3 = 0;
163 162
164 163 /*
165 164 * now x*y is represented by sxy, exy, and xy[0-3], and z is
166 165 * represented likewise; swap if need be so |xy| <= |z|
167 166 */
168 167 if (exy > ez || (exy == ez && (xy0 > z0 || (xy0 == z0 &&
169 168 (xy1 > z1 || (xy1 == z1 && (xy2 | xy3) != 0)))))) {
170 169 e = sxy; sxy = sz; sz = e;
171 170 e = exy; exy = ez; ez = e;
172 171 e = xy0; xy0 = z0; z0 = e;
173 172 e = xy1; xy1 = z1; z1 = e;
174 173 z2 = xy2; xy2 = 0;
175 174 z3 = xy3; xy3 = 0;
176 175 }
177 176
178 177 /* shift the significand of xy keeping a sticky bit */
179 178 e = ez - exy;
180 179 if (e > 116) {
181 180 xy0 = xy1 = xy2 = 0;
182 181 xy3 = 1;
183 182 } else if (e >= 96) {
184 183 sticky = xy3 | xy2 | xy1 | ((xy0 << 1) << (127 - e));
185 184 xy3 = xy0 >> (e - 96);
186 185 if (sticky)
187 186 xy3 |= 1;
188 187 xy0 = xy1 = xy2 = 0;
189 188 } else if (e >= 64) {
190 189 sticky = xy3 | xy2 | ((xy1 << 1) << (95 - e));
191 190 xy3 = (xy1 >> (e - 64)) | ((xy0 << 1) << (95 - e));
192 191 if (sticky)
193 192 xy3 |= 1;
194 193 xy2 = xy0 >> (e - 64);
195 194 xy0 = xy1 = 0;
196 195 } else if (e >= 32) {
197 196 sticky = xy3 | ((xy2 << 1) << (63 - e));
198 197 xy3 = (xy2 >> (e - 32)) | ((xy1 << 1) << (63 - e));
199 198 if (sticky)
200 199 xy3 |= 1;
201 200 xy2 = (xy1 >> (e - 32)) | ((xy0 << 1) << (63 - e));
202 201 xy1 = xy0 >> (e - 32);
203 202 xy0 = 0;
204 203 } else if (e) {
205 204 sticky = (xy3 << 1) << (31 - e);
206 205 xy3 = (xy3 >> e) | ((xy2 << 1) << (31 - e));
207 206 if (sticky)
208 207 xy3 |= 1;
209 208 xy2 = (xy2 >> e) | ((xy1 << 1) << (31 - e));
210 209 xy1 = (xy1 >> e) | ((xy0 << 1) << (31 - e));
211 210 xy0 >>= e;
212 211 }
213 212
214 213 /* if this is a magnitude subtract, negate the significand of xy */
215 214 if (sxy ^ sz) {
216 215 xy0 = ~xy0;
217 216 xy1 = ~xy1;
218 217 xy2 = ~xy2;
219 218 xy3 = -xy3;
220 219 if (xy3 == 0)
221 220 if (++xy2 == 0)
222 221 if (++xy1 == 0)
223 222 xy0++;
224 223 }
225 224
226 225 /* add, propagating carries */
227 226 z3 += xy3;
228 227 e = (z3 < xy3);
229 228 z2 += xy2;
230 229 if (e) {
231 230 z2++;
232 231 e = (z2 <= xy2);
233 232 } else
234 233 e = (z2 < xy2);
235 234 z1 += xy1;
236 235 if (e) {
237 236 z1++;
238 237 e = (z1 <= xy1);
239 238 } else
240 239 e = (z1 < xy1);
241 240 z0 += xy0;
242 241 if (e)
243 242 z0++;
244 243
245 244 /* postnormalize and collect rounding information into z2 */
246 245 if (ez < 1) {
247 246 /* result is tiny; shift right until exponent is within range */
248 247 e = 1 - ez;
249 248 if (e > 56) {
250 249 z2 = 1; /* result can't be exactly zero */
251 250 z0 = z1 = 0;
252 251 } else if (e >= 32) {
253 252 sticky = z3 | z2 | ((z1 << 1) << (63 - e));
254 253 z2 = (z1 >> (e - 32)) | ((z0 << 1) << (63 - e));
255 254 if (sticky)
256 255 z2 |= 1;
257 256 z1 = z0 >> (e - 32);
258 257 z0 = 0;
259 258 } else {
260 259 sticky = z3 | (z2 << 1) << (31 - e);
261 260 z2 = (z2 >> e) | ((z1 << 1) << (31 - e));
262 261 if (sticky)
263 262 z2 |= 1;
264 263 z1 = (z1 >> e) | ((z0 << 1) << (31 - e));
265 264 z0 >>= e;
266 265 }
267 266 ez = 1;
268 267 } else if (z0 >= 0x200000) {
269 268 /* carry out; shift right by one */
270 269 sticky = (z2 & 1) | z3;
271 270 z2 = (z2 >> 1) | (z1 << 31);
272 271 if (sticky)
273 272 z2 |= 1;
274 273 z1 = (z1 >> 1) | (z0 << 31);
275 274 z0 >>= 1;
276 275 ez++;
277 276 } else {
278 277 if (z0 < 0x100000 && (z0 | z1 | z2 | z3) != 0) {
279 278 /*
280 279 * borrow/cancellation; shift left as much as
281 280 * exponent allows
282 281 */
283 282 while (!(z0 | (z1 & 0xffe00000)) && ez >= 33) {
284 283 z0 = z1;
285 284 z1 = z2;
286 285 z2 = z3;
287 286 z3 = 0;
288 287 ez -= 32;
289 288 }
290 289 while (z0 < 0x100000 && ez > 1) {
291 290 z0 = (z0 << 1) | (z1 >> 31);
292 291 z1 = (z1 << 1) | (z2 >> 31);
293 292 z2 = (z2 << 1) | (z3 >> 31);
294 293 z3 <<= 1;
295 294 ez--;
296 295 }
297 296 }
298 297 if (z3)
299 298 z2 |= 1;
300 299 }
301 300
302 301 /* get the rounding mode and clear current exceptions */
303 302 rm = fsr >> 30;
304 303 fsr &= ~FSR_CEXC;
305 304
306 305 /* strip off the integer bit, if there is one */
307 306 ibit = z0 & 0x100000;
308 307 if (ibit)
309 308 z0 -= 0x100000;
310 309 else {
311 310 ez = 0;
312 311 if (!(z0 | z1 | z2)) { /* exact zero */
313 312 zz.i[0] = rm == FSR_RM ? 0x80000000 : 0;
314 313 zz.i[1] = 0;
315 314 __fenv_setfsr32(&fsr);
316 315 return (zz.d);
317 316 }
318 317 }
319 318
320 319 /*
321 320 * flip the sense of directed roundings if the result is negative;
322 321 * the logic below applies to a positive result
323 322 */
324 323 if (sz)
325 324 rm ^= rm >> 1;
326 325
327 326 /* round and raise exceptions */
328 327 if (z2) {
329 328 fsr |= FSR_NXC;
330 329
331 330 /* decide whether to round the fraction up */
332 331 if (rm == FSR_RP || (rm == FSR_RN && (z2 > 0x80000000u ||
333 332 (z2 == 0x80000000u && (z1 & 1))))) {
334 333 /* round up and renormalize if necessary */
335 334 if (++z1 == 0) {
336 335 if (++z0 == 0x100000) {
337 336 z0 = 0;
338 337 ez++;
339 338 }
340 339 }
341 340 }
342 341 }
343 342
344 343 /* check for under/overflow */
345 344 if (ez >= 0x7ff) {
346 345 if (rm == FSR_RN || rm == FSR_RP) {
347 346 zz.i[0] = sz | 0x7ff00000;
348 347 zz.i[1] = 0;
349 348 } else {
350 349 zz.i[0] = sz | 0x7fefffff;
351 350 zz.i[1] = 0xffffffff;
352 351 }
353 352 fsr |= FSR_OFC | FSR_NXC;
354 353 } else {
355 354 zz.i[0] = sz | (ez << 20) | z0;
356 355 zz.i[1] = z1;
357 356
358 357 /*
359 358 * !ibit => exact result was tiny before rounding,
360 359 * z2 nonzero => result delivered is inexact
361 360 */
362 361 if (!ibit) {
363 362 if (z2)
364 363 fsr |= FSR_UFC | FSR_NXC;
365 364 else if (fsr & FSR_UFM)
366 365 fsr |= FSR_UFC;
367 366 }
368 367 }
369 368
370 369 /* restore the fsr and emulate exceptions as needed */
371 370 if ((fsr & FSR_CEXC) & (fsr >> 23)) {
372 371 __fenv_setfsr32(&fsr);
373 372 if (fsr & FSR_OFC) {
374 373 dummy = huge;
375 374 dummy *= huge;
376 375 } else if (fsr & FSR_UFC) {
377 376 dummy = tiny;
378 377 if (fsr & FSR_NXC)
379 378 dummy *= tiny;
380 379 else
381 380 dummy -= tiny2;
382 381 } else {
383 382 dummy = huge;
384 383 dummy += tiny;
385 384 }
386 385 } else {
387 386 fsr |= (fsr & 0x1f) << 5;
388 387 __fenv_setfsr32(&fsr);
389 388 }
390 389 return (zz.d);
391 390 }
392 391
393 392 #elif defined(__x86)
394 393
395 394 #if defined(__amd64)
396 395 #define NI 4
397 396 #else
398 397 #define NI 3
399 398 #endif
400 399
401 400 /*
402 401 * fma for x86: 64-bit double precision, little-endian
403 402 */
404 403 double
405 404 __fma(double x, double y, double z) {
406 405 union {
407 406 unsigned i[NI];
408 407 long double e;
409 408 } xx, yy, zz;
410 409 long double xe, ye, xhi, xlo, yhi, ylo;
411 410 int ex, ey, ez;
412 411 unsigned cwsw, oldcwsw, rm;
413 412
414 413 /* convert the operands to double extended */
415 414 xx.e = (long double) x;
416 415 yy.e = (long double) y;
417 416 zz.e = (long double) z;
418 417
419 418 /* extract the exponents of the arguments */
420 419 ex = xx.i[2] & 0x7fff;
421 420 ey = yy.i[2] & 0x7fff;
422 421 ez = zz.i[2] & 0x7fff;
423 422
424 423 /* dispense with inf, nan, and zero cases */
425 424 if (ex == 0x7fff || ey == 0x7fff || ex == 0 || ey == 0)
426 425 /* x or y is inf, nan, or zero */
427 426 return ((double) (xx.e * yy.e + zz.e));
428 427
429 428 if (ez >= 0x7fff) /* z is inf or nan */
430 429 return ((double) (xx.e + zz.e));
431 430 /* avoid spurious inexact in x * y */
432 431
433 432 /*
434 433 * save the control and status words, mask all exceptions, and
435 434 * set rounding to 64-bit precision and to-nearest
436 435 */
437 436 __fenv_getcwsw(&oldcwsw);
438 437 cwsw = (oldcwsw & 0xf0c0ffff) | 0x033f0000;
439 438 __fenv_setcwsw(&cwsw);
440 439
441 440 /* multiply x*y to 106 bits */
442 441 xe = xx.e;
443 442 xx.i[0] = 0;
444 443 xhi = xx.e; /* hi 32 bits */
445 444 xlo = xe - xhi; /* lo 21 bits */
446 445 ye = yy.e;
447 446 yy.i[0] = 0;
448 447 yhi = yy.e;
449 448 ylo = ye - yhi;
450 449 xe = xe * ye;
451 450 ye = ((xhi * yhi - xe) + xhi * ylo + xlo * yhi) + xlo * ylo;
452 451
453 452 /* distill the sum of xe, ye, and z */
454 453 xhi = ye + zz.e;
455 454 yhi = xhi - ye;
456 455 xlo = (zz.e - yhi) + (ye - (xhi - yhi));
457 456 /* now (xhi,xlo) = ye + z */
458 457
459 458 yhi = xe + xhi;
460 459 ye = yhi - xe;
461 460 ylo = (xhi - ye) + (xe - (yhi - ye)); /* now (yhi,ylo) = xe + xhi */
462 461
463 462 xhi = xlo + ylo;
464 463 xe = xhi - xlo;
465 464 xlo = (ylo - xe) + (xlo - (xhi - xe)); /* now (xhi,xlo) = xlo + ylo */
466 465
467 466 yy.e = yhi + xhi;
468 467 ylo = (yhi - yy.e) + xhi; /* now (yy.e,ylo) = xhi + yhi */
469 468
470 469 if (yy.i[1] != 0) { /* yy.e is nonzero */
471 470 /* perturb yy.e if its least significant 10 bits are zero */
472 471 if (!(yy.i[0] & 0x3ff)) {
473 472 xx.e = ylo + xlo;
474 473 if (xx.i[1] != 0) {
475 474 xx.i[2] = (xx.i[2] & 0x8000) |
476 475 ((yy.i[2] & 0x7fff) - 63);
477 476 xx.i[1] = 0x80000000;
478 477 xx.i[0] = 0;
479 478 yy.e += xx.e;
480 479 }
481 480 }
482 481 } else {
483 482 /* set sign of zero result according to rounding direction */
484 483 rm = oldcwsw & 0x0c000000;
485 484 yy.i[2] = ((rm == FCW_RM)? 0x8000 : 0);
486 485 }
487 486
488 487 /*
489 488 * restore the control and status words and convert the result
490 489 * to double
491 490 */
492 491 __fenv_setcwsw(&oldcwsw);
493 492 return ((double) yy.e);
494 493 }
495 494
496 495 #else
497 496 #error Unknown architecture
498 497 #endif
↓ open down ↓ |
408 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX