Print this page
5262 libm needs to be carefully unifdef'd
5268 libm doesn't need to hide symbols which are already local
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/lib/libm/common/m9x/fma.c
+++ new/usr/src/lib/libm/common/m9x/fma.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
↓ open down ↓ |
19 lines elided |
↑ open up ↑ |
20 20 */
21 21
22 22 /*
23 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 24 */
25 25 /*
26 26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
27 27 * Use is subject to license terms.
28 28 */
29 29
30 -#if defined(ELFOBJ)
31 30 #pragma weak fma = __fma
32 -#endif
33 31
34 32 #include "libm.h"
35 33 #include "fma.h"
36 34 #include "fenv_inlines.h"
37 35
38 36 #if defined(__sparc)
39 37
40 38 static const union {
41 39 unsigned i[2];
42 40 double d;
43 41 } C[] = {
44 42 { 0x3fe00000u, 0 },
45 43 { 0x40000000u, 0 },
46 44 { 0x43300000u, 0 },
47 45 { 0x41a00000u, 0 },
48 46 { 0x3e500000u, 0 },
49 47 { 0x3df00000u, 0 },
50 48 { 0x3bf00000u, 0 },
51 49 { 0x7fe00000u, 0 },
52 50 { 0x00100000u, 0 },
53 51 { 0x00100001u, 0 }
54 52 };
55 53
56 54 #define half C[0].d
57 55 #define two C[1].d
58 56 #define two52 C[2].d
59 57 #define two27 C[3].d
60 58 #define twom26 C[4].d
61 59 #define twom32 C[5].d
62 60 #define twom64 C[6].d
63 61 #define huge C[7].d
64 62 #define tiny C[8].d
65 63 #define tiny2 C[9].d
66 64
67 65 static const unsigned int fsr_rm = 0xc0000000u;
68 66
69 67 /*
70 68 * fma for SPARC: 64-bit double precision, big-endian
71 69 */
72 70 double
73 71 __fma(double x, double y, double z) {
74 72 union {
75 73 unsigned i[2];
76 74 double d;
77 75 } xx, yy, zz;
78 76 double xhi, yhi, xlo, ylo, t;
79 77 unsigned int xy0, xy1, xy2, xy3, z0, z1, z2, z3, fsr, rm, sticky;
80 78 int hx, hy, hz, ex, ey, ez, exy, sxy, sz, e, ibit;
81 79 volatile double dummy;
82 80
83 81 /* extract the high order words of the arguments */
84 82 xx.d = x;
85 83 yy.d = y;
86 84 zz.d = z;
87 85 hx = xx.i[0] & ~0x80000000;
88 86 hy = yy.i[0] & ~0x80000000;
89 87 hz = zz.i[0] & ~0x80000000;
90 88
91 89 /* dispense with inf, nan, and zero cases */
92 90 if (hx >= 0x7ff00000 || hy >= 0x7ff00000 || (hx | xx.i[1]) == 0 ||
93 91 (hy | yy.i[1]) == 0) /* x or y is inf, nan, or zero */
94 92 return (x * y + z);
95 93
96 94 if (hz >= 0x7ff00000) /* z is inf or nan */
97 95 return (x + z); /* avoid spurious under/overflow in x * y */
98 96
99 97 if ((hz | zz.i[1]) == 0) /* z is zero */
100 98 /*
101 99 * x * y isn't zero but could underflow to zero,
102 100 * so don't add z, lest we perturb the sign
103 101 */
104 102 return (x * y);
105 103
106 104 /*
107 105 * now x, y, and z are all finite and nonzero; save the fsr and
108 106 * set round-to-negative-infinity mode (and clear nonstandard
109 107 * mode before we try to scale subnormal operands)
110 108 */
111 109 __fenv_getfsr32(&fsr);
112 110 __fenv_setfsr32(&fsr_rm);
113 111
114 112 /* extract signs and exponents, and normalize subnormals */
115 113 sxy = (xx.i[0] ^ yy.i[0]) & 0x80000000;
116 114 sz = zz.i[0] & 0x80000000;
117 115 ex = hx >> 20;
118 116 if (!ex) {
119 117 xx.d = x * two52;
120 118 ex = ((xx.i[0] & ~0x80000000) >> 20) - 52;
121 119 }
122 120 ey = hy >> 20;
123 121 if (!ey) {
124 122 yy.d = y * two52;
125 123 ey = ((yy.i[0] & ~0x80000000) >> 20) - 52;
126 124 }
127 125 ez = hz >> 20;
128 126 if (!ez) {
129 127 zz.d = z * two52;
130 128 ez = ((zz.i[0] & ~0x80000000) >> 20) - 52;
131 129 }
132 130
133 131 /* multiply x*y to 106 bits */
134 132 exy = ex + ey - 0x3ff;
135 133 xx.i[0] = (xx.i[0] & 0xfffff) | 0x3ff00000;
136 134 yy.i[0] = (yy.i[0] & 0xfffff) | 0x3ff00000;
137 135 x = xx.d;
138 136 y = yy.d;
139 137 xhi = ((x + twom26) + two27) - two27;
140 138 yhi = ((y + twom26) + two27) - two27;
141 139 xlo = x - xhi;
142 140 ylo = y - yhi;
143 141 x *= y;
144 142 y = ((xhi * yhi - x) + xhi * ylo + xlo * yhi) + xlo * ylo;
145 143 if (x >= two) {
146 144 x *= half;
147 145 y *= half;
148 146 exy++;
149 147 }
150 148
151 149 /* extract the significands */
152 150 xx.d = x;
153 151 xy0 = (xx.i[0] & 0xfffff) | 0x100000;
154 152 xy1 = xx.i[1];
155 153 yy.d = t = y + twom32;
156 154 xy2 = yy.i[1];
157 155 yy.d = (y - (t - twom32)) + twom64;
158 156 xy3 = yy.i[1];
159 157 z0 = (zz.i[0] & 0xfffff) | 0x100000;
160 158 z1 = zz.i[1];
161 159 z2 = z3 = 0;
162 160
163 161 /*
164 162 * now x*y is represented by sxy, exy, and xy[0-3], and z is
165 163 * represented likewise; swap if need be so |xy| <= |z|
166 164 */
167 165 if (exy > ez || (exy == ez && (xy0 > z0 || (xy0 == z0 &&
168 166 (xy1 > z1 || (xy1 == z1 && (xy2 | xy3) != 0)))))) {
169 167 e = sxy; sxy = sz; sz = e;
170 168 e = exy; exy = ez; ez = e;
171 169 e = xy0; xy0 = z0; z0 = e;
172 170 e = xy1; xy1 = z1; z1 = e;
173 171 z2 = xy2; xy2 = 0;
174 172 z3 = xy3; xy3 = 0;
175 173 }
176 174
177 175 /* shift the significand of xy keeping a sticky bit */
178 176 e = ez - exy;
179 177 if (e > 116) {
180 178 xy0 = xy1 = xy2 = 0;
181 179 xy3 = 1;
182 180 } else if (e >= 96) {
183 181 sticky = xy3 | xy2 | xy1 | ((xy0 << 1) << (127 - e));
184 182 xy3 = xy0 >> (e - 96);
185 183 if (sticky)
186 184 xy3 |= 1;
187 185 xy0 = xy1 = xy2 = 0;
188 186 } else if (e >= 64) {
189 187 sticky = xy3 | xy2 | ((xy1 << 1) << (95 - e));
190 188 xy3 = (xy1 >> (e - 64)) | ((xy0 << 1) << (95 - e));
191 189 if (sticky)
192 190 xy3 |= 1;
193 191 xy2 = xy0 >> (e - 64);
194 192 xy0 = xy1 = 0;
195 193 } else if (e >= 32) {
196 194 sticky = xy3 | ((xy2 << 1) << (63 - e));
197 195 xy3 = (xy2 >> (e - 32)) | ((xy1 << 1) << (63 - e));
198 196 if (sticky)
199 197 xy3 |= 1;
200 198 xy2 = (xy1 >> (e - 32)) | ((xy0 << 1) << (63 - e));
201 199 xy1 = xy0 >> (e - 32);
202 200 xy0 = 0;
203 201 } else if (e) {
204 202 sticky = (xy3 << 1) << (31 - e);
205 203 xy3 = (xy3 >> e) | ((xy2 << 1) << (31 - e));
206 204 if (sticky)
207 205 xy3 |= 1;
208 206 xy2 = (xy2 >> e) | ((xy1 << 1) << (31 - e));
209 207 xy1 = (xy1 >> e) | ((xy0 << 1) << (31 - e));
210 208 xy0 >>= e;
211 209 }
212 210
213 211 /* if this is a magnitude subtract, negate the significand of xy */
214 212 if (sxy ^ sz) {
215 213 xy0 = ~xy0;
216 214 xy1 = ~xy1;
217 215 xy2 = ~xy2;
218 216 xy3 = -xy3;
219 217 if (xy3 == 0)
220 218 if (++xy2 == 0)
221 219 if (++xy1 == 0)
222 220 xy0++;
223 221 }
224 222
225 223 /* add, propagating carries */
226 224 z3 += xy3;
227 225 e = (z3 < xy3);
228 226 z2 += xy2;
229 227 if (e) {
230 228 z2++;
231 229 e = (z2 <= xy2);
232 230 } else
233 231 e = (z2 < xy2);
234 232 z1 += xy1;
235 233 if (e) {
236 234 z1++;
237 235 e = (z1 <= xy1);
238 236 } else
239 237 e = (z1 < xy1);
240 238 z0 += xy0;
241 239 if (e)
242 240 z0++;
243 241
244 242 /* postnormalize and collect rounding information into z2 */
245 243 if (ez < 1) {
246 244 /* result is tiny; shift right until exponent is within range */
247 245 e = 1 - ez;
248 246 if (e > 56) {
249 247 z2 = 1; /* result can't be exactly zero */
250 248 z0 = z1 = 0;
251 249 } else if (e >= 32) {
252 250 sticky = z3 | z2 | ((z1 << 1) << (63 - e));
253 251 z2 = (z1 >> (e - 32)) | ((z0 << 1) << (63 - e));
254 252 if (sticky)
255 253 z2 |= 1;
256 254 z1 = z0 >> (e - 32);
257 255 z0 = 0;
258 256 } else {
259 257 sticky = z3 | (z2 << 1) << (31 - e);
260 258 z2 = (z2 >> e) | ((z1 << 1) << (31 - e));
261 259 if (sticky)
262 260 z2 |= 1;
263 261 z1 = (z1 >> e) | ((z0 << 1) << (31 - e));
264 262 z0 >>= e;
265 263 }
266 264 ez = 1;
267 265 } else if (z0 >= 0x200000) {
268 266 /* carry out; shift right by one */
269 267 sticky = (z2 & 1) | z3;
270 268 z2 = (z2 >> 1) | (z1 << 31);
271 269 if (sticky)
272 270 z2 |= 1;
273 271 z1 = (z1 >> 1) | (z0 << 31);
274 272 z0 >>= 1;
275 273 ez++;
276 274 } else {
277 275 if (z0 < 0x100000 && (z0 | z1 | z2 | z3) != 0) {
278 276 /*
279 277 * borrow/cancellation; shift left as much as
280 278 * exponent allows
281 279 */
282 280 while (!(z0 | (z1 & 0xffe00000)) && ez >= 33) {
283 281 z0 = z1;
284 282 z1 = z2;
285 283 z2 = z3;
286 284 z3 = 0;
287 285 ez -= 32;
288 286 }
289 287 while (z0 < 0x100000 && ez > 1) {
290 288 z0 = (z0 << 1) | (z1 >> 31);
291 289 z1 = (z1 << 1) | (z2 >> 31);
292 290 z2 = (z2 << 1) | (z3 >> 31);
293 291 z3 <<= 1;
294 292 ez--;
295 293 }
296 294 }
297 295 if (z3)
298 296 z2 |= 1;
299 297 }
300 298
301 299 /* get the rounding mode and clear current exceptions */
302 300 rm = fsr >> 30;
303 301 fsr &= ~FSR_CEXC;
304 302
305 303 /* strip off the integer bit, if there is one */
306 304 ibit = z0 & 0x100000;
307 305 if (ibit)
308 306 z0 -= 0x100000;
309 307 else {
310 308 ez = 0;
311 309 if (!(z0 | z1 | z2)) { /* exact zero */
312 310 zz.i[0] = rm == FSR_RM ? 0x80000000 : 0;
313 311 zz.i[1] = 0;
314 312 __fenv_setfsr32(&fsr);
315 313 return (zz.d);
316 314 }
317 315 }
318 316
319 317 /*
320 318 * flip the sense of directed roundings if the result is negative;
321 319 * the logic below applies to a positive result
322 320 */
323 321 if (sz)
324 322 rm ^= rm >> 1;
325 323
326 324 /* round and raise exceptions */
327 325 if (z2) {
328 326 fsr |= FSR_NXC;
329 327
330 328 /* decide whether to round the fraction up */
331 329 if (rm == FSR_RP || (rm == FSR_RN && (z2 > 0x80000000u ||
332 330 (z2 == 0x80000000u && (z1 & 1))))) {
333 331 /* round up and renormalize if necessary */
334 332 if (++z1 == 0) {
335 333 if (++z0 == 0x100000) {
336 334 z0 = 0;
337 335 ez++;
338 336 }
339 337 }
340 338 }
341 339 }
342 340
343 341 /* check for under/overflow */
344 342 if (ez >= 0x7ff) {
345 343 if (rm == FSR_RN || rm == FSR_RP) {
346 344 zz.i[0] = sz | 0x7ff00000;
347 345 zz.i[1] = 0;
348 346 } else {
349 347 zz.i[0] = sz | 0x7fefffff;
350 348 zz.i[1] = 0xffffffff;
351 349 }
352 350 fsr |= FSR_OFC | FSR_NXC;
353 351 } else {
354 352 zz.i[0] = sz | (ez << 20) | z0;
355 353 zz.i[1] = z1;
356 354
357 355 /*
358 356 * !ibit => exact result was tiny before rounding,
359 357 * z2 nonzero => result delivered is inexact
360 358 */
361 359 if (!ibit) {
362 360 if (z2)
363 361 fsr |= FSR_UFC | FSR_NXC;
364 362 else if (fsr & FSR_UFM)
365 363 fsr |= FSR_UFC;
366 364 }
367 365 }
368 366
369 367 /* restore the fsr and emulate exceptions as needed */
370 368 if ((fsr & FSR_CEXC) & (fsr >> 23)) {
371 369 __fenv_setfsr32(&fsr);
372 370 if (fsr & FSR_OFC) {
373 371 dummy = huge;
374 372 dummy *= huge;
375 373 } else if (fsr & FSR_UFC) {
376 374 dummy = tiny;
377 375 if (fsr & FSR_NXC)
378 376 dummy *= tiny;
379 377 else
380 378 dummy -= tiny2;
381 379 } else {
382 380 dummy = huge;
383 381 dummy += tiny;
384 382 }
385 383 } else {
386 384 fsr |= (fsr & 0x1f) << 5;
387 385 __fenv_setfsr32(&fsr);
388 386 }
389 387 return (zz.d);
390 388 }
391 389
392 390 #elif defined(__x86)
393 391
394 392 #if defined(__amd64)
395 393 #define NI 4
396 394 #else
397 395 #define NI 3
398 396 #endif
399 397
400 398 /*
401 399 * fma for x86: 64-bit double precision, little-endian
402 400 */
403 401 double
404 402 __fma(double x, double y, double z) {
405 403 union {
406 404 unsigned i[NI];
407 405 long double e;
408 406 } xx, yy, zz;
409 407 long double xe, ye, xhi, xlo, yhi, ylo;
410 408 int ex, ey, ez;
411 409 unsigned cwsw, oldcwsw, rm;
412 410
413 411 /* convert the operands to double extended */
414 412 xx.e = (long double) x;
415 413 yy.e = (long double) y;
416 414 zz.e = (long double) z;
417 415
418 416 /* extract the exponents of the arguments */
419 417 ex = xx.i[2] & 0x7fff;
420 418 ey = yy.i[2] & 0x7fff;
421 419 ez = zz.i[2] & 0x7fff;
422 420
423 421 /* dispense with inf, nan, and zero cases */
424 422 if (ex == 0x7fff || ey == 0x7fff || ex == 0 || ey == 0)
425 423 /* x or y is inf, nan, or zero */
426 424 return ((double) (xx.e * yy.e + zz.e));
427 425
428 426 if (ez >= 0x7fff) /* z is inf or nan */
429 427 return ((double) (xx.e + zz.e));
430 428 /* avoid spurious inexact in x * y */
431 429
432 430 /*
433 431 * save the control and status words, mask all exceptions, and
434 432 * set rounding to 64-bit precision and to-nearest
435 433 */
436 434 __fenv_getcwsw(&oldcwsw);
437 435 cwsw = (oldcwsw & 0xf0c0ffff) | 0x033f0000;
438 436 __fenv_setcwsw(&cwsw);
439 437
440 438 /* multiply x*y to 106 bits */
441 439 xe = xx.e;
442 440 xx.i[0] = 0;
443 441 xhi = xx.e; /* hi 32 bits */
444 442 xlo = xe - xhi; /* lo 21 bits */
445 443 ye = yy.e;
446 444 yy.i[0] = 0;
447 445 yhi = yy.e;
448 446 ylo = ye - yhi;
449 447 xe = xe * ye;
450 448 ye = ((xhi * yhi - xe) + xhi * ylo + xlo * yhi) + xlo * ylo;
451 449
452 450 /* distill the sum of xe, ye, and z */
453 451 xhi = ye + zz.e;
454 452 yhi = xhi - ye;
455 453 xlo = (zz.e - yhi) + (ye - (xhi - yhi));
456 454 /* now (xhi,xlo) = ye + z */
457 455
458 456 yhi = xe + xhi;
459 457 ye = yhi - xe;
460 458 ylo = (xhi - ye) + (xe - (yhi - ye)); /* now (yhi,ylo) = xe + xhi */
461 459
462 460 xhi = xlo + ylo;
463 461 xe = xhi - xlo;
464 462 xlo = (ylo - xe) + (xlo - (xhi - xe)); /* now (xhi,xlo) = xlo + ylo */
465 463
466 464 yy.e = yhi + xhi;
467 465 ylo = (yhi - yy.e) + xhi; /* now (yy.e,ylo) = xhi + yhi */
468 466
469 467 if (yy.i[1] != 0) { /* yy.e is nonzero */
470 468 /* perturb yy.e if its least significant 10 bits are zero */
471 469 if (!(yy.i[0] & 0x3ff)) {
472 470 xx.e = ylo + xlo;
473 471 if (xx.i[1] != 0) {
474 472 xx.i[2] = (xx.i[2] & 0x8000) |
475 473 ((yy.i[2] & 0x7fff) - 63);
476 474 xx.i[1] = 0x80000000;
477 475 xx.i[0] = 0;
478 476 yy.e += xx.e;
479 477 }
480 478 }
481 479 } else {
482 480 /* set sign of zero result according to rounding direction */
483 481 rm = oldcwsw & 0x0c000000;
484 482 yy.i[2] = ((rm == FCW_RM)? 0x8000 : 0);
485 483 }
486 484
487 485 /*
488 486 * restore the control and status words and convert the result
489 487 * to double
490 488 */
491 489 __fenv_setcwsw(&oldcwsw);
492 490 return ((double) yy.e);
493 491 }
494 492
495 493 #else
496 494 #error Unknown architecture
497 495 #endif
↓ open down ↓ |
455 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX