Print this page
11210 libm should be cstyle(1ONBLD) clean
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/lib/libm/common/m9x/fma.c
+++ new/usr/src/lib/libm/common/m9x/fma.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
↓ open down ↓ |
14 lines elided |
↑ open up ↑ |
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 24 */
25 +
25 26 /*
26 27 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
27 28 * Use is subject to license terms.
28 29 */
29 30
30 31 #pragma weak fma = __fma
31 32
32 33 #include "libm.h"
33 34 #include "fma.h"
34 35 #include "fenv_inlines.h"
35 36
36 37 #if defined(__sparc)
37 -
38 38 static const union {
39 39 unsigned i[2];
40 40 double d;
41 41 } C[] = {
42 42 { 0x3fe00000u, 0 },
43 43 { 0x40000000u, 0 },
44 44 { 0x43300000u, 0 },
45 45 { 0x41a00000u, 0 },
46 46 { 0x3e500000u, 0 },
47 47 { 0x3df00000u, 0 },
48 48 { 0x3bf00000u, 0 },
49 49 { 0x7fe00000u, 0 },
50 50 { 0x00100000u, 0 },
51 51 { 0x00100001u, 0 }
52 52 };
53 53
54 -#define half C[0].d
55 -#define two C[1].d
56 -#define two52 C[2].d
57 -#define two27 C[3].d
58 -#define twom26 C[4].d
59 -#define twom32 C[5].d
60 -#define twom64 C[6].d
61 -#define huge C[7].d
62 -#define tiny C[8].d
63 -#define tiny2 C[9].d
54 +#define half C[0].d
55 +#define two C[1].d
56 +#define two52 C[2].d
57 +#define two27 C[3].d
58 +#define twom26 C[4].d
59 +#define twom32 C[5].d
60 +#define twom64 C[6].d
61 +#define huge C[7].d
62 +#define tiny C[8].d
63 +#define tiny2 C[9].d
64 64
65 65 static const unsigned int fsr_rm = 0xc0000000u;
66 66
67 67 /*
68 68 * fma for SPARC: 64-bit double precision, big-endian
69 69 */
70 70 double
71 -__fma(double x, double y, double z) {
71 +__fma(double x, double y, double z)
72 +{
72 73 union {
73 74 unsigned i[2];
74 75 double d;
75 76 } xx, yy, zz;
77 +
76 78 double xhi, yhi, xlo, ylo, t;
77 79 unsigned int xy0, xy1, xy2, xy3, z0, z1, z2, z3, fsr, rm, sticky;
78 80 int hx, hy, hz, ex, ey, ez, exy, sxy, sz, e, ibit;
79 - volatile double dummy;
81 + volatile double dummy;
80 82
81 83 /* extract the high order words of the arguments */
82 84 xx.d = x;
83 85 yy.d = y;
84 86 zz.d = z;
85 87 hx = xx.i[0] & ~0x80000000;
86 88 hy = yy.i[0] & ~0x80000000;
87 89 hz = zz.i[0] & ~0x80000000;
88 90
89 91 /* dispense with inf, nan, and zero cases */
90 - if (hx >= 0x7ff00000 || hy >= 0x7ff00000 || (hx | xx.i[1]) == 0 ||
91 - (hy | yy.i[1]) == 0) /* x or y is inf, nan, or zero */
92 + if (hx >= 0x7ff00000 || hy >= 0x7ff00000 || (hx | xx.i[1]) == 0 || (hy |
93 + yy.i[1]) == 0) /* x or y is inf, nan, or zero */
92 94 return (x * y + z);
93 95
94 96 if (hz >= 0x7ff00000) /* z is inf or nan */
95 97 return (x + z); /* avoid spurious under/overflow in x * y */
96 98
97 - if ((hz | zz.i[1]) == 0) /* z is zero */
99 + if ((hz | zz.i[1]) == 0) /* z is zero */
98 100 /*
99 101 * x * y isn't zero but could underflow to zero,
100 102 * so don't add z, lest we perturb the sign
101 103 */
102 104 return (x * y);
103 105
104 106 /*
105 107 * now x, y, and z are all finite and nonzero; save the fsr and
106 108 * set round-to-negative-infinity mode (and clear nonstandard
107 109 * mode before we try to scale subnormal operands)
108 110 */
109 111 __fenv_getfsr32(&fsr);
110 112 __fenv_setfsr32(&fsr_rm);
111 113
112 114 /* extract signs and exponents, and normalize subnormals */
113 115 sxy = (xx.i[0] ^ yy.i[0]) & 0x80000000;
114 116 sz = zz.i[0] & 0x80000000;
115 117 ex = hx >> 20;
118 +
116 119 if (!ex) {
117 120 xx.d = x * two52;
118 121 ex = ((xx.i[0] & ~0x80000000) >> 20) - 52;
119 122 }
123 +
120 124 ey = hy >> 20;
125 +
121 126 if (!ey) {
122 127 yy.d = y * two52;
123 128 ey = ((yy.i[0] & ~0x80000000) >> 20) - 52;
124 129 }
130 +
125 131 ez = hz >> 20;
132 +
126 133 if (!ez) {
127 134 zz.d = z * two52;
128 135 ez = ((zz.i[0] & ~0x80000000) >> 20) - 52;
129 136 }
130 137
131 138 /* multiply x*y to 106 bits */
132 139 exy = ex + ey - 0x3ff;
133 140 xx.i[0] = (xx.i[0] & 0xfffff) | 0x3ff00000;
134 141 yy.i[0] = (yy.i[0] & 0xfffff) | 0x3ff00000;
135 142 x = xx.d;
136 143 y = yy.d;
137 144 xhi = ((x + twom26) + two27) - two27;
138 145 yhi = ((y + twom26) + two27) - two27;
139 146 xlo = x - xhi;
140 147 ylo = y - yhi;
141 148 x *= y;
142 149 y = ((xhi * yhi - x) + xhi * ylo + xlo * yhi) + xlo * ylo;
150 +
143 151 if (x >= two) {
144 152 x *= half;
145 153 y *= half;
146 154 exy++;
147 155 }
148 156
149 157 /* extract the significands */
150 158 xx.d = x;
151 159 xy0 = (xx.i[0] & 0xfffff) | 0x100000;
152 160 xy1 = xx.i[1];
153 161 yy.d = t = y + twom32;
154 162 xy2 = yy.i[1];
↓ open down ↓ |
2 lines elided |
↑ open up ↑ |
155 163 yy.d = (y - (t - twom32)) + twom64;
156 164 xy3 = yy.i[1];
157 165 z0 = (zz.i[0] & 0xfffff) | 0x100000;
158 166 z1 = zz.i[1];
159 167 z2 = z3 = 0;
160 168
161 169 /*
162 170 * now x*y is represented by sxy, exy, and xy[0-3], and z is
163 171 * represented likewise; swap if need be so |xy| <= |z|
164 172 */
165 - if (exy > ez || (exy == ez && (xy0 > z0 || (xy0 == z0 &&
166 - (xy1 > z1 || (xy1 == z1 && (xy2 | xy3) != 0)))))) {
167 - e = sxy; sxy = sz; sz = e;
168 - e = exy; exy = ez; ez = e;
169 - e = xy0; xy0 = z0; z0 = e;
170 - e = xy1; xy1 = z1; z1 = e;
171 - z2 = xy2; xy2 = 0;
172 - z3 = xy3; xy3 = 0;
173 + if (exy > ez || (exy == ez && (xy0 > z0 || (xy0 == z0 && (xy1 > z1 ||
174 + (xy1 == z1 && (xy2 | xy3) != 0)))))) {
175 + e = sxy;
176 + sxy = sz;
177 + sz = e;
178 + e = exy;
179 + exy = ez;
180 + ez = e;
181 + e = xy0;
182 + xy0 = z0;
183 + z0 = e;
184 + e = xy1;
185 + xy1 = z1;
186 + z1 = e;
187 + z2 = xy2;
188 + xy2 = 0;
189 + z3 = xy3;
190 + xy3 = 0;
173 191 }
174 192
175 193 /* shift the significand of xy keeping a sticky bit */
176 194 e = ez - exy;
195 +
177 196 if (e > 116) {
178 197 xy0 = xy1 = xy2 = 0;
179 198 xy3 = 1;
180 199 } else if (e >= 96) {
181 200 sticky = xy3 | xy2 | xy1 | ((xy0 << 1) << (127 - e));
182 201 xy3 = xy0 >> (e - 96);
202 +
183 203 if (sticky)
184 204 xy3 |= 1;
205 +
185 206 xy0 = xy1 = xy2 = 0;
186 207 } else if (e >= 64) {
187 208 sticky = xy3 | xy2 | ((xy1 << 1) << (95 - e));
188 209 xy3 = (xy1 >> (e - 64)) | ((xy0 << 1) << (95 - e));
210 +
189 211 if (sticky)
190 212 xy3 |= 1;
213 +
191 214 xy2 = xy0 >> (e - 64);
192 215 xy0 = xy1 = 0;
193 216 } else if (e >= 32) {
194 217 sticky = xy3 | ((xy2 << 1) << (63 - e));
195 218 xy3 = (xy2 >> (e - 32)) | ((xy1 << 1) << (63 - e));
219 +
196 220 if (sticky)
197 221 xy3 |= 1;
222 +
198 223 xy2 = (xy1 >> (e - 32)) | ((xy0 << 1) << (63 - e));
199 224 xy1 = xy0 >> (e - 32);
200 225 xy0 = 0;
201 226 } else if (e) {
202 227 sticky = (xy3 << 1) << (31 - e);
203 228 xy3 = (xy3 >> e) | ((xy2 << 1) << (31 - e));
229 +
204 230 if (sticky)
205 231 xy3 |= 1;
232 +
206 233 xy2 = (xy2 >> e) | ((xy1 << 1) << (31 - e));
207 234 xy1 = (xy1 >> e) | ((xy0 << 1) << (31 - e));
208 235 xy0 >>= e;
209 236 }
210 237
211 238 /* if this is a magnitude subtract, negate the significand of xy */
212 239 if (sxy ^ sz) {
213 240 xy0 = ~xy0;
214 241 xy1 = ~xy1;
215 242 xy2 = ~xy2;
216 243 xy3 = -xy3;
244 +
217 245 if (xy3 == 0)
218 246 if (++xy2 == 0)
219 247 if (++xy1 == 0)
220 248 xy0++;
221 249 }
222 250
223 251 /* add, propagating carries */
224 252 z3 += xy3;
225 253 e = (z3 < xy3);
226 254 z2 += xy2;
255 +
227 256 if (e) {
228 257 z2++;
229 258 e = (z2 <= xy2);
230 - } else
259 + } else {
231 260 e = (z2 < xy2);
261 + }
262 +
232 263 z1 += xy1;
264 +
233 265 if (e) {
234 266 z1++;
235 267 e = (z1 <= xy1);
236 - } else
268 + } else {
237 269 e = (z1 < xy1);
270 + }
271 +
238 272 z0 += xy0;
273 +
239 274 if (e)
240 275 z0++;
241 276
242 277 /* postnormalize and collect rounding information into z2 */
243 278 if (ez < 1) {
244 279 /* result is tiny; shift right until exponent is within range */
245 280 e = 1 - ez;
281 +
246 282 if (e > 56) {
247 - z2 = 1; /* result can't be exactly zero */
283 + z2 = 1; /* result can't be exactly zero */
248 284 z0 = z1 = 0;
249 285 } else if (e >= 32) {
250 286 sticky = z3 | z2 | ((z1 << 1) << (63 - e));
251 287 z2 = (z1 >> (e - 32)) | ((z0 << 1) << (63 - e));
288 +
252 289 if (sticky)
253 290 z2 |= 1;
291 +
254 292 z1 = z0 >> (e - 32);
255 293 z0 = 0;
256 294 } else {
257 295 sticky = z3 | (z2 << 1) << (31 - e);
258 296 z2 = (z2 >> e) | ((z1 << 1) << (31 - e));
297 +
259 298 if (sticky)
260 299 z2 |= 1;
300 +
261 301 z1 = (z1 >> e) | ((z0 << 1) << (31 - e));
262 302 z0 >>= e;
263 303 }
304 +
264 305 ez = 1;
265 306 } else if (z0 >= 0x200000) {
266 307 /* carry out; shift right by one */
267 308 sticky = (z2 & 1) | z3;
268 309 z2 = (z2 >> 1) | (z1 << 31);
310 +
269 311 if (sticky)
270 312 z2 |= 1;
313 +
271 314 z1 = (z1 >> 1) | (z0 << 31);
272 315 z0 >>= 1;
273 316 ez++;
274 317 } else {
275 318 if (z0 < 0x100000 && (z0 | z1 | z2 | z3) != 0) {
276 319 /*
277 320 * borrow/cancellation; shift left as much as
278 321 * exponent allows
279 322 */
280 323 while (!(z0 | (z1 & 0xffe00000)) && ez >= 33) {
281 324 z0 = z1;
282 325 z1 = z2;
283 326 z2 = z3;
284 327 z3 = 0;
285 328 ez -= 32;
286 329 }
330 +
287 331 while (z0 < 0x100000 && ez > 1) {
288 332 z0 = (z0 << 1) | (z1 >> 31);
289 333 z1 = (z1 << 1) | (z2 >> 31);
290 334 z2 = (z2 << 1) | (z3 >> 31);
291 335 z3 <<= 1;
292 336 ez--;
293 337 }
294 338 }
339 +
295 340 if (z3)
296 341 z2 |= 1;
297 342 }
298 343
299 344 /* get the rounding mode and clear current exceptions */
300 345 rm = fsr >> 30;
301 346 fsr &= ~FSR_CEXC;
302 347
303 348 /* strip off the integer bit, if there is one */
304 349 ibit = z0 & 0x100000;
305 - if (ibit)
350 +
351 + if (ibit) {
306 352 z0 -= 0x100000;
307 - else {
353 + } else {
308 354 ez = 0;
309 - if (!(z0 | z1 | z2)) { /* exact zero */
355 +
356 + if (!(z0 | z1 | z2)) { /* exact zero */
310 357 zz.i[0] = rm == FSR_RM ? 0x80000000 : 0;
311 358 zz.i[1] = 0;
312 359 __fenv_setfsr32(&fsr);
313 360 return (zz.d);
314 361 }
315 362 }
316 363
317 364 /*
318 365 * flip the sense of directed roundings if the result is negative;
319 366 * the logic below applies to a positive result
320 367 */
321 368 if (sz)
322 369 rm ^= rm >> 1;
323 370
324 371 /* round and raise exceptions */
325 372 if (z2) {
326 373 fsr |= FSR_NXC;
327 374
328 375 /* decide whether to round the fraction up */
329 - if (rm == FSR_RP || (rm == FSR_RN && (z2 > 0x80000000u ||
330 - (z2 == 0x80000000u && (z1 & 1))))) {
376 + if (rm == FSR_RP || (rm == FSR_RN && (z2 > 0x80000000u || (z2 ==
377 + 0x80000000u && (z1 & 1))))) {
331 378 /* round up and renormalize if necessary */
332 379 if (++z1 == 0) {
333 380 if (++z0 == 0x100000) {
334 381 z0 = 0;
335 382 ez++;
336 383 }
337 384 }
338 385 }
339 386 }
340 387
341 388 /* check for under/overflow */
342 389 if (ez >= 0x7ff) {
343 390 if (rm == FSR_RN || rm == FSR_RP) {
344 391 zz.i[0] = sz | 0x7ff00000;
345 392 zz.i[1] = 0;
346 393 } else {
347 394 zz.i[0] = sz | 0x7fefffff;
348 395 zz.i[1] = 0xffffffff;
349 396 }
397 +
350 398 fsr |= FSR_OFC | FSR_NXC;
351 399 } else {
352 400 zz.i[0] = sz | (ez << 20) | z0;
353 401 zz.i[1] = z1;
354 402
355 403 /*
356 404 * !ibit => exact result was tiny before rounding,
357 405 * z2 nonzero => result delivered is inexact
358 406 */
359 407 if (!ibit) {
360 408 if (z2)
361 409 fsr |= FSR_UFC | FSR_NXC;
362 410 else if (fsr & FSR_UFM)
363 411 fsr |= FSR_UFC;
364 412 }
365 413 }
366 414
367 415 /* restore the fsr and emulate exceptions as needed */
368 416 if ((fsr & FSR_CEXC) & (fsr >> 23)) {
369 417 __fenv_setfsr32(&fsr);
418 +
370 419 if (fsr & FSR_OFC) {
371 420 dummy = huge;
372 421 dummy *= huge;
373 422 } else if (fsr & FSR_UFC) {
374 423 dummy = tiny;
424 +
375 425 if (fsr & FSR_NXC)
376 426 dummy *= tiny;
377 427 else
378 428 dummy -= tiny2;
379 429 } else {
380 430 dummy = huge;
381 431 dummy += tiny;
382 432 }
383 433 } else {
384 434 fsr |= (fsr & 0x1f) << 5;
385 435 __fenv_setfsr32(&fsr);
386 436 }
437 +
387 438 return (zz.d);
388 439 }
389 -
390 440 #elif defined(__x86)
391 -
392 441 #if defined(__amd64)
393 -#define NI 4
442 +#define NI 4
394 443 #else
395 -#define NI 3
444 +#define NI 3
396 445 #endif
397 446
398 447 /*
399 448 * fma for x86: 64-bit double precision, little-endian
400 449 */
401 450 double
402 -__fma(double x, double y, double z) {
451 +__fma(double x, double y, double z)
452 +{
403 453 union {
404 454 unsigned i[NI];
405 455 long double e;
406 456 } xx, yy, zz;
457 +
407 458 long double xe, ye, xhi, xlo, yhi, ylo;
408 459 int ex, ey, ez;
409 460 unsigned cwsw, oldcwsw, rm;
410 461
411 462 /* convert the operands to double extended */
412 - xx.e = (long double) x;
413 - yy.e = (long double) y;
414 - zz.e = (long double) z;
463 + xx.e = (long double)x;
464 + yy.e = (long double)y;
465 + zz.e = (long double)z;
415 466
416 467 /* extract the exponents of the arguments */
417 468 ex = xx.i[2] & 0x7fff;
418 469 ey = yy.i[2] & 0x7fff;
419 470 ez = zz.i[2] & 0x7fff;
420 471
421 472 /* dispense with inf, nan, and zero cases */
422 473 if (ex == 0x7fff || ey == 0x7fff || ex == 0 || ey == 0)
423 474 /* x or y is inf, nan, or zero */
424 - return ((double) (xx.e * yy.e + zz.e));
475 + return ((double)(xx.e * yy.e + zz.e));
425 476
426 - if (ez >= 0x7fff) /* z is inf or nan */
427 - return ((double) (xx.e + zz.e));
428 - /* avoid spurious inexact in x * y */
477 + if (ez >= 0x7fff) /* z is inf or nan */
478 + return ((double)(xx.e + zz.e));
479 +
480 + /* avoid spurious inexact in x * y */
429 481
430 482 /*
431 483 * save the control and status words, mask all exceptions, and
432 484 * set rounding to 64-bit precision and to-nearest
433 485 */
434 486 __fenv_getcwsw(&oldcwsw);
435 487 cwsw = (oldcwsw & 0xf0c0ffff) | 0x033f0000;
436 488 __fenv_setcwsw(&cwsw);
437 489
438 490 /* multiply x*y to 106 bits */
439 491 xe = xx.e;
440 492 xx.i[0] = 0;
441 - xhi = xx.e; /* hi 32 bits */
442 - xlo = xe - xhi; /* lo 21 bits */
493 + xhi = xx.e; /* hi 32 bits */
494 + xlo = xe - xhi; /* lo 21 bits */
443 495 ye = yy.e;
444 496 yy.i[0] = 0;
445 497 yhi = yy.e;
446 498 ylo = ye - yhi;
447 499 xe = xe * ye;
448 500 ye = ((xhi * yhi - xe) + xhi * ylo + xlo * yhi) + xlo * ylo;
449 501
450 502 /* distill the sum of xe, ye, and z */
451 503 xhi = ye + zz.e;
452 504 yhi = xhi - ye;
453 505 xlo = (zz.e - yhi) + (ye - (xhi - yhi));
454 - /* now (xhi,xlo) = ye + z */
506 + /* now (xhi,xlo) = ye + z */
455 507
456 508 yhi = xe + xhi;
457 509 ye = yhi - xe;
458 510 ylo = (xhi - ye) + (xe - (yhi - ye)); /* now (yhi,ylo) = xe + xhi */
459 511
460 512 xhi = xlo + ylo;
461 513 xe = xhi - xlo;
462 514 xlo = (ylo - xe) + (xlo - (xhi - xe)); /* now (xhi,xlo) = xlo + ylo */
463 515
464 516 yy.e = yhi + xhi;
465 517 ylo = (yhi - yy.e) + xhi; /* now (yy.e,ylo) = xhi + yhi */
466 518
467 - if (yy.i[1] != 0) { /* yy.e is nonzero */
519 + if (yy.i[1] != 0) { /* yy.e is nonzero */
468 520 /* perturb yy.e if its least significant 10 bits are zero */
469 521 if (!(yy.i[0] & 0x3ff)) {
470 522 xx.e = ylo + xlo;
523 +
471 524 if (xx.i[1] != 0) {
472 - xx.i[2] = (xx.i[2] & 0x8000) |
473 - ((yy.i[2] & 0x7fff) - 63);
525 + xx.i[2] = (xx.i[2] & 0x8000) | ((yy.i[2] &
526 + 0x7fff) - 63);
474 527 xx.i[1] = 0x80000000;
475 528 xx.i[0] = 0;
476 529 yy.e += xx.e;
477 530 }
478 531 }
479 532 } else {
480 533 /* set sign of zero result according to rounding direction */
481 534 rm = oldcwsw & 0x0c000000;
482 - yy.i[2] = ((rm == FCW_RM)? 0x8000 : 0);
535 + yy.i[2] = ((rm == FCW_RM) ? 0x8000 : 0);
483 536 }
484 537
485 538 /*
486 539 * restore the control and status words and convert the result
487 540 * to double
488 541 */
489 542 __fenv_setcwsw(&oldcwsw);
490 - return ((double) yy.e);
543 + return ((double)yy.e);
491 544 }
492 -
493 545 #else
494 546 #error Unknown architecture
495 547 #endif
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX