Print this page
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/lib/libmvec/common/__vrhypot.c
+++ new/usr/src/lib/libmvec/common/__vrhypot.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 24 */
25 25 /*
26 26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
27 27 * Use is subject to license terms.
28 28 */
29 29
30 30 #include <sys/isa_defs.h>
31 31 #include "libm_synonyms.h"
32 32 #include "libm_inlines.h"
33 33
34 34 #ifdef _LITTLE_ENDIAN
35 35 #define HI(x) *(1+(int*)x)
36 36 #define LO(x) *(unsigned*)x
37 37 #else
38 38 #define HI(x) *(int*)x
39 39 #define LO(x) *(1+(unsigned*)x)
40 40 #endif
41 41
42 42 #ifdef __RESTRICT
43 43 #define restrict _Restrict
44 44 #else
45 45 #define restrict
46 46 #endif
47 47
48 48 /* double rhypot(double x, double y)
49 49 *
50 50 * Method :
51 51 * 1. Special cases:
52 52 * x or y = Inf => 0
53 53 * x or y = NaN => QNaN
54 54 * x and y = 0 => Inf + divide-by-zero
55 55 * 2. Computes rhypot(x,y):
56 56 * rhypot(x,y) = m * sqrt(1/(xnm * xnm + ynm * ynm))
57 57 * Where:
58 58 * m = 1/max(|x|,|y|)
59 59 * xnm = x * m
60 60 * ynm = y * m
61 61 *
62 62 * Compute 1/(xnm * xnm + ynm * ynm) by simulating
63 63 * muti-precision arithmetic.
64 64 *
65 65 * Accuracy:
66 66 * Maximum error observed: less than 0.869 ulp after 1.000.000.000
67 67 * results.
68 68 */
69 69
70 70 #define sqrt __sqrt
71 71
72 72 extern double sqrt( double );
73 73
74 74 extern double fabs( double );
75 75
76 76 static const int __vlibm_TBL_rhypot[] = {
77 77 /* i = [0,127]
78 78 * TBL[i] = 0x3ff00000 + *(int*)&(1.0 / *(double*)&(0x3ff0000000000000ULL + (i << 45))); */
79 79 0x7fe00000, 0x7fdfc07f, 0x7fdf81f8, 0x7fdf4465,
80 80 0x7fdf07c1, 0x7fdecc07, 0x7fde9131, 0x7fde573a,
81 81 0x7fde1e1e, 0x7fdde5d6, 0x7fddae60, 0x7fdd77b6,
82 82 0x7fdd41d4, 0x7fdd0cb5, 0x7fdcd856, 0x7fdca4b3,
83 83 0x7fdc71c7, 0x7fdc3f8f, 0x7fdc0e07, 0x7fdbdd2b,
84 84 0x7fdbacf9, 0x7fdb7d6c, 0x7fdb4e81, 0x7fdb2036,
85 85 0x7fdaf286, 0x7fdac570, 0x7fda98ef, 0x7fda6d01,
86 86 0x7fda41a4, 0x7fda16d3, 0x7fd9ec8e, 0x7fd9c2d1,
87 87 0x7fd99999, 0x7fd970e4, 0x7fd948b0, 0x7fd920fb,
88 88 0x7fd8f9c1, 0x7fd8d301, 0x7fd8acb9, 0x7fd886e5,
89 89 0x7fd86186, 0x7fd83c97, 0x7fd81818, 0x7fd7f405,
90 90 0x7fd7d05f, 0x7fd7ad22, 0x7fd78a4c, 0x7fd767dc,
91 91 0x7fd745d1, 0x7fd72428, 0x7fd702e0, 0x7fd6e1f7,
92 92 0x7fd6c16c, 0x7fd6a13c, 0x7fd68168, 0x7fd661ec,
93 93 0x7fd642c8, 0x7fd623fa, 0x7fd60581, 0x7fd5e75b,
94 94 0x7fd5c988, 0x7fd5ac05, 0x7fd58ed2, 0x7fd571ed,
95 95 0x7fd55555, 0x7fd53909, 0x7fd51d07, 0x7fd50150,
96 96 0x7fd4e5e0, 0x7fd4cab8, 0x7fd4afd6, 0x7fd49539,
97 97 0x7fd47ae1, 0x7fd460cb, 0x7fd446f8, 0x7fd42d66,
98 98 0x7fd41414, 0x7fd3fb01, 0x7fd3e22c, 0x7fd3c995,
99 99 0x7fd3b13b, 0x7fd3991c, 0x7fd38138, 0x7fd3698d,
100 100 0x7fd3521c, 0x7fd33ae4, 0x7fd323e3, 0x7fd30d19,
101 101 0x7fd2f684, 0x7fd2e025, 0x7fd2c9fb, 0x7fd2b404,
102 102 0x7fd29e41, 0x7fd288b0, 0x7fd27350, 0x7fd25e22,
103 103 0x7fd24924, 0x7fd23456, 0x7fd21fb7, 0x7fd20b47,
104 104 0x7fd1f704, 0x7fd1e2ef, 0x7fd1cf06, 0x7fd1bb4a,
105 105 0x7fd1a7b9, 0x7fd19453, 0x7fd18118, 0x7fd16e06,
106 106 0x7fd15b1e, 0x7fd1485f, 0x7fd135c8, 0x7fd12358,
107 107 0x7fd11111, 0x7fd0fef0, 0x7fd0ecf5, 0x7fd0db20,
108 108 0x7fd0c971, 0x7fd0b7e6, 0x7fd0a681, 0x7fd0953f,
109 109 0x7fd08421, 0x7fd07326, 0x7fd0624d, 0x7fd05197,
110 110 0x7fd04104, 0x7fd03091, 0x7fd02040, 0x7fd01010,
111 111 };
112 112
113 113 static const unsigned long long LCONST[] = {
114 114 0x3ff0000000000000ULL, /* DONE = 1.0 */
115 115 0x4000000000000000ULL, /* DTWO = 2.0 */
116 116 0x4230000000000000ULL, /* D2ON36 = 2**36 */
117 117 0x7fd0000000000000ULL, /* D2ON1022 = 2**1022 */
118 118 0x3cb0000000000000ULL, /* D2ONM52 = 2**-52 */
119 119 };
120 120
121 121 #define RET_SC(I) \
122 122 px += stridex; \
123 123 py += stridey; \
124 124 pz += stridez; \
125 125 if ( --n <= 0 ) \
126 126 break; \
127 127 goto start##I;
128 128
129 129 #define RETURN(I, ret) \
130 130 { \
131 131 pz[0] = (ret); \
132 132 RET_SC(I) \
133 133 }
134 134
135 135 #define PREP(I) \
136 136 hx##I = HI(px); \
137 137 hy##I = HI(py); \
138 138 hx##I &= 0x7fffffff; \
139 139 hy##I &= 0x7fffffff; \
140 140 pz##I = pz; \
141 141 if ( hx##I >= 0x7ff00000 || hy##I >= 0x7ff00000 ) /* |X| or |Y| = Inf,NaN */ \
142 142 { \
143 143 lx = LO(px); \
144 144 ly = LO(py); \
145 145 x = *px; \
146 146 y = *py; \
147 147 if ( hx##I == 0x7ff00000 && lx == 0 ) res0 = 0.0; /* |X| = Inf */ \
148 148 else if ( hy##I == 0x7ff00000 && ly == 0 ) res0 = 0.0; /* |Y| = Inf */ \
149 149 else res0 = fabs(x) + fabs(y); \
150 150 \
151 151 RETURN (I, res0) \
152 152 } \
153 153 x##I = *px; \
154 154 y##I = *py; \
155 155 diff0 = hy##I - hx##I; \
156 156 j0 = diff0 >> 31; \
157 157 if ( hx##I < 0x00100000 && hy##I < 0x00100000 ) /* |X| and |Y| = subnormal or zero */ \
158 158 { \
159 159 lx = LO(px); \
160 160 ly = LO(py); \
161 161 x = x##I; \
162 162 y = y##I; \
163 163 \
164 164 if ( (hx##I | hy##I | lx | ly) == 0 ) /* |X| and |Y| = 0 */ \
165 165 RETURN (I, DONE / 0.0) \
166 166 \
167 167 x = fabs(x); \
168 168 y = fabs(y); \
169 169 \
170 170 x = *(long long*)&x; \
171 171 y = *(long long*)&y; \
172 172 \
173 173 x *= D2ONM52; \
174 174 y *= D2ONM52; \
175 175 \
176 176 x_hi0 = ( x + D2ON36 ) - D2ON36; \
177 177 y_hi0 = ( y + D2ON36 ) - D2ON36; \
178 178 x_lo0 = x - x_hi0; \
179 179 y_lo0 = y - y_hi0; \
180 180 res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0); \
181 181 res0_lo = ((x + x_hi0) * x_lo0 + (y + y_hi0) * y_lo0); \
182 182 \
183 183 dres0 = res0_hi + res0_lo; \
184 184 \
185 185 iarr0 = HI(&dres0); \
186 186 iexp0 = iarr0 & 0xfff00000; \
187 187 \
188 188 iarr0 = (iarr0 >> 11) & 0x1fc; \
189 189 itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0]; \
190 190 itbl0 -= iexp0; \
191 191 HI(&dd0) = itbl0; \
192 192 LO(&dd0) = 0; \
193 193 \
194 194 dd0 = dd0 * (DTWO - dd0 * dres0); \
195 195 dd0 = dd0 * (DTWO - dd0 * dres0); \
196 196 dres0 = dd0 * (DTWO - dd0 * dres0); \
197 197 \
198 198 HI(&res0) = HI(&dres0) & 0xffffff00; \
199 199 LO(&res0) = 0; \
200 200 res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0; \
201 201 res0 = sqrt ( res0 ); \
202 202 \
203 203 res0 = D2ON1022 * res0; \
204 204 RETURN (I, res0) \
205 205 } \
206 206 j0 = hy##I - (diff0 & j0); \
207 207 j0 &= 0x7ff00000; \
208 208 HI(&scl##I) = 0x7ff00000 - j0;
209 209
↓ open down ↓ |
209 lines elided |
↑ open up ↑ |
210 210 void
211 211 __vrhypot( int n, double * restrict px, int stridex, double * restrict py,
212 212 int stridey, double * restrict pz, int stridez )
213 213 {
214 214 int i = 0;
215 215 double x, y;
216 216 double x_hi0, x_lo0, y_hi0, y_lo0, scl0 = 0;
217 217 double x0, y0, res0, dd0;
218 218 double res0_hi,res0_lo, dres0;
219 219 double x_hi1, x_lo1, y_hi1, y_lo1, scl1 = 0;
220 - double x1, y1, res1, dd1;
220 + double x1 = 0.0L, y1 = 0.0L, res1, dd1;
221 221 double res1_hi,res1_lo, dres1;
222 222 double x_hi2, x_lo2, y_hi2, y_lo2, scl2 = 0;
223 223 double x2, y2, res2, dd2;
224 224 double res2_hi,res2_lo, dres2;
225 225
226 226 int hx0, hy0, j0, diff0;
227 227 int iarr0, iexp0, itbl0;
228 228 int hx1, hy1;
229 229 int iarr1, iexp1, itbl1;
230 230 int hx2, hy2;
231 231 int iarr2, iexp2, itbl2;
232 232
233 233 int lx, ly;
234 234
235 235 double DONE = ((double*)LCONST)[0];
236 236 double DTWO = ((double*)LCONST)[1];
237 237 double D2ON36 = ((double*)LCONST)[2];
238 238 double D2ON1022 = ((double*)LCONST)[3];
239 239 double D2ONM52 = ((double*)LCONST)[4];
240 240
241 - double *pz0, *pz1, *pz2;
241 + double *pz0, *pz1 = 0, *pz2;
242 242
243 243 do
244 244 {
245 245 start0:
246 246 PREP(0)
247 247 px += stridex;
248 248 py += stridey;
249 249 pz += stridez;
250 250 i = 1;
251 251 if ( --n <= 0 )
252 252 break;
253 253
254 254 start1:
255 255 PREP(1)
256 256 px += stridex;
257 257 py += stridey;
258 258 pz += stridez;
259 259 i = 2;
260 260 if ( --n <= 0 )
261 261 break;
262 262
263 263 start2:
264 264 PREP(2)
265 265
266 266 x0 *= scl0;
267 267 y0 *= scl0;
268 268 x1 *= scl1;
269 269 y1 *= scl1;
270 270 x2 *= scl2;
271 271 y2 *= scl2;
272 272
273 273 x_hi0 = ( x0 + D2ON36 ) - D2ON36;
274 274 y_hi0 = ( y0 + D2ON36 ) - D2ON36;
275 275 x_hi1 = ( x1 + D2ON36 ) - D2ON36;
276 276 y_hi1 = ( y1 + D2ON36 ) - D2ON36;
277 277 x_hi2 = ( x2 + D2ON36 ) - D2ON36;
278 278 y_hi2 = ( y2 + D2ON36 ) - D2ON36;
279 279 x_lo0 = x0 - x_hi0;
280 280 y_lo0 = y0 - y_hi0;
281 281 x_lo1 = x1 - x_hi1;
282 282 y_lo1 = y1 - y_hi1;
283 283 x_lo2 = x2 - x_hi2;
284 284 y_lo2 = y2 - y_hi2;
285 285 res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);
286 286 res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1);
287 287 res2_hi = (x_hi2 * x_hi2 + y_hi2 * y_hi2);
288 288 res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0);
289 289 res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1);
290 290 res2_lo = ((x2 + x_hi2) * x_lo2 + (y2 + y_hi2) * y_lo2);
291 291
292 292 dres0 = res0_hi + res0_lo;
293 293 dres1 = res1_hi + res1_lo;
294 294 dres2 = res2_hi + res2_lo;
295 295
296 296 iarr0 = HI(&dres0);
297 297 iarr1 = HI(&dres1);
298 298 iarr2 = HI(&dres2);
299 299 iexp0 = iarr0 & 0xfff00000;
300 300 iexp1 = iarr1 & 0xfff00000;
301 301 iexp2 = iarr2 & 0xfff00000;
302 302
303 303 iarr0 = (iarr0 >> 11) & 0x1fc;
304 304 iarr1 = (iarr1 >> 11) & 0x1fc;
305 305 iarr2 = (iarr2 >> 11) & 0x1fc;
306 306 itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];
307 307 itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0];
308 308 itbl2 = ((int*)((char*)__vlibm_TBL_rhypot + iarr2))[0];
309 309 itbl0 -= iexp0;
310 310 itbl1 -= iexp1;
311 311 itbl2 -= iexp2;
312 312 HI(&dd0) = itbl0;
313 313 HI(&dd1) = itbl1;
314 314 HI(&dd2) = itbl2;
315 315 LO(&dd0) = 0;
316 316 LO(&dd1) = 0;
317 317 LO(&dd2) = 0;
318 318
319 319 dd0 = dd0 * (DTWO - dd0 * dres0);
320 320 dd1 = dd1 * (DTWO - dd1 * dres1);
321 321 dd2 = dd2 * (DTWO - dd2 * dres2);
322 322 dd0 = dd0 * (DTWO - dd0 * dres0);
323 323 dd1 = dd1 * (DTWO - dd1 * dres1);
324 324 dd2 = dd2 * (DTWO - dd2 * dres2);
325 325 dres0 = dd0 * (DTWO - dd0 * dres0);
326 326 dres1 = dd1 * (DTWO - dd1 * dres1);
327 327 dres2 = dd2 * (DTWO - dd2 * dres2);
328 328
329 329 HI(&res0) = HI(&dres0) & 0xffffff00;
330 330 HI(&res1) = HI(&dres1) & 0xffffff00;
331 331 HI(&res2) = HI(&dres2) & 0xffffff00;
332 332 LO(&res0) = 0;
333 333 LO(&res1) = 0;
334 334 LO(&res2) = 0;
335 335 res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;
336 336 res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1;
337 337 res2 += (DONE - res2_hi * res2 - res2_lo * res2) * dres2;
338 338 res0 = sqrt ( res0 );
339 339 res1 = sqrt ( res1 );
340 340 res2 = sqrt ( res2 );
341 341
342 342 res0 = scl0 * res0;
343 343 res1 = scl1 * res1;
344 344 res2 = scl2 * res2;
345 345
346 346 *pz0 = res0;
347 347 *pz1 = res1;
348 348 *pz2 = res2;
349 349
350 350 px += stridex;
351 351 py += stridey;
352 352 pz += stridez;
353 353 i = 0;
354 354
355 355 } while ( --n > 0 );
356 356
357 357 if ( i > 0 )
358 358 {
359 359 x0 *= scl0;
360 360 y0 *= scl0;
361 361
362 362 x_hi0 = ( x0 + D2ON36 ) - D2ON36;
363 363 y_hi0 = ( y0 + D2ON36 ) - D2ON36;
364 364 x_lo0 = x0 - x_hi0;
365 365 y_lo0 = y0 - y_hi0;
366 366 res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);
367 367 res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0);
368 368
369 369 dres0 = res0_hi + res0_lo;
370 370
371 371 iarr0 = HI(&dres0);
372 372 iexp0 = iarr0 & 0xfff00000;
373 373
374 374 iarr0 = (iarr0 >> 11) & 0x1fc;
375 375 itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];
376 376 itbl0 -= iexp0;
377 377 HI(&dd0) = itbl0;
378 378 LO(&dd0) = 0;
379 379
380 380 dd0 = dd0 * (DTWO - dd0 * dres0);
381 381 dd0 = dd0 * (DTWO - dd0 * dres0);
382 382 dres0 = dd0 * (DTWO - dd0 * dres0);
383 383
384 384 HI(&res0) = HI(&dres0) & 0xffffff00;
385 385 LO(&res0) = 0;
386 386 res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;
387 387 res0 = sqrt ( res0 );
388 388
389 389 res0 = scl0 * res0;
390 390
391 391 *pz0 = res0;
392 392
393 393 if ( i > 1 )
394 394 {
395 395 x1 *= scl1;
396 396 y1 *= scl1;
397 397
398 398 x_hi1 = ( x1 + D2ON36 ) - D2ON36;
399 399 y_hi1 = ( y1 + D2ON36 ) - D2ON36;
400 400 x_lo1 = x1 - x_hi1;
401 401 y_lo1 = y1 - y_hi1;
402 402 res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1);
403 403 res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1);
404 404
405 405 dres1 = res1_hi + res1_lo;
406 406
407 407 iarr1 = HI(&dres1);
408 408 iexp1 = iarr1 & 0xfff00000;
409 409
410 410 iarr1 = (iarr1 >> 11) & 0x1fc;
411 411 itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0];
412 412 itbl1 -= iexp1;
413 413 HI(&dd1) = itbl1;
414 414 LO(&dd1) = 0;
415 415
416 416 dd1 = dd1 * (DTWO - dd1 * dres1);
417 417 dd1 = dd1 * (DTWO - dd1 * dres1);
418 418 dres1 = dd1 * (DTWO - dd1 * dres1);
419 419
420 420 HI(&res1) = HI(&dres1) & 0xffffff00;
421 421 LO(&res1) = 0;
422 422 res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1;
423 423 res1 = sqrt ( res1 );
424 424
425 425 res1 = scl1 * res1;
426 426
427 427 *pz1 = res1;
428 428 }
429 429 }
430 430 }
431 431
↓ open down ↓ |
180 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX