1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 */
25 /*
26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
27 * Use is subject to license terms.
28 */
29
30 #include <sys/isa_defs.h>
31 #include "libm_synonyms.h"
32 #include "libm_inlines.h"
33
34 #ifdef _LITTLE_ENDIAN
35 #define HI(x) *(1+(int*)x)
36 #define LO(x) *(unsigned*)x
37 #else
38 #define HI(x) *(int*)x
39 #define LO(x) *(1+(unsigned*)x)
40 #endif
41
42 #ifdef __RESTRICT
43 #define restrict _Restrict
44 #else
45 #define restrict
46 #endif
47
48 /* double rhypot(double x, double y)
49 *
50 * Method :
51 * 1. Special cases:
52 * x or y = Inf => 0
53 * x or y = NaN => QNaN
54 * x and y = 0 => Inf + divide-by-zero
55 * 2. Computes rhypot(x,y):
56 * rhypot(x,y) = m * sqrt(1/(xnm * xnm + ynm * ynm))
57 * Where:
58 * m = 1/max(|x|,|y|)
59 * xnm = x * m
60 * ynm = y * m
61 *
62 * Compute 1/(xnm * xnm + ynm * ynm) by simulating
63 * muti-precision arithmetic.
64 *
65 * Accuracy:
66 * Maximum error observed: less than 0.869 ulp after 1.000.000.000
67 * results.
68 */
69
70 #define sqrt __sqrt
71
72 extern double sqrt( double );
73
74 extern double fabs( double );
75
76 static const int __vlibm_TBL_rhypot[] = {
77 /* i = [0,127]
78 * TBL[i] = 0x3ff00000 + *(int*)&(1.0 / *(double*)&(0x3ff0000000000000ULL + (i << 45))); */
79 0x7fe00000, 0x7fdfc07f, 0x7fdf81f8, 0x7fdf4465,
80 0x7fdf07c1, 0x7fdecc07, 0x7fde9131, 0x7fde573a,
81 0x7fde1e1e, 0x7fdde5d6, 0x7fddae60, 0x7fdd77b6,
82 0x7fdd41d4, 0x7fdd0cb5, 0x7fdcd856, 0x7fdca4b3,
83 0x7fdc71c7, 0x7fdc3f8f, 0x7fdc0e07, 0x7fdbdd2b,
84 0x7fdbacf9, 0x7fdb7d6c, 0x7fdb4e81, 0x7fdb2036,
85 0x7fdaf286, 0x7fdac570, 0x7fda98ef, 0x7fda6d01,
86 0x7fda41a4, 0x7fda16d3, 0x7fd9ec8e, 0x7fd9c2d1,
87 0x7fd99999, 0x7fd970e4, 0x7fd948b0, 0x7fd920fb,
88 0x7fd8f9c1, 0x7fd8d301, 0x7fd8acb9, 0x7fd886e5,
89 0x7fd86186, 0x7fd83c97, 0x7fd81818, 0x7fd7f405,
90 0x7fd7d05f, 0x7fd7ad22, 0x7fd78a4c, 0x7fd767dc,
91 0x7fd745d1, 0x7fd72428, 0x7fd702e0, 0x7fd6e1f7,
92 0x7fd6c16c, 0x7fd6a13c, 0x7fd68168, 0x7fd661ec,
93 0x7fd642c8, 0x7fd623fa, 0x7fd60581, 0x7fd5e75b,
94 0x7fd5c988, 0x7fd5ac05, 0x7fd58ed2, 0x7fd571ed,
95 0x7fd55555, 0x7fd53909, 0x7fd51d07, 0x7fd50150,
96 0x7fd4e5e0, 0x7fd4cab8, 0x7fd4afd6, 0x7fd49539,
97 0x7fd47ae1, 0x7fd460cb, 0x7fd446f8, 0x7fd42d66,
98 0x7fd41414, 0x7fd3fb01, 0x7fd3e22c, 0x7fd3c995,
99 0x7fd3b13b, 0x7fd3991c, 0x7fd38138, 0x7fd3698d,
100 0x7fd3521c, 0x7fd33ae4, 0x7fd323e3, 0x7fd30d19,
101 0x7fd2f684, 0x7fd2e025, 0x7fd2c9fb, 0x7fd2b404,
102 0x7fd29e41, 0x7fd288b0, 0x7fd27350, 0x7fd25e22,
103 0x7fd24924, 0x7fd23456, 0x7fd21fb7, 0x7fd20b47,
104 0x7fd1f704, 0x7fd1e2ef, 0x7fd1cf06, 0x7fd1bb4a,
105 0x7fd1a7b9, 0x7fd19453, 0x7fd18118, 0x7fd16e06,
106 0x7fd15b1e, 0x7fd1485f, 0x7fd135c8, 0x7fd12358,
107 0x7fd11111, 0x7fd0fef0, 0x7fd0ecf5, 0x7fd0db20,
108 0x7fd0c971, 0x7fd0b7e6, 0x7fd0a681, 0x7fd0953f,
109 0x7fd08421, 0x7fd07326, 0x7fd0624d, 0x7fd05197,
110 0x7fd04104, 0x7fd03091, 0x7fd02040, 0x7fd01010,
111 };
112
113 static const unsigned long long LCONST[] = {
114 0x3ff0000000000000ULL, /* DONE = 1.0 */
115 0x4000000000000000ULL, /* DTWO = 2.0 */
116 0x4230000000000000ULL, /* D2ON36 = 2**36 */
117 0x7fd0000000000000ULL, /* D2ON1022 = 2**1022 */
118 0x3cb0000000000000ULL, /* D2ONM52 = 2**-52 */
119 };
120
121 #define RET_SC(I) \
122 px += stridex; \
123 py += stridey; \
124 pz += stridez; \
125 if ( --n <= 0 ) \
126 break; \
127 goto start##I;
128
129 #define RETURN(I, ret) \
130 { \
131 pz[0] = (ret); \
132 RET_SC(I) \
133 }
134
135 #define PREP(I) \
136 hx##I = HI(px); \
137 hy##I = HI(py); \
138 hx##I &= 0x7fffffff; \
139 hy##I &= 0x7fffffff; \
140 pz##I = pz; \
141 if ( hx##I >= 0x7ff00000 || hy##I >= 0x7ff00000 ) /* |X| or |Y| = Inf,NaN */ \
142 { \
143 lx = LO(px); \
144 ly = LO(py); \
145 x = *px; \
146 y = *py; \
147 if ( hx##I == 0x7ff00000 && lx == 0 ) res0 = 0.0; /* |X| = Inf */ \
148 else if ( hy##I == 0x7ff00000 && ly == 0 ) res0 = 0.0; /* |Y| = Inf */ \
149 else res0 = fabs(x) + fabs(y); \
150 \
151 RETURN (I, res0) \
152 } \
153 x##I = *px; \
154 y##I = *py; \
155 diff0 = hy##I - hx##I; \
156 j0 = diff0 >> 31; \
157 if ( hx##I < 0x00100000 && hy##I < 0x00100000 ) /* |X| and |Y| = subnormal or zero */ \
158 { \
159 lx = LO(px); \
160 ly = LO(py); \
161 x = x##I; \
162 y = y##I; \
163 \
164 if ( (hx##I | hy##I | lx | ly) == 0 ) /* |X| and |Y| = 0 */ \
165 RETURN (I, DONE / 0.0) \
166 \
167 x = fabs(x); \
168 y = fabs(y); \
169 \
170 x = *(long long*)&x; \
171 y = *(long long*)&y; \
172 \
173 x *= D2ONM52; \
174 y *= D2ONM52; \
175 \
176 x_hi0 = ( x + D2ON36 ) - D2ON36; \
177 y_hi0 = ( y + D2ON36 ) - D2ON36; \
178 x_lo0 = x - x_hi0; \
179 y_lo0 = y - y_hi0; \
180 res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0); \
181 res0_lo = ((x + x_hi0) * x_lo0 + (y + y_hi0) * y_lo0); \
182 \
183 dres0 = res0_hi + res0_lo; \
184 \
185 iarr0 = HI(&dres0); \
186 iexp0 = iarr0 & 0xfff00000; \
187 \
188 iarr0 = (iarr0 >> 11) & 0x1fc; \
189 itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0]; \
190 itbl0 -= iexp0; \
191 HI(&dd0) = itbl0; \
192 LO(&dd0) = 0; \
193 \
194 dd0 = dd0 * (DTWO - dd0 * dres0); \
195 dd0 = dd0 * (DTWO - dd0 * dres0); \
196 dres0 = dd0 * (DTWO - dd0 * dres0); \
197 \
198 HI(&res0) = HI(&dres0) & 0xffffff00; \
199 LO(&res0) = 0; \
200 res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0; \
201 res0 = sqrt ( res0 ); \
202 \
203 res0 = D2ON1022 * res0; \
204 RETURN (I, res0) \
205 } \
206 j0 = hy##I - (diff0 & j0); \
207 j0 &= 0x7ff00000; \
208 HI(&scl##I) = 0x7ff00000 - j0;
209
210 void
211 __vrhypot( int n, double * restrict px, int stridex, double * restrict py,
212 int stridey, double * restrict pz, int stridez )
213 {
214 int i = 0;
215 double x, y;
216 double x_hi0, x_lo0, y_hi0, y_lo0, scl0 = 0;
217 double x0, y0, res0, dd0;
218 double res0_hi,res0_lo, dres0;
219 double x_hi1, x_lo1, y_hi1, y_lo1, scl1 = 0;
220 double x1 = 0.0L, y1 = 0.0L, res1, dd1;
221 double res1_hi,res1_lo, dres1;
222 double x_hi2, x_lo2, y_hi2, y_lo2, scl2 = 0;
223 double x2, y2, res2, dd2;
224 double res2_hi,res2_lo, dres2;
225
226 int hx0, hy0, j0, diff0;
227 int iarr0, iexp0, itbl0;
228 int hx1, hy1;
229 int iarr1, iexp1, itbl1;
230 int hx2, hy2;
231 int iarr2, iexp2, itbl2;
232
233 int lx, ly;
234
235 double DONE = ((double*)LCONST)[0];
236 double DTWO = ((double*)LCONST)[1];
237 double D2ON36 = ((double*)LCONST)[2];
238 double D2ON1022 = ((double*)LCONST)[3];
239 double D2ONM52 = ((double*)LCONST)[4];
240
241 double *pz0, *pz1 = 0, *pz2;
242
243 do
244 {
245 start0:
246 PREP(0)
247 px += stridex;
248 py += stridey;
249 pz += stridez;
250 i = 1;
251 if ( --n <= 0 )
252 break;
253
254 start1:
255 PREP(1)
256 px += stridex;
257 py += stridey;
258 pz += stridez;
259 i = 2;
260 if ( --n <= 0 )
261 break;
262
263 start2:
264 PREP(2)
265
266 x0 *= scl0;
267 y0 *= scl0;
268 x1 *= scl1;
269 y1 *= scl1;
270 x2 *= scl2;
271 y2 *= scl2;
272
273 x_hi0 = ( x0 + D2ON36 ) - D2ON36;
274 y_hi0 = ( y0 + D2ON36 ) - D2ON36;
275 x_hi1 = ( x1 + D2ON36 ) - D2ON36;
276 y_hi1 = ( y1 + D2ON36 ) - D2ON36;
277 x_hi2 = ( x2 + D2ON36 ) - D2ON36;
278 y_hi2 = ( y2 + D2ON36 ) - D2ON36;
279 x_lo0 = x0 - x_hi0;
280 y_lo0 = y0 - y_hi0;
281 x_lo1 = x1 - x_hi1;
282 y_lo1 = y1 - y_hi1;
283 x_lo2 = x2 - x_hi2;
284 y_lo2 = y2 - y_hi2;
285 res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);
286 res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1);
287 res2_hi = (x_hi2 * x_hi2 + y_hi2 * y_hi2);
288 res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0);
289 res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1);
290 res2_lo = ((x2 + x_hi2) * x_lo2 + (y2 + y_hi2) * y_lo2);
291
292 dres0 = res0_hi + res0_lo;
293 dres1 = res1_hi + res1_lo;
294 dres2 = res2_hi + res2_lo;
295
296 iarr0 = HI(&dres0);
297 iarr1 = HI(&dres1);
298 iarr2 = HI(&dres2);
299 iexp0 = iarr0 & 0xfff00000;
300 iexp1 = iarr1 & 0xfff00000;
301 iexp2 = iarr2 & 0xfff00000;
302
303 iarr0 = (iarr0 >> 11) & 0x1fc;
304 iarr1 = (iarr1 >> 11) & 0x1fc;
305 iarr2 = (iarr2 >> 11) & 0x1fc;
306 itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];
307 itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0];
308 itbl2 = ((int*)((char*)__vlibm_TBL_rhypot + iarr2))[0];
309 itbl0 -= iexp0;
310 itbl1 -= iexp1;
311 itbl2 -= iexp2;
312 HI(&dd0) = itbl0;
313 HI(&dd1) = itbl1;
314 HI(&dd2) = itbl2;
315 LO(&dd0) = 0;
316 LO(&dd1) = 0;
317 LO(&dd2) = 0;
318
319 dd0 = dd0 * (DTWO - dd0 * dres0);
320 dd1 = dd1 * (DTWO - dd1 * dres1);
321 dd2 = dd2 * (DTWO - dd2 * dres2);
322 dd0 = dd0 * (DTWO - dd0 * dres0);
323 dd1 = dd1 * (DTWO - dd1 * dres1);
324 dd2 = dd2 * (DTWO - dd2 * dres2);
325 dres0 = dd0 * (DTWO - dd0 * dres0);
326 dres1 = dd1 * (DTWO - dd1 * dres1);
327 dres2 = dd2 * (DTWO - dd2 * dres2);
328
329 HI(&res0) = HI(&dres0) & 0xffffff00;
330 HI(&res1) = HI(&dres1) & 0xffffff00;
331 HI(&res2) = HI(&dres2) & 0xffffff00;
332 LO(&res0) = 0;
333 LO(&res1) = 0;
334 LO(&res2) = 0;
335 res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;
336 res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1;
337 res2 += (DONE - res2_hi * res2 - res2_lo * res2) * dres2;
338 res0 = sqrt ( res0 );
339 res1 = sqrt ( res1 );
340 res2 = sqrt ( res2 );
341
342 res0 = scl0 * res0;
343 res1 = scl1 * res1;
344 res2 = scl2 * res2;
345
346 *pz0 = res0;
347 *pz1 = res1;
348 *pz2 = res2;
349
350 px += stridex;
351 py += stridey;
352 pz += stridez;
353 i = 0;
354
355 } while ( --n > 0 );
356
357 if ( i > 0 )
358 {
359 x0 *= scl0;
360 y0 *= scl0;
361
362 x_hi0 = ( x0 + D2ON36 ) - D2ON36;
363 y_hi0 = ( y0 + D2ON36 ) - D2ON36;
364 x_lo0 = x0 - x_hi0;
365 y_lo0 = y0 - y_hi0;
366 res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);
367 res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0);
368
369 dres0 = res0_hi + res0_lo;
370
371 iarr0 = HI(&dres0);
372 iexp0 = iarr0 & 0xfff00000;
373
374 iarr0 = (iarr0 >> 11) & 0x1fc;
375 itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];
376 itbl0 -= iexp0;
377 HI(&dd0) = itbl0;
378 LO(&dd0) = 0;
379
380 dd0 = dd0 * (DTWO - dd0 * dres0);
381 dd0 = dd0 * (DTWO - dd0 * dres0);
382 dres0 = dd0 * (DTWO - dd0 * dres0);
383
384 HI(&res0) = HI(&dres0) & 0xffffff00;
385 LO(&res0) = 0;
386 res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;
387 res0 = sqrt ( res0 );
388
389 res0 = scl0 * res0;
390
391 *pz0 = res0;
392
393 if ( i > 1 )
394 {
395 x1 *= scl1;
396 y1 *= scl1;
397
398 x_hi1 = ( x1 + D2ON36 ) - D2ON36;
399 y_hi1 = ( y1 + D2ON36 ) - D2ON36;
400 x_lo1 = x1 - x_hi1;
401 y_lo1 = y1 - y_hi1;
402 res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1);
403 res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1);
404
405 dres1 = res1_hi + res1_lo;
406
407 iarr1 = HI(&dres1);
408 iexp1 = iarr1 & 0xfff00000;
409
410 iarr1 = (iarr1 >> 11) & 0x1fc;
411 itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0];
412 itbl1 -= iexp1;
413 HI(&dd1) = itbl1;
414 LO(&dd1) = 0;
415
416 dd1 = dd1 * (DTWO - dd1 * dres1);
417 dd1 = dd1 * (DTWO - dd1 * dres1);
418 dres1 = dd1 * (DTWO - dd1 * dres1);
419
420 HI(&res1) = HI(&dres1) & 0xffffff00;
421 LO(&res1) = 0;
422 res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1;
423 res1 = sqrt ( res1 );
424
425 res1 = scl1 * res1;
426
427 *pz1 = res1;
428 }
429 }
430 }
431