1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 */
25 /*
26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
27 * Use is subject to license terms.
28 */
29
30 #include <sys/isa_defs.h>
31 #include "libm_synonyms.h"
32 #include "libm_inlines.h"
33
34 #ifdef _LITTLE_ENDIAN
35 #define HI(x) *(1+(int*)x)
36 #define LO(x) *(unsigned*)x
37 #else
38 #define HI(x) *(int*)x
39 #define LO(x) *(1+(unsigned*)x)
40 #endif
41
42 #ifdef __RESTRICT
43 #define restrict _Restrict
44 #else
45 #define restrict
46 #endif
47
48 /* double rsqrt(double x)
49 *
50 * Method :
51 * 1. Special cases:
52 * for x = NaN => QNaN;
53 * for x = +Inf => 0;
54 * for x is negative, -Inf => QNaN + invalid;
55 * for x = +0 => +Inf + divide-by-zero;
56 * for x = -0 => -Inf + divide-by-zero.
57 * 2. Computes reciprocal square root from:
58 * x = m * 2**n
59 * Where:
60 * m = [0.5, 2),
61 * n = ((exponent + 1) & ~1).
62 * Then:
63 * rsqrt(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m))
64 * 2. Computes 1/sqrt(m) from:
65 * 1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm))
66 * Where:
67 * m = m0 + dm,
68 * m0 = 0.5 * (1 + k/64) for m = [0.5, 0.5+127/256), k = [0, 63];
69 * m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127];
70 * m0 = 2.0 for m = [1.0+127/128, 2.0), k = 128.
71 * Then:
72 * 1/sqrt(m0) is looked up in a table,
73 * 1/m0 is computed as (1/sqrt(m0)) * (1/sqrt(m0)).
74 * 1/sqrt(1 + (1/m0)*dm) is computed using approximation:
75 * 1/sqrt(1 + z) = (((((a6 * z + a5) * z + a4) * z + a3)
76 * * z + a2) * z + a1) * z + a0
77 * where z = [-1/128, 1/128].
78 *
79 * Accuracy:
80 * The maximum relative error for the approximating
81 * polynomial is 2**(-56.26).
82 * Maximum error observed: less than 0.563 ulp after 1.500.000.000
83 * results.
84 */
85
86 #define sqrt __sqrt
87
88 extern double sqrt (double);
89 extern const double __vlibm_TBL_rsqrt[];
90
91 static void
92 __vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey);
93
94 #pragma no_inline(__vrsqrt_n)
95
96 #define RETURN(ret) \
97 { \
98 *py = (ret); \
99 py += stridey; \
100 if (n_n == 0) \
101 { \
102 spx = px; spy = py; \
103 hx = HI(px); \
104 continue; \
105 } \
106 n--; \
107 break; \
108 }
109
110 static const double
111 DONE = 1.0,
112 K1 = -5.00000000000005209867e-01,
113 K2 = 3.75000000000004884257e-01,
114 K3 = -3.12499999317136886551e-01,
115 K4 = 2.73437499359815081532e-01,
116 K5 = -2.46116125605037803130e-01,
117 K6 = 2.25606914648617522896e-01;
118
119 void
120 __vrsqrt(int n, double * restrict px, int stridex, double * restrict py, int stridey)
121 {
122 double *spx, *spy;
123 int ax, lx, hx, n_n;
124 double res;
125
126 while (n > 1)
127 {
128 n_n = 0;
129 spx = px;
130 spy = py;
131 hx = HI(px);
132 for (; n > 1 ; n--)
133 {
134 px += stridex;
135 if (hx >= 0x7ff00000) /* X = NaN or Inf */
136 {
137 res = *(px - stridex);
138 RETURN (DONE / res)
139 }
140
141 py += stridey;
142
143 if (hx < 0x00100000) /* X = denormal, zero or negative */
144 {
145 py -= stridey;
146 ax = hx & 0x7fffffff;
147 lx = LO((px - stridex));
148 res = *(px - stridex);
149
150 if ((ax | lx) == 0) /* |X| = zero */
151 {
152 RETURN (DONE / res)
153 }
154 else if (hx >= 0) /* X = denormal */
155 {
156 double res_c0, dsqrt_exp0;
157 int ind0, sqrt_exp0;
158 double xx0, dexp_hi0, dexp_lo0;
159 int hx0, resh0, res_ch0;
160
161 res = *(long long*)&res;
162
163 hx0 = HI(&res);
164 sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
165 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
166
167 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
168 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
169 HI(&res) = resh0;
170 HI(&res_c0) = res_ch0;
171 LO(&res_c0) = 0;
172
173 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
174 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
175 xx0 = dexp_hi0 * dexp_hi0;
176 xx0 = (res - res_c0) * xx0;
177 res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
178
179 res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
180
181 HI(&dsqrt_exp0) = sqrt_exp0;
182 LO(&dsqrt_exp0) = 0;
183 res *= dsqrt_exp0;
184
185 RETURN (res)
186 }
187 else /* X = negative */
188 {
189 RETURN (sqrt(res))
190 }
191 }
192 n_n++;
193 hx = HI(px);
194 }
195 if (n_n > 0)
196 __vrsqrt_n(n_n, spx, stridex, spy, stridey);
197 }
198 if (n > 0)
199 {
200 hx = HI(px);
201
202 if (hx >= 0x7ff00000) /* X = NaN or Inf */
203 {
204 res = *px;
205 *py = DONE / res;
206 }
207 else if (hx < 0x00100000) /* X = denormal, zero or negative */
208 {
209 ax = hx & 0x7fffffff;
210 lx = LO(px);
211 res = *px;
212
213 if ((ax | lx) == 0) /* |X| = zero */
214 {
215 *py = DONE / res;
216 }
217 else if (hx >= 0) /* X = denormal */
218 {
219 double res_c0, dsqrt_exp0;
220 int ind0, sqrt_exp0;
221 double xx0, dexp_hi0, dexp_lo0;
222 int hx0, resh0, res_ch0;
223
224 res = *(long long*)&res;
225
226 hx0 = HI(&res);
227 sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
228 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
229
230 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
231 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
232 HI(&res) = resh0;
233 HI(&res_c0) = res_ch0;
234 LO(&res_c0) = 0;
235
236 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
237 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
238 xx0 = dexp_hi0 * dexp_hi0;
239 xx0 = (res - res_c0) * xx0;
240 res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
241
242 res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
243
244 HI(&dsqrt_exp0) = sqrt_exp0;
245 LO(&dsqrt_exp0) = 0;
246 res *= dsqrt_exp0;
247
248 *py = res;
249 }
250 else /* X = negative */
251 {
252 *py = sqrt(res);
253 }
254 }
255 else
256 {
257 double res_c0, dsqrt_exp0;
258 int ind0, sqrt_exp0;
259 double xx0, dexp_hi0, dexp_lo0;
260 int resh0, res_ch0;
261
262 sqrt_exp0 = (0x5fe - (hx >> 21)) << 20;
263 ind0 = (((hx >> 10) & 0x7f8) + 8) & -16;
264
265 resh0 = (hx & 0x001fffff) | 0x3fe00000;
266 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
267 HI(&res) = resh0;
268 LO(&res) = LO(px);
269 HI(&res_c0) = res_ch0;
270 LO(&res_c0) = 0;
271
272 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
273 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
274 xx0 = dexp_hi0 * dexp_hi0;
275 xx0 = (res - res_c0) * xx0;
276 res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
277
278 res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
279
280 HI(&dsqrt_exp0) = sqrt_exp0;
281 LO(&dsqrt_exp0) = 0;
282 res *= dsqrt_exp0;
283
284 *py = res;
285 }
286 }
287 }
288
289 static void
290 __vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey)
291 {
292 double res0, res_c0, dsqrt_exp0;
293 double res1, res_c1, dsqrt_exp1;
294 double res2, res_c2, dsqrt_exp2;
295 int ind0, sqrt_exp0;
296 int ind1, sqrt_exp1;
297 int ind2, sqrt_exp2;
298 double xx0, dexp_hi0, dexp_lo0;
299 double xx1, dexp_hi1, dexp_lo1;
300 double xx2, dexp_hi2, dexp_lo2;
301 int hx0, resh0, res_ch0;
302 int hx1, resh1, res_ch1;
303 int hx2, resh2, res_ch2;
304
305 LO(&dsqrt_exp0) = 0;
306 LO(&dsqrt_exp1) = 0;
307 LO(&dsqrt_exp2) = 0;
308 LO(&res_c0) = 0;
309 LO(&res_c1) = 0;
310 LO(&res_c2) = 0;
311
312 for(; n > 2 ; n -= 3)
313 {
314 hx0 = HI(px);
315 LO(&res0) = LO(px);
316 px += stridex;
317
318 hx1 = HI(px);
319 LO(&res1) = LO(px);
320 px += stridex;
321
322 hx2 = HI(px);
323 LO(&res2) = LO(px);
324 px += stridex;
325
326 sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
327 sqrt_exp1 = (0x5fe - (hx1 >> 21)) << 20;
328 sqrt_exp2 = (0x5fe - (hx2 >> 21)) << 20;
329 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
330 ind1 = (((hx1 >> 10) & 0x7f8) + 8) & -16;
331 ind2 = (((hx2 >> 10) & 0x7f8) + 8) & -16;
332
333 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
334 resh1 = (hx1 & 0x001fffff) | 0x3fe00000;
335 resh2 = (hx2 & 0x001fffff) | 0x3fe00000;
336 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
337 res_ch1 = (resh1 + 0x00002000) & 0x7fffc000;
338 res_ch2 = (resh2 + 0x00002000) & 0x7fffc000;
339 HI(&res0) = resh0;
340 HI(&res1) = resh1;
341 HI(&res2) = resh2;
342 HI(&res_c0) = res_ch0;
343 HI(&res_c1) = res_ch1;
344 HI(&res_c2) = res_ch2;
345
346 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
347 dexp_hi1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[0];
348 dexp_hi2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[0];
349 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
350 dexp_lo1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[1];
351 dexp_lo2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[1];
352 xx0 = dexp_hi0 * dexp_hi0;
353 xx1 = dexp_hi1 * dexp_hi1;
354 xx2 = dexp_hi2 * dexp_hi2;
355 xx0 = (res0 - res_c0) * xx0;
356 xx1 = (res1 - res_c1) * xx1;
357 xx2 = (res2 - res_c2) * xx2;
358 res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
359 res1 = (((((K6 * xx1 + K5) * xx1 + K4) * xx1 + K3) * xx1 + K2) * xx1 + K1) * xx1;
360 res2 = (((((K6 * xx2 + K5) * xx2 + K4) * xx2 + K3) * xx2 + K2) * xx2 + K1) * xx2;
361
362 res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
363 res1 = dexp_hi1 * res1 + dexp_lo1 + dexp_hi1;
364 res2 = dexp_hi2 * res2 + dexp_lo2 + dexp_hi2;
365
366 HI(&dsqrt_exp0) = sqrt_exp0;
367 HI(&dsqrt_exp1) = sqrt_exp1;
368 HI(&dsqrt_exp2) = sqrt_exp2;
369 res0 *= dsqrt_exp0;
370 res1 *= dsqrt_exp1;
371 res2 *= dsqrt_exp2;
372
373 *py = res0;
374 py += stridey;
375
376 *py = res1;
377 py += stridey;
378
379 *py = res2;
380 py += stridey;
381 }
382
383 for(; n > 0 ; n--)
384 {
385 hx0 = HI(px);
386
387 sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
388 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
389
390 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
391 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
392 HI(&res0) = resh0;
393 LO(&res0) = LO(px);
394 HI(&res_c0) = res_ch0;
395 LO(&res_c0) = 0;
396
397 px += stridex;
398
399 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
400 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
401 xx0 = dexp_hi0 * dexp_hi0;
402 xx0 = (res0 - res_c0) * xx0;
403 res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
404
405 res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
406
407 HI(&dsqrt_exp0) = sqrt_exp0;
408 LO(&dsqrt_exp0) = 0;
409 res0 *= dsqrt_exp0;
410
411 *py = res0;
412 py += stridey;
413 }
414 }
415