```   1 /*
3  *
4  * The contents of this file are subject to the terms of the
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
18  *
20  */
21
22 /*
24  */
25 /*
27  * Use is subject to license terms.
28  */
29
30 #include <sys/isa_defs.h>
31 #include "libm_synonyms.h"
32 #include "libm_inlines.h"
33
34 #ifdef _LITTLE_ENDIAN
35 #define HI(x)   *(1+(int*)x)
36 #define LO(x)   *(unsigned*)x
37 #else
38 #define HI(x)   *(int*)x
39 #define LO(x)   *(1+(unsigned*)x)
40 #endif
41
42 #ifdef __RESTRICT
43 #define restrict _Restrict
44 #else
45 #define restrict
46 #endif
47
48 /* double rsqrt(double x)
49  *
50  * Method :
51  *      1. Special cases:
52  *              for x = NaN                             => QNaN;
53  *              for x = +Inf                            => 0;
54  *              for x is negative, -Inf                 => QNaN + invalid;
55  *              for x = +0                              => +Inf + divide-by-zero;
56  *              for x = -0                              => -Inf + divide-by-zero.
57  *      2. Computes reciprocal square root from:
58  *              x = m * 2**n
59  *      Where:
60  *              m = [0.5, 2),
61  *              n = ((exponent + 1) & ~1).
62  *      Then:
63  *              rsqrt(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m))
64  *      2. Computes 1/sqrt(m) from:
65  *              1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm))
66  *      Where:
67  *              m = m0 + dm,
68  *              m0 = 0.5 * (1 + k/64) for m = [0.5,         0.5+127/256), k = [0, 63];
69  *              m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127];
70  *              m0 = 2.0              for m = [1.0+127/128, 2.0),         k = 128.
71  *      Then:
72  *              1/sqrt(m0) is looked up in a table,
73  *              1/m0 is computed as (1/sqrt(m0)) * (1/sqrt(m0)).
74  *              1/sqrt(1 + (1/m0)*dm) is computed using approximation:
75  *                      1/sqrt(1 + z) = (((((a6 * z + a5) * z + a4) * z + a3)
76  *                                              * z + a2) * z + a1) * z + a0
77  *                      where z = [-1/128, 1/128].
78  *
79  * Accuracy:
80  *      The maximum relative error for the approximating
81  *      polynomial is 2**(-56.26).
82  *      Maximum error observed: less than 0.563 ulp after 1.500.000.000
83  *      results.
84  */
85
86 #define sqrt __sqrt
87
88 extern double sqrt (double);
89 extern const double __vlibm_TBL_rsqrt[];
90
91 static void
92 __vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey);
93
94 #pragma no_inline(__vrsqrt_n)
95
96 #define RETURN(ret)                                             \
97 {                                                               \
98         *py = (ret);                                            \
99         py += stridey;                                          \
100         if (n_n == 0)                                           \
101         {                                                       \
102                 spx = px; spy = py;                             \
103                 hx = HI(px);                                    \
104                 continue;                                       \
105         }                                                       \
106         n--;                                                    \
107         break;                                                  \
108 }
109
110 static const double
111         DONE = 1.0,
112         K1 = -5.00000000000005209867e-01,
113         K2 =  3.75000000000004884257e-01,
114         K3 = -3.12499999317136886551e-01,
115         K4 =  2.73437499359815081532e-01,
116         K5 = -2.46116125605037803130e-01,
117         K6 =  2.25606914648617522896e-01;
118
119 void
120 __vrsqrt(int n, double * restrict px, int stridex, double * restrict py, int stridey)
121 {
122         double          *spx, *spy;
123         int             ax, lx, hx, n_n;
124         double          res;
125
126         while (n > 1)
127         {
128                 n_n = 0;
129                 spx = px;
130                 spy = py;
131                 hx = HI(px);
132                 for (; n > 1 ; n--)
133                 {
134                         px += stridex;
135                         if (hx >= 0x7ff00000)                /* X = NaN or Inf       */
136                         {
137                                 res = *(px - stridex);
138                                 RETURN (DONE / res)
139                         }
140
141                         py += stridey;
142
143                         if (hx < 0x00100000)         /* X = denormal, zero or negative       */
144                         {
145                                 py -= stridey;
146                                 ax = hx & 0x7fffffff;
147                                 lx = LO((px - stridex));
148                                 res = *(px - stridex);
149
150                                 if ((ax | lx) == 0)     /* |X| = zero   */
151                                 {
152                                         RETURN (DONE / res)
153                                 }
154                                 else if (hx >= 0)    /* X = denormal */
155                                 {
156                                         double          res_c0, dsqrt_exp0;
157                                         int             ind0, sqrt_exp0;
158                                         double          xx0, dexp_hi0, dexp_lo0;
159                                         int             hx0, resh0, res_ch0;
160
161                                         res = *(long long*)&res;
162
163                                         hx0 = HI(&res);
164                                         sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
165                                         ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
166
167                                         resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
168                                         res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
169                                         HI(&res) = resh0;
170                                         HI(&res_c0) = res_ch0;
171                                         LO(&res_c0) = 0;
172
173                                         dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
174                                         dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
175                                         xx0 = dexp_hi0 * dexp_hi0;
176                                         xx0 = (res - res_c0) * xx0;
177                                         res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
178
179                                         res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
180
181                                         HI(&dsqrt_exp0) = sqrt_exp0;
182                                         LO(&dsqrt_exp0) = 0;
183                                         res *= dsqrt_exp0;
184
185                                         RETURN (res)
186                                 }
187                                 else    /* X = negative */
188                                 {
189                                         RETURN (sqrt(res))
190                                 }
191                         }
192                         n_n++;
193                         hx = HI(px);
194                 }
195                 if (n_n > 0)
196                         __vrsqrt_n(n_n, spx, stridex, spy, stridey);
197         }
198         if (n > 0)
199         {
200                 hx = HI(px);
201
202                 if (hx >= 0x7ff00000)                /* X = NaN or Inf       */
203                 {
204                         res = *px;
205                         *py = DONE / res;
206                 }
207                 else if (hx < 0x00100000)    /* X = denormal, zero or negative       */
208                 {
209                         ax = hx & 0x7fffffff;
210                         lx = LO(px);
211                         res = *px;
212
213                         if ((ax | lx) == 0)     /* |X| = zero   */
214                         {
215                                 *py = DONE / res;
216                         }
217                         else if (hx >= 0)    /* X = denormal */
218                         {
219                                 double          res_c0, dsqrt_exp0;
220                                 int             ind0, sqrt_exp0;
221                                 double          xx0, dexp_hi0, dexp_lo0;
222                                 int             hx0, resh0, res_ch0;
223
224                                 res = *(long long*)&res;
225
226                                 hx0 = HI(&res);
227                                 sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
228                                 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
229
230                                 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
231                                 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
232                                 HI(&res) = resh0;
233                                 HI(&res_c0) = res_ch0;
234                                 LO(&res_c0) = 0;
235
236                                 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
237                                 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
238                                 xx0 = dexp_hi0 * dexp_hi0;
239                                 xx0 = (res - res_c0) * xx0;
240                                 res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
241
242                                 res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
243
244                                 HI(&dsqrt_exp0) = sqrt_exp0;
245                                 LO(&dsqrt_exp0) = 0;
246                                 res *= dsqrt_exp0;
247
248                                 *py = res;
249                         }
250                         else    /* X = negative */
251                         {
252                                 *py = sqrt(res);
253                         }
254                 }
255                 else
256                 {
257                         double          res_c0, dsqrt_exp0;
258                         int             ind0, sqrt_exp0;
259                         double          xx0, dexp_hi0, dexp_lo0;
260                         int             resh0, res_ch0;
261
262                         sqrt_exp0 = (0x5fe - (hx >> 21)) << 20;
263                         ind0 = (((hx >> 10) & 0x7f8) + 8) & -16;
264
265                         resh0 = (hx & 0x001fffff) | 0x3fe00000;
266                         res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
267                         HI(&res) = resh0;
268                         LO(&res) = LO(px);
269                         HI(&res_c0) = res_ch0;
270                         LO(&res_c0) = 0;
271
272                         dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
273                         dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
274                         xx0 = dexp_hi0 * dexp_hi0;
275                         xx0 = (res - res_c0) * xx0;
276                         res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
277
278                         res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
279
280                         HI(&dsqrt_exp0) = sqrt_exp0;
281                         LO(&dsqrt_exp0) = 0;
282                         res *= dsqrt_exp0;
283
284                         *py = res;
285                 }
286         }
287 }
288
289 static void
290 __vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey)
291 {
292         double          res0, res_c0, dsqrt_exp0;
293         double          res1, res_c1, dsqrt_exp1;
294         double          res2, res_c2, dsqrt_exp2;
295         int             ind0, sqrt_exp0;
296         int             ind1, sqrt_exp1;
297         int             ind2, sqrt_exp2;
298         double          xx0, dexp_hi0, dexp_lo0;
299         double          xx1, dexp_hi1, dexp_lo1;
300         double          xx2, dexp_hi2, dexp_lo2;
301         int             hx0, resh0, res_ch0;
302         int             hx1, resh1, res_ch1;
303         int             hx2, resh2, res_ch2;
304
305         LO(&dsqrt_exp0) = 0;
306         LO(&dsqrt_exp1) = 0;
307         LO(&dsqrt_exp2) = 0;
308         LO(&res_c0) = 0;
309         LO(&res_c1) = 0;
310         LO(&res_c2) = 0;
311
312         for(; n > 2 ; n -= 3)
313         {
314                 hx0 = HI(px);
315                 LO(&res0) = LO(px);
316                 px += stridex;
317
318                 hx1 = HI(px);
319                 LO(&res1) = LO(px);
320                 px += stridex;
321
322                 hx2 = HI(px);
323                 LO(&res2) = LO(px);
324                 px += stridex;
325
326                 sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
327                 sqrt_exp1 = (0x5fe - (hx1 >> 21)) << 20;
328                 sqrt_exp2 = (0x5fe - (hx2 >> 21)) << 20;
329                 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
330                 ind1 = (((hx1 >> 10) & 0x7f8) + 8) & -16;
331                 ind2 = (((hx2 >> 10) & 0x7f8) + 8) & -16;
332
333                 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
334                 resh1 = (hx1 & 0x001fffff) | 0x3fe00000;
335                 resh2 = (hx2 & 0x001fffff) | 0x3fe00000;
336                 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
337                 res_ch1 = (resh1 + 0x00002000) & 0x7fffc000;
338                 res_ch2 = (resh2 + 0x00002000) & 0x7fffc000;
339                 HI(&res0) = resh0;
340                 HI(&res1) = resh1;
341                 HI(&res2) = resh2;
342                 HI(&res_c0) = res_ch0;
343                 HI(&res_c1) = res_ch1;
344                 HI(&res_c2) = res_ch2;
345
346                 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
347                 dexp_hi1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[0];
348                 dexp_hi2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[0];
349                 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
350                 dexp_lo1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[1];
351                 dexp_lo2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[1];
352                 xx0 = dexp_hi0 * dexp_hi0;
353                 xx1 = dexp_hi1 * dexp_hi1;
354                 xx2 = dexp_hi2 * dexp_hi2;
355                 xx0 = (res0 - res_c0) * xx0;
356                 xx1 = (res1 - res_c1) * xx1;
357                 xx2 = (res2 - res_c2) * xx2;
358                 res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
359                 res1 = (((((K6 * xx1 + K5) * xx1 + K4) * xx1 + K3) * xx1 + K2) * xx1 + K1) * xx1;
360                 res2 = (((((K6 * xx2 + K5) * xx2 + K4) * xx2 + K3) * xx2 + K2) * xx2 + K1) * xx2;
361
362                 res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
363                 res1 = dexp_hi1 * res1 + dexp_lo1 + dexp_hi1;
364                 res2 = dexp_hi2 * res2 + dexp_lo2 + dexp_hi2;
365
366                 HI(&dsqrt_exp0) = sqrt_exp0;
367                 HI(&dsqrt_exp1) = sqrt_exp1;
368                 HI(&dsqrt_exp2) = sqrt_exp2;
369                 res0 *= dsqrt_exp0;
370                 res1 *= dsqrt_exp1;
371                 res2 *= dsqrt_exp2;
372
373                 *py = res0;
374                 py += stridey;
375
376                 *py = res1;
377                 py += stridey;
378
379                 *py = res2;
380                 py += stridey;
381         }
382
383         for(; n > 0 ; n--)
384         {
385                 hx0 = HI(px);
386
387                 sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
388                 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
389
390                 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
391                 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
392                 HI(&res0) = resh0;
393                 LO(&res0) = LO(px);
394                 HI(&res_c0) = res_ch0;
395                 LO(&res_c0) = 0;
396
397                 px += stridex;
398
399                 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
400                 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
401                 xx0 = dexp_hi0 * dexp_hi0;
402                 xx0 = (res0 - res_c0) * xx0;
403                 res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
404
405                 res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
406
407                 HI(&dsqrt_exp0) = sqrt_exp0;
408                 LO(&dsqrt_exp0) = 0;
409                 res0 *= dsqrt_exp0;
410
411                 *py = res0;
412                 py += stridey;
413         }
414 }
415
```