Print this page
5261 libm should stop using synonyms.h
5298 fabs is 0-sized, confuses dis(1) and others
Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Approved by: Gordon Ross <gwr@nexenta.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/lib/libmvec/common/__vrsqrt.c
+++ new/usr/src/lib/libmvec/common/__vrsqrt.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
↓ open down ↓ |
20 lines elided |
↑ open up ↑ |
21 21
22 22 /*
23 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 24 */
25 25 /*
26 26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
27 27 * Use is subject to license terms.
28 28 */
29 29
30 30 #include <sys/isa_defs.h>
31 -#include "libm_synonyms.h"
32 31 #include "libm_inlines.h"
33 32
34 33 #ifdef _LITTLE_ENDIAN
35 34 #define HI(x) *(1+(int*)x)
36 35 #define LO(x) *(unsigned*)x
37 36 #else
38 37 #define HI(x) *(int*)x
39 38 #define LO(x) *(1+(unsigned*)x)
40 39 #endif
41 40
42 41 #ifdef __RESTRICT
43 42 #define restrict _Restrict
44 43 #else
45 44 #define restrict
46 45 #endif
47 46
48 47 /* double rsqrt(double x)
49 48 *
50 49 * Method :
51 50 * 1. Special cases:
52 51 * for x = NaN => QNaN;
53 52 * for x = +Inf => 0;
54 53 * for x is negative, -Inf => QNaN + invalid;
55 54 * for x = +0 => +Inf + divide-by-zero;
56 55 * for x = -0 => -Inf + divide-by-zero.
57 56 * 2. Computes reciprocal square root from:
58 57 * x = m * 2**n
59 58 * Where:
60 59 * m = [0.5, 2),
61 60 * n = ((exponent + 1) & ~1).
62 61 * Then:
63 62 * rsqrt(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m))
64 63 * 2. Computes 1/sqrt(m) from:
65 64 * 1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm))
66 65 * Where:
67 66 * m = m0 + dm,
68 67 * m0 = 0.5 * (1 + k/64) for m = [0.5, 0.5+127/256), k = [0, 63];
69 68 * m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127];
70 69 * m0 = 2.0 for m = [1.0+127/128, 2.0), k = 128.
71 70 * Then:
72 71 * 1/sqrt(m0) is looked up in a table,
73 72 * 1/m0 is computed as (1/sqrt(m0)) * (1/sqrt(m0)).
74 73 * 1/sqrt(1 + (1/m0)*dm) is computed using approximation:
75 74 * 1/sqrt(1 + z) = (((((a6 * z + a5) * z + a4) * z + a3)
↓ open down ↓ |
34 lines elided |
↑ open up ↑ |
76 75 * * z + a2) * z + a1) * z + a0
77 76 * where z = [-1/128, 1/128].
78 77 *
79 78 * Accuracy:
80 79 * The maximum relative error for the approximating
81 80 * polynomial is 2**(-56.26).
82 81 * Maximum error observed: less than 0.563 ulp after 1.500.000.000
83 82 * results.
84 83 */
85 84
86 -#define sqrt __sqrt
87 -
88 85 extern double sqrt (double);
89 86 extern const double __vlibm_TBL_rsqrt[];
90 87
91 88 static void
92 89 __vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey);
93 90
94 91 #pragma no_inline(__vrsqrt_n)
95 92
96 93 #define RETURN(ret) \
97 94 { \
98 95 *py = (ret); \
99 96 py += stridey; \
100 97 if (n_n == 0) \
101 98 { \
102 99 spx = px; spy = py; \
103 100 hx = HI(px); \
104 101 continue; \
105 102 } \
106 103 n--; \
107 104 break; \
108 105 }
109 106
110 107 static const double
111 108 DONE = 1.0,
112 109 K1 = -5.00000000000005209867e-01,
113 110 K2 = 3.75000000000004884257e-01,
114 111 K3 = -3.12499999317136886551e-01,
115 112 K4 = 2.73437499359815081532e-01,
116 113 K5 = -2.46116125605037803130e-01,
117 114 K6 = 2.25606914648617522896e-01;
118 115
119 116 void
120 117 __vrsqrt(int n, double * restrict px, int stridex, double * restrict py, int stridey)
121 118 {
122 119 double *spx, *spy;
123 120 int ax, lx, hx, n_n;
124 121 double res;
125 122
126 123 while (n > 1)
127 124 {
128 125 n_n = 0;
129 126 spx = px;
130 127 spy = py;
131 128 hx = HI(px);
132 129 for (; n > 1 ; n--)
133 130 {
134 131 px += stridex;
135 132 if (hx >= 0x7ff00000) /* X = NaN or Inf */
136 133 {
137 134 res = *(px - stridex);
138 135 RETURN (DONE / res)
139 136 }
140 137
141 138 py += stridey;
142 139
143 140 if (hx < 0x00100000) /* X = denormal, zero or negative */
144 141 {
145 142 py -= stridey;
146 143 ax = hx & 0x7fffffff;
147 144 lx = LO((px - stridex));
148 145 res = *(px - stridex);
149 146
150 147 if ((ax | lx) == 0) /* |X| = zero */
151 148 {
152 149 RETURN (DONE / res)
153 150 }
154 151 else if (hx >= 0) /* X = denormal */
155 152 {
156 153 double res_c0, dsqrt_exp0;
157 154 int ind0, sqrt_exp0;
158 155 double xx0, dexp_hi0, dexp_lo0;
159 156 int hx0, resh0, res_ch0;
160 157
161 158 res = *(long long*)&res;
162 159
163 160 hx0 = HI(&res);
164 161 sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
165 162 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
166 163
167 164 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
168 165 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
169 166 HI(&res) = resh0;
170 167 HI(&res_c0) = res_ch0;
171 168 LO(&res_c0) = 0;
172 169
173 170 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
174 171 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
175 172 xx0 = dexp_hi0 * dexp_hi0;
176 173 xx0 = (res - res_c0) * xx0;
177 174 res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
178 175
179 176 res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
180 177
181 178 HI(&dsqrt_exp0) = sqrt_exp0;
182 179 LO(&dsqrt_exp0) = 0;
183 180 res *= dsqrt_exp0;
184 181
185 182 RETURN (res)
186 183 }
187 184 else /* X = negative */
188 185 {
189 186 RETURN (sqrt(res))
190 187 }
191 188 }
192 189 n_n++;
193 190 hx = HI(px);
194 191 }
195 192 if (n_n > 0)
196 193 __vrsqrt_n(n_n, spx, stridex, spy, stridey);
197 194 }
198 195 if (n > 0)
199 196 {
200 197 hx = HI(px);
201 198
202 199 if (hx >= 0x7ff00000) /* X = NaN or Inf */
203 200 {
204 201 res = *px;
205 202 *py = DONE / res;
206 203 }
207 204 else if (hx < 0x00100000) /* X = denormal, zero or negative */
208 205 {
209 206 ax = hx & 0x7fffffff;
210 207 lx = LO(px);
211 208 res = *px;
212 209
213 210 if ((ax | lx) == 0) /* |X| = zero */
214 211 {
215 212 *py = DONE / res;
216 213 }
217 214 else if (hx >= 0) /* X = denormal */
218 215 {
219 216 double res_c0, dsqrt_exp0;
220 217 int ind0, sqrt_exp0;
221 218 double xx0, dexp_hi0, dexp_lo0;
222 219 int hx0, resh0, res_ch0;
223 220
224 221 res = *(long long*)&res;
225 222
226 223 hx0 = HI(&res);
227 224 sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
228 225 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
229 226
230 227 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
231 228 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
232 229 HI(&res) = resh0;
233 230 HI(&res_c0) = res_ch0;
234 231 LO(&res_c0) = 0;
235 232
236 233 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
237 234 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
238 235 xx0 = dexp_hi0 * dexp_hi0;
239 236 xx0 = (res - res_c0) * xx0;
240 237 res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
241 238
242 239 res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
243 240
244 241 HI(&dsqrt_exp0) = sqrt_exp0;
245 242 LO(&dsqrt_exp0) = 0;
246 243 res *= dsqrt_exp0;
247 244
248 245 *py = res;
249 246 }
250 247 else /* X = negative */
251 248 {
252 249 *py = sqrt(res);
253 250 }
254 251 }
255 252 else
256 253 {
257 254 double res_c0, dsqrt_exp0;
258 255 int ind0, sqrt_exp0;
259 256 double xx0, dexp_hi0, dexp_lo0;
260 257 int resh0, res_ch0;
261 258
262 259 sqrt_exp0 = (0x5fe - (hx >> 21)) << 20;
263 260 ind0 = (((hx >> 10) & 0x7f8) + 8) & -16;
264 261
265 262 resh0 = (hx & 0x001fffff) | 0x3fe00000;
266 263 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
267 264 HI(&res) = resh0;
268 265 LO(&res) = LO(px);
269 266 HI(&res_c0) = res_ch0;
270 267 LO(&res_c0) = 0;
271 268
272 269 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
273 270 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
274 271 xx0 = dexp_hi0 * dexp_hi0;
275 272 xx0 = (res - res_c0) * xx0;
276 273 res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
277 274
278 275 res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
279 276
280 277 HI(&dsqrt_exp0) = sqrt_exp0;
281 278 LO(&dsqrt_exp0) = 0;
282 279 res *= dsqrt_exp0;
283 280
284 281 *py = res;
285 282 }
286 283 }
287 284 }
288 285
289 286 static void
290 287 __vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey)
291 288 {
292 289 double res0, res_c0, dsqrt_exp0;
293 290 double res1, res_c1, dsqrt_exp1;
294 291 double res2, res_c2, dsqrt_exp2;
295 292 int ind0, sqrt_exp0;
296 293 int ind1, sqrt_exp1;
297 294 int ind2, sqrt_exp2;
298 295 double xx0, dexp_hi0, dexp_lo0;
299 296 double xx1, dexp_hi1, dexp_lo1;
300 297 double xx2, dexp_hi2, dexp_lo2;
301 298 int hx0, resh0, res_ch0;
302 299 int hx1, resh1, res_ch1;
303 300 int hx2, resh2, res_ch2;
304 301
305 302 LO(&dsqrt_exp0) = 0;
306 303 LO(&dsqrt_exp1) = 0;
307 304 LO(&dsqrt_exp2) = 0;
308 305 LO(&res_c0) = 0;
309 306 LO(&res_c1) = 0;
310 307 LO(&res_c2) = 0;
311 308
312 309 for(; n > 2 ; n -= 3)
313 310 {
314 311 hx0 = HI(px);
315 312 LO(&res0) = LO(px);
316 313 px += stridex;
317 314
318 315 hx1 = HI(px);
319 316 LO(&res1) = LO(px);
320 317 px += stridex;
321 318
322 319 hx2 = HI(px);
323 320 LO(&res2) = LO(px);
324 321 px += stridex;
325 322
326 323 sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
327 324 sqrt_exp1 = (0x5fe - (hx1 >> 21)) << 20;
328 325 sqrt_exp2 = (0x5fe - (hx2 >> 21)) << 20;
329 326 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
330 327 ind1 = (((hx1 >> 10) & 0x7f8) + 8) & -16;
331 328 ind2 = (((hx2 >> 10) & 0x7f8) + 8) & -16;
332 329
333 330 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
334 331 resh1 = (hx1 & 0x001fffff) | 0x3fe00000;
335 332 resh2 = (hx2 & 0x001fffff) | 0x3fe00000;
336 333 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
337 334 res_ch1 = (resh1 + 0x00002000) & 0x7fffc000;
338 335 res_ch2 = (resh2 + 0x00002000) & 0x7fffc000;
339 336 HI(&res0) = resh0;
340 337 HI(&res1) = resh1;
341 338 HI(&res2) = resh2;
342 339 HI(&res_c0) = res_ch0;
343 340 HI(&res_c1) = res_ch1;
344 341 HI(&res_c2) = res_ch2;
345 342
346 343 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
347 344 dexp_hi1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[0];
348 345 dexp_hi2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[0];
349 346 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
350 347 dexp_lo1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[1];
351 348 dexp_lo2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[1];
352 349 xx0 = dexp_hi0 * dexp_hi0;
353 350 xx1 = dexp_hi1 * dexp_hi1;
354 351 xx2 = dexp_hi2 * dexp_hi2;
355 352 xx0 = (res0 - res_c0) * xx0;
356 353 xx1 = (res1 - res_c1) * xx1;
357 354 xx2 = (res2 - res_c2) * xx2;
358 355 res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
359 356 res1 = (((((K6 * xx1 + K5) * xx1 + K4) * xx1 + K3) * xx1 + K2) * xx1 + K1) * xx1;
360 357 res2 = (((((K6 * xx2 + K5) * xx2 + K4) * xx2 + K3) * xx2 + K2) * xx2 + K1) * xx2;
361 358
362 359 res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
363 360 res1 = dexp_hi1 * res1 + dexp_lo1 + dexp_hi1;
364 361 res2 = dexp_hi2 * res2 + dexp_lo2 + dexp_hi2;
365 362
366 363 HI(&dsqrt_exp0) = sqrt_exp0;
367 364 HI(&dsqrt_exp1) = sqrt_exp1;
368 365 HI(&dsqrt_exp2) = sqrt_exp2;
369 366 res0 *= dsqrt_exp0;
370 367 res1 *= dsqrt_exp1;
371 368 res2 *= dsqrt_exp2;
372 369
373 370 *py = res0;
374 371 py += stridey;
375 372
376 373 *py = res1;
377 374 py += stridey;
378 375
379 376 *py = res2;
380 377 py += stridey;
381 378 }
382 379
383 380 for(; n > 0 ; n--)
384 381 {
385 382 hx0 = HI(px);
386 383
387 384 sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
388 385 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
389 386
390 387 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
391 388 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
392 389 HI(&res0) = resh0;
393 390 LO(&res0) = LO(px);
394 391 HI(&res_c0) = res_ch0;
395 392 LO(&res_c0) = 0;
396 393
397 394 px += stridex;
398 395
399 396 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
400 397 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
401 398 xx0 = dexp_hi0 * dexp_hi0;
402 399 xx0 = (res0 - res_c0) * xx0;
403 400 res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
404 401
↓ open down ↓ |
307 lines elided |
↑ open up ↑ |
405 402 res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
406 403
407 404 HI(&dsqrt_exp0) = sqrt_exp0;
408 405 LO(&dsqrt_exp0) = 0;
409 406 res0 *= dsqrt_exp0;
410 407
411 408 *py = res0;
412 409 py += stridey;
413 410 }
414 411 }
415 -
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX