Print this page
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/lib/libmvec/common/__vsinf.c
+++ new/usr/src/lib/libmvec/common/__vsinf.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
23 23 */
24 24 /*
25 25 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
26 26 * Use is subject to license terms.
27 27 */
28 28
29 29 /*
30 30 * __vsinf: single precision vector sin
31 31 *
32 32 * Algorithm:
33 33 *
34 34 * For |x| < pi/4, approximate sin(x) by a polynomial x+x*z*(S0+
35 35 * z*(S1+z*S2)) and cos(x) by a polynomial 1+z*(-1/2+z*(C0+z*(C1+
36 36 * z*C2))), where z = x*x, all evaluated in double precision.
37 37 *
38 38 * Accuracy:
39 39 *
40 40 * The largest error is less than 0.6 ulps.
41 41 */
42 42
43 43 #include <sys/isa_defs.h>
44 44
45 45 #ifdef _LITTLE_ENDIAN
46 46 #define HI(x) *(1+(int *)&x)
47 47 #define LO(x) *(unsigned *)&x
48 48 #else
49 49 #define HI(x) *(int *)&x
50 50 #define LO(x) *(1+(unsigned *)&x)
51 51 #endif
52 52
53 53 #ifdef __RESTRICT
54 54 #define restrict _Restrict
55 55 #else
56 56 #define restrict
57 57 #endif
58 58
59 59 extern int __vlibm_rem_pio2m(double *, double *, int, int, int);
60 60
61 61 static const double C[] = {
62 62 -1.66666552424430847168e-01, /* 2^ -3 * -1.5555460000000 */
63 63 8.33219196647405624390e-03, /* 2^ -7 * 1.11077E0000000 */
64 64 -1.95187909412197768688e-04, /* 2^-13 * -1.9956B60000000 */
65 65 1.0,
66 66 -0.5,
67 67 4.16666455566883087158e-02, /* 2^ -5 * 1.55554A0000000 */
68 68 -1.38873036485165357590e-03, /* 2^-10 * -1.6C0C1E0000000 */
69 69 2.44309903791872784495e-05, /* 2^-16 * 1.99E24E0000000 */
70 70 0.636619772367581343075535, /* 2^ -1 * 1.45F306DC9C883 */
71 71 6755399441055744.0, /* 2^ 52 * 1.8000000000000 */
72 72 1.570796326734125614166, /* 2^ 0 * 1.921FB54400000 */
73 73 6.077100506506192601475e-11, /* 2^-34 * 1.0B4611A626331 */
74 74 };
75 75
76 76 #define S0 C[0]
77 77 #define S1 C[1]
78 78 #define S2 C[2]
79 79 #define one C[3]
80 80 #define mhalf C[4]
81 81 #define C0 C[5]
82 82 #define C1 C[6]
83 83 #define C2 C[7]
84 84 #define invpio2 C[8]
85 85 #define c3two51 C[9]
86 86 #define pio2_1 C[10]
87 87 #define pio2_t C[11]
88 88
89 89 #define PREPROCESS(N, index, label) \
90 90 hx = *(int *)x; \
91 91 ix = hx & 0x7fffffff; \
92 92 t = *x; \
93 93 x += stridex; \
94 94 if (ix <= 0x3f490fdb) { /* |x| < pi/4 */ \
95 95 if (ix == 0) { \
96 96 y[index] = t; \
97 97 goto label; \
98 98 } \
99 99 y##N = (double)t; \
100 100 n##N = 0; \
101 101 } else if (ix <= 0x49c90fdb) { /* |x| < 2^19*pi */ \
102 102 y##N = (double)t; \
103 103 medium = 1; \
104 104 } else { \
105 105 if (ix >= 0x7f800000) { /* inf or nan */ \
106 106 y[index] = t / t; \
107 107 goto label; \
108 108 } \
109 109 z##N = y##N = (double)t; \
110 110 hx = HI(y##N); \
111 111 n##N = ((hx >> 20) & 0x7ff) - 1046; \
112 112 HI(z##N) = (hx & 0xfffff) | 0x41600000; \
113 113 n##N = __vlibm_rem_pio2m(&z##N, &y##N, n##N, 1, 0); \
114 114 if (hx < 0) { \
115 115 y##N = -y##N; \
116 116 n##N = -n##N; \
117 117 } \
118 118 z##N = y##N * y##N; \
119 119 if (n##N & 1) { /* compute cos y */ \
120 120 f##N = (float)(one + z##N * (mhalf + z##N * \
121 121 (C0 + z##N * (C1 + z##N * C2)))); \
122 122 } else { /* compute sin y */ \
123 123 f##N = (float)(y##N + y##N * z##N * (S0 + \
124 124 z##N * (S1 + z##N * S2))); \
125 125 } \
126 126 y[index] = (n##N & 2)? -f##N : f##N; \
127 127 goto label; \
128 128 }
129 129
130 130 #define PROCESS(N) \
131 131 if (medium) { \
132 132 z##N = y##N * invpio2 + c3two51; \
133 133 n##N = LO(z##N); \
134 134 z##N -= c3two51; \
135 135 y##N = (y##N - z##N * pio2_1) - z##N * pio2_t; \
136 136 } \
137 137 z##N = y##N * y##N; \
138 138 if (n##N & 1) { /* compute cos y */ \
139 139 f##N = (float)(one + z##N * (mhalf + z##N * (C0 + \
140 140 z##N * (C1 + z##N * C2)))); \
141 141 } else { /* compute sin y */ \
142 142 f##N = (float)(y##N + y##N * z##N * (S0 + z##N * (S1 + \
143 143 z##N * S2))); \
144 144 } \
↓ open down ↓ |
144 lines elided |
↑ open up ↑ |
145 145 *y = (n##N & 2)? -f##N : f##N; \
146 146 y += stridey
147 147
148 148 void
149 149 __vsinf(int n, float *restrict x, int stridex, float *restrict y,
150 150 int stridey)
151 151 {
152 152 double y0, y1, y2, y3;
153 153 double z0, z1, z2, z3;
154 154 float f0, f1, f2, f3, t;
155 - int n0, n1, n2, n3, hx, ix, medium;
155 + int n0 = 0, n1 = 0, n2 = 0, n3, hx, ix, medium;
156 156
157 157 y -= stridey;
158 158
159 159 for (;;) {
160 160 begin:
161 161 y += stridey;
162 162
163 163 if (--n < 0)
164 164 break;
165 165
166 166 medium = 0;
167 167 PREPROCESS(0, 0, begin);
168 168
169 169 if (--n < 0)
170 170 goto process1;
171 171
172 172 PREPROCESS(1, stridey, process1);
173 173
174 174 if (--n < 0)
175 175 goto process2;
176 176
177 177 PREPROCESS(2, (stridey << 1), process2);
178 178
179 179 if (--n < 0)
180 180 goto process3;
181 181
182 182 PREPROCESS(3, (stridey << 1) + stridey, process3);
183 183
184 184 if (medium) {
185 185 z0 = y0 * invpio2 + c3two51;
186 186 z1 = y1 * invpio2 + c3two51;
187 187 z2 = y2 * invpio2 + c3two51;
188 188 z3 = y3 * invpio2 + c3two51;
189 189
190 190 n0 = LO(z0);
191 191 n1 = LO(z1);
192 192 n2 = LO(z2);
193 193 n3 = LO(z3);
194 194
195 195 z0 -= c3two51;
196 196 z1 -= c3two51;
197 197 z2 -= c3two51;
198 198 z3 -= c3two51;
199 199
200 200 y0 = (y0 - z0 * pio2_1) - z0 * pio2_t;
201 201 y1 = (y1 - z1 * pio2_1) - z1 * pio2_t;
202 202 y2 = (y2 - z2 * pio2_1) - z2 * pio2_t;
203 203 y3 = (y3 - z3 * pio2_1) - z3 * pio2_t;
204 204 }
205 205
206 206 z0 = y0 * y0;
207 207 z1 = y1 * y1;
208 208 z2 = y2 * y2;
209 209 z3 = y3 * y3;
210 210
211 211 hx = (n0 & 1) | ((n1 & 1) << 1) | ((n2 & 1) << 2) |
212 212 ((n3 & 1) << 3);
213 213 switch (hx) {
214 214 case 0:
215 215 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
216 216 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
217 217 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
218 218 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
219 219 break;
220 220
221 221 case 1:
222 222 f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
223 223 z0 * (C1 + z0 * C2))));
224 224 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
225 225 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
226 226 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
227 227 break;
228 228
229 229 case 2:
230 230 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
231 231 f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
232 232 z1 * (C1 + z1 * C2))));
233 233 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
234 234 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
235 235 break;
236 236
237 237 case 3:
238 238 f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
239 239 z0 * (C1 + z0 * C2))));
240 240 f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
241 241 z1 * (C1 + z1 * C2))));
242 242 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
243 243 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
244 244 break;
245 245
246 246 case 4:
247 247 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
248 248 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
249 249 f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
250 250 z2 * (C1 + z2 * C2))));
251 251 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
252 252 break;
253 253
254 254 case 5:
255 255 f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
256 256 z0 * (C1 + z0 * C2))));
257 257 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
258 258 f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
259 259 z2 * (C1 + z2 * C2))));
260 260 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
261 261 break;
262 262
263 263 case 6:
264 264 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
265 265 f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
266 266 z1 * (C1 + z1 * C2))));
267 267 f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
268 268 z2 * (C1 + z2 * C2))));
269 269 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
270 270 break;
271 271
272 272 case 7:
273 273 f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
274 274 z0 * (C1 + z0 * C2))));
275 275 f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
276 276 z1 * (C1 + z1 * C2))));
277 277 f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
278 278 z2 * (C1 + z2 * C2))));
279 279 f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
280 280 break;
281 281
282 282 case 8:
283 283 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
284 284 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
285 285 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
286 286 f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
287 287 z3 * (C1 + z3 * C2))));
288 288 break;
289 289
290 290 case 9:
291 291 f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
292 292 z0 * (C1 + z0 * C2))));
293 293 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
294 294 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
295 295 f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
296 296 z3 * (C1 + z3 * C2))));
297 297 break;
298 298
299 299 case 10:
300 300 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
301 301 f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
302 302 z1 * (C1 + z1 * C2))));
303 303 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
304 304 f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
305 305 z3 * (C1 + z3 * C2))));
306 306 break;
307 307
308 308 case 11:
309 309 f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
310 310 z0 * (C1 + z0 * C2))));
311 311 f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
312 312 z1 * (C1 + z1 * C2))));
313 313 f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
314 314 f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
315 315 z3 * (C1 + z3 * C2))));
316 316 break;
317 317
318 318 case 12:
319 319 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
320 320 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
321 321 f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
322 322 z2 * (C1 + z2 * C2))));
323 323 f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
324 324 z3 * (C1 + z3 * C2))));
325 325 break;
326 326
327 327 case 13:
328 328 f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
329 329 z0 * (C1 + z0 * C2))));
330 330 f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
331 331 f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
332 332 z2 * (C1 + z2 * C2))));
333 333 f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
334 334 z3 * (C1 + z3 * C2))));
335 335 break;
336 336
337 337 case 14:
338 338 f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
339 339 f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
340 340 z1 * (C1 + z1 * C2))));
341 341 f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
342 342 z2 * (C1 + z2 * C2))));
343 343 f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
344 344 z3 * (C1 + z3 * C2))));
345 345 break;
346 346
347 347 default:
348 348 f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
349 349 z0 * (C1 + z0 * C2))));
350 350 f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
351 351 z1 * (C1 + z1 * C2))));
352 352 f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
353 353 z2 * (C1 + z2 * C2))));
354 354 f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
355 355 z3 * (C1 + z3 * C2))));
356 356 }
357 357
358 358 *y = (n0 & 2)? -f0 : f0;
359 359 y += stridey;
360 360 *y = (n1 & 2)? -f1 : f1;
361 361 y += stridey;
362 362 *y = (n2 & 2)? -f2 : f2;
363 363 y += stridey;
364 364 *y = (n3 & 2)? -f3 : f3;
365 365 continue;
366 366
367 367 process1:
368 368 PROCESS(0);
369 369 continue;
370 370
371 371 process2:
372 372 PROCESS(0);
373 373 PROCESS(1);
374 374 continue;
375 375
376 376 process3:
377 377 PROCESS(0);
378 378 PROCESS(1);
379 379 PROCESS(2);
380 380 }
381 381 }
↓ open down ↓ |
216 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX