1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  23  */
  24 /*
  25  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  26  * Use is subject to license terms.
  27  */
  28 
  29 /*
  30  * __vsinf: single precision vector sin
  31  *
  32  * Algorithm:
  33  *
  34  * For |x| < pi/4, approximate sin(x) by a polynomial x+x*z*(S0+
  35  * z*(S1+z*S2)) and cos(x) by a polynomial 1+z*(-1/2+z*(C0+z*(C1+
  36  * z*C2))), where z = x*x, all evaluated in double precision.
  37  *
  38  * Accuracy:
  39  *
  40  * The largest error is less than 0.6 ulps.
  41  */
  42 
  43 #include <sys/isa_defs.h>
  44 
  45 #ifdef _LITTLE_ENDIAN
  46 #define HI(x)   *(1+(int *)&x)
  47 #define LO(x)   *(unsigned *)&x
  48 #else
  49 #define HI(x)   *(int *)&x
  50 #define LO(x)   *(1+(unsigned *)&x)
  51 #endif
  52 
  53 #ifdef __RESTRICT
  54 #define restrict _Restrict
  55 #else
  56 #define restrict
  57 #endif
  58 
  59 extern int __vlibm_rem_pio2m(double *, double *, int, int, int);
  60 
  61 static const double C[] = {
  62         -1.66666552424430847168e-01,    /* 2^ -3 * -1.5555460000000 */
  63         8.33219196647405624390e-03,     /* 2^ -7 *  1.11077E0000000 */
  64         -1.95187909412197768688e-04,    /* 2^-13 * -1.9956B60000000 */
  65         1.0,
  66         -0.5,
  67         4.16666455566883087158e-02,     /* 2^ -5 *  1.55554A0000000 */
  68         -1.38873036485165357590e-03,    /* 2^-10 * -1.6C0C1E0000000 */
  69         2.44309903791872784495e-05,     /* 2^-16 *  1.99E24E0000000 */
  70         0.636619772367581343075535,     /* 2^ -1  * 1.45F306DC9C883 */
  71         6755399441055744.0,             /* 2^ 52  * 1.8000000000000 */
  72         1.570796326734125614166,        /* 2^  0  * 1.921FB54400000 */
  73         6.077100506506192601475e-11,    /* 2^-34  * 1.0B4611A626331 */
  74 };
  75 
  76 #define S0      C[0]
  77 #define S1      C[1]
  78 #define S2      C[2]
  79 #define one     C[3]
  80 #define mhalf   C[4]
  81 #define C0      C[5]
  82 #define C1      C[6]
  83 #define C2      C[7]
  84 #define invpio2 C[8]
  85 #define c3two51 C[9]
  86 #define pio2_1  C[10]
  87 #define pio2_t  C[11]
  88 
  89 #define PREPROCESS(N, index, label)                                     \
  90         hx = *(int *)x;                                                 \
  91         ix = hx & 0x7fffffff;                                               \
  92         t = *x;                                                         \
  93         x += stridex;                                                   \
  94         if (ix <= 0x3f490fdb) { /* |x| < pi/4 */                  \
  95                 if (ix == 0) {                                          \
  96                         y[index] = t;                                   \
  97                         goto label;                                     \
  98                 }                                                       \
  99                 y##N = (double)t;                                       \
 100                 n##N = 0;                                               \
 101         } else if (ix <= 0x49c90fdb) { /* |x| < 2^19*pi */                \
 102                 y##N = (double)t;                                       \
 103                 medium = 1;                                             \
 104         } else {                                                        \
 105                 if (ix >= 0x7f800000) { /* inf or nan */             \
 106                         y[index] = t / t;                               \
 107                         goto label;                                     \
 108                 }                                                       \
 109                 z##N = y##N = (double)t;                                \
 110                 hx = HI(y##N);                                          \
 111                 n##N = ((hx >> 20) & 0x7ff) - 1046;                   \
 112                 HI(z##N) = (hx & 0xfffff) | 0x41600000;                     \
 113                 n##N = __vlibm_rem_pio2m(&z##N, &y##N, n##N, 1, 0);     \
 114                 if (hx < 0) {                                                \
 115                         y##N = -y##N;                                   \
 116                         n##N = -n##N;                                   \
 117                 }                                                       \
 118                 z##N = y##N * y##N;                                     \
 119                 if (n##N & 1) { /* compute cos y */                 \
 120                         f##N = (float)(one + z##N * (mhalf + z##N *     \
 121                             (C0 + z##N * (C1 + z##N * C2))));           \
 122                 } else { /* compute sin y */                            \
 123                         f##N = (float)(y##N + y##N * z##N * (S0 +       \
 124                             z##N * (S1 + z##N * S2)));                  \
 125                 }                                                       \
 126                 y[index] = (n##N & 2)? -f##N : f##N;                        \
 127                 goto label;                                             \
 128         }
 129 
 130 #define PROCESS(N)                                                      \
 131         if (medium) {                                                   \
 132                 z##N = y##N * invpio2 + c3two51;                        \
 133                 n##N = LO(z##N);                                        \
 134                 z##N -= c3two51;                                        \
 135                 y##N = (y##N - z##N * pio2_1) - z##N * pio2_t;          \
 136         }                                                               \
 137         z##N = y##N * y##N;                                             \
 138         if (n##N & 1) { /* compute cos y */                         \
 139                 f##N = (float)(one + z##N * (mhalf + z##N * (C0 +       \
 140                     z##N * (C1 + z##N * C2))));                         \
 141         } else { /* compute sin y */                                    \
 142                 f##N = (float)(y##N + y##N * z##N * (S0 + z##N * (S1 +  \
 143                     z##N * S2)));                                       \
 144         }                                                               \
 145         *y = (n##N & 2)? -f##N : f##N;                                      \
 146         y += stridey
 147 
 148 void
 149 __vsinf(int n, float *restrict x, int stridex, float *restrict y,
 150     int stridey)
 151 {
 152         double          y0, y1, y2, y3;
 153         double          z0, z1, z2, z3;
 154         float           f0, f1, f2, f3, t;
 155         int             n0 = 0, n1 = 0, n2 = 0, n3, hx, ix, medium;
 156 
 157         y -= stridey;
 158 
 159         for (;;) {
 160 begin:
 161                 y += stridey;
 162 
 163                 if (--n < 0)
 164                         break;
 165 
 166                 medium = 0;
 167                 PREPROCESS(0, 0, begin);
 168 
 169                 if (--n < 0)
 170                         goto process1;
 171 
 172                 PREPROCESS(1, stridey, process1);
 173 
 174                 if (--n < 0)
 175                         goto process2;
 176 
 177                 PREPROCESS(2, (stridey << 1), process2);
 178 
 179                 if (--n < 0)
 180                         goto process3;
 181 
 182                 PREPROCESS(3, (stridey << 1) + stridey, process3);
 183 
 184                 if (medium) {
 185                         z0 = y0 * invpio2 + c3two51;
 186                         z1 = y1 * invpio2 + c3two51;
 187                         z2 = y2 * invpio2 + c3two51;
 188                         z3 = y3 * invpio2 + c3two51;
 189 
 190                         n0 = LO(z0);
 191                         n1 = LO(z1);
 192                         n2 = LO(z2);
 193                         n3 = LO(z3);
 194 
 195                         z0 -= c3two51;
 196                         z1 -= c3two51;
 197                         z2 -= c3two51;
 198                         z3 -= c3two51;
 199 
 200                         y0 = (y0 - z0 * pio2_1) - z0 * pio2_t;
 201                         y1 = (y1 - z1 * pio2_1) - z1 * pio2_t;
 202                         y2 = (y2 - z2 * pio2_1) - z2 * pio2_t;
 203                         y3 = (y3 - z3 * pio2_1) - z3 * pio2_t;
 204                 }
 205 
 206                 z0 = y0 * y0;
 207                 z1 = y1 * y1;
 208                 z2 = y2 * y2;
 209                 z3 = y3 * y3;
 210 
 211                 hx = (n0 & 1) | ((n1 & 1) << 1) | ((n2 & 1) << 2) |
 212                     ((n3 & 1) << 3);
 213                 switch (hx) {
 214                 case 0:
 215                         f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 216                         f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 217                         f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 218                         f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 219                         break;
 220 
 221                 case 1:
 222                         f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 223                             z0 * (C1 + z0 * C2))));
 224                         f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 225                         f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 226                         f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 227                         break;
 228 
 229                 case 2:
 230                         f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 231                         f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 232                             z1 * (C1 + z1 * C2))));
 233                         f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 234                         f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 235                         break;
 236 
 237                 case 3:
 238                         f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 239                             z0 * (C1 + z0 * C2))));
 240                         f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 241                             z1 * (C1 + z1 * C2))));
 242                         f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 243                         f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 244                         break;
 245 
 246                 case 4:
 247                         f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 248                         f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 249                         f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 250                             z2 * (C1 + z2 * C2))));
 251                         f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 252                         break;
 253 
 254                 case 5:
 255                         f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 256                             z0 * (C1 + z0 * C2))));
 257                         f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 258                         f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 259                             z2 * (C1 + z2 * C2))));
 260                         f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 261                         break;
 262 
 263                 case 6:
 264                         f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 265                         f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 266                             z1 * (C1 + z1 * C2))));
 267                         f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 268                             z2 * (C1 + z2 * C2))));
 269                         f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 270                         break;
 271 
 272                 case 7:
 273                         f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 274                             z0 * (C1 + z0 * C2))));
 275                         f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 276                             z1 * (C1 + z1 * C2))));
 277                         f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 278                             z2 * (C1 + z2 * C2))));
 279                         f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 280                         break;
 281 
 282                 case 8:
 283                         f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 284                         f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 285                         f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 286                         f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 287                             z3 * (C1 + z3 * C2))));
 288                         break;
 289 
 290                 case 9:
 291                         f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 292                             z0 * (C1 + z0 * C2))));
 293                         f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 294                         f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 295                         f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 296                             z3 * (C1 + z3 * C2))));
 297                         break;
 298 
 299                 case 10:
 300                         f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 301                         f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 302                             z1 * (C1 + z1 * C2))));
 303                         f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 304                         f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 305                             z3 * (C1 + z3 * C2))));
 306                         break;
 307 
 308                 case 11:
 309                         f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 310                             z0 * (C1 + z0 * C2))));
 311                         f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 312                             z1 * (C1 + z1 * C2))));
 313                         f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 314                         f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 315                             z3 * (C1 + z3 * C2))));
 316                         break;
 317 
 318                 case 12:
 319                         f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 320                         f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 321                         f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 322                             z2 * (C1 + z2 * C2))));
 323                         f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 324                             z3 * (C1 + z3 * C2))));
 325                         break;
 326 
 327                 case 13:
 328                         f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 329                             z0 * (C1 + z0 * C2))));
 330                         f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 331                         f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 332                             z2 * (C1 + z2 * C2))));
 333                         f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 334                             z3 * (C1 + z3 * C2))));
 335                         break;
 336 
 337                 case 14:
 338                         f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 339                         f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 340                             z1 * (C1 + z1 * C2))));
 341                         f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 342                             z2 * (C1 + z2 * C2))));
 343                         f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 344                             z3 * (C1 + z3 * C2))));
 345                         break;
 346 
 347                 default:
 348                         f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 349                             z0 * (C1 + z0 * C2))));
 350                         f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 351                             z1 * (C1 + z1 * C2))));
 352                         f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 353                             z2 * (C1 + z2 * C2))));
 354                         f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 355                             z3 * (C1 + z3 * C2))));
 356                 }
 357 
 358                 *y = (n0 & 2)? -f0 : f0;
 359                 y += stridey;
 360                 *y = (n1 & 2)? -f1 : f1;
 361                 y += stridey;
 362                 *y = (n2 & 2)? -f2 : f2;
 363                 y += stridey;
 364                 *y = (n3 & 2)? -f3 : f3;
 365                 continue;
 366 
 367 process1:
 368                 PROCESS(0);
 369                 continue;
 370 
 371 process2:
 372                 PROCESS(0);
 373                 PROCESS(1);
 374                 continue;
 375 
 376 process3:
 377                 PROCESS(0);
 378                 PROCESS(1);
 379                 PROCESS(2);
 380         }
 381 }