1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  23  */
  24 /*
  25  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  26  * Use is subject to license terms.
  27  */
  28 
  29 /*
  30  * __vcosf: single precision vector cos
  31  *
  32  * Algorithm:
  33  *
  34  * For |x| < pi/4, approximate sin(x) by a polynomial x+x*z*(S0+
  35  * z*(S1+z*S2)) and cos(x) by a polynomial 1+z*(-1/2+z*(C0+z*(C1+
  36  * z*C2))), where z = x*x, all evaluated in double precision.
  37  *
  38  * Accuracy:
  39  *
  40  * The largest error is less than 0.6 ulps.
  41  */
  42 
  43 #include <sys/isa_defs.h>
  44 
  45 #ifdef _LITTLE_ENDIAN
  46 #define HI(x)   *(1+(int *)&x)
  47 #define LO(x)   *(unsigned *)&x
  48 #else
  49 #define HI(x)   *(int *)&x
  50 #define LO(x)   *(1+(unsigned *)&x)
  51 #endif
  52 
  53 #ifdef __RESTRICT
  54 #define restrict _Restrict
  55 #else
  56 #define restrict
  57 #endif
  58 
  59 extern int __vlibm_rem_pio2m(double *, double *, int, int, int);
  60 
  61 static const double C[] = {
  62         -1.66666552424430847168e-01,    /* 2^ -3 * -1.5555460000000 */
  63         8.33219196647405624390e-03,     /* 2^ -7 *  1.11077E0000000 */
  64         -1.95187909412197768688e-04,    /* 2^-13 * -1.9956B60000000 */
  65         1.0,
  66         -0.5,
  67         4.16666455566883087158e-02,     /* 2^ -5 *  1.55554A0000000 */
  68         -1.38873036485165357590e-03,    /* 2^-10 * -1.6C0C1E0000000 */
  69         2.44309903791872784495e-05,     /* 2^-16 *  1.99E24E0000000 */
  70         0.636619772367581343075535,     /* 2^ -1  * 1.45F306DC9C883 */
  71         6755399441055744.0,             /* 2^ 52  * 1.8000000000000 */
  72         1.570796326734125614166,        /* 2^  0  * 1.921FB54400000 */
  73         6.077100506506192601475e-11,    /* 2^-34  * 1.0B4611A626331 */
  74 };
  75 
  76 #define S0      C[0]
  77 #define S1      C[1]
  78 #define S2      C[2]
  79 #define one     C[3]
  80 #define mhalf   C[4]
  81 #define C0      C[5]
  82 #define C1      C[6]
  83 #define C2      C[7]
  84 #define invpio2 C[8]
  85 #define c3two51 C[9]
  86 #define pio2_1  C[10]
  87 #define pio2_t  C[11]
  88 
  89 #define PREPROCESS(N, index, label)                                     \
  90         hx = *(int *)x;                                                 \
  91         ix = hx & 0x7fffffff;                                               \
  92         t = *x;                                                         \
  93         x += stridex;                                                   \
  94         if (ix <= 0x3f490fdb) { /* |x| < pi/4 */                  \
  95                 if (ix == 0) {                                          \
  96                         y[index] = one;                                 \
  97                         goto label;                                     \
  98                 }                                                       \
  99                 y##N = (double)t;                                       \
 100                 n##N = 1;                                               \
 101         } else if (ix <= 0x49c90fdb) { /* |x| < 2^19*pi */                \
 102                 y##N = (double)t;                                       \
 103                 medium = 1;                                             \
 104         } else {                                                        \
 105                 if (ix >= 0x7f800000) { /* inf or nan */             \
 106                         y[index] = t / t;                               \
 107                         goto label;                                     \
 108                 }                                                       \
 109                 z##N = y##N = (double)t;                                \
 110                 hx = HI(y##N);                                          \
 111                 n##N = ((hx >> 20) & 0x7ff) - 1046;                   \
 112                 HI(z##N) = (hx & 0xfffff) | 0x41600000;                     \
 113                 n##N = __vlibm_rem_pio2m(&z##N, &y##N, n##N, 1, 0) + 1; \
 114                 z##N = y##N * y##N;                                     \
 115                 if (n##N & 1) { /* compute cos y */                 \
 116                         f##N = (float)(one + z##N * (mhalf + z##N *     \
 117                             (C0 + z##N * (C1 + z##N * C2))));           \
 118                 } else { /* compute sin y */                            \
 119                         f##N = (float)(y##N + y##N * z##N * (S0 +       \
 120                             z##N * (S1 + z##N * S2)));                  \
 121                 }                                                       \
 122                 y[index] = (n##N & 2)? -f##N : f##N;                        \
 123                 goto label;                                             \
 124         }
 125 
 126 #define PROCESS(N)                                                      \
 127         if (medium) {                                                   \
 128                 z##N = y##N * invpio2 + c3two51;                        \
 129                 n##N = LO(z##N) + 1;                                    \
 130                 z##N -= c3two51;                                        \
 131                 y##N = (y##N - z##N * pio2_1) - z##N * pio2_t;          \
 132         }                                                               \
 133         z##N = y##N * y##N;                                             \
 134         if (n##N & 1) { /* compute cos y */                         \
 135                 f##N = (float)(one + z##N * (mhalf + z##N * (C0 +       \
 136                     z##N * (C1 + z##N * C2))));                         \
 137         } else { /* compute sin y */                                    \
 138                 f##N = (float)(y##N + y##N * z##N * (S0 + z##N * (S1 +  \
 139                     z##N * S2)));                                       \
 140         }                                                               \
 141         *y = (n##N & 2)? -f##N : f##N;                                      \
 142         y += stridey
 143 
 144 void
 145 __vcosf(int n, float *restrict x, int stridex, float *restrict y,
 146     int stridey)
 147 {
 148         double          y0, y1, y2, y3;
 149         double          z0, z1, z2, z3;
 150         float           f0, f1, f2, f3, t;
 151         int             n0 = 0, n1 = 0, n2 = 0, n3, hx, ix, medium;
 152 
 153         y -= stridey;
 154 
 155         for (;;) {
 156 begin:
 157                 y += stridey;
 158 
 159                 if (--n < 0)
 160                         break;
 161 
 162                 medium = 0;
 163                 PREPROCESS(0, 0, begin);
 164 
 165                 if (--n < 0)
 166                         goto process1;
 167 
 168                 PREPROCESS(1, stridey, process1);
 169 
 170                 if (--n < 0)
 171                         goto process2;
 172 
 173                 PREPROCESS(2, (stridey << 1), process2);
 174 
 175                 if (--n < 0)
 176                         goto process3;
 177 
 178                 PREPROCESS(3, (stridey << 1) + stridey, process3);
 179 
 180                 if (medium) {
 181                         z0 = y0 * invpio2 + c3two51;
 182                         z1 = y1 * invpio2 + c3two51;
 183                         z2 = y2 * invpio2 + c3two51;
 184                         z3 = y3 * invpio2 + c3two51;
 185 
 186                         n0 = LO(z0) + 1;
 187                         n1 = LO(z1) + 1;
 188                         n2 = LO(z2) + 1;
 189                         n3 = LO(z3) + 1;
 190 
 191                         z0 -= c3two51;
 192                         z1 -= c3two51;
 193                         z2 -= c3two51;
 194                         z3 -= c3two51;
 195 
 196                         y0 = (y0 - z0 * pio2_1) - z0 * pio2_t;
 197                         y1 = (y1 - z1 * pio2_1) - z1 * pio2_t;
 198                         y2 = (y2 - z2 * pio2_1) - z2 * pio2_t;
 199                         y3 = (y3 - z3 * pio2_1) - z3 * pio2_t;
 200                 }
 201 
 202                 z0 = y0 * y0;
 203                 z1 = y1 * y1;
 204                 z2 = y2 * y2;
 205                 z3 = y3 * y3;
 206 
 207                 hx = (n0 & 1) | ((n1 & 1) << 1) | ((n2 & 1) << 2) |
 208                     ((n3 & 1) << 3);
 209                 switch (hx) {
 210                 case 0:
 211                         f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 212                         f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 213                         f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 214                         f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 215                         break;
 216 
 217                 case 1:
 218                         f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 219                             z0 * (C1 + z0 * C2))));
 220                         f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 221                         f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 222                         f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 223                         break;
 224 
 225                 case 2:
 226                         f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 227                         f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 228                             z1 * (C1 + z1 * C2))));
 229                         f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 230                         f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 231                         break;
 232 
 233                 case 3:
 234                         f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 235                             z0 * (C1 + z0 * C2))));
 236                         f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 237                             z1 * (C1 + z1 * C2))));
 238                         f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 239                         f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 240                         break;
 241 
 242                 case 4:
 243                         f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 244                         f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 245                         f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 246                             z2 * (C1 + z2 * C2))));
 247                         f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 248                         break;
 249 
 250                 case 5:
 251                         f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 252                             z0 * (C1 + z0 * C2))));
 253                         f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 254                         f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 255                             z2 * (C1 + z2 * C2))));
 256                         f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 257                         break;
 258 
 259                 case 6:
 260                         f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 261                         f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 262                             z1 * (C1 + z1 * C2))));
 263                         f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 264                             z2 * (C1 + z2 * C2))));
 265                         f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 266                         break;
 267 
 268                 case 7:
 269                         f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 270                             z0 * (C1 + z0 * C2))));
 271                         f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 272                             z1 * (C1 + z1 * C2))));
 273                         f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 274                             z2 * (C1 + z2 * C2))));
 275                         f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 276                         break;
 277 
 278                 case 8:
 279                         f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 280                         f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 281                         f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 282                         f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 283                             z3 * (C1 + z3 * C2))));
 284                         break;
 285 
 286                 case 9:
 287                         f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 288                             z0 * (C1 + z0 * C2))));
 289                         f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 290                         f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 291                         f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 292                             z3 * (C1 + z3 * C2))));
 293                         break;
 294 
 295                 case 10:
 296                         f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 297                         f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 298                             z1 * (C1 + z1 * C2))));
 299                         f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 300                         f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 301                             z3 * (C1 + z3 * C2))));
 302                         break;
 303 
 304                 case 11:
 305                         f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 306                             z0 * (C1 + z0 * C2))));
 307                         f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 308                             z1 * (C1 + z1 * C2))));
 309                         f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 310                         f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 311                             z3 * (C1 + z3 * C2))));
 312                         break;
 313 
 314                 case 12:
 315                         f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 316                         f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 317                         f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 318                             z2 * (C1 + z2 * C2))));
 319                         f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 320                             z3 * (C1 + z3 * C2))));
 321                         break;
 322 
 323                 case 13:
 324                         f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 325                             z0 * (C1 + z0 * C2))));
 326                         f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 327                         f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 328                             z2 * (C1 + z2 * C2))));
 329                         f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 330                             z3 * (C1 + z3 * C2))));
 331                         break;
 332 
 333                 case 14:
 334                         f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 335                         f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 336                             z1 * (C1 + z1 * C2))));
 337                         f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 338                             z2 * (C1 + z2 * C2))));
 339                         f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 340                             z3 * (C1 + z3 * C2))));
 341                         break;
 342 
 343                 default:
 344                         f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 345                             z0 * (C1 + z0 * C2))));
 346                         f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 347                             z1 * (C1 + z1 * C2))));
 348                         f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 349                             z2 * (C1 + z2 * C2))));
 350                         f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 351                             z3 * (C1 + z3 * C2))));
 352                 }
 353 
 354                 *y = (n0 & 2)? -f0 : f0;
 355                 y += stridey;
 356                 *y = (n1 & 2)? -f1 : f1;
 357                 y += stridey;
 358                 *y = (n2 & 2)? -f2 : f2;
 359                 y += stridey;
 360                 *y = (n3 & 2)? -f3 : f3;
 361                 continue;
 362 
 363 process1:
 364                 PROCESS(0);
 365                 continue;
 366 
 367 process2:
 368                 PROCESS(0);
 369                 PROCESS(1);
 370                 continue;
 371 
 372 process3:
 373                 PROCESS(0);
 374                 PROCESS(1);
 375                 PROCESS(2);
 376         }
 377 }