Print this page
    
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/lib/libmvec/common/__vsinf.c
          +++ new/usr/src/lib/libmvec/common/__vsinf.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  23   23   */
  24   24  /*
  25   25   * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  26   26   * Use is subject to license terms.
  27   27   */
  28   28  
  29   29  /*
  30   30   * __vsinf: single precision vector sin
  31   31   *
  32   32   * Algorithm:
  33   33   *
  34   34   * For |x| < pi/4, approximate sin(x) by a polynomial x+x*z*(S0+
  35   35   * z*(S1+z*S2)) and cos(x) by a polynomial 1+z*(-1/2+z*(C0+z*(C1+
  36   36   * z*C2))), where z = x*x, all evaluated in double precision.
  37   37   *
  38   38   * Accuracy:
  39   39   *
  40   40   * The largest error is less than 0.6 ulps.
  41   41   */
  42   42  
  43   43  #include <sys/isa_defs.h>
  44   44  
  45   45  #ifdef _LITTLE_ENDIAN
  46   46  #define HI(x)   *(1+(int *)&x)
  47   47  #define LO(x)   *(unsigned *)&x
  48   48  #else
  49   49  #define HI(x)   *(int *)&x
  50   50  #define LO(x)   *(1+(unsigned *)&x)
  51   51  #endif
  52   52  
  53   53  #ifdef __RESTRICT
  54   54  #define restrict _Restrict
  55   55  #else
  56   56  #define restrict
  57   57  #endif
  58   58  
  59   59  extern int __vlibm_rem_pio2m(double *, double *, int, int, int);
  60   60  
  61   61  static const double C[] = {
  62   62          -1.66666552424430847168e-01,    /* 2^ -3 * -1.5555460000000 */
  63   63          8.33219196647405624390e-03,     /* 2^ -7 *  1.11077E0000000 */
  64   64          -1.95187909412197768688e-04,    /* 2^-13 * -1.9956B60000000 */
  65   65          1.0,
  66   66          -0.5,
  67   67          4.16666455566883087158e-02,     /* 2^ -5 *  1.55554A0000000 */
  68   68          -1.38873036485165357590e-03,    /* 2^-10 * -1.6C0C1E0000000 */
  69   69          2.44309903791872784495e-05,     /* 2^-16 *  1.99E24E0000000 */
  70   70          0.636619772367581343075535,     /* 2^ -1  * 1.45F306DC9C883 */
  71   71          6755399441055744.0,             /* 2^ 52  * 1.8000000000000 */
  72   72          1.570796326734125614166,        /* 2^  0  * 1.921FB54400000 */
  73   73          6.077100506506192601475e-11,    /* 2^-34  * 1.0B4611A626331 */
  74   74  };
  75   75  
  76   76  #define S0      C[0]
  77   77  #define S1      C[1]
  78   78  #define S2      C[2]
  79   79  #define one     C[3]
  80   80  #define mhalf   C[4]
  81   81  #define C0      C[5]
  82   82  #define C1      C[6]
  83   83  #define C2      C[7]
  84   84  #define invpio2 C[8]
  85   85  #define c3two51 C[9]
  86   86  #define pio2_1  C[10]
  87   87  #define pio2_t  C[11]
  88   88  
  89   89  #define PREPROCESS(N, index, label)                                     \
  90   90          hx = *(int *)x;                                                 \
  91   91          ix = hx & 0x7fffffff;                                           \
  92   92          t = *x;                                                         \
  93   93          x += stridex;                                                   \
  94   94          if (ix <= 0x3f490fdb) { /* |x| < pi/4 */                        \
  95   95                  if (ix == 0) {                                          \
  96   96                          y[index] = t;                                   \
  97   97                          goto label;                                     \
  98   98                  }                                                       \
  99   99                  y##N = (double)t;                                       \
 100  100                  n##N = 0;                                               \
 101  101          } else if (ix <= 0x49c90fdb) { /* |x| < 2^19*pi */              \
 102  102                  y##N = (double)t;                                       \
 103  103                  medium = 1;                                             \
 104  104          } else {                                                        \
 105  105                  if (ix >= 0x7f800000) { /* inf or nan */                \
 106  106                          y[index] = t / t;                               \
 107  107                          goto label;                                     \
 108  108                  }                                                       \
 109  109                  z##N = y##N = (double)t;                                \
 110  110                  hx = HI(y##N);                                          \
 111  111                  n##N = ((hx >> 20) & 0x7ff) - 1046;                     \
 112  112                  HI(z##N) = (hx & 0xfffff) | 0x41600000;                 \
 113  113                  n##N = __vlibm_rem_pio2m(&z##N, &y##N, n##N, 1, 0);     \
 114  114                  if (hx < 0) {                                           \
 115  115                          y##N = -y##N;                                   \
 116  116                          n##N = -n##N;                                   \
 117  117                  }                                                       \
 118  118                  z##N = y##N * y##N;                                     \
 119  119                  if (n##N & 1) { /* compute cos y */                     \
 120  120                          f##N = (float)(one + z##N * (mhalf + z##N *     \
 121  121                              (C0 + z##N * (C1 + z##N * C2))));           \
 122  122                  } else { /* compute sin y */                            \
 123  123                          f##N = (float)(y##N + y##N * z##N * (S0 +       \
 124  124                              z##N * (S1 + z##N * S2)));                  \
 125  125                  }                                                       \
 126  126                  y[index] = (n##N & 2)? -f##N : f##N;                    \
 127  127                  goto label;                                             \
 128  128          }
 129  129  
 130  130  #define PROCESS(N)                                                      \
 131  131          if (medium) {                                                   \
 132  132                  z##N = y##N * invpio2 + c3two51;                        \
 133  133                  n##N = LO(z##N);                                        \
 134  134                  z##N -= c3two51;                                        \
 135  135                  y##N = (y##N - z##N * pio2_1) - z##N * pio2_t;          \
 136  136          }                                                               \
 137  137          z##N = y##N * y##N;                                             \
 138  138          if (n##N & 1) { /* compute cos y */                             \
 139  139                  f##N = (float)(one + z##N * (mhalf + z##N * (C0 +       \
 140  140                      z##N * (C1 + z##N * C2))));                         \
 141  141          } else { /* compute sin y */                                    \
 142  142                  f##N = (float)(y##N + y##N * z##N * (S0 + z##N * (S1 +  \
 143  143                      z##N * S2)));                                       \
 144  144          }                                                               \
  
    | 
      ↓ open down ↓ | 
    144 lines elided | 
    
      ↑ open up ↑ | 
  
 145  145          *y = (n##N & 2)? -f##N : f##N;                                  \
 146  146          y += stridey
 147  147  
 148  148  void
 149  149  __vsinf(int n, float *restrict x, int stridex, float *restrict y,
 150  150      int stridey)
 151  151  {
 152  152          double          y0, y1, y2, y3;
 153  153          double          z0, z1, z2, z3;
 154  154          float           f0, f1, f2, f3, t;
 155      -        int             n0, n1, n2, n3, hx, ix, medium;
      155 +        int             n0 = 0, n1 = 0, n2 = 0, n3, hx, ix, medium;
 156  156  
 157  157          y -= stridey;
 158  158  
 159  159          for (;;) {
 160  160  begin:
 161  161                  y += stridey;
 162  162  
 163  163                  if (--n < 0)
 164  164                          break;
 165  165  
 166  166                  medium = 0;
 167  167                  PREPROCESS(0, 0, begin);
 168  168  
 169  169                  if (--n < 0)
 170  170                          goto process1;
 171  171  
 172  172                  PREPROCESS(1, stridey, process1);
 173  173  
 174  174                  if (--n < 0)
 175  175                          goto process2;
 176  176  
 177  177                  PREPROCESS(2, (stridey << 1), process2);
 178  178  
 179  179                  if (--n < 0)
 180  180                          goto process3;
 181  181  
 182  182                  PREPROCESS(3, (stridey << 1) + stridey, process3);
 183  183  
 184  184                  if (medium) {
 185  185                          z0 = y0 * invpio2 + c3two51;
 186  186                          z1 = y1 * invpio2 + c3two51;
 187  187                          z2 = y2 * invpio2 + c3two51;
 188  188                          z3 = y3 * invpio2 + c3two51;
 189  189  
 190  190                          n0 = LO(z0);
 191  191                          n1 = LO(z1);
 192  192                          n2 = LO(z2);
 193  193                          n3 = LO(z3);
 194  194  
 195  195                          z0 -= c3two51;
 196  196                          z1 -= c3two51;
 197  197                          z2 -= c3two51;
 198  198                          z3 -= c3two51;
 199  199  
 200  200                          y0 = (y0 - z0 * pio2_1) - z0 * pio2_t;
 201  201                          y1 = (y1 - z1 * pio2_1) - z1 * pio2_t;
 202  202                          y2 = (y2 - z2 * pio2_1) - z2 * pio2_t;
 203  203                          y3 = (y3 - z3 * pio2_1) - z3 * pio2_t;
 204  204                  }
 205  205  
 206  206                  z0 = y0 * y0;
 207  207                  z1 = y1 * y1;
 208  208                  z2 = y2 * y2;
 209  209                  z3 = y3 * y3;
 210  210  
 211  211                  hx = (n0 & 1) | ((n1 & 1) << 1) | ((n2 & 1) << 2) |
 212  212                      ((n3 & 1) << 3);
 213  213                  switch (hx) {
 214  214                  case 0:
 215  215                          f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 216  216                          f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 217  217                          f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 218  218                          f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 219  219                          break;
 220  220  
 221  221                  case 1:
 222  222                          f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 223  223                              z0 * (C1 + z0 * C2))));
 224  224                          f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 225  225                          f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 226  226                          f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 227  227                          break;
 228  228  
 229  229                  case 2:
 230  230                          f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 231  231                          f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 232  232                              z1 * (C1 + z1 * C2))));
 233  233                          f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 234  234                          f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 235  235                          break;
 236  236  
 237  237                  case 3:
 238  238                          f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 239  239                              z0 * (C1 + z0 * C2))));
 240  240                          f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 241  241                              z1 * (C1 + z1 * C2))));
 242  242                          f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 243  243                          f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 244  244                          break;
 245  245  
 246  246                  case 4:
 247  247                          f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 248  248                          f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 249  249                          f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 250  250                              z2 * (C1 + z2 * C2))));
 251  251                          f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 252  252                          break;
 253  253  
 254  254                  case 5:
 255  255                          f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 256  256                              z0 * (C1 + z0 * C2))));
 257  257                          f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 258  258                          f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 259  259                              z2 * (C1 + z2 * C2))));
 260  260                          f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 261  261                          break;
 262  262  
 263  263                  case 6:
 264  264                          f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 265  265                          f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 266  266                              z1 * (C1 + z1 * C2))));
 267  267                          f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 268  268                              z2 * (C1 + z2 * C2))));
 269  269                          f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 270  270                          break;
 271  271  
 272  272                  case 7:
 273  273                          f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 274  274                              z0 * (C1 + z0 * C2))));
 275  275                          f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 276  276                              z1 * (C1 + z1 * C2))));
 277  277                          f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 278  278                              z2 * (C1 + z2 * C2))));
 279  279                          f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 280  280                          break;
 281  281  
 282  282                  case 8:
 283  283                          f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 284  284                          f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 285  285                          f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 286  286                          f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 287  287                              z3 * (C1 + z3 * C2))));
 288  288                          break;
 289  289  
 290  290                  case 9:
 291  291                          f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 292  292                              z0 * (C1 + z0 * C2))));
 293  293                          f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 294  294                          f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 295  295                          f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 296  296                              z3 * (C1 + z3 * C2))));
 297  297                          break;
 298  298  
 299  299                  case 10:
 300  300                          f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 301  301                          f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 302  302                              z1 * (C1 + z1 * C2))));
 303  303                          f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 304  304                          f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 305  305                              z3 * (C1 + z3 * C2))));
 306  306                          break;
 307  307  
 308  308                  case 11:
 309  309                          f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 310  310                              z0 * (C1 + z0 * C2))));
 311  311                          f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 312  312                              z1 * (C1 + z1 * C2))));
 313  313                          f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 314  314                          f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 315  315                              z3 * (C1 + z3 * C2))));
 316  316                          break;
 317  317  
 318  318                  case 12:
 319  319                          f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 320  320                          f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 321  321                          f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 322  322                              z2 * (C1 + z2 * C2))));
 323  323                          f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 324  324                              z3 * (C1 + z3 * C2))));
 325  325                          break;
 326  326  
 327  327                  case 13:
 328  328                          f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 329  329                              z0 * (C1 + z0 * C2))));
 330  330                          f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 331  331                          f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 332  332                              z2 * (C1 + z2 * C2))));
 333  333                          f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 334  334                              z3 * (C1 + z3 * C2))));
 335  335                          break;
 336  336  
 337  337                  case 14:
 338  338                          f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 339  339                          f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 340  340                              z1 * (C1 + z1 * C2))));
 341  341                          f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 342  342                              z2 * (C1 + z2 * C2))));
 343  343                          f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 344  344                              z3 * (C1 + z3 * C2))));
 345  345                          break;
 346  346  
 347  347                  default:
 348  348                          f0 = (float)(one + z0 * (mhalf + z0 * (C0 +
 349  349                              z0 * (C1 + z0 * C2))));
 350  350                          f1 = (float)(one + z1 * (mhalf + z1 * (C0 +
 351  351                              z1 * (C1 + z1 * C2))));
 352  352                          f2 = (float)(one + z2 * (mhalf + z2 * (C0 +
 353  353                              z2 * (C1 + z2 * C2))));
 354  354                          f3 = (float)(one + z3 * (mhalf + z3 * (C0 +
 355  355                              z3 * (C1 + z3 * C2))));
 356  356                  }
 357  357  
 358  358                  *y = (n0 & 2)? -f0 : f0;
 359  359                  y += stridey;
 360  360                  *y = (n1 & 2)? -f1 : f1;
 361  361                  y += stridey;
 362  362                  *y = (n2 & 2)? -f2 : f2;
 363  363                  y += stridey;
 364  364                  *y = (n3 & 2)? -f3 : f3;
 365  365                  continue;
 366  366  
 367  367  process1:
 368  368                  PROCESS(0);
 369  369                  continue;
 370  370  
 371  371  process2:
 372  372                  PROCESS(0);
 373  373                  PROCESS(1);
 374  374                  continue;
 375  375  
 376  376  process3:
 377  377                  PROCESS(0);
 378  378                  PROCESS(1);
 379  379                  PROCESS(2);
 380  380          }
 381  381  }
  
    | 
      ↓ open down ↓ | 
    216 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX