webrev_693-libm-all-changes-since-richlowe Wdiff usr/src/lib/libmvec/common/__vsincosf.c

Print this page

Split	Close
Expand all
Collapse all

          --- old/usr/src/lib/libmvec/common/__vsincosf.c
          +++ new/usr/src/lib/libmvec/common/__vsincosf.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  23   23   */
  24   24  /*
  25   25   * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  26   26   * Use is subject to license terms.
  27   27   */
  28   28  
  29   29  /*
  30   30   * __vsincosf: single precision vector sincos
  31   31   *
  32   32   * Algorithm:
  33   33   *
  34   34   * For |x| < pi/4, approximate sin(x) by a polynomial x+x*z*(S0+
  35   35   * z*(S1+z*S2)) and cos(x) by a polynomial 1+z*(-1/2+z*(C0+z*(C1+
  36   36   * z*C2))), where z = x*x, all evaluated in double precision.
  37   37   *
  38   38   * Accuracy:
  39   39   *
  40   40   * The largest error is less than 0.6 ulps.
  41   41   */
  42   42  
  43   43  #include <sys/isa_defs.h>
  44   44  
  45   45  #ifdef _LITTLE_ENDIAN
  46   46  #define HI(x)   *(1+(int *)&x)
  47   47  #define LO(x)   *(unsigned *)&x
  48   48  #else
  49   49  #define HI(x)   *(int *)&x
  50   50  #define LO(x)   *(1+(unsigned *)&x)
  51   51  #endif
  52   52  
  53   53  #ifdef __RESTRICT
  54   54  #define restrict _Restrict
  55   55  #else
  56   56  #define restrict
  57   57  #endif
  58   58  
  59   59  extern int __vlibm_rem_pio2m(double *, double *, int, int, int);
  60   60  
  61   61  static const double C[] = {
  62   62          -1.66666552424430847168e-01,    /* 2^ -3 * -1.5555460000000 */
  63   63          8.33219196647405624390e-03,     /* 2^ -7 *  1.11077E0000000 */
  64   64          -1.95187909412197768688e-04,    /* 2^-13 * -1.9956B60000000 */
  65   65          1.0,
  66   66          -0.5,
  67   67          4.16666455566883087158e-02,     /* 2^ -5 *  1.55554A0000000 */
  68   68          -1.38873036485165357590e-03,    /* 2^-10 * -1.6C0C1E0000000 */
  69   69          2.44309903791872784495e-05,     /* 2^-16 *  1.99E24E0000000 */
  70   70          0.636619772367581343075535,     /* 2^ -1  * 1.45F306DC9C883 */
  71   71          6755399441055744.0,             /* 2^ 52  * 1.8000000000000 */
  72   72          1.570796326734125614166,        /* 2^  0  * 1.921FB54400000 */
  73   73          6.077100506506192601475e-11,    /* 2^-34  * 1.0B4611A626331 */
  74   74  };
  75   75  
  76   76  #define S0      C[0]
  77   77  #define S1      C[1]
  78   78  #define S2      C[2]
  79   79  #define one     C[3]
  80   80  #define mhalf   C[4]
  81   81  #define C0      C[5]
  82   82  #define C1      C[6]
  83   83  #define C2      C[7]
  84   84  #define invpio2 C[8]
  85   85  #define c3two51 C[9]
  86   86  #define pio2_1  C[10]
  87   87  #define pio2_t  C[11]
  88   88  
  89   89  #define PREPROCESS(N, sindex, cindex, label)                            \
  90   90          hx = *(int *)x;                                                 \
  91   91          ix = hx & 0x7fffffff;                                           \
  92   92          t = *x;                                                         \
  93   93          x += stridex;                                                   \
  94   94          if (ix <= 0x3f490fdb) { /* |x| < pi/4 */                        \
  95   95                  if (ix == 0) {                                          \
  96   96                          s[sindex] = t;                                  \
  97   97                          c[cindex] = one;                                \
  98   98                          goto label;                                     \
  99   99                  }                                                       \
 100  100                  y##N = (double)t;                                       \
 101  101                  n##N = 0;                                               \
 102  102          } else if (ix <= 0x49c90fdb) { /* |x| < 2^19*pi */              \
 103  103                  y##N = (double)t;                                       \
 104  104                  medium = 1;                                             \
 105  105          } else {                                                        \
 106  106                  if (ix >= 0x7f800000) { /* inf or nan */                \
 107  107                          s[sindex] = c[cindex] = t / t;                  \
 108  108                          goto label;                                     \
 109  109                  }                                                       \
 110  110                  z##N = y##N = (double)t;                                \
 111  111                  hx = HI(y##N);                                          \
 112  112                  n##N = ((hx >> 20) & 0x7ff) - 1046;                     \
 113  113                  HI(z##N) = (hx & 0xfffff) | 0x41600000;                 \
 114  114                  n##N = __vlibm_rem_pio2m(&z##N, &y##N, n##N, 1, 0);     \
 115  115                  if (hx < 0) {                                           \
 116  116                          y##N = -y##N;                                   \
 117  117                          n##N = -n##N;                                   \
 118  118                  }                                                       \
 119  119                  z##N = y##N * y##N;                                     \
 120  120                  f##N = (float)(y##N + y##N * z##N * (S0 + z##N *        \
 121  121                      (S1 + z##N * S2)));                                 \
 122  122                  g##N = (float)(one + z##N * (mhalf + z##N * (C0 +       \
 123  123                      z##N * (C1 + z##N * C2))));                         \
 124  124                  if (n##N & 2) {                                         \
 125  125                          f##N = -f##N;                                   \
 126  126                          g##N = -g##N;                                   \
 127  127                  }                                                       \
 128  128                  if (n##N & 1) {                                         \
 129  129                          s[sindex] = g##N;                               \
 130  130                          c[cindex] = -f##N;                              \
 131  131                  } else {                                                \
 132  132                          s[sindex] = f##N;                               \
 133  133                          c[cindex] = g##N;                               \
 134  134                  }                                                       \
 135  135                  goto label;                                             \
 136  136          }
 137  137  
 138  138  #define PROCESS(N)                                                      \
 139  139          if (medium) {                                                   \
 140  140                  z##N = y##N * invpio2 + c3two51;                        \
 141  141                  n##N = LO(z##N);                                        \
 142  142                  z##N -= c3two51;                                        \
 143  143                  y##N = (y##N - z##N * pio2_1) - z##N * pio2_t;          \
 144  144          }                                                               \
 145  145          z##N = y##N * y##N;                                             \
 146  146          f##N = (float)(y##N + y##N * z##N * (S0 + z##N * (S1 + z##N * S2)));\
 147  147          g##N = (float)(one + z##N * (mhalf + z##N * (C0 + z##N *        \
 148  148              (C1 + z##N * C2))));                                        \
 149  149          if (n##N & 2) {                                                 \
 150  150                  f##N = -f##N;                                           \
 151  151                  g##N = -g##N;                                           \
 152  152          }                                                               \
 153  153          if (n##N & 1) {                                                 \
 154  154                  *s = g##N;                                              \
 155  155                  *c = -f##N;                                             \
 156  156          } else {                                                        \
 157  157                  *s = f##N;                                              \
 158  158                  *c = g##N;                                              \
 159  159          }                                                               \
 160  160          s += strides;                                                   \

↓ open down ↓

160 lines elided

↑ open up ↑

 161  161          c += stridec
 162  162  
 163  163  void
 164  164  __vsincosf(int n, float *restrict x, int stridex,
 165  165      float *restrict s, int strides, float *restrict c, int stridec)
 166  166  {
 167  167          double          y0, y1, y2, y3;
 168  168          double          z0, z1, z2, z3;
 169  169          float           f0, f1, f2, f3, t;
 170  170          float           g0, g1, g2, g3;
 171      -        int             n0, n1, n2, n3, hx, ix, medium;
      171 +        int             n0 = 0, n1 = 0, n2 = 0, n3, hx, ix, medium;
 172  172  
 173  173          s -= strides;
 174  174          c -= stridec;
 175  175  
 176  176          for (;;) {
 177  177  begin:
 178  178                  s += strides;
 179  179                  c += stridec;
 180  180  
 181  181                  if (--n < 0)

 182  182                          break;
 183  183  
 184  184                  medium = 0;
 185  185                  PREPROCESS(0, 0, 0, begin);
 186  186  
 187  187                  if (--n < 0)
 188  188                          goto process1;
 189  189  
 190  190                  PREPROCESS(1, strides, stridec, process1);
 191  191  
 192  192                  if (--n < 0)
 193  193                          goto process2;
 194  194  
 195  195                  PREPROCESS(2, (strides << 1), (stridec << 1), process2);
 196  196  
 197  197                  if (--n < 0)
 198  198                          goto process3;
 199  199  
 200  200                  PREPROCESS(3, (strides << 1) + strides,
 201  201                      (stridec << 1) + stridec, process3);
 202  202  
 203  203                  if (medium) {
 204  204                          z0 = y0 * invpio2 + c3two51;
 205  205                          z1 = y1 * invpio2 + c3two51;
 206  206                          z2 = y2 * invpio2 + c3two51;
 207  207                          z3 = y3 * invpio2 + c3two51;
 208  208  
 209  209                          n0 = LO(z0);
 210  210                          n1 = LO(z1);
 211  211                          n2 = LO(z2);
 212  212                          n3 = LO(z3);
 213  213  
 214  214                          z0 -= c3two51;
 215  215                          z1 -= c3two51;
 216  216                          z2 -= c3two51;
 217  217                          z3 -= c3two51;
 218  218  
 219  219                          y0 = (y0 - z0 * pio2_1) - z0 * pio2_t;
 220  220                          y1 = (y1 - z1 * pio2_1) - z1 * pio2_t;
 221  221                          y2 = (y2 - z2 * pio2_1) - z2 * pio2_t;
 222  222                          y3 = (y3 - z3 * pio2_1) - z3 * pio2_t;
 223  223                  }
 224  224  
 225  225                  z0 = y0 * y0;
 226  226                  z1 = y1 * y1;
 227  227                  z2 = y2 * y2;
 228  228                  z3 = y3 * y3;
 229  229  
 230  230                  f0 = (float)(y0 + y0 * z0 * (S0 + z0 * (S1 + z0 * S2)));
 231  231                  f1 = (float)(y1 + y1 * z1 * (S0 + z1 * (S1 + z1 * S2)));
 232  232                  f2 = (float)(y2 + y2 * z2 * (S0 + z2 * (S1 + z2 * S2)));
 233  233                  f3 = (float)(y3 + y3 * z3 * (S0 + z3 * (S1 + z3 * S2)));
 234  234  
 235  235                  g0 = (float)(one + z0 * (mhalf + z0 * (C0 + z0 *
 236  236                      (C1 + z0 * C2))));
 237  237                  g1 = (float)(one + z1 * (mhalf + z1 * (C0 + z1 *
 238  238                      (C1 + z1 * C2))));
 239  239                  g2 = (float)(one + z2 * (mhalf + z2 * (C0 + z2 *
 240  240                      (C1 + z2 * C2))));
 241  241                  g3 = (float)(one + z3 * (mhalf + z3 * (C0 + z3 *
 242  242                      (C1 + z3 * C2))));
 243  243  
 244  244                  if (n0 & 2) {
 245  245                          f0 = -f0;
 246  246                          g0 = -g0;
 247  247                  }
 248  248                  if (n1 & 2) {
 249  249                          f1 = -f1;
 250  250                          g1 = -g1;
 251  251                  }
 252  252                  if (n2 & 2) {
 253  253                          f2 = -f2;
 254  254                          g2 = -g2;
 255  255                  }
 256  256                  if (n3 & 2) {
 257  257                          f3 = -f3;
 258  258                          g3 = -g3;
 259  259                  }
 260  260  
 261  261                  if (n0 & 1) {
 262  262                          *s = g0;
 263  263                          *c = -f0;
 264  264                  } else {
 265  265                          *s = f0;
 266  266                          *c = g0;
 267  267                  }
 268  268                  s += strides;
 269  269                  c += stridec;
 270  270  
 271  271                  if (n1 & 1) {
 272  272                          *s = g1;
 273  273                          *c = -f1;
 274  274                  } else {
 275  275                          *s = f1;
 276  276                          *c = g1;
 277  277                  }
 278  278                  s += strides;
 279  279                  c += stridec;
 280  280  
 281  281                  if (n2 & 1) {
 282  282                          *s = g2;
 283  283                          *c = -f2;
 284  284                  } else {
 285  285                          *s = f2;
 286  286                          *c = g2;
 287  287                  }
 288  288                  s += strides;
 289  289                  c += stridec;
 290  290  
 291  291                  if (n3 & 1) {
 292  292                          *s = g3;
 293  293                          *c = -f3;
 294  294                  } else {
 295  295                          *s = f3;
 296  296                          *c = g3;
 297  297                  }
 298  298                  continue;
 299  299  
 300  300  process1:
 301  301                  PROCESS(0);
 302  302                  continue;
 303  303  
 304  304  process2:
 305  305                  PROCESS(0);
 306  306                  PROCESS(1);
 307  307                  continue;
 308  308  
 309  309  process3:
 310  310                  PROCESS(0);
 311  311                  PROCESS(1);
 312  312                  PROCESS(2);
 313  313          }
 314  314  }

↓ open down ↓

133 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX