illumos-gate New usr/src/lib/libmvec/common/__vrem

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25 /*
  26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  27  * Use is subject to license terms.
  28  */
  29 
  30 /*
  31  * Given X, __vlibm_rem_pio2m finds Y and an integer n such that
  32  * Y = X - n*pi/2 and |Y| < pi/2.
  33  *
  34  * On entry, X is represented by x, an array of nx 24-bit integers
  35  * stored in double precision format, and e:
  36  *
  37  *   X = sum (x[i] * 2^(e - 24*i))
  38  *
  39  * nx must be 1, 2, or 3, and e must be >= -24.  For example, a
  40  * suitable representation for the double precision number z can
  41  * be computed as follows:
  42  *
  43  *      e  = ilogb(z)-23
  44  *      z  = scalbn(z,-e)
  45  *      for i = 0,1,2
  46  *              x[i] = floor(z)
  47  *              z    = (z-x[i])*2**24
  48  *
  49  * On exit, Y is approximated by y[0] if prec is 0 and by the un-
  50  * evaluated sum y[0] + y[1] if prec != 0.  The approximation is
  51  * accurate to 53 bits in the former case and to at least 72 bits
  52  * in the latter.
  53  *
  54  * __vlibm_rem_pio2m returns n mod 8.
  55  *
  56  * Notes:
  57  *
  58  * As n is the integer nearest X * 2/pi, we approximate the latter
  59  * product to a precision that is determined dynamically so as to
  60  * ensure that the final value Y is approximated accurately enough.
  61  * We don't bother to compute terms in the product that are multiples
  62  * of 8, so the cost of this multiplication is independent of the
  63  * magnitude of X.  The variable ip determines the offset into the
  64  * array ipio2 of the first term we need to use.  The variable eq0
  65  * is the corresponding exponent of the first partial product.
  66  *
  67  * The partial products are scaled, summed, and split into an array
  68  * of non-overlapping 24-bit terms (not necessarily having the same
  69  * signs).  Each partial product overlaps three elements of the
  70  * resulting array:
  71  *
  72  *      q[i]   xxxxxxxxxxxxxx
  73  *      q[i+1]       xxxxxxxxxxxxxx
  74  *      q[i+2]             xxxxxxxxxxxxxx
  75  *      ...                      ...
  76  *
  77  *
  78  *      r[i]     xxxxxx
  79  *      r[i+1]         xxxxxx
  80  *      r[i+2]               xxxxxx
  81  *      ...                        ...
  82  *
  83  * In order that the last element of the r array have some correct
  84  * bits, we compute an extra term in the q array, but we don't bother
  85  * to split this last term into 24-bit chunks; thus, the final term
  86  * of the r array could have more than 24 bits, but this doesn't
  87  * matter.
  88  *
  89  * After we subtract the nearest integer to the product, we multiply
  90  * the remaining part of r by pi/2 to obtain Y.  Before we compute
  91  * this last product, however, we make sure that the remaining part
  92  * of r has at least five nonzero terms, computing more if need be.
  93  * This ensures that even if the first nonzero term is only a single
  94  * bit and the last term is wrong in several trailing bits, we still
  95  * have enough accuracy to obtain 72 bits of Y.
  96  *
  97  * IMPORTANT: This code assumes that the rounding mode is round-to-
  98  * nearest in several key places.  First, after we compute X * 2/pi,
  99  * we round to the nearest integer by adding and subtracting a power
 100  * of two.  This step must be done in round-to-nearest mode to ensure
 101  * that the remainder is less than 1/2 in absolute value.  (Because
 102  * we only take two adjacent terms of r into account when we perform
 103  * this rounding, in very rare cases the remainder could be just
 104  * barely greater than 1/2, but this shouldn't matter in practice.)
 105  *
 106  * Second, we also split the partial products of X * 2/pi into 24-bit
 107  * pieces by adding and subtracting a power of two.  In this step,
 108  * round-to-nearest mode is important in order to guarantee that
 109  * the index of the first nonzero term in the remainder gives an
 110  * accurate indication of the number of significant terms.  For
 111  * example, suppose eq0 = -1, so that r[1] is a multiple of 1/2 and
 112  * |r[2]| < 1/2.  After we subtract the nearest integer, r[1] could
 113  * be -1/2, and r[2] could be very nearly 1/2, so that r[1] != 0,
 114  * yet the remainder is much smaller than the least significant bit
 115  * corresponding to r[1].  As long as we use round-to-nearest mode,
 116  * this can't happen; instead, the absolute value of each r[j] will
 117  * be less than 1/2 the least significant bit corresponding to r[j-1],
 118  * so that the entire remainder must be at least half as large as
 119  * the first nonzero term (or perhaps just barely smaller than this).
 120  */
 121 
 122 #include <sys/isa_defs.h>
 123 
 124 #ifdef _LITTLE_ENDIAN
 125 #define HIWORD  1
 126 #define LOWORD  0
 127 #else
 128 #define HIWORD  0
 129 #define LOWORD  1
 130 #endif
 131 
 132 /* 396 hex digits of 2/pi, with two leading zeroes to make life easier */
 133 static const double ipio2[] = {
 134         0, 0,
 135         0xA2F983, 0x6E4E44, 0x1529FC, 0x2757D1, 0xF534DD, 0xC0DB62,
 136         0x95993C, 0x439041, 0xFE5163, 0xABDEBB, 0xC561B7, 0x246E3A,
 137         0x424DD2, 0xE00649, 0x2EEA09, 0xD1921C, 0xFE1DEB, 0x1CB129,
 138         0xA73EE8, 0x8235F5, 0x2EBB44, 0x84E99C, 0x7026B4, 0x5F7E41,
 139         0x3991D6, 0x398353, 0x39F49C, 0x845F8B, 0xBDF928, 0x3B1FF8,
 140         0x97FFDE, 0x05980F, 0xEF2F11, 0x8B5A0A, 0x6D1F6D, 0x367ECF,
 141         0x27CB09, 0xB74F46, 0x3F669E, 0x5FEA2D, 0x7527BA, 0xC7EBE5,
 142         0xF17B3D, 0x0739F7, 0x8A5292, 0xEA6BFB, 0x5FB11F, 0x8D5D08,
 143         0x560330, 0x46FC7B, 0x6BABF0, 0xCFBC20, 0x9AF436, 0x1DA9E3,
 144         0x91615E, 0xE61B08, 0x659985, 0x5F14A0, 0x68408D, 0xFFD880,
 145         0x4D7327, 0x310606, 0x1556CA, 0x73A8C9, 0x60E27B, 0xC08C6B,
 146 };
 147 
 148 /* pi/2 in 24-bit pieces */
 149 static const double pio2[] = {
 150         1.57079625129699707031e+00,
 151         7.54978941586159635335e-08,
 152         5.39030252995776476554e-15,
 153         3.28200341580791294123e-22,
 154         1.27065575308067607349e-29,
 155 };
 156 
 157 /* miscellaneous constants */
 158 static const double
 159         zero    = 0.0,
 160         two24   = 16777216.0,
 161         round1  = 6755399441055744.0,   /* 3 * 2^51 */
 162         round24 = 113336795588871485128704.0, /* 3 * 2^75 */
 163         twon24  = 5.960464477539062500E-8;
 164 
 165 int
 166 __vlibm_rem_pio2m(double *x, double *y, int e, int nx, int prec)
 167 {
 168         union {
 169                 double  d;
 170                 int     i[2];
 171         } s;
 172         double  z, t, p, q[20], r[21], *pr;
 173         int     nq, ip, n, i, j, k, eq0, eqnqm1;
 174 
 175         /* determine ip and eq0; note that -48 <= eq0 <= 2 */
 176         ip = (e - 3) / 24;
 177         if (ip < 0)
 178                 ip = 0;
 179         eq0 = e - 24 * (ip + 1);
 180 
 181         /* compute q[0,...,5] = x * ipio2 and initialize nq and eqnqm1 */
 182         if (nx == 3) {
 183                 q[0] = x[0] * ipio2[ip+2] + x[1] * ipio2[ip+1] + x[2] * ipio2[ip];
 184                 q[1] = x[0] * ipio2[ip+3] + x[1] * ipio2[ip+2] + x[2] * ipio2[ip+1];
 185                 q[2] = x[0] * ipio2[ip+4] + x[1] * ipio2[ip+3] + x[2] * ipio2[ip+2];
 186                 q[3] = x[0] * ipio2[ip+5] + x[1] * ipio2[ip+4] + x[2] * ipio2[ip+3];
 187                 q[4] = x[0] * ipio2[ip+6] + x[1] * ipio2[ip+5] + x[2] * ipio2[ip+4];
 188                 q[5] = x[0] * ipio2[ip+7] + x[1] * ipio2[ip+6] + x[2] * ipio2[ip+5];
 189         } else if (nx == 2) {
 190                 q[0] = x[0] * ipio2[ip+2] + x[1] * ipio2[ip+1];
 191                 q[1] = x[0] * ipio2[ip+3] + x[1] * ipio2[ip+2];
 192                 q[2] = x[0] * ipio2[ip+4] + x[1] * ipio2[ip+3];
 193                 q[3] = x[0] * ipio2[ip+5] + x[1] * ipio2[ip+4];
 194                 q[4] = x[0] * ipio2[ip+6] + x[1] * ipio2[ip+5];
 195                 q[5] = x[0] * ipio2[ip+7] + x[1] * ipio2[ip+6];
 196         } else {
 197                 q[0] = x[0] * ipio2[ip+2];
 198                 q[1] = x[0] * ipio2[ip+3];
 199                 q[2] = x[0] * ipio2[ip+4];
 200                 q[3] = x[0] * ipio2[ip+5];
 201                 q[4] = x[0] * ipio2[ip+6];
 202                 q[5] = x[0] * ipio2[ip+7];
 203         }
 204         nq = 5;
 205         eqnqm1 = eq0 - 96;
 206 
 207 recompute:
 208         /* propagate carries and incorporate powers of two */
 209         s.i[HIWORD] = (0x3ff + eqnqm1) << 20;
 210         s.i[LOWORD] = 0;
 211         p = s.d;
 212         z = q[nq] * twon24;
 213         for (j = nq-1; j >= 1; j--) {
 214                 z += q[j];
 215                 t = (z + round24) - round24; /* must be rounded to nearest */
 216                 r[j+1] = (z - t) * p;
 217                 z = t * twon24;
 218                 p *= two24;
 219         }
 220         z += q[0];
 221         t = (z + round24) - round24; /* must be rounded to nearest */
 222         r[1] = (z - t) * p;
 223         r[0] = t * p;
 224 
 225         /* form n = [r] mod 8 and leave the fractional part of r */
 226         if (eq0 > 0) {
 227                 /* binary point lies within r[2] */
 228                 z = r[2] + r[3];
 229                 t = (z + round1) - round1; /* must be rounded to nearest */
 230                 r[2] -= t;
 231                 n = (int)(r[1] + t);
 232                 r[0] = r[1] = zero;
 233         } else if (eq0 > -24) {
 234                 /* binary point lies within or just to the right of r[1] */
 235                 z = r[1] + r[2];
 236                 t = (z + round1) - round1; /* must be rounded to nearest */
 237                 r[1] -= t;
 238                 z = r[0] + t;
 239                 /* cut off high part of z so conversion to int doesn't
 240                    overflow */
 241                 t = (z + round24) - round24;
 242                 n = (int)(z - t);
 243                 r[0] = zero;
 244         } else {
 245                 /* binary point lies within or just to the right of r[0] */
 246                 z = r[0] + r[1];
 247                 t = (z + round1) - round1; /* must be rounded to nearest */
 248                 r[0] -= t;
 249                 n = (int)t;
 250         }
 251 
 252         /* count the number of leading zeroes in r */
 253         for (j = 0; j <= nq; j++) {
 254                 if (r[j] != zero)
 255                         break;
 256         }
 257 
 258         /* if fewer than 5 terms remain, add more */
 259         if (nq - j < 4) {
 260                 k = 4 - (nq - j);
 261                 /*
 262                  * compute q[nq+1] to q[nq+k]
 263                  *
 264                  * For some reason, writing out the nx loop explicitly
 265                  * for each of the three possible values (as above) seems
 266                  * to run a little slower, so we'll leave this code as is.
 267                  */
 268                 for (i = nq + 1; i <= nq + k; i++) {
 269                         t = x[0] * ipio2[ip+2+i];
 270                         for (j = 1; j < nx; j++)
 271                                 t += x[j] * ipio2[ip+2+i-j];
 272                         q[i] = t;
 273                         eqnqm1 -= 24;
 274                 }
 275                 nq += k;
 276                 goto recompute;
 277         }
 278 
 279         /* set pr and nq so that pr[0,...,nq] is the part of r remaining */
 280         pr = &r[j];
 281         nq = nq - j;
 282 
 283         /* compute pio2 * pr[0,...,nq]; note that nq >= 4 here */
 284         q[0] = pio2[0] * pr[0];
 285         q[1] = pio2[0] * pr[1] + pio2[1] * pr[0];
 286         q[2] = pio2[0] * pr[2] + pio2[1] * pr[1] + pio2[2] * pr[0];
 287         q[3] = pio2[0] * pr[3] + pio2[1] * pr[2] + pio2[2] * pr[1]
 288             + pio2[3] * pr[0];
 289         for (i = 4; i <= nq; i++) {
 290                 q[i] = pio2[0] * pr[i] + pio2[1] * pr[i-1] + pio2[2] * pr[i-2]
 291                     + pio2[3] * pr[i-3] + pio2[4] * pr[i-4];
 292         }
 293 
 294         /* sum q in increasing order to obtain the first term of y */
 295         t = q[nq];
 296         for (i = nq - 1; i >= 0; i--)
 297                 t += q[i];
 298         y[0] = t;
 299         if (prec) {
 300                 /* subtract and sum again in decreasing order
 301                    to obtain the second term */
 302                 t = q[0] - t;
 303                 for (i = 1; i <= nq; i++)
 304                         t += q[i];
 305                 y[1] = t;
 306         }
 307 
 308         return (n & 7);
 309 }