1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25 /*
  26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  27  * Use is subject to license terms.
  28  */
  29 
  30 #include <sys/isa_defs.h>
  31 #include "libm_synonyms.h"
  32 #include "libm_inlines.h"
  33 
  34 #ifdef _LITTLE_ENDIAN
  35 #define HI(x)   *(1+(int*)x)
  36 #define LO(x)   *(unsigned*)x
  37 #else
  38 #define HI(x)   *(int*)x
  39 #define LO(x)   *(1+(unsigned*)x)
  40 #endif
  41 
  42 #ifdef __RESTRICT
  43 #define restrict _Restrict
  44 #else
  45 #define restrict
  46 #endif
  47 
  48 /* double rsqrt(double x)
  49  *
  50  * Method :
  51  *      1. Special cases:
  52  *              for x = NaN                             => QNaN;
  53  *              for x = +Inf                            => 0;
  54  *              for x is negative, -Inf                 => QNaN + invalid;
  55  *              for x = +0                              => +Inf + divide-by-zero;
  56  *              for x = -0                              => -Inf + divide-by-zero.
  57  *      2. Computes reciprocal square root from:
  58  *              x = m * 2**n
  59  *      Where:
  60  *              m = [0.5, 2),
  61  *              n = ((exponent + 1) & ~1).
  62  *      Then:
  63  *              rsqrt(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m))
  64  *      2. Computes 1/sqrt(m) from:
  65  *              1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm))
  66  *      Where:
  67  *              m = m0 + dm,
  68  *              m0 = 0.5 * (1 + k/64) for m = [0.5,         0.5+127/256), k = [0, 63];
  69  *              m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127];
  70  *              m0 = 2.0              for m = [1.0+127/128, 2.0),         k = 128.
  71  *      Then:
  72  *              1/sqrt(m0) is looked up in a table,
  73  *              1/m0 is computed as (1/sqrt(m0)) * (1/sqrt(m0)).
  74  *              1/sqrt(1 + (1/m0)*dm) is computed using approximation:
  75  *                      1/sqrt(1 + z) = (((((a6 * z + a5) * z + a4) * z + a3)
  76  *                                              * z + a2) * z + a1) * z + a0
  77  *                      where z = [-1/128, 1/128].
  78  *
  79  * Accuracy:
  80  *      The maximum relative error for the approximating
  81  *      polynomial is 2**(-56.26).
  82  *      Maximum error observed: less than 0.563 ulp after 1.500.000.000
  83  *      results.
  84  */
  85 
  86 #define sqrt __sqrt
  87 
  88 extern double sqrt (double);
  89 extern const double __vlibm_TBL_rsqrt[];
  90 
  91 static void
  92 __vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey);
  93 
  94 #pragma no_inline(__vrsqrt_n)
  95 
  96 #define RETURN(ret)                                             \
  97 {                                                               \
  98         *py = (ret);                                            \
  99         py += stridey;                                          \
 100         if (n_n == 0)                                           \
 101         {                                                       \
 102                 spx = px; spy = py;                             \
 103                 hx = HI(px);                                    \
 104                 continue;                                       \
 105         }                                                       \
 106         n--;                                                    \
 107         break;                                                  \
 108 }
 109 
 110 static const double
 111         DONE = 1.0,
 112         K1 = -5.00000000000005209867e-01,
 113         K2 =  3.75000000000004884257e-01,
 114         K3 = -3.12499999317136886551e-01,
 115         K4 =  2.73437499359815081532e-01,
 116         K5 = -2.46116125605037803130e-01,
 117         K6 =  2.25606914648617522896e-01;
 118 
 119 void
 120 __vrsqrt(int n, double * restrict px, int stridex, double * restrict py, int stridey)
 121 {
 122         double          *spx, *spy;
 123         int             ax, lx, hx, n_n;
 124         double          res;
 125 
 126         while (n > 1)
 127         {
 128                 n_n = 0;
 129                 spx = px;
 130                 spy = py;
 131                 hx = HI(px);
 132                 for (; n > 1 ; n--)
 133                 {
 134                         px += stridex;
 135                         if (hx >= 0x7ff00000)                /* X = NaN or Inf       */
 136                         {
 137                                 res = *(px - stridex);
 138                                 RETURN (DONE / res)
 139                         }
 140 
 141                         py += stridey;
 142 
 143                         if (hx < 0x00100000)         /* X = denormal, zero or negative       */
 144                         {
 145                                 py -= stridey;
 146                                 ax = hx & 0x7fffffff;
 147                                 lx = LO((px - stridex));
 148                                 res = *(px - stridex);
 149 
 150                                 if ((ax | lx) == 0)     /* |X| = zero   */
 151                                 {
 152                                         RETURN (DONE / res)
 153                                 }
 154                                 else if (hx >= 0)    /* X = denormal */
 155                                 {
 156                                         double          res_c0, dsqrt_exp0;
 157                                         int             ind0, sqrt_exp0;
 158                                         double          xx0, dexp_hi0, dexp_lo0;
 159                                         int             hx0, resh0, res_ch0;
 160 
 161                                         res = *(long long*)&res;
 162 
 163                                         hx0 = HI(&res);
 164                                         sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
 165                                         ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
 166 
 167                                         resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
 168                                         res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
 169                                         HI(&res) = resh0;
 170                                         HI(&res_c0) = res_ch0;
 171                                         LO(&res_c0) = 0;
 172 
 173                                         dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
 174                                         dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
 175                                         xx0 = dexp_hi0 * dexp_hi0;
 176                                         xx0 = (res - res_c0) * xx0;
 177                                         res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
 178 
 179                                         res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
 180 
 181                                         HI(&dsqrt_exp0) = sqrt_exp0;
 182                                         LO(&dsqrt_exp0) = 0;
 183                                         res *= dsqrt_exp0;
 184 
 185                                         RETURN (res)
 186                                 }
 187                                 else    /* X = negative */
 188                                 {
 189                                         RETURN (sqrt(res))
 190                                 }
 191                         }
 192                         n_n++;
 193                         hx = HI(px);
 194                 }
 195                 if (n_n > 0)
 196                         __vrsqrt_n(n_n, spx, stridex, spy, stridey);
 197         }
 198         if (n > 0)
 199         {
 200                 hx = HI(px);
 201 
 202                 if (hx >= 0x7ff00000)                /* X = NaN or Inf       */
 203                 {
 204                         res = *px;
 205                         *py = DONE / res;
 206                 }
 207                 else if (hx < 0x00100000)    /* X = denormal, zero or negative       */
 208                 {
 209                         ax = hx & 0x7fffffff;
 210                         lx = LO(px);
 211                         res = *px;
 212 
 213                         if ((ax | lx) == 0)     /* |X| = zero   */
 214                         {
 215                                 *py = DONE / res;
 216                         }
 217                         else if (hx >= 0)    /* X = denormal */
 218                         {
 219                                 double          res_c0, dsqrt_exp0;
 220                                 int             ind0, sqrt_exp0;
 221                                 double          xx0, dexp_hi0, dexp_lo0;
 222                                 int             hx0, resh0, res_ch0;
 223 
 224                                 res = *(long long*)&res;
 225 
 226                                 hx0 = HI(&res);
 227                                 sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20;
 228                                 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
 229 
 230                                 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
 231                                 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
 232                                 HI(&res) = resh0;
 233                                 HI(&res_c0) = res_ch0;
 234                                 LO(&res_c0) = 0;
 235 
 236                                 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
 237                                 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
 238                                 xx0 = dexp_hi0 * dexp_hi0;
 239                                 xx0 = (res - res_c0) * xx0;
 240                                 res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
 241 
 242                                 res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
 243 
 244                                 HI(&dsqrt_exp0) = sqrt_exp0;
 245                                 LO(&dsqrt_exp0) = 0;
 246                                 res *= dsqrt_exp0;
 247 
 248                                 *py = res;
 249                         }
 250                         else    /* X = negative */
 251                         {
 252                                 *py = sqrt(res);
 253                         }
 254                 }
 255                 else
 256                 {
 257                         double          res_c0, dsqrt_exp0;
 258                         int             ind0, sqrt_exp0;
 259                         double          xx0, dexp_hi0, dexp_lo0;
 260                         int             resh0, res_ch0;
 261 
 262                         sqrt_exp0 = (0x5fe - (hx >> 21)) << 20;
 263                         ind0 = (((hx >> 10) & 0x7f8) + 8) & -16;
 264 
 265                         resh0 = (hx & 0x001fffff) | 0x3fe00000;
 266                         res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
 267                         HI(&res) = resh0;
 268                         LO(&res) = LO(px);
 269                         HI(&res_c0) = res_ch0;
 270                         LO(&res_c0) = 0;
 271 
 272                         dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
 273                         dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
 274                         xx0 = dexp_hi0 * dexp_hi0;
 275                         xx0 = (res - res_c0) * xx0;
 276                         res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
 277 
 278                         res = dexp_hi0 * res + dexp_lo0 + dexp_hi0;
 279 
 280                         HI(&dsqrt_exp0) = sqrt_exp0;
 281                         LO(&dsqrt_exp0) = 0;
 282                         res *= dsqrt_exp0;
 283 
 284                         *py = res;
 285                 }
 286         }
 287 }
 288 
 289 static void
 290 __vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey)
 291 {
 292         double          res0, res_c0, dsqrt_exp0;
 293         double          res1, res_c1, dsqrt_exp1;
 294         double          res2, res_c2, dsqrt_exp2;
 295         int             ind0, sqrt_exp0;
 296         int             ind1, sqrt_exp1;
 297         int             ind2, sqrt_exp2;
 298         double          xx0, dexp_hi0, dexp_lo0;
 299         double          xx1, dexp_hi1, dexp_lo1;
 300         double          xx2, dexp_hi2, dexp_lo2;
 301         int             hx0, resh0, res_ch0;
 302         int             hx1, resh1, res_ch1;
 303         int             hx2, resh2, res_ch2;
 304 
 305         LO(&dsqrt_exp0) = 0;
 306         LO(&dsqrt_exp1) = 0;
 307         LO(&dsqrt_exp2) = 0;
 308         LO(&res_c0) = 0;
 309         LO(&res_c1) = 0;
 310         LO(&res_c2) = 0;
 311 
 312         for(; n > 2 ; n -= 3)
 313         {
 314                 hx0 = HI(px);
 315                 LO(&res0) = LO(px);
 316                 px += stridex;
 317 
 318                 hx1 = HI(px);
 319                 LO(&res1) = LO(px);
 320                 px += stridex;
 321 
 322                 hx2 = HI(px);
 323                 LO(&res2) = LO(px);
 324                 px += stridex;
 325 
 326                 sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
 327                 sqrt_exp1 = (0x5fe - (hx1 >> 21)) << 20;
 328                 sqrt_exp2 = (0x5fe - (hx2 >> 21)) << 20;
 329                 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
 330                 ind1 = (((hx1 >> 10) & 0x7f8) + 8) & -16;
 331                 ind2 = (((hx2 >> 10) & 0x7f8) + 8) & -16;
 332 
 333                 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
 334                 resh1 = (hx1 & 0x001fffff) | 0x3fe00000;
 335                 resh2 = (hx2 & 0x001fffff) | 0x3fe00000;
 336                 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
 337                 res_ch1 = (resh1 + 0x00002000) & 0x7fffc000;
 338                 res_ch2 = (resh2 + 0x00002000) & 0x7fffc000;
 339                 HI(&res0) = resh0;
 340                 HI(&res1) = resh1;
 341                 HI(&res2) = resh2;
 342                 HI(&res_c0) = res_ch0;
 343                 HI(&res_c1) = res_ch1;
 344                 HI(&res_c2) = res_ch2;
 345 
 346                 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
 347                 dexp_hi1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[0];
 348                 dexp_hi2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[0];
 349                 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
 350                 dexp_lo1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[1];
 351                 dexp_lo2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[1];
 352                 xx0 = dexp_hi0 * dexp_hi0;
 353                 xx1 = dexp_hi1 * dexp_hi1;
 354                 xx2 = dexp_hi2 * dexp_hi2;
 355                 xx0 = (res0 - res_c0) * xx0;
 356                 xx1 = (res1 - res_c1) * xx1;
 357                 xx2 = (res2 - res_c2) * xx2;
 358                 res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
 359                 res1 = (((((K6 * xx1 + K5) * xx1 + K4) * xx1 + K3) * xx1 + K2) * xx1 + K1) * xx1;
 360                 res2 = (((((K6 * xx2 + K5) * xx2 + K4) * xx2 + K3) * xx2 + K2) * xx2 + K1) * xx2;
 361 
 362                 res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
 363                 res1 = dexp_hi1 * res1 + dexp_lo1 + dexp_hi1;
 364                 res2 = dexp_hi2 * res2 + dexp_lo2 + dexp_hi2;
 365 
 366                 HI(&dsqrt_exp0) = sqrt_exp0;
 367                 HI(&dsqrt_exp1) = sqrt_exp1;
 368                 HI(&dsqrt_exp2) = sqrt_exp2;
 369                 res0 *= dsqrt_exp0;
 370                 res1 *= dsqrt_exp1;
 371                 res2 *= dsqrt_exp2;
 372 
 373                 *py = res0;
 374                 py += stridey;
 375 
 376                 *py = res1;
 377                 py += stridey;
 378 
 379                 *py = res2;
 380                 py += stridey;
 381         }
 382 
 383         for(; n > 0 ; n--)
 384         {
 385                 hx0 = HI(px);
 386 
 387                 sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20;
 388                 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16;
 389 
 390                 resh0 = (hx0 & 0x001fffff) | 0x3fe00000;
 391                 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000;
 392                 HI(&res0) = resh0;
 393                 LO(&res0) = LO(px);
 394                 HI(&res_c0) = res_ch0;
 395                 LO(&res_c0) = 0;
 396 
 397                 px += stridex;
 398 
 399                 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0];
 400                 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1];
 401                 xx0 = dexp_hi0 * dexp_hi0;
 402                 xx0 = (res0 - res_c0) * xx0;
 403                 res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0;
 404 
 405                 res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0;
 406 
 407                 HI(&dsqrt_exp0) = sqrt_exp0;
 408                 LO(&dsqrt_exp0) = 0;
 409                 res0 *= dsqrt_exp0;
 410 
 411                 *py = res0;
 412                 py += stridey;
 413         }
 414 }
 415