1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25 /*
  26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  27  * Use is subject to license terms.
  28  */
  29 
  30 #include <sys/isa_defs.h>
  31 #include "libm_inlines.h"
  32 
  33 #ifdef _LITTLE_ENDIAN
  34 #define HI(x)   *(1+(int*)x)
  35 #define LO(x)   *(unsigned*)x
  36 #else
  37 #define HI(x)   *(int*)x
  38 #define LO(x)   *(1+(unsigned*)x)
  39 #endif
  40 
  41 #ifdef __RESTRICT
  42 #define restrict _Restrict
  43 #else
  44 #define restrict
  45 #endif
  46 
  47 /* double rhypot(double x, double y)
  48  *
  49  * Method :
  50  *      1. Special cases:
  51  *              x or  y = Inf                                   => 0
  52  *              x or  y = NaN                                   => QNaN
  53  *              x and y = 0                                     => Inf + divide-by-zero
  54  *      2. Computes rhypot(x,y):
  55  *              rhypot(x,y) = m * sqrt(1/(xnm * xnm + ynm * ynm))
  56  *      Where:
  57  *              m = 1/max(|x|,|y|)
  58  *              xnm = x * m
  59  *              ynm = y * m
  60  *
  61  *      Compute 1/(xnm * xnm + ynm * ynm) by simulating
  62  *      muti-precision arithmetic.
  63  *
  64  * Accuracy:
  65  *      Maximum error observed: less than 0.869 ulp after 1.000.000.000
  66  *      results.
  67  */
  68 
  69 extern double sqrt(double);
  70 extern double fabs(double);
  71 
  72 static const int __vlibm_TBL_rhypot[] = {
  73 /* i = [0,127]
  74  * TBL[i] = 0x3ff00000 + *(int*)&(1.0 / *(double*)&(0x3ff0000000000000ULL + (i << 45))); */
  75  0x7fe00000,  0x7fdfc07f, 0x7fdf81f8,  0x7fdf4465,
  76  0x7fdf07c1,  0x7fdecc07, 0x7fde9131,  0x7fde573a,
  77  0x7fde1e1e,  0x7fdde5d6, 0x7fddae60,  0x7fdd77b6,
  78  0x7fdd41d4,  0x7fdd0cb5, 0x7fdcd856,  0x7fdca4b3,
  79  0x7fdc71c7,  0x7fdc3f8f, 0x7fdc0e07,  0x7fdbdd2b,
  80  0x7fdbacf9,  0x7fdb7d6c, 0x7fdb4e81,  0x7fdb2036,
  81  0x7fdaf286,  0x7fdac570, 0x7fda98ef,  0x7fda6d01,
  82  0x7fda41a4,  0x7fda16d3, 0x7fd9ec8e,  0x7fd9c2d1,
  83  0x7fd99999,  0x7fd970e4, 0x7fd948b0,  0x7fd920fb,
  84  0x7fd8f9c1,  0x7fd8d301, 0x7fd8acb9,  0x7fd886e5,
  85  0x7fd86186,  0x7fd83c97, 0x7fd81818,  0x7fd7f405,
  86  0x7fd7d05f,  0x7fd7ad22, 0x7fd78a4c,  0x7fd767dc,
  87  0x7fd745d1,  0x7fd72428, 0x7fd702e0,  0x7fd6e1f7,
  88  0x7fd6c16c,  0x7fd6a13c, 0x7fd68168,  0x7fd661ec,
  89  0x7fd642c8,  0x7fd623fa, 0x7fd60581,  0x7fd5e75b,
  90  0x7fd5c988,  0x7fd5ac05, 0x7fd58ed2,  0x7fd571ed,
  91  0x7fd55555,  0x7fd53909, 0x7fd51d07,  0x7fd50150,
  92  0x7fd4e5e0,  0x7fd4cab8, 0x7fd4afd6,  0x7fd49539,
  93  0x7fd47ae1,  0x7fd460cb, 0x7fd446f8,  0x7fd42d66,
  94  0x7fd41414,  0x7fd3fb01, 0x7fd3e22c,  0x7fd3c995,
  95  0x7fd3b13b,  0x7fd3991c, 0x7fd38138,  0x7fd3698d,
  96  0x7fd3521c,  0x7fd33ae4, 0x7fd323e3,  0x7fd30d19,
  97  0x7fd2f684,  0x7fd2e025, 0x7fd2c9fb,  0x7fd2b404,
  98  0x7fd29e41,  0x7fd288b0, 0x7fd27350,  0x7fd25e22,
  99  0x7fd24924,  0x7fd23456, 0x7fd21fb7,  0x7fd20b47,
 100  0x7fd1f704,  0x7fd1e2ef, 0x7fd1cf06,  0x7fd1bb4a,
 101  0x7fd1a7b9,  0x7fd19453, 0x7fd18118,  0x7fd16e06,
 102  0x7fd15b1e,  0x7fd1485f, 0x7fd135c8,  0x7fd12358,
 103  0x7fd11111,  0x7fd0fef0, 0x7fd0ecf5,  0x7fd0db20,
 104  0x7fd0c971,  0x7fd0b7e6, 0x7fd0a681,  0x7fd0953f,
 105  0x7fd08421,  0x7fd07326, 0x7fd0624d,  0x7fd05197,
 106  0x7fd04104,  0x7fd03091, 0x7fd02040,  0x7fd01010,
 107 };
 108 
 109 static const unsigned long long LCONST[] = {
 110 0x3ff0000000000000ULL,  /* DONE = 1.0           */
 111 0x4000000000000000ULL,  /* DTWO = 2.0           */
 112 0x4230000000000000ULL,  /* D2ON36 = 2**36       */
 113 0x7fd0000000000000ULL,  /* D2ON1022 = 2**1022   */
 114 0x3cb0000000000000ULL,  /* D2ONM52 = 2**-52     */
 115 };
 116 
 117 #define RET_SC(I)                                                                               \
 118         px += stridex;                                                                          \
 119         py += stridey;                                                                          \
 120         pz += stridez;                                                                          \
 121         if (--n <= 0)                                                                                \
 122                 break;                                                                          \
 123         goto start##I;
 124 
 125 #define RETURN(I, ret)                                                                          \
 126 {                                                                                               \
 127         pz[0] = (ret);                                                                          \
 128         RET_SC(I)                                                                               \
 129 }
 130 
 131 #define PREP(I)                                                                                 \
 132 hx##I = HI(px);                                                                         \
 133 hy##I = HI(py);                                                                         \
 134 hx##I &= 0x7fffffff;                                                                                \
 135 hy##I &= 0x7fffffff;                                                                                \
 136 pz##I = pz;                                                                                     \
 137 if (hx##I >= 0x7ff00000 || hy##I >= 0x7ff00000)   /* |X| or |Y| = Inf,NaN */              \
 138 {                                                                                               \
 139         lx = LO(px);                                                                    \
 140         ly = LO(py);                                                                    \
 141         x = *px;                                                                                \
 142         y = *py;                                                                                \
 143         if (hx##I == 0x7ff00000 && lx == 0) res0 = 0.0;         /* |X| = Inf */         \
 144         else if (hy##I == 0x7ff00000 && ly == 0) res0 = 0.0;    /* |Y| = Inf */         \
 145         else res0 = fabs(x) + fabs(y);                                                          \
 146                                                                                                 \
 147         RETURN (I, res0)                                                                        \
 148 }                                                                                               \
 149 x##I = *px;                                                                                     \
 150 y##I = *py;                                                                                     \
 151 diff0 = hy##I - hx##I;                                                                          \
 152 j0 = diff0 >> 31;                                                                         \
 153 if (hx##I < 0x00100000 && hy##I < 0x00100000)     /* |X| and |Y| = subnormal or zero */           \
 154 {                                                                                               \
 155         lx = LO(px);                                                                    \
 156         ly = LO(py);                                                                    \
 157         x = x##I;                                                                               \
 158         y = y##I;                                                                               \
 159                                                                                                 \
 160         if ((hx##I | hy##I | lx | ly) == 0)     /* |X| and |Y| = 0 */                           \
 161                 RETURN (I, DONE / 0.0)                                                  \
 162                                                                                                 \
 163         x = fabs(x);                                                                            \
 164         y = fabs(y);                                                                            \
 165                                                                                                 \
 166         x = *(long long*)&x;                                                                        \
 167         y = *(long long*)&y;                                                                        \
 168                                                                                                 \
 169         x *= D2ONM52;                                                                           \
 170         y *= D2ONM52;                                                                           \
 171                                                                                                 \
 172         x_hi0 = (x + D2ON36) - D2ON36;                                                  \
 173         y_hi0 = (y + D2ON36) - D2ON36;                                                  \
 174         x_lo0 = x - x_hi0;                                                                      \
 175         y_lo0 = y - y_hi0;                                                                      \
 176         res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);                                              \
 177         res0_lo = ((x + x_hi0) * x_lo0 + (y + y_hi0) * y_lo0);                                  \
 178                                                                                                 \
 179         dres0 = res0_hi + res0_lo;                                                              \
 180                                                                                                 \
 181         iarr0 = HI(&dres0);                                                         \
 182         iexp0 = iarr0 & 0xfff00000;                                                         \
 183                                                                                                 \
 184         iarr0 = (iarr0 >> 11) & 0x1fc;                                                                \
 185         itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];                                 \
 186         itbl0 -= iexp0;                                                                         \
 187         HI(&dd0) = itbl0;                                           \
 188         LO(&dd0) = 0;                                                               \
 189                                                                                                 \
 190         dd0 = dd0 * (DTWO - dd0 * dres0);                                                       \
 191         dd0 = dd0 * (DTWO - dd0 * dres0);                                                       \
 192         dres0 = dd0 * (DTWO - dd0 * dres0);                                                     \
 193                                                                                                 \
 194         HI(&res0) = HI(&dres0) & 0xffffff00;                                        \
 195         LO(&res0) = 0;                                                              \
 196         res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;                               \
 197         res0 = sqrt (res0);                                                                     \
 198                                                                                                 \
 199         res0 = D2ON1022 * res0;                                                                 \
 200         RETURN (I, res0)                                                                        \
 201 }                                                                                               \
 202 j0 = hy##I - (diff0 & j0);                                                                  \
 203 j0 &= 0x7ff00000;                                                                           \
 204 HI(&scl##I) = 0x7ff00000 - j0;
 205 
 206 void
 207 __vrhypot(int n, double * restrict px, int stridex, double * restrict py,
 208         int stridey, double * restrict pz, int stridez)
 209 {
 210         int             i = 0;
 211         double          x, y;
 212         double          x_hi0, x_lo0, y_hi0, y_lo0, scl0 = 0;
 213         double          x0, y0, res0, dd0;
 214         double          res0_hi,res0_lo, dres0;
 215         double          x_hi1, x_lo1, y_hi1, y_lo1, scl1 = 0;
 216         double          x1 = 0.0L, y1 = 0.0L, res1, dd1;
 217         double          res1_hi,res1_lo, dres1;
 218         double          x_hi2, x_lo2, y_hi2, y_lo2, scl2 = 0;
 219         double          x2, y2, res2, dd2;
 220         double          res2_hi,res2_lo, dres2;
 221 
 222         int             hx0, hy0, j0, diff0;
 223         int             iarr0, iexp0, itbl0;
 224         int             hx1, hy1;
 225         int             iarr1, iexp1, itbl1;
 226         int             hx2, hy2;
 227         int             iarr2, iexp2, itbl2;
 228 
 229         int             lx, ly;
 230 
 231         double          DONE = ((double*)LCONST)[0];
 232         double          DTWO = ((double*)LCONST)[1];
 233         double          D2ON36 = ((double*)LCONST)[2];
 234         double          D2ON1022 = ((double*)LCONST)[3];
 235         double          D2ONM52 = ((double*)LCONST)[4];
 236 
 237         double          *pz0, *pz1 = 0, *pz2;
 238 
 239         do
 240         {
 241 start0:
 242                 PREP(0)
 243                 px += stridex;
 244                 py += stridey;
 245                 pz += stridez;
 246                 i = 1;
 247                 if (--n <= 0)
 248                         break;
 249 
 250 start1:
 251                 PREP(1)
 252                 px += stridex;
 253                 py += stridey;
 254                 pz += stridez;
 255                 i = 2;
 256                 if (--n <= 0)
 257                         break;
 258 
 259 start2:
 260                 PREP(2)
 261 
 262                 x0 *= scl0;
 263                 y0 *= scl0;
 264                 x1 *= scl1;
 265                 y1 *= scl1;
 266                 x2 *= scl2;
 267                 y2 *= scl2;
 268 
 269                 x_hi0 = (x0 + D2ON36) - D2ON36;
 270                 y_hi0 = (y0 + D2ON36) - D2ON36;
 271                 x_hi1 = (x1 + D2ON36) - D2ON36;
 272                 y_hi1 = (y1 + D2ON36) - D2ON36;
 273                 x_hi2 = (x2 + D2ON36) - D2ON36;
 274                 y_hi2 = (y2 + D2ON36) - D2ON36;
 275                 x_lo0 = x0 - x_hi0;
 276                 y_lo0 = y0 - y_hi0;
 277                 x_lo1 = x1 - x_hi1;
 278                 y_lo1 = y1 - y_hi1;
 279                 x_lo2 = x2 - x_hi2;
 280                 y_lo2 = y2 - y_hi2;
 281                 res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);
 282                 res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1);
 283                 res2_hi = (x_hi2 * x_hi2 + y_hi2 * y_hi2);
 284                 res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0);
 285                 res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1);
 286                 res2_lo = ((x2 + x_hi2) * x_lo2 + (y2 + y_hi2) * y_lo2);
 287 
 288                 dres0 = res0_hi + res0_lo;
 289                 dres1 = res1_hi + res1_lo;
 290                 dres2 = res2_hi + res2_lo;
 291 
 292                 iarr0 = HI(&dres0);
 293                 iarr1 = HI(&dres1);
 294                 iarr2 = HI(&dres2);
 295                 iexp0 = iarr0 & 0xfff00000;
 296                 iexp1 = iarr1 & 0xfff00000;
 297                 iexp2 = iarr2 & 0xfff00000;
 298 
 299                 iarr0 = (iarr0 >> 11) & 0x1fc;
 300                 iarr1 = (iarr1 >> 11) & 0x1fc;
 301                 iarr2 = (iarr2 >> 11) & 0x1fc;
 302                 itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];
 303                 itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0];
 304                 itbl2 = ((int*)((char*)__vlibm_TBL_rhypot + iarr2))[0];
 305                 itbl0 -= iexp0;
 306                 itbl1 -= iexp1;
 307                 itbl2 -= iexp2;
 308                 HI(&dd0) = itbl0;
 309                 HI(&dd1) = itbl1;
 310                 HI(&dd2) = itbl2;
 311                 LO(&dd0) = 0;
 312                 LO(&dd1) = 0;
 313                 LO(&dd2) = 0;
 314 
 315                 dd0 = dd0 * (DTWO - dd0 * dres0);
 316                 dd1 = dd1 * (DTWO - dd1 * dres1);
 317                 dd2 = dd2 * (DTWO - dd2 * dres2);
 318                 dd0 = dd0 * (DTWO - dd0 * dres0);
 319                 dd1 = dd1 * (DTWO - dd1 * dres1);
 320                 dd2 = dd2 * (DTWO - dd2 * dres2);
 321                 dres0 = dd0 * (DTWO - dd0 * dres0);
 322                 dres1 = dd1 * (DTWO - dd1 * dres1);
 323                 dres2 = dd2 * (DTWO - dd2 * dres2);
 324 
 325                 HI(&res0) = HI(&dres0) & 0xffffff00;
 326                 HI(&res1) = HI(&dres1) & 0xffffff00;
 327                 HI(&res2) = HI(&dres2) & 0xffffff00;
 328                 LO(&res0) = 0;
 329                 LO(&res1) = 0;
 330                 LO(&res2) = 0;
 331                 res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;
 332                 res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1;
 333                 res2 += (DONE - res2_hi * res2 - res2_lo * res2) * dres2;
 334                 res0 = sqrt (res0);
 335                 res1 = sqrt (res1);
 336                 res2 = sqrt (res2);
 337 
 338                 res0 = scl0 * res0;
 339                 res1 = scl1 * res1;
 340                 res2 = scl2 * res2;
 341 
 342                 *pz0 = res0;
 343                 *pz1 = res1;
 344                 *pz2 = res2;
 345 
 346                 px += stridex;
 347                 py += stridey;
 348                 pz += stridez;
 349                 i = 0;
 350 
 351         } while (--n > 0);
 352 
 353         if (i > 0)
 354         {
 355                 x0 *= scl0;
 356                 y0 *= scl0;
 357 
 358                 x_hi0 = (x0 + D2ON36) - D2ON36;
 359                 y_hi0 = (y0 + D2ON36) - D2ON36;
 360                 x_lo0 = x0 - x_hi0;
 361                 y_lo0 = y0 - y_hi0;
 362                 res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);
 363                 res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0);
 364 
 365                 dres0 = res0_hi + res0_lo;
 366 
 367                 iarr0 = HI(&dres0);
 368                 iexp0 = iarr0 & 0xfff00000;
 369 
 370                 iarr0 = (iarr0 >> 11) & 0x1fc;
 371                 itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];
 372                 itbl0 -= iexp0;
 373                 HI(&dd0) = itbl0;
 374                 LO(&dd0) = 0;
 375 
 376                 dd0 = dd0 * (DTWO - dd0 * dres0);
 377                 dd0 = dd0 * (DTWO - dd0 * dres0);
 378                 dres0 = dd0 * (DTWO - dd0 * dres0);
 379 
 380                 HI(&res0) = HI(&dres0) & 0xffffff00;
 381                 LO(&res0) = 0;
 382                 res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;
 383                 res0 = sqrt (res0);
 384 
 385                 res0 = scl0 * res0;
 386 
 387                 *pz0 = res0;
 388 
 389                 if (i > 1)
 390                 {
 391                         x1 *= scl1;
 392                         y1 *= scl1;
 393 
 394                         x_hi1 = (x1 + D2ON36) - D2ON36;
 395                         y_hi1 = (y1 + D2ON36) - D2ON36;
 396                         x_lo1 = x1 - x_hi1;
 397                         y_lo1 = y1 - y_hi1;
 398                         res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1);
 399                         res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1);
 400 
 401                         dres1 = res1_hi + res1_lo;
 402 
 403                         iarr1 = HI(&dres1);
 404                         iexp1 = iarr1 & 0xfff00000;
 405 
 406                         iarr1 = (iarr1 >> 11) & 0x1fc;
 407                         itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0];
 408                         itbl1 -= iexp1;
 409                         HI(&dd1) = itbl1;
 410                         LO(&dd1) = 0;
 411 
 412                         dd1 = dd1 * (DTWO - dd1 * dres1);
 413                         dd1 = dd1 * (DTWO - dd1 * dres1);
 414                         dres1 = dd1 * (DTWO - dd1 * dres1);
 415 
 416                         HI(&res1) = HI(&dres1) & 0xffffff00;
 417                         LO(&res1) = 0;
 418                         res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1;
 419                         res1 = sqrt (res1);
 420 
 421                         res1 = scl1 * res1;
 422 
 423                         *pz1 = res1;
 424                 }
 425         }
 426 }