1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25 /*
  26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  27  * Use is subject to license terms.
  28  */
  29 
  30 #include <sys/isa_defs.h>
  31 #include "libm_synonyms.h"
  32 #include "libm_inlines.h"
  33 
  34 #ifdef _LITTLE_ENDIAN
  35 #define HI(x)   *(1+(int*)x)
  36 #define LO(x)   *(unsigned*)x
  37 #else
  38 #define HI(x)   *(int*)x
  39 #define LO(x)   *(1+(unsigned*)x)
  40 #endif
  41 
  42 #ifdef __RESTRICT
  43 #define restrict _Restrict
  44 #else
  45 #define restrict
  46 #endif
  47 
  48 /* double rhypot(double x, double y)
  49  *
  50  * Method :
  51  *      1. Special cases:
  52  *              x or  y = Inf                                   => 0
  53  *              x or  y = NaN                                   => QNaN
  54  *              x and y = 0                                     => Inf + divide-by-zero
  55  *      2. Computes rhypot(x,y):
  56  *              rhypot(x,y) = m * sqrt(1/(xnm * xnm + ynm * ynm))
  57  *      Where:
  58  *              m = 1/max(|x|,|y|)
  59  *              xnm = x * m
  60  *              ynm = y * m
  61  *
  62  *      Compute 1/(xnm * xnm + ynm * ynm) by simulating
  63  *      muti-precision arithmetic.
  64  *
  65  * Accuracy:
  66  *      Maximum error observed: less than 0.869 ulp after 1.000.000.000
  67  *      results.
  68  */
  69 
  70 #define sqrt __sqrt
  71 
  72 extern double sqrt(double);
  73 
  74 extern double fabs(double);
  75 
  76 static const int __vlibm_TBL_rhypot[] = {
  77 /* i = [0,127]
  78  * TBL[i] = 0x3ff00000 + *(int*)&(1.0 / *(double*)&(0x3ff0000000000000ULL + (i << 45))); */
  79  0x7fe00000,  0x7fdfc07f, 0x7fdf81f8,  0x7fdf4465,
  80  0x7fdf07c1,  0x7fdecc07, 0x7fde9131,  0x7fde573a,
  81  0x7fde1e1e,  0x7fdde5d6, 0x7fddae60,  0x7fdd77b6,
  82  0x7fdd41d4,  0x7fdd0cb5, 0x7fdcd856,  0x7fdca4b3,
  83  0x7fdc71c7,  0x7fdc3f8f, 0x7fdc0e07,  0x7fdbdd2b,
  84  0x7fdbacf9,  0x7fdb7d6c, 0x7fdb4e81,  0x7fdb2036,
  85  0x7fdaf286,  0x7fdac570, 0x7fda98ef,  0x7fda6d01,
  86  0x7fda41a4,  0x7fda16d3, 0x7fd9ec8e,  0x7fd9c2d1,
  87  0x7fd99999,  0x7fd970e4, 0x7fd948b0,  0x7fd920fb,
  88  0x7fd8f9c1,  0x7fd8d301, 0x7fd8acb9,  0x7fd886e5,
  89  0x7fd86186,  0x7fd83c97, 0x7fd81818,  0x7fd7f405,
  90  0x7fd7d05f,  0x7fd7ad22, 0x7fd78a4c,  0x7fd767dc,
  91  0x7fd745d1,  0x7fd72428, 0x7fd702e0,  0x7fd6e1f7,
  92  0x7fd6c16c,  0x7fd6a13c, 0x7fd68168,  0x7fd661ec,
  93  0x7fd642c8,  0x7fd623fa, 0x7fd60581,  0x7fd5e75b,
  94  0x7fd5c988,  0x7fd5ac05, 0x7fd58ed2,  0x7fd571ed,
  95  0x7fd55555,  0x7fd53909, 0x7fd51d07,  0x7fd50150,
  96  0x7fd4e5e0,  0x7fd4cab8, 0x7fd4afd6,  0x7fd49539,
  97  0x7fd47ae1,  0x7fd460cb, 0x7fd446f8,  0x7fd42d66,
  98  0x7fd41414,  0x7fd3fb01, 0x7fd3e22c,  0x7fd3c995,
  99  0x7fd3b13b,  0x7fd3991c, 0x7fd38138,  0x7fd3698d,
 100  0x7fd3521c,  0x7fd33ae4, 0x7fd323e3,  0x7fd30d19,
 101  0x7fd2f684,  0x7fd2e025, 0x7fd2c9fb,  0x7fd2b404,
 102  0x7fd29e41,  0x7fd288b0, 0x7fd27350,  0x7fd25e22,
 103  0x7fd24924,  0x7fd23456, 0x7fd21fb7,  0x7fd20b47,
 104  0x7fd1f704,  0x7fd1e2ef, 0x7fd1cf06,  0x7fd1bb4a,
 105  0x7fd1a7b9,  0x7fd19453, 0x7fd18118,  0x7fd16e06,
 106  0x7fd15b1e,  0x7fd1485f, 0x7fd135c8,  0x7fd12358,
 107  0x7fd11111,  0x7fd0fef0, 0x7fd0ecf5,  0x7fd0db20,
 108  0x7fd0c971,  0x7fd0b7e6, 0x7fd0a681,  0x7fd0953f,
 109  0x7fd08421,  0x7fd07326, 0x7fd0624d,  0x7fd05197,
 110  0x7fd04104,  0x7fd03091, 0x7fd02040,  0x7fd01010,
 111 };
 112 
 113 static const unsigned long long LCONST[] = {
 114 0x3ff0000000000000ULL,  /* DONE = 1.0           */
 115 0x4000000000000000ULL,  /* DTWO = 2.0           */
 116 0x4230000000000000ULL,  /* D2ON36 = 2**36       */
 117 0x7fd0000000000000ULL,  /* D2ON1022 = 2**1022   */
 118 0x3cb0000000000000ULL,  /* D2ONM52 = 2**-52     */
 119 };
 120 
 121 #define RET_SC(I)                                                                               \
 122         px += stridex;                                                                          \
 123         py += stridey;                                                                          \
 124         pz += stridez;                                                                          \
 125         if (--n <= 0)                                                                                \
 126                 break;                                                                          \
 127         goto start##I;
 128 
 129 #define RETURN(I, ret)                                                                          \
 130 {                                                                                               \
 131         pz[0] = (ret);                                                                          \
 132         RET_SC(I)                                                                               \
 133 }
 134 
 135 #define PREP(I)                                                                                 \
 136 hx##I = HI(px);                                                                         \
 137 hy##I = HI(py);                                                                         \
 138 hx##I &= 0x7fffffff;                                                                                \
 139 hy##I &= 0x7fffffff;                                                                                \
 140 pz##I = pz;                                                                                     \
 141 if (hx##I >= 0x7ff00000 || hy##I >= 0x7ff00000)   /* |X| or |Y| = Inf,NaN */              \
 142 {                                                                                               \
 143         lx = LO(px);                                                                    \
 144         ly = LO(py);                                                                    \
 145         x = *px;                                                                                \
 146         y = *py;                                                                                \
 147         if (hx##I == 0x7ff00000 && lx == 0) res0 = 0.0;         /* |X| = Inf */         \
 148         else if (hy##I == 0x7ff00000 && ly == 0) res0 = 0.0;    /* |Y| = Inf */         \
 149         else res0 = fabs(x) + fabs(y);                                                          \
 150                                                                                                 \
 151         RETURN (I, res0)                                                                        \
 152 }                                                                                               \
 153 x##I = *px;                                                                                     \
 154 y##I = *py;                                                                                     \
 155 diff0 = hy##I - hx##I;                                                                          \
 156 j0 = diff0 >> 31;                                                                         \
 157 if (hx##I < 0x00100000 && hy##I < 0x00100000)     /* |X| and |Y| = subnormal or zero */           \
 158 {                                                                                               \
 159         lx = LO(px);                                                                    \
 160         ly = LO(py);                                                                    \
 161         x = x##I;                                                                               \
 162         y = y##I;                                                                               \
 163                                                                                                 \
 164         if ((hx##I | hy##I | lx | ly) == 0)     /* |X| and |Y| = 0 */                           \
 165                 RETURN (I, DONE / 0.0)                                                  \
 166                                                                                                 \
 167         x = fabs(x);                                                                            \
 168         y = fabs(y);                                                                            \
 169                                                                                                 \
 170         x = *(long long*)&x;                                                                        \
 171         y = *(long long*)&y;                                                                        \
 172                                                                                                 \
 173         x *= D2ONM52;                                                                           \
 174         y *= D2ONM52;                                                                           \
 175                                                                                                 \
 176         x_hi0 = (x + D2ON36) - D2ON36;                                                  \
 177         y_hi0 = (y + D2ON36) - D2ON36;                                                  \
 178         x_lo0 = x - x_hi0;                                                                      \
 179         y_lo0 = y - y_hi0;                                                                      \
 180         res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);                                              \
 181         res0_lo = ((x + x_hi0) * x_lo0 + (y + y_hi0) * y_lo0);                                  \
 182                                                                                                 \
 183         dres0 = res0_hi + res0_lo;                                                              \
 184                                                                                                 \
 185         iarr0 = HI(&dres0);                                                         \
 186         iexp0 = iarr0 & 0xfff00000;                                                         \
 187                                                                                                 \
 188         iarr0 = (iarr0 >> 11) & 0x1fc;                                                                \
 189         itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];                                 \
 190         itbl0 -= iexp0;                                                                         \
 191         HI(&dd0) = itbl0;                                           \
 192         LO(&dd0) = 0;                                                               \
 193                                                                                                 \
 194         dd0 = dd0 * (DTWO - dd0 * dres0);                                                       \
 195         dd0 = dd0 * (DTWO - dd0 * dres0);                                                       \
 196         dres0 = dd0 * (DTWO - dd0 * dres0);                                                     \
 197                                                                                                 \
 198         HI(&res0) = HI(&dres0) & 0xffffff00;                                        \
 199         LO(&res0) = 0;                                                              \
 200         res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;                               \
 201         res0 = sqrt (res0);                                                                     \
 202                                                                                                 \
 203         res0 = D2ON1022 * res0;                                                                 \
 204         RETURN (I, res0)                                                                        \
 205 }                                                                                               \
 206 j0 = hy##I - (diff0 & j0);                                                                  \
 207 j0 &= 0x7ff00000;                                                                           \
 208 HI(&scl##I) = 0x7ff00000 - j0;
 209 
 210 void
 211 __vrhypot(int n, double * restrict px, int stridex, double * restrict py,
 212         int stridey, double * restrict pz, int stridez)
 213 {
 214         int             i = 0;
 215         double          x, y;
 216         double          x_hi0, x_lo0, y_hi0, y_lo0, scl0 = 0;
 217         double          x0, y0, res0, dd0;
 218         double          res0_hi,res0_lo, dres0;
 219         double          x_hi1, x_lo1, y_hi1, y_lo1, scl1 = 0;
 220         double          x1 = 0.0L, y1 = 0.0L, res1, dd1;
 221         double          res1_hi,res1_lo, dres1;
 222         double          x_hi2, x_lo2, y_hi2, y_lo2, scl2 = 0;
 223         double          x2, y2, res2, dd2;
 224         double          res2_hi,res2_lo, dres2;
 225 
 226         int             hx0, hy0, j0, diff0;
 227         int             iarr0, iexp0, itbl0;
 228         int             hx1, hy1;
 229         int             iarr1, iexp1, itbl1;
 230         int             hx2, hy2;
 231         int             iarr2, iexp2, itbl2;
 232 
 233         int             lx, ly;
 234 
 235         double          DONE = ((double*)LCONST)[0];
 236         double          DTWO = ((double*)LCONST)[1];
 237         double          D2ON36 = ((double*)LCONST)[2];
 238         double          D2ON1022 = ((double*)LCONST)[3];
 239         double          D2ONM52 = ((double*)LCONST)[4];
 240 
 241         double          *pz0, *pz1 = 0, *pz2;
 242 
 243         do
 244         {
 245 start0:
 246                 PREP(0)
 247                 px += stridex;
 248                 py += stridey;
 249                 pz += stridez;
 250                 i = 1;
 251                 if (--n <= 0)
 252                         break;
 253 
 254 start1:
 255                 PREP(1)
 256                 px += stridex;
 257                 py += stridey;
 258                 pz += stridez;
 259                 i = 2;
 260                 if (--n <= 0)
 261                         break;
 262 
 263 start2:
 264                 PREP(2)
 265 
 266                 x0 *= scl0;
 267                 y0 *= scl0;
 268                 x1 *= scl1;
 269                 y1 *= scl1;
 270                 x2 *= scl2;
 271                 y2 *= scl2;
 272 
 273                 x_hi0 = (x0 + D2ON36) - D2ON36;
 274                 y_hi0 = (y0 + D2ON36) - D2ON36;
 275                 x_hi1 = (x1 + D2ON36) - D2ON36;
 276                 y_hi1 = (y1 + D2ON36) - D2ON36;
 277                 x_hi2 = (x2 + D2ON36) - D2ON36;
 278                 y_hi2 = (y2 + D2ON36) - D2ON36;
 279                 x_lo0 = x0 - x_hi0;
 280                 y_lo0 = y0 - y_hi0;
 281                 x_lo1 = x1 - x_hi1;
 282                 y_lo1 = y1 - y_hi1;
 283                 x_lo2 = x2 - x_hi2;
 284                 y_lo2 = y2 - y_hi2;
 285                 res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);
 286                 res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1);
 287                 res2_hi = (x_hi2 * x_hi2 + y_hi2 * y_hi2);
 288                 res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0);
 289                 res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1);
 290                 res2_lo = ((x2 + x_hi2) * x_lo2 + (y2 + y_hi2) * y_lo2);
 291 
 292                 dres0 = res0_hi + res0_lo;
 293                 dres1 = res1_hi + res1_lo;
 294                 dres2 = res2_hi + res2_lo;
 295 
 296                 iarr0 = HI(&dres0);
 297                 iarr1 = HI(&dres1);
 298                 iarr2 = HI(&dres2);
 299                 iexp0 = iarr0 & 0xfff00000;
 300                 iexp1 = iarr1 & 0xfff00000;
 301                 iexp2 = iarr2 & 0xfff00000;
 302 
 303                 iarr0 = (iarr0 >> 11) & 0x1fc;
 304                 iarr1 = (iarr1 >> 11) & 0x1fc;
 305                 iarr2 = (iarr2 >> 11) & 0x1fc;
 306                 itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];
 307                 itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0];
 308                 itbl2 = ((int*)((char*)__vlibm_TBL_rhypot + iarr2))[0];
 309                 itbl0 -= iexp0;
 310                 itbl1 -= iexp1;
 311                 itbl2 -= iexp2;
 312                 HI(&dd0) = itbl0;
 313                 HI(&dd1) = itbl1;
 314                 HI(&dd2) = itbl2;
 315                 LO(&dd0) = 0;
 316                 LO(&dd1) = 0;
 317                 LO(&dd2) = 0;
 318 
 319                 dd0 = dd0 * (DTWO - dd0 * dres0);
 320                 dd1 = dd1 * (DTWO - dd1 * dres1);
 321                 dd2 = dd2 * (DTWO - dd2 * dres2);
 322                 dd0 = dd0 * (DTWO - dd0 * dres0);
 323                 dd1 = dd1 * (DTWO - dd1 * dres1);
 324                 dd2 = dd2 * (DTWO - dd2 * dres2);
 325                 dres0 = dd0 * (DTWO - dd0 * dres0);
 326                 dres1 = dd1 * (DTWO - dd1 * dres1);
 327                 dres2 = dd2 * (DTWO - dd2 * dres2);
 328 
 329                 HI(&res0) = HI(&dres0) & 0xffffff00;
 330                 HI(&res1) = HI(&dres1) & 0xffffff00;
 331                 HI(&res2) = HI(&dres2) & 0xffffff00;
 332                 LO(&res0) = 0;
 333                 LO(&res1) = 0;
 334                 LO(&res2) = 0;
 335                 res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;
 336                 res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1;
 337                 res2 += (DONE - res2_hi * res2 - res2_lo * res2) * dres2;
 338                 res0 = sqrt (res0);
 339                 res1 = sqrt (res1);
 340                 res2 = sqrt (res2);
 341 
 342                 res0 = scl0 * res0;
 343                 res1 = scl1 * res1;
 344                 res2 = scl2 * res2;
 345 
 346                 *pz0 = res0;
 347                 *pz1 = res1;
 348                 *pz2 = res2;
 349 
 350                 px += stridex;
 351                 py += stridey;
 352                 pz += stridez;
 353                 i = 0;
 354 
 355         } while (--n > 0);
 356 
 357         if (i > 0)
 358         {
 359                 x0 *= scl0;
 360                 y0 *= scl0;
 361 
 362                 x_hi0 = (x0 + D2ON36) - D2ON36;
 363                 y_hi0 = (y0 + D2ON36) - D2ON36;
 364                 x_lo0 = x0 - x_hi0;
 365                 y_lo0 = y0 - y_hi0;
 366                 res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0);
 367                 res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0);
 368 
 369                 dres0 = res0_hi + res0_lo;
 370 
 371                 iarr0 = HI(&dres0);
 372                 iexp0 = iarr0 & 0xfff00000;
 373 
 374                 iarr0 = (iarr0 >> 11) & 0x1fc;
 375                 itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0];
 376                 itbl0 -= iexp0;
 377                 HI(&dd0) = itbl0;
 378                 LO(&dd0) = 0;
 379 
 380                 dd0 = dd0 * (DTWO - dd0 * dres0);
 381                 dd0 = dd0 * (DTWO - dd0 * dres0);
 382                 dres0 = dd0 * (DTWO - dd0 * dres0);
 383 
 384                 HI(&res0) = HI(&dres0) & 0xffffff00;
 385                 LO(&res0) = 0;
 386                 res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0;
 387                 res0 = sqrt (res0);
 388 
 389                 res0 = scl0 * res0;
 390 
 391                 *pz0 = res0;
 392 
 393                 if (i > 1)
 394                 {
 395                         x1 *= scl1;
 396                         y1 *= scl1;
 397 
 398                         x_hi1 = (x1 + D2ON36) - D2ON36;
 399                         y_hi1 = (y1 + D2ON36) - D2ON36;
 400                         x_lo1 = x1 - x_hi1;
 401                         y_lo1 = y1 - y_hi1;
 402                         res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1);
 403                         res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1);
 404 
 405                         dres1 = res1_hi + res1_lo;
 406 
 407                         iarr1 = HI(&dres1);
 408                         iexp1 = iarr1 & 0xfff00000;
 409 
 410                         iarr1 = (iarr1 >> 11) & 0x1fc;
 411                         itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0];
 412                         itbl1 -= iexp1;
 413                         HI(&dd1) = itbl1;
 414                         LO(&dd1) = 0;
 415 
 416                         dd1 = dd1 * (DTWO - dd1 * dres1);
 417                         dd1 = dd1 * (DTWO - dd1 * dres1);
 418                         dres1 = dd1 * (DTWO - dd1 * dres1);
 419 
 420                         HI(&res1) = HI(&dres1) & 0xffffff00;
 421                         LO(&res1) = 0;
 422                         res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1;
 423                         res1 = sqrt (res1);
 424 
 425                         res1 = scl1 * res1;
 426 
 427                         *pz1 = res1;
 428                 }
 429         }
 430 }
 431