1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25 /*
  26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  27  * Use is subject to license terms.
  28  */
  29 
  30 #include <sys/isa_defs.h>
  31 #include "libm_synonyms.h"
  32 #include "libm_inlines.h"
  33 
  34 #ifdef _LITTLE_ENDIAN
  35 #define HI(x)   *(1+(int*)x)
  36 #define LO(x)   *(unsigned*)x
  37 #else
  38 #define HI(x)   *(int*)x
  39 #define LO(x)   *(1+(unsigned*)x)
  40 #endif
  41 
  42 #ifdef __RESTRICT
  43 #define restrict _Restrict
  44 #else
  45 #define restrict
  46 #endif
  47 
  48 /* float rhypotf(float x, float y)
  49  *
  50  * Method :
  51  *      1. Special cases:
  52  *              for x or y = Inf                        => 0;
  53  *              for x or y = NaN                        => QNaN;
  54  *              for x and y = 0                         => +Inf + divide-by-zero;
  55  *      2. Computes d = x * x + y * y;
  56  *      3. Computes reciprocal square root from:
  57  *              d = m * 2**n
  58  *      Where:
  59  *              m = [0.5, 2),
  60  *              n = ((exponent + 1) & ~1).
  61  *      Then:
  62  *              rsqrtf(d) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m))
  63  *      4. Computes 1/sqrt(m) from:
  64  *              1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm))
  65  *      Where:
  66  *              m = m0 + dm,
  67  *              m0 = 0.5 * (1 + k/64) for m = [0.5,         0.5+127/256), k = [0, 63];
  68  *              m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127];
  69  *      Then:
  70  *              1/sqrt(m0), 1/m0 are looked up in a table,
  71  *              1/sqrt(1 + (1/m0)*dm) is computed using approximation:
  72  *                      1/sqrt(1 + z) = ((a3 * z + a2) * z + a1) * z + a0
  73  *                      where z = [-1/64, 1/64].
  74  *
  75  * Accuracy:
  76  *      The maximum relative error for the approximating
  77  *      polynomial is 2**(-27.87).
  78  *      Maximum error observed: less than 0.535 ulp after 3.000.000.000
  79  *      results.
  80  */
  81 
  82 #pragma align 32 (__vlibm_TBL_rhypotf)
  83 
  84 static const double __vlibm_TBL_rhypotf[] = {
  85 /*
  86  i = [0,63]
  87  TBL[2*i+0] = 1.0 / (*(double*)&(0x3ff0000000000000LL + (i << 46)));
  88  TBL[2*i+1] = (double)(0.5/sqrtl(2) / sqrtl(*(double*)&(0x3ff0000000000000LL + (i << 46))));
  89  TBL[128+2*i+0] = 1.0 / (*(double*)&(0x3ff0000000000000LL + (i << 46)));
  90  TBL[128+2*i+1] = (double)(0.25 / sqrtl(*(double*)&(0x3ff0000000000000LL + (i << 46))));
  91 */
  92  1.0000000000000000000e+00, 3.5355339059327378637e-01,
  93  9.8461538461538467004e-01, 3.5082320772281166965e-01,
  94  9.6969696969696972388e-01, 3.4815531191139570399e-01,
  95  9.5522388059701490715e-01, 3.4554737023254405992e-01,
  96  9.4117647058823528106e-01, 3.4299717028501769400e-01,
  97  9.2753623188405798228e-01, 3.4050261230349943009e-01,
  98  9.1428571428571425717e-01, 3.3806170189140660742e-01,
  99  9.0140845070422537244e-01, 3.3567254331867563133e-01,
 100  8.8888888888888883955e-01, 3.3333333333333331483e-01,
 101  8.7671232876712323900e-01, 3.3104235544094717802e-01,
 102  8.6486486486486491287e-01, 3.2879797461071458287e-01,
 103  8.5333333333333338810e-01, 3.2659863237109043599e-01,
 104  8.4210526315789469010e-01, 3.2444284226152508843e-01,
 105  8.3116883116883122362e-01, 3.2232918561015211356e-01,
 106  8.2051282051282048435e-01, 3.2025630761017426229e-01,
 107  8.1012658227848100001e-01, 3.1822291367029204023e-01,
 108  8.0000000000000004441e-01, 3.1622776601683794118e-01,
 109  7.9012345679012341293e-01, 3.1426968052735443360e-01,
 110  7.8048780487804880757e-01, 3.1234752377721214378e-01,
 111  7.7108433734939763049e-01, 3.1046021028253312224e-01,
 112  7.6190476190476186247e-01, 3.0860669992418382490e-01,
 113  7.5294117647058822484e-01, 3.0678599553894819740e-01,
 114  7.4418604651162789665e-01, 3.0499714066520933198e-01,
 115  7.3563218390804596680e-01, 3.0323921743156134756e-01,
 116  7.2727272727272729291e-01, 3.0151134457776362918e-01,
 117  7.1910112359550559802e-01, 2.9981267559834456904e-01,
 118  7.1111111111111113825e-01, 2.9814239699997197031e-01,
 119  7.0329670329670335160e-01, 2.9649972666444046610e-01,
 120  6.9565217391304345895e-01, 2.9488391230979427160e-01,
 121  6.8817204301075274309e-01, 2.9329423004270660513e-01,
 122  6.8085106382978721751e-01, 2.9172998299578911663e-01,
 123  6.7368421052631577428e-01, 2.9019050004400465115e-01,
 124  6.6666666666666662966e-01, 2.8867513459481286553e-01,
 125  6.5979381443298967813e-01, 2.8718326344709527165e-01,
 126  6.5306122448979586625e-01, 2.8571428571428569843e-01,
 127  6.4646464646464651960e-01, 2.8426762180748055275e-01,
 128  6.4000000000000001332e-01, 2.8284271247461900689e-01,
 129  6.3366336633663367106e-01, 2.8143901789211672737e-01,
 130  6.2745098039215685404e-01, 2.8005601680560193723e-01,
 131  6.2135922330097081989e-01, 2.7869320571664707442e-01,
 132  6.1538461538461541878e-01, 2.7735009811261457369e-01,
 133  6.0952380952380957879e-01, 2.7602622373694168934e-01,
 134  6.0377358490566035432e-01, 2.7472112789737807015e-01,
 135  5.9813084112149528249e-01, 2.7343437080986532361e-01,
 136  5.9259259259259255970e-01, 2.7216552697590867815e-01,
 137  5.8715596330275232617e-01, 2.7091418459143856712e-01,
 138  5.8181818181818178992e-01, 2.6967994498529684888e-01,
 139  5.7657657657657657158e-01, 2.6846242208560971987e-01,
 140  5.7142857142857139685e-01, 2.6726124191242439654e-01,
 141  5.6637168141592919568e-01, 2.6607604209509572168e-01,
 142  5.6140350877192979340e-01, 2.6490647141300877054e-01,
 143  5.5652173913043478937e-01, 2.6375218935831479250e-01,
 144  5.5172413793103447510e-01, 2.6261286571944508772e-01,
 145  5.4700854700854706358e-01, 2.6148818018424535570e-01,
 146  5.4237288135593220151e-01, 2.6037782196164771520e-01,
 147  5.3781512605042014474e-01, 2.5928148942086576278e-01,
 148  5.3333333333333332593e-01, 2.5819888974716115326e-01,
 149  5.2892561983471075848e-01, 2.5712973861329002645e-01,
 150  5.2459016393442625681e-01, 2.5607375986579195004e-01,
 151  5.2032520325203257539e-01, 2.5503068522533534068e-01,
 152  5.1612903225806450180e-01, 2.5400025400038100942e-01,
 153  5.1200000000000001066e-01, 2.5298221281347033074e-01,
 154  5.0793650793650790831e-01, 2.5197631533948483540e-01,
 155  5.0393700787401574104e-01, 2.5098232205526344041e-01,
 156  1.0000000000000000000e+00, 2.5000000000000000000e-01,
 157  9.8461538461538467004e-01, 2.4806946917841690703e-01,
 158  9.6969696969696972388e-01, 2.4618298195866547551e-01,
 159  9.5522388059701490715e-01, 2.4433888871261044695e-01,
 160  9.4117647058823528106e-01, 2.4253562503633296910e-01,
 161  9.2753623188405798228e-01, 2.4077170617153839660e-01,
 162  9.1428571428571425717e-01, 2.3904572186687872426e-01,
 163  9.0140845070422537244e-01, 2.3735633163877067897e-01,
 164  8.8888888888888883955e-01, 2.3570226039551583908e-01,
 165  8.7671232876712323900e-01, 2.3408229439226113655e-01,
 166  8.6486486486486491287e-01, 2.3249527748763856860e-01,
 167  8.5333333333333338810e-01, 2.3094010767585029797e-01,
 168  8.4210526315789469010e-01, 2.2941573387056177213e-01,
 169  8.3116883116883122362e-01, 2.2792115291927589338e-01,
 170  8.2051282051282048435e-01, 2.2645540682891915352e-01,
 171  8.1012658227848100001e-01, 2.2501758018520479077e-01,
 172  8.0000000000000004441e-01, 2.2360679774997896385e-01,
 173  7.9012345679012341293e-01, 2.2222222222222220989e-01,
 174  7.8048780487804880757e-01, 2.2086305214969309541e-01,
 175  7.7108433734939763049e-01, 2.1952851997938069295e-01,
 176  7.6190476190476186247e-01, 2.1821789023599238999e-01,
 177  7.5294117647058822484e-01, 2.1693045781865616384e-01,
 178  7.4418604651162789665e-01, 2.1566554640687682354e-01,
 179  7.3563218390804596680e-01, 2.1442250696755896233e-01,
 180  7.2727272727272729291e-01, 2.1320071635561044232e-01,
 181  7.1910112359550559802e-01, 2.1199957600127200541e-01,
 182  7.1111111111111113825e-01, 2.1081851067789195153e-01,
 183  7.0329670329670335160e-01, 2.0965696734438366011e-01,
 184  6.9565217391304345895e-01, 2.0851441405707477061e-01,
 185  6.8817204301075274309e-01, 2.0739033894608505104e-01,
 186  6.8085106382978721751e-01, 2.0628424925175867233e-01,
 187  6.7368421052631577428e-01, 2.0519567041703082322e-01,
 188  6.6666666666666662966e-01, 2.0412414523193150862e-01,
 189  6.5979381443298967813e-01, 2.0306923302672380549e-01,
 190  6.5306122448979586625e-01, 2.0203050891044216364e-01,
 191  6.4646464646464651960e-01, 2.0100756305184241945e-01,
 192  6.4000000000000001332e-01, 2.0000000000000001110e-01,
 193  6.3366336633663367106e-01, 1.9900743804199783060e-01,
 194  6.2745098039215685404e-01, 1.9802950859533485772e-01,
 195  6.2135922330097081989e-01, 1.9706585563285863860e-01,
 196  6.1538461538461541878e-01, 1.9611613513818404453e-01,
 197  6.0952380952380957879e-01, 1.9518001458970662965e-01,
 198  6.0377358490566035432e-01, 1.9425717247145282696e-01,
 199  5.9813084112149528249e-01, 1.9334729780913270658e-01,
 200  5.9259259259259255970e-01, 1.9245008972987526219e-01,
 201  5.8715596330275232617e-01, 1.9156525704423027490e-01,
 202  5.8181818181818178992e-01, 1.9069251784911847580e-01,
 203  5.7657657657657657158e-01, 1.8983159915049979682e-01,
 204  5.7142857142857139685e-01, 1.8898223650461362655e-01,
 205  5.6637168141592919568e-01, 1.8814417367671945613e-01,
 206  5.6140350877192979340e-01, 1.8731716231633879777e-01,
 207  5.5652173913043478937e-01, 1.8650096164806276300e-01,
 208  5.5172413793103447510e-01, 1.8569533817705186074e-01,
 209  5.4700854700854706358e-01, 1.8490006540840969729e-01,
 210  5.4237288135593220151e-01, 1.8411492357966466327e-01,
 211  5.3781512605042014474e-01, 1.8333969940564226464e-01,
 212  5.3333333333333332593e-01, 1.8257418583505535814e-01,
 213  5.2892561983471075848e-01, 1.8181818181818182323e-01,
 214  5.2459016393442625681e-01, 1.8107149208503706128e-01,
 215  5.2032520325203257539e-01, 1.8033392693348646030e-01,
 216  5.1612903225806450180e-01, 1.7960530202677491007e-01,
 217  5.1200000000000001066e-01, 1.7888543819998317663e-01,
 218  5.0793650793650790831e-01, 1.7817416127494958844e-01,
 219  5.0393700787401574104e-01, 1.7747130188322274291e-01,
 220 };
 221 
 222 #define fabsf   __fabsf
 223 
 224 extern float fabsf( float );
 225 
 226 static const double
 227         A0 = 9.99999997962321453275e-01,
 228         A1 =-4.99999998166077580600e-01,
 229         A2 = 3.75066768969515586277e-01,
 230         A3 =-3.12560092408808548438e-01;
 231 
 232 static void
 233 __vrhypotf_n( int n, float * restrict px, int stridex, float * restrict py,
 234         int stridey, float * restrict pz, int stridez );
 235 
 236 #pragma no_inline(__vrhypotf_n)
 237 
 238 #define RETURN(ret)                                             \
 239 {                                                               \
 240         *pz = (ret);                                            \
 241         pz += stridez;                                          \
 242         if ( n_n == 0 )                                         \
 243         {                                                       \
 244                 spx = px; spy = py; spz = pz;                   \
 245                 ay0 = *(int*)py;                                \
 246                 continue;                                       \
 247         }                                                       \
 248         n--;                                                    \
 249         break;                                                  \
 250 }
 251 
 252 
 253 void
 254 __vrhypotf( int n, float * restrict px, int stridex, float * restrict py,
 255         int stridey, float * restrict pz, int stridez )
 256 {
 257         float           *spx, *spy, *spz;
 258         int             ax0, ay0, n_n;
 259         float           res, x0, y0;
 260 
 261         while ( n > 1 )
 262         {
 263                 n_n = 0;
 264                 spx = px;
 265                 spy = py;
 266                 spz = pz;
 267                 ax0 = *(int*)px;
 268                 ay0 = *(int*)py;
 269                 for ( ; n > 1 ; n-- )
 270                 {
 271                         ax0 &= 0x7fffffff;
 272                         ay0 &= 0x7fffffff;
 273 
 274                         px += stridex;
 275 
 276                         if ( ax0 >= 0x7f800000 || ay0 >= 0x7f800000 )     /* X or Y = NaN or Inf  */
 277                         {
 278                                 x0 = *(px - stridex);
 279                                 y0 = *py;
 280                                 res = fabsf(x0) + fabsf(y0);
 281                                 if( ax0 == 0x7f800000 ) res = 0.0f;
 282                                 else if( ay0 == 0x7f800000 ) res = 0.0f;
 283                                 ax0 = *(int*)px;
 284                                 py += stridey;
 285                                 RETURN ( res )
 286                         }
 287                         ax0 = *(int*)px;
 288                         py += stridey;
 289                         if ( ay0 == 0 )         /* Y = 0        */
 290                         {
 291                                 int tx = *(int*)(px - stridex) & 0x7fffffff;
 292                                 if ( tx == 0 )  /* X = 0        */
 293                                 {
 294                                         RETURN ( 1.0f / 0.0f )
 295                                 }
 296                         }
 297                         pz += stridez;
 298                         n_n++;
 299                         ay0 = *(int*)py;
 300                 }
 301                 if ( n_n > 0 )
 302                         __vrhypotf_n( n_n, spx, stridex, spy, stridey, spz, stridez );
 303         }
 304         if ( n > 0 )
 305         {
 306                 ax0 = *(int*)px;
 307                 ay0 = *(int*)py;
 308                 x0 = *px;
 309                 y0 = *py;
 310 
 311                 ax0 &= 0x7fffffff;
 312                 ay0 &= 0x7fffffff;
 313 
 314                 if ( ax0 >= 0x7f800000 || ay0 >= 0x7f800000 )     /* X or Y = NaN or Inf  */
 315                 {
 316                         res = fabsf(x0) + fabsf(y0);
 317                         if( ax0 == 0x7f800000 ) res = 0.0f;
 318                         else if( ay0 == 0x7f800000 ) res = 0.0f;
 319                         *pz = res;
 320                 }
 321                 else if ( ax0 == 0 && ay0 == 0 )        /* X and Y = 0  */
 322                 {
 323                         *pz = 1.0f / 0.0f;
 324                 }
 325                 else
 326                 {
 327                         double          xx0, res0, hyp0, h_hi0 = 0, dbase0 = 0;
 328                         int             ibase0, si0, hyp0h;
 329 
 330                         hyp0 = x0 * (double)x0 + y0 * (double)y0;
 331 
 332                         ibase0 = HI(&hyp0);
 333 
 334                         HI(&dbase0) = (0x60000000 - ((ibase0 & 0x7fe00000) >> 1));
 335 
 336                         hyp0h = (ibase0 & 0x000fffff) | 0x3ff00000;
 337                         HI(&hyp0) = hyp0h;
 338                         HI(&h_hi0) = hyp0h & 0x7fffc000;
 339 
 340                         ibase0 >>= 10;
 341                         si0 = ibase0 & 0x7f0;
 342                         xx0 = ((double*)((char*)__vlibm_TBL_rhypotf + si0))[0];
 343 
 344                         xx0 = (hyp0 - h_hi0) * xx0;
 345                         res0 = ((double*)((char*)__vlibm_TBL_rhypotf + si0))[1];
 346                         res0 *= (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
 347                         res0 *= dbase0;
 348                         *pz = res0;
 349                 }
 350         }
 351 }
 352 
 353 static void
 354 __vrhypotf_n( int n, float * restrict px, int stridex, float * restrict py,
 355         int stridey, float * restrict pz, int stridez )
 356 {
 357         double          xx0, res0, hyp0, h_hi0 = 0, dbase0 = 0;
 358         double          xx1, res1, hyp1, h_hi1 = 0, dbase1 = 0;
 359         double          xx2, res2, hyp2, h_hi2 = 0, dbase2 = 0;
 360         float           x0, y0;
 361         float           x1, y1;
 362         float           x2, y2;
 363         int             ibase0, si0, hyp0h;
 364         int             ibase1, si1, hyp1h;
 365         int             ibase2, si2, hyp2h;
 366 
 367         for ( ; n > 2 ; n -= 3 )
 368         {
 369                 x0 = *px;
 370                 px += stridex;
 371                 x1 = *px;
 372                 px += stridex;
 373                 x2 = *px;
 374                 px += stridex;
 375 
 376                 y0 = *py;
 377                 py += stridey;
 378                 y1 = *py;
 379                 py += stridey;
 380                 y2 = *py;
 381                 py += stridey;
 382 
 383                 hyp0 = x0 * (double)x0 + y0 * (double)y0;
 384                 hyp1 = x1 * (double)x1 + y1 * (double)y1;
 385                 hyp2 = x2 * (double)x2 + y2 * (double)y2;
 386 
 387                 ibase0 = HI(&hyp0);
 388                 ibase1 = HI(&hyp1);
 389                 ibase2 = HI(&hyp2);
 390 
 391                 HI(&dbase0) = (0x60000000 - ((ibase0 & 0x7fe00000) >> 1));
 392                 HI(&dbase1) = (0x60000000 - ((ibase1 & 0x7fe00000) >> 1));
 393                 HI(&dbase2) = (0x60000000 - ((ibase2 & 0x7fe00000) >> 1));
 394 
 395                 hyp0h = (ibase0 & 0x000fffff) | 0x3ff00000;
 396                 hyp1h = (ibase1 & 0x000fffff) | 0x3ff00000;
 397                 hyp2h = (ibase2 & 0x000fffff) | 0x3ff00000;
 398                 HI(&hyp0) = hyp0h;
 399                 HI(&hyp1) = hyp1h;
 400                 HI(&hyp2) = hyp2h;
 401                 HI(&h_hi0) = hyp0h & 0x7fffc000;
 402                 HI(&h_hi1) = hyp1h & 0x7fffc000;
 403                 HI(&h_hi2) = hyp2h & 0x7fffc000;
 404 
 405                 ibase0 >>= 10;
 406                 ibase1 >>= 10;
 407                 ibase2 >>= 10;
 408                 si0 = ibase0 & 0x7f0;
 409                 si1 = ibase1 & 0x7f0;
 410                 si2 = ibase2 & 0x7f0;
 411                 xx0 = ((double*)((char*)__vlibm_TBL_rhypotf + si0))[0];
 412                 xx1 = ((double*)((char*)__vlibm_TBL_rhypotf + si1))[0];
 413                 xx2 = ((double*)((char*)__vlibm_TBL_rhypotf + si2))[0];
 414 
 415                 xx0 = (hyp0 - h_hi0) * xx0;
 416                 xx1 = (hyp1 - h_hi1) * xx1;
 417                 xx2 = (hyp2 - h_hi2) * xx2;
 418                 res0 = ((double*)((char*)__vlibm_TBL_rhypotf + si0))[1];
 419                 res1 = ((double*)((char*)__vlibm_TBL_rhypotf + si1))[1];
 420                 res2 = ((double*)((char*)__vlibm_TBL_rhypotf + si2))[1];
 421                 res0 *= (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
 422                 res1 *= (((A3 * xx1 + A2) * xx1 + A1) * xx1 + A0);
 423                 res2 *= (((A3 * xx2 + A2) * xx2 + A1) * xx2 + A0);
 424                 res0 *= dbase0;
 425                 res1 *= dbase1;
 426                 res2 *= dbase2;
 427                 *pz = res0;
 428                 pz += stridez;
 429                 *pz = res1;
 430                 pz += stridez;
 431                 *pz = res2;
 432                 pz += stridez;
 433         }
 434 
 435         for ( ; n > 0 ; n-- )
 436         {
 437                 x0 = *px;
 438                 px += stridex;
 439 
 440                 y0 = *py;
 441                 py += stridey;
 442 
 443                 hyp0 = x0 * (double)x0 + y0 * (double)y0;
 444 
 445                 ibase0 = HI(&hyp0);
 446 
 447                 HI(&dbase0) = (0x60000000 - ((ibase0 & 0x7fe00000) >> 1));
 448 
 449                 hyp0h = (ibase0 & 0x000fffff) | 0x3ff00000;
 450                 HI(&hyp0) = hyp0h;
 451                 HI(&h_hi0) = hyp0h & 0x7fffc000;
 452 
 453                 ibase0 >>= 10;
 454                 si0 = ibase0 & 0x7f0;
 455                 xx0 = ((double*)((char*)__vlibm_TBL_rhypotf + si0))[0];
 456 
 457                 xx0 = (hyp0 - h_hi0) * xx0;
 458                 res0 = ((double*)((char*)__vlibm_TBL_rhypotf + si0))[1];
 459                 res0 *= (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
 460                 res0 *= dbase0;
 461                 *pz = res0;
 462                 pz += stridez;
 463         }
 464 }
 465