1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  24  */
  25 /*
  26  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  27  * Use is subject to license terms.
  28  */
  29 
  30 #include "libm_synonyms.h"
  31 #include "libm_inlines.h"
  32 
  33 #ifdef __RESTRICT
  34 #define restrict _Restrict
  35 #else
  36 #define restrict
  37 #endif
  38 
  39 /* float rsqrtf(float x)
  40  *
  41  * Method :
  42  *      1. Special cases:
  43  *              for x = NaN                             => QNaN;
  44  *              for x = +Inf                            => 0;
  45  *              for x is negative, -Inf                 => QNaN + invalid;
  46  *              for x = +0                              => +Inf + divide-by-zero;
  47  *              for x = -0                              => -Inf + divide-by-zero.
  48  *      2. Computes reciprocal square root from:
  49  *              x = m * 2**n
  50  *      Where:
  51  *              m = [0.5, 2),
  52  *              n = ((exponent + 1) & ~1).
  53  *      Then:
  54  *              rsqrtf(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m))
  55  *      2. Computes 1/sqrt(m) from:
  56  *              1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm))
  57  *      Where:
  58  *              m = m0 + dm,
  59  *              m0 = 0.5 * (1 + k/64) for m = [0.5,         0.5+127/256), k = [0, 63];
  60  *              m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127];
  61  *      Then:
  62  *              1/sqrt(m0), 1/m0 are looked up in a table,
  63  *              1/sqrt(1 + (1/m0)*dm) is computed using approximation:
  64  *                      1/sqrt(1 + z) = ((a3 * z + a2) * z + a1) * z + a0
  65  *                      where z = [-1/64, 1/64].
  66  *
  67  * Accuracy:
  68  *      The maximum relative error for the approximating
  69  *      polynomial is 2**(-27.87).
  70  *      Maximum error observed: less than 0.534 ulp for the
  71  *      whole float type range.
  72  */
  73 
  74 #define sqrtf __sqrtf
  75 
  76 extern float sqrtf(float);
  77 
  78 static const double __TBL_rsqrtf[] = {
  79 /*
  80 i = [0,63]
  81  TBL[2*i  ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-24;
  82  TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46)));
  83 i = [64,127]
  84  TBL[2*i  ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-23;
  85  TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46)));
  86 */
  87  1.1920928955078125000e-07, 1.4142135623730951455e+00,
  88  1.1737530048076923728e-07, 1.4032928308912466786e+00,
  89  1.1559688683712121533e-07, 1.3926212476455828160e+00,
  90  1.1387156016791044559e-07, 1.3821894809301762397e+00,
  91  1.1219697840073529256e-07, 1.3719886811400707760e+00,
  92  1.1057093523550724772e-07, 1.3620104492139977204e+00,
  93  1.0899135044642856803e-07, 1.3522468075656264297e+00,
  94  1.0745626100352112918e-07, 1.3426901732747025253e+00,
  95  1.0596381293402777190e-07, 1.3333333333333332593e+00,
  96  1.0451225385273972023e-07, 1.3241694217637887121e+00,
  97  1.0309992609797297870e-07, 1.3151918984428583315e+00,
  98  1.0172526041666667320e-07, 1.3063945294843617440e+00,
  99  1.0038677014802631022e-07, 1.2977713690461003537e+00,
 100  9.9083045860389616921e-08, 1.2893167424406084542e+00,
 101  9.7812750400641022247e-08, 1.2810252304406970492e+00,
 102  9.6574614319620251657e-08, 1.2728916546811681609e+00,
 103  9.5367431640625005294e-08, 1.2649110640673517647e+00,
 104  9.4190055941358019463e-08, 1.2570787221094177344e+00,
 105  9.3041396722560978838e-08, 1.2493900951088485751e+00,
 106  9.1920416039156631290e-08, 1.2418408411301324890e+00,
 107  9.0826125372023804482e-08, 1.2344267996967352996e+00,
 108  8.9757582720588234048e-08, 1.2271439821557927896e+00,
 109  8.8713889898255812722e-08, 1.2199885626608373279e+00,
 110  8.7694190014367814875e-08, 1.2129568697262453902e+00,
 111  8.6697665127840911497e-08, 1.2060453783110545167e+00,
 112  8.5723534058988761666e-08, 1.1992507023933782762e+00,
 113  8.4771050347222225457e-08, 1.1925695879998878812e+00,
 114  8.3839500343406599951e-08, 1.1859989066577618644e+00,
 115  8.2928201426630432481e-08, 1.1795356492391770864e+00,
 116  8.2036500336021511923e-08, 1.1731769201708264205e+00,
 117  8.1163771609042551220e-08, 1.1669199319831564665e+00,
 118  8.0309416118421050820e-08, 1.1607620001760186046e+00,
 119  7.9472859700520828922e-08, 1.1547005383792514621e+00,
 120  7.8653551868556699530e-08, 1.1487330537883810866e+00,
 121  7.7850964604591830522e-08, 1.1428571428571427937e+00,
 122  7.7064591224747481298e-08, 1.1370704872299222110e+00,
 123  7.6293945312500001588e-08, 1.1313708498984760276e+00,
 124  7.5538559715346535571e-08, 1.1257560715684669095e+00,
 125  7.4797985600490195040e-08, 1.1202240672224077489e+00,
 126  7.4071791565533974158e-08, 1.1147728228665882977e+00,
 127  7.3359562800480773303e-08, 1.1094003924504582947e+00,
 128  7.2660900297619054173e-08, 1.1041048949477667573e+00,
 129  7.1975420106132072725e-08, 1.0988845115895122806e+00,
 130  7.1302752628504667579e-08, 1.0937374832394612945e+00,
 131  7.0642541956018514597e-08, 1.0886621079036347126e+00,
 132  6.9994445240825691959e-08, 1.0836567383657542685e+00,
 133  6.9358132102272723904e-08, 1.0787197799411873955e+00,
 134  6.8733284065315314719e-08, 1.0738496883424388795e+00,
 135  6.8119594029017853361e-08, 1.0690449676496975862e+00,
 136  6.7516765763274335346e-08, 1.0643041683803828867e+00,
 137  6.6924513432017540145e-08, 1.0596258856520350822e+00,
 138  6.6342561141304348632e-08, 1.0550087574332591700e+00,
 139  6.5770642510775861156e-08, 1.0504514628777803509e+00,
 140  6.5208500267094023655e-08, 1.0459527207369814228e+00,
 141  6.4655885858050847233e-08, 1.0415112878465908608e+00,
 142  6.4112559086134451001e-08, 1.0371259576834630511e+00,
 143  6.3578287760416665784e-08, 1.0327955589886446131e+00,
 144  6.3052847365702481089e-08, 1.0285189544531601058e+00,
 145  6.2536020747950822927e-08, 1.0242950394631678002e+00,
 146  6.2027597815040656970e-08, 1.0201227409013413627e+00,
 147  6.1527375252016127325e-08, 1.0160010160015240377e+00,
 148  6.1035156250000001271e-08, 1.0119288512538813229e+00,
 149  6.0550750248015869655e-08, 1.0079052613579393416e+00,
 150  6.0073972687007873182e-08, 1.0039292882210537616e+00,
 151  1.1920928955078125000e-07, 1.0000000000000000000e+00,
 152  1.1737530048076923728e-07, 9.9227787671366762812e-01,
 153  1.1559688683712121533e-07, 9.8473192783466190203e-01,
 154  1.1387156016791044559e-07, 9.7735555485044178781e-01,
 155  1.1219697840073529256e-07, 9.7014250014533187638e-01,
 156  1.1057093523550724772e-07, 9.6308682468615358641e-01,
 157  1.0899135044642856803e-07, 9.5618288746751489704e-01,
 158  1.0745626100352112918e-07, 9.4942532655508271588e-01,
 159  1.0596381293402777190e-07, 9.4280904158206335630e-01,
 160  1.0451225385273972023e-07, 9.3632917756904454620e-01,
 161  1.0309992609797297870e-07, 9.2998110995055427441e-01,
 162  1.0172526041666667320e-07, 9.2376043070340119190e-01,
 163  1.0038677014802631022e-07, 9.1766293548224708854e-01,
 164  9.9083045860389616921e-08, 9.1168461167710357351e-01,
 165  9.7812750400641022247e-08, 9.0582162731567661407e-01,
 166  9.6574614319620251657e-08, 9.0007032074081916306e-01,
 167  9.5367431640625005294e-08, 8.9442719099991585541e-01,
 168  9.4190055941358019463e-08, 8.8888888888888883955e-01,
 169  9.3041396722560978838e-08, 8.8345220859877238162e-01,
 170  9.1920416039156631290e-08, 8.7811407991752277180e-01,
 171  9.0826125372023804482e-08, 8.7287156094396955996e-01,
 172  8.9757582720588234048e-08, 8.6772183127462465535e-01,
 173  8.8713889898255812722e-08, 8.6266218562750729415e-01,
 174  8.7694190014367814875e-08, 8.5769002787023584933e-01,
 175  8.6697665127840911497e-08, 8.5280286542244176928e-01,
 176  8.5723534058988761666e-08, 8.4799830400508802164e-01,
 177  8.4771050347222225457e-08, 8.4327404271156780613e-01,
 178  8.3839500343406599951e-08, 8.3862786937753464045e-01,
 179  8.2928201426630432481e-08, 8.3405765622829908246e-01,
 180  8.2036500336021511923e-08, 8.2956135578434020417e-01,
 181  8.1163771609042551220e-08, 8.2513699700703468931e-01,
 182  8.0309416118421050820e-08, 8.2078268166812329287e-01,
 183  7.9472859700520828922e-08, 8.1649658092772603446e-01,
 184  7.8653551868556699530e-08, 8.1227693210689522196e-01,
 185  7.7850964604591830522e-08, 8.0812203564176865456e-01,
 186  7.7064591224747481298e-08, 8.0403025220736967782e-01,
 187  7.6293945312500001588e-08, 8.0000000000000004441e-01,
 188  7.5538559715346535571e-08, 7.9602975216799132241e-01,
 189  7.4797985600490195040e-08, 7.9211803438133943089e-01,
 190  7.4071791565533974158e-08, 7.8826342253143455441e-01,
 191  7.3359562800480773303e-08, 7.8446454055273617811e-01,
 192  7.2660900297619054173e-08, 7.8072005835882651859e-01,
 193  7.1975420106132072725e-08, 7.7702868988581130782e-01,
 194  7.1302752628504667579e-08, 7.7338919123653082632e-01,
 195  7.0642541956018514597e-08, 7.6980035891950104876e-01,
 196  6.9994445240825691959e-08, 7.6626102817692109959e-01,
 197  6.9358132102272723904e-08, 7.6277007139647390321e-01,
 198  6.8733284065315314719e-08, 7.5932639660199918730e-01,
 199  6.8119594029017853361e-08, 7.5592894601845450619e-01,
 200  6.7516765763274335346e-08, 7.5257669470687782454e-01,
 201  6.6924513432017540145e-08, 7.4926864926535519107e-01,
 202  6.6342561141304348632e-08, 7.4600384659225105199e-01,
 203  6.5770642510775861156e-08, 7.4278135270820744296e-01,
 204  6.5208500267094023655e-08, 7.3960026163363878915e-01,
 205  6.4655885858050847233e-08, 7.3645969431865865307e-01,
 206  6.4112559086134451001e-08, 7.3335879762256905856e-01,
 207  6.3578287760416665784e-08, 7.3029674334022143256e-01,
 208  6.3052847365702481089e-08, 7.2727272727272729291e-01,
 209  6.2536020747950822927e-08, 7.2428596834014824513e-01,
 210  6.2027597815040656970e-08, 7.2133570773394584119e-01,
 211  6.1527375252016127325e-08, 7.1842120810709964029e-01,
 212  6.1035156250000001271e-08, 7.1554175279993270653e-01,
 213  6.0550750248015869655e-08, 7.1269664509979835376e-01,
 214  6.0073972687007873182e-08, 7.0988520753289097165e-01,
 215 };
 216 
 217 static const unsigned long long LCONST[] = {
 218 0x3feffffffee7f18fULL,  /* A0 = 9.99999997962321453275e-01      */
 219 0xbfdffffffe07e52fULL,  /* A1 =-4.99999998166077580600e-01      */
 220 0x3fd801180ca296d9ULL,  /* A2 = 3.75066768969515586277e-01      */
 221 0xbfd400fc0bbb8e78ULL,  /* A3 =-3.12560092408808548438e-01      */
 222 };
 223 
 224 static void
 225 __vrsqrtf_n(int n, float * restrict px, int stridex, float * restrict py, int stridey);
 226 
 227 #pragma no_inline(__vrsqrtf_n)
 228 
 229 #define RETURN(ret)                                             \
 230 {                                                               \
 231         *py = (ret);                                            \
 232         py += stridey;                                          \
 233         if (n_n == 0)                                           \
 234         {                                                       \
 235                 spx = px; spy = py;                             \
 236                 ax0 = *(int*)px;                                \
 237                 continue;                                       \
 238         }                                                       \
 239         n--;                                                    \
 240         break;                                                  \
 241 }
 242 
 243 void
 244 __vrsqrtf(int n, float * restrict px, int stridex, float * restrict py, int stridey)
 245 {
 246         float           *spx, *spy;
 247         int             ax0, n_n;
 248         float           res;
 249         float           FONE = 1.0f, FTWO = 2.0f;
 250 
 251         while (n > 1)
 252         {
 253                 n_n = 0;
 254                 spx = px;
 255                 spy = py;
 256                 ax0 = *(int*)px;
 257                 for (; n > 1 ; n--)
 258                 {
 259                         px += stridex;
 260                         if (ax0 >= 0x7f800000)       /* X = NaN or Inf       */
 261                         {
 262                                 res = *(px - stridex);
 263                                 RETURN (FONE / res)
 264                         }
 265 
 266                         py += stridey;
 267 
 268                         if (ax0 < 0x00800000)                /* X = denormal, zero or negative       */
 269                         {
 270                                 py -= stridey;
 271                                 res = *(px - stridex);
 272 
 273                                 if ((ax0 & 0x7fffffff) == 0)        /* |X| = zero   */
 274                                 {
 275                                         RETURN (FONE / res)
 276                                 }
 277                                 else if (ax0 >= 0)   /* X = denormal */
 278                                 {
 279                                         double          A0 = ((double*)LCONST)[0];      /*  9.99999997962321453275e-01  */
 280                                         double          A1 = ((double*)LCONST)[1];      /* -4.99999998166077580600e-01  */
 281                                         double          A2 = ((double*)LCONST)[2];      /*  3.75066768969515586277e-01  */
 282                                         double          A3 = ((double*)LCONST)[3];      /* -3.12560092408808548438e-01  */
 283 
 284                                         double          res0, xx0, tbl_div0, tbl_sqrt0;
 285                                         float           fres0;
 286                                         int             iax0, si0, iexp0;
 287 
 288                                         res = *(int*)&res;
 289                                         res *= FTWO;
 290                                         ax0 = *(int*)&res;
 291                                         iexp0 = ax0 >> 24;
 292                                         iexp0 = 0x3f + 0x4b - iexp0;
 293                                         iexp0 = iexp0 << 23;
 294 
 295                                         si0 = (ax0 >> 13) & 0x7f0;
 296 
 297                                         tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0];
 298                                         tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1];
 299                                         iax0 = ax0 & 0x7ffe0000;
 300                                         iax0 = ax0 - iax0;
 301                                         xx0 = iax0 * tbl_div0;
 302                                         res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
 303 
 304                                         fres0 = res0;
 305                                         iexp0 += *(int*)&fres0;
 306                                         RETURN(*(float*)&iexp0)
 307                                 }
 308                                 else    /* X = negative */
 309                                 {
 310                                         RETURN (sqrtf(res))
 311                                 }
 312                         }
 313                         n_n++;
 314                         ax0 = *(int*)px;
 315                 }
 316                 if (n_n > 0)
 317                         __vrsqrtf_n(n_n, spx, stridex, spy, stridey);
 318         }
 319 
 320         if (n > 0)
 321         {
 322                 ax0 = *(int*)px;
 323 
 324                 if (ax0 >= 0x7f800000)       /* X = NaN or Inf       */
 325                 {
 326                         res = *px;
 327                         *py = FONE / res;
 328                 }
 329                 else if (ax0 < 0x00800000)   /* X = denormal, zero or negative       */
 330                 {
 331                         res = *px;
 332 
 333                         if ((ax0 & 0x7fffffff) == 0)        /* |X| = zero   */
 334                         {
 335                                 *py = FONE / res;
 336                         }
 337                         else if (ax0 >= 0)   /* X = denormal */
 338                         {
 339                                 double          A0 = ((double*)LCONST)[0];      /*  9.99999997962321453275e-01  */
 340                                 double          A1 = ((double*)LCONST)[1];      /* -4.99999998166077580600e-01  */
 341                                 double          A2 = ((double*)LCONST)[2];      /*  3.75066768969515586277e-01  */
 342                                 double          A3 = ((double*)LCONST)[3];      /* -3.12560092408808548438e-01  */
 343                                 double          res0, xx0, tbl_div0, tbl_sqrt0;
 344                                 float           fres0;
 345                                 int             iax0, si0, iexp0;
 346 
 347                                 res = *(int*)&res;
 348                                 res *= FTWO;
 349                                 ax0 = *(int*)&res;
 350                                 iexp0 = ax0 >> 24;
 351                                 iexp0 = 0x3f + 0x4b - iexp0;
 352                                 iexp0 = iexp0 << 23;
 353 
 354                                 si0 = (ax0 >> 13) & 0x7f0;
 355 
 356                                 tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0];
 357                                 tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1];
 358                                 iax0 = ax0 & 0x7ffe0000;
 359                                 iax0 = ax0 - iax0;
 360                                 xx0 = iax0 * tbl_div0;
 361                                 res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
 362 
 363                                 fres0 = res0;
 364                                 iexp0 += *(int*)&fres0;
 365 
 366                                 *(int*)py = iexp0;
 367                         }
 368                         else    /* X = negative */
 369                         {
 370                                 *py = sqrtf(res);
 371                         }
 372                 }
 373                 else
 374                 {
 375                         double          A0 = ((double*)LCONST)[0];      /*  9.99999997962321453275e-01  */
 376                         double          A1 = ((double*)LCONST)[1];      /* -4.99999998166077580600e-01  */
 377                         double          A2 = ((double*)LCONST)[2];      /*  3.75066768969515586277e-01  */
 378                         double          A3 = ((double*)LCONST)[3];      /* -3.12560092408808548438e-01  */
 379                         double          res0, xx0, tbl_div0, tbl_sqrt0;
 380                         float           fres0;
 381                         int             iax0, si0, iexp0;
 382 
 383                         iexp0 = ax0 >> 24;
 384                         iexp0 = 0x3f - iexp0;
 385                         iexp0 = iexp0 << 23;
 386 
 387                         si0 = (ax0 >> 13) & 0x7f0;
 388 
 389                         tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0];
 390                         tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1];
 391                         iax0 = ax0 & 0x7ffe0000;
 392                         iax0 = ax0 - iax0;
 393                         xx0 = iax0 * tbl_div0;
 394                         res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
 395 
 396                         fres0 = res0;
 397                         iexp0 += *(int*)&fres0;
 398 
 399                         *(int*)py = iexp0;
 400                 }
 401         }
 402 }
 403 
 404 void
 405 __vrsqrtf_n(int n, float * restrict px, int stridex, float * restrict py, int stridey)
 406 {
 407         double          A0 = ((double*)LCONST)[0];      /*  9.99999997962321453275e-01  */
 408         double          A1 = ((double*)LCONST)[1];      /* -4.99999998166077580600e-01  */
 409         double          A2 = ((double*)LCONST)[2];      /*  3.75066768969515586277e-01  */
 410         double          A3 = ((double*)LCONST)[3];      /* -3.12560092408808548438e-01  */
 411         double          res0, xx0, tbl_div0, tbl_sqrt0;
 412         float           fres0;
 413         int             iax0, ax0, si0, iexp0;
 414 
 415 #if defined(ARCH_v7) || defined(ARCH_v8)
 416         double          res1, xx1, tbl_div1, tbl_sqrt1;
 417         double          res2, xx2, tbl_div2, tbl_sqrt2;
 418         float           fres1, fres2;
 419         int             iax1, ax1, si1, iexp1;
 420         int             iax2, ax2, si2, iexp2;
 421 
 422         for(; n > 2 ; n -= 3)
 423         {
 424                 ax0 = *(int*)px;
 425                 px += stridex;
 426 
 427                 ax1 = *(int*)px;
 428                 px += stridex;
 429 
 430                 ax2 = *(int*)px;
 431                 px += stridex;
 432 
 433                 iexp0 = ax0 >> 24;
 434                 iexp1 = ax1 >> 24;
 435                 iexp2 = ax2 >> 24;
 436                 iexp0 = 0x3f - iexp0;
 437                 iexp1 = 0x3f - iexp1;
 438                 iexp2 = 0x3f - iexp2;
 439 
 440                 iexp0 = iexp0 << 23;
 441                 iexp1 = iexp1 << 23;
 442                 iexp2 = iexp2 << 23;
 443 
 444                 si0 = (ax0 >> 13) & 0x7f0;
 445                 si1 = (ax1 >> 13) & 0x7f0;
 446                 si2 = (ax2 >> 13) & 0x7f0;
 447 
 448                 tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0];
 449                 tbl_div1 = ((double*)((char*)__TBL_rsqrtf + si1))[0];
 450                 tbl_div2 = ((double*)((char*)__TBL_rsqrtf + si2))[0];
 451                 tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1];
 452                 tbl_sqrt1 = ((double*)((char*)__TBL_rsqrtf + si1))[1];
 453                 tbl_sqrt2 = ((double*)((char*)__TBL_rsqrtf + si2))[1];
 454                 iax0 = ax0 & 0x7ffe0000;
 455                 iax1 = ax1 & 0x7ffe0000;
 456                 iax2 = ax2 & 0x7ffe0000;
 457                 iax0 = ax0 - iax0;
 458                 iax1 = ax1 - iax1;
 459                 iax2 = ax2 - iax2;
 460                 xx0 = iax0 * tbl_div0;
 461                 xx1 = iax1 * tbl_div1;
 462                 xx2 = iax2 * tbl_div2;
 463                 res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
 464                 res1 = tbl_sqrt1 * (((A3 * xx1 + A2) * xx1 + A1) * xx1 + A0);
 465                 res2 = tbl_sqrt2 * (((A3 * xx2 + A2) * xx2 + A1) * xx2 + A0);
 466 
 467                 fres0 = res0;
 468                 fres1 = res1;
 469                 fres2 = res2;
 470 
 471                 iexp0 += *(int*)&fres0;
 472                 iexp1 += *(int*)&fres1;
 473                 iexp2 += *(int*)&fres2;
 474                 *(int*)py = iexp0;
 475                 py += stridey;
 476                 *(int*)py = iexp1;
 477                 py += stridey;
 478                 *(int*)py = iexp2;
 479                 py += stridey;
 480         }
 481 #endif
 482         for(; n > 0 ; n--)
 483         {
 484                 ax0 = *(int*)px;
 485                 px += stridex;
 486 
 487                 iexp0 = ax0 >> 24;
 488                 iexp0 = 0x3f - iexp0;
 489                 iexp0 = iexp0 << 23;
 490 
 491                 si0 = (ax0 >> 13) & 0x7f0;
 492 
 493                 tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0];
 494                 tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1];
 495                 iax0 = ax0 & 0x7ffe0000;
 496                 iax0 = ax0 - iax0;
 497                 xx0 = iax0 * tbl_div0;
 498                 res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
 499 
 500                 fres0 = res0;
 501                 iexp0 += *(int*)&fres0;
 502                 *(int*)py = iexp0;
 503                 py += stridey;
 504         }
 505 }
 506