1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 */ 25 /* 26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 27 * Use is subject to license terms. 28 */ 29 30 #include "libm_synonyms.h" 31 #include "libm_inlines.h" 32 33 #ifdef __RESTRICT 34 #define restrict _Restrict 35 #else 36 #define restrict 37 #endif 38 39 /* float rsqrtf(float x) 40 * 41 * Method : 42 * 1. Special cases: 43 * for x = NaN => QNaN; 44 * for x = +Inf => 0; 45 * for x is negative, -Inf => QNaN + invalid; 46 * for x = +0 => +Inf + divide-by-zero; 47 * for x = -0 => -Inf + divide-by-zero. 48 * 2. Computes reciprocal square root from: 49 * x = m * 2**n 50 * Where: 51 * m = [0.5, 2), 52 * n = ((exponent + 1) & ~1). 53 * Then: 54 * rsqrtf(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m)) 55 * 2. Computes 1/sqrt(m) from: 56 * 1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm)) 57 * Where: 58 * m = m0 + dm, 59 * m0 = 0.5 * (1 + k/64) for m = [0.5, 0.5+127/256), k = [0, 63]; 60 * m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127]; 61 * Then: 62 * 1/sqrt(m0), 1/m0 are looked up in a table, 63 * 1/sqrt(1 + (1/m0)*dm) is computed using approximation: 64 * 1/sqrt(1 + z) = ((a3 * z + a2) * z + a1) * z + a0 65 * where z = [-1/64, 1/64]. 66 * 67 * Accuracy: 68 * The maximum relative error for the approximating 69 * polynomial is 2**(-27.87). 70 * Maximum error observed: less than 0.534 ulp for the 71 * whole float type range. 72 */ 73 74 #define sqrtf __sqrtf 75 76 extern float sqrtf( float ); 77 78 static const double __TBL_rsqrtf[] = { 79 /* 80 i = [0,63] 81 TBL[2*i ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-24; 82 TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46))); 83 i = [64,127] 84 TBL[2*i ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-23; 85 TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46))); 86 */ 87 1.1920928955078125000e-07, 1.4142135623730951455e+00, 88 1.1737530048076923728e-07, 1.4032928308912466786e+00, 89 1.1559688683712121533e-07, 1.3926212476455828160e+00, 90 1.1387156016791044559e-07, 1.3821894809301762397e+00, 91 1.1219697840073529256e-07, 1.3719886811400707760e+00, 92 1.1057093523550724772e-07, 1.3620104492139977204e+00, 93 1.0899135044642856803e-07, 1.3522468075656264297e+00, 94 1.0745626100352112918e-07, 1.3426901732747025253e+00, 95 1.0596381293402777190e-07, 1.3333333333333332593e+00, 96 1.0451225385273972023e-07, 1.3241694217637887121e+00, 97 1.0309992609797297870e-07, 1.3151918984428583315e+00, 98 1.0172526041666667320e-07, 1.3063945294843617440e+00, 99 1.0038677014802631022e-07, 1.2977713690461003537e+00, 100 9.9083045860389616921e-08, 1.2893167424406084542e+00, 101 9.7812750400641022247e-08, 1.2810252304406970492e+00, 102 9.6574614319620251657e-08, 1.2728916546811681609e+00, 103 9.5367431640625005294e-08, 1.2649110640673517647e+00, 104 9.4190055941358019463e-08, 1.2570787221094177344e+00, 105 9.3041396722560978838e-08, 1.2493900951088485751e+00, 106 9.1920416039156631290e-08, 1.2418408411301324890e+00, 107 9.0826125372023804482e-08, 1.2344267996967352996e+00, 108 8.9757582720588234048e-08, 1.2271439821557927896e+00, 109 8.8713889898255812722e-08, 1.2199885626608373279e+00, 110 8.7694190014367814875e-08, 1.2129568697262453902e+00, 111 8.6697665127840911497e-08, 1.2060453783110545167e+00, 112 8.5723534058988761666e-08, 1.1992507023933782762e+00, 113 8.4771050347222225457e-08, 1.1925695879998878812e+00, 114 8.3839500343406599951e-08, 1.1859989066577618644e+00, 115 8.2928201426630432481e-08, 1.1795356492391770864e+00, 116 8.2036500336021511923e-08, 1.1731769201708264205e+00, 117 8.1163771609042551220e-08, 1.1669199319831564665e+00, 118 8.0309416118421050820e-08, 1.1607620001760186046e+00, 119 7.9472859700520828922e-08, 1.1547005383792514621e+00, 120 7.8653551868556699530e-08, 1.1487330537883810866e+00, 121 7.7850964604591830522e-08, 1.1428571428571427937e+00, 122 7.7064591224747481298e-08, 1.1370704872299222110e+00, 123 7.6293945312500001588e-08, 1.1313708498984760276e+00, 124 7.5538559715346535571e-08, 1.1257560715684669095e+00, 125 7.4797985600490195040e-08, 1.1202240672224077489e+00, 126 7.4071791565533974158e-08, 1.1147728228665882977e+00, 127 7.3359562800480773303e-08, 1.1094003924504582947e+00, 128 7.2660900297619054173e-08, 1.1041048949477667573e+00, 129 7.1975420106132072725e-08, 1.0988845115895122806e+00, 130 7.1302752628504667579e-08, 1.0937374832394612945e+00, 131 7.0642541956018514597e-08, 1.0886621079036347126e+00, 132 6.9994445240825691959e-08, 1.0836567383657542685e+00, 133 6.9358132102272723904e-08, 1.0787197799411873955e+00, 134 6.8733284065315314719e-08, 1.0738496883424388795e+00, 135 6.8119594029017853361e-08, 1.0690449676496975862e+00, 136 6.7516765763274335346e-08, 1.0643041683803828867e+00, 137 6.6924513432017540145e-08, 1.0596258856520350822e+00, 138 6.6342561141304348632e-08, 1.0550087574332591700e+00, 139 6.5770642510775861156e-08, 1.0504514628777803509e+00, 140 6.5208500267094023655e-08, 1.0459527207369814228e+00, 141 6.4655885858050847233e-08, 1.0415112878465908608e+00, 142 6.4112559086134451001e-08, 1.0371259576834630511e+00, 143 6.3578287760416665784e-08, 1.0327955589886446131e+00, 144 6.3052847365702481089e-08, 1.0285189544531601058e+00, 145 6.2536020747950822927e-08, 1.0242950394631678002e+00, 146 6.2027597815040656970e-08, 1.0201227409013413627e+00, 147 6.1527375252016127325e-08, 1.0160010160015240377e+00, 148 6.1035156250000001271e-08, 1.0119288512538813229e+00, 149 6.0550750248015869655e-08, 1.0079052613579393416e+00, 150 6.0073972687007873182e-08, 1.0039292882210537616e+00, 151 1.1920928955078125000e-07, 1.0000000000000000000e+00, 152 1.1737530048076923728e-07, 9.9227787671366762812e-01, 153 1.1559688683712121533e-07, 9.8473192783466190203e-01, 154 1.1387156016791044559e-07, 9.7735555485044178781e-01, 155 1.1219697840073529256e-07, 9.7014250014533187638e-01, 156 1.1057093523550724772e-07, 9.6308682468615358641e-01, 157 1.0899135044642856803e-07, 9.5618288746751489704e-01, 158 1.0745626100352112918e-07, 9.4942532655508271588e-01, 159 1.0596381293402777190e-07, 9.4280904158206335630e-01, 160 1.0451225385273972023e-07, 9.3632917756904454620e-01, 161 1.0309992609797297870e-07, 9.2998110995055427441e-01, 162 1.0172526041666667320e-07, 9.2376043070340119190e-01, 163 1.0038677014802631022e-07, 9.1766293548224708854e-01, 164 9.9083045860389616921e-08, 9.1168461167710357351e-01, 165 9.7812750400641022247e-08, 9.0582162731567661407e-01, 166 9.6574614319620251657e-08, 9.0007032074081916306e-01, 167 9.5367431640625005294e-08, 8.9442719099991585541e-01, 168 9.4190055941358019463e-08, 8.8888888888888883955e-01, 169 9.3041396722560978838e-08, 8.8345220859877238162e-01, 170 9.1920416039156631290e-08, 8.7811407991752277180e-01, 171 9.0826125372023804482e-08, 8.7287156094396955996e-01, 172 8.9757582720588234048e-08, 8.6772183127462465535e-01, 173 8.8713889898255812722e-08, 8.6266218562750729415e-01, 174 8.7694190014367814875e-08, 8.5769002787023584933e-01, 175 8.6697665127840911497e-08, 8.5280286542244176928e-01, 176 8.5723534058988761666e-08, 8.4799830400508802164e-01, 177 8.4771050347222225457e-08, 8.4327404271156780613e-01, 178 8.3839500343406599951e-08, 8.3862786937753464045e-01, 179 8.2928201426630432481e-08, 8.3405765622829908246e-01, 180 8.2036500336021511923e-08, 8.2956135578434020417e-01, 181 8.1163771609042551220e-08, 8.2513699700703468931e-01, 182 8.0309416118421050820e-08, 8.2078268166812329287e-01, 183 7.9472859700520828922e-08, 8.1649658092772603446e-01, 184 7.8653551868556699530e-08, 8.1227693210689522196e-01, 185 7.7850964604591830522e-08, 8.0812203564176865456e-01, 186 7.7064591224747481298e-08, 8.0403025220736967782e-01, 187 7.6293945312500001588e-08, 8.0000000000000004441e-01, 188 7.5538559715346535571e-08, 7.9602975216799132241e-01, 189 7.4797985600490195040e-08, 7.9211803438133943089e-01, 190 7.4071791565533974158e-08, 7.8826342253143455441e-01, 191 7.3359562800480773303e-08, 7.8446454055273617811e-01, 192 7.2660900297619054173e-08, 7.8072005835882651859e-01, 193 7.1975420106132072725e-08, 7.7702868988581130782e-01, 194 7.1302752628504667579e-08, 7.7338919123653082632e-01, 195 7.0642541956018514597e-08, 7.6980035891950104876e-01, 196 6.9994445240825691959e-08, 7.6626102817692109959e-01, 197 6.9358132102272723904e-08, 7.6277007139647390321e-01, 198 6.8733284065315314719e-08, 7.5932639660199918730e-01, 199 6.8119594029017853361e-08, 7.5592894601845450619e-01, 200 6.7516765763274335346e-08, 7.5257669470687782454e-01, 201 6.6924513432017540145e-08, 7.4926864926535519107e-01, 202 6.6342561141304348632e-08, 7.4600384659225105199e-01, 203 6.5770642510775861156e-08, 7.4278135270820744296e-01, 204 6.5208500267094023655e-08, 7.3960026163363878915e-01, 205 6.4655885858050847233e-08, 7.3645969431865865307e-01, 206 6.4112559086134451001e-08, 7.3335879762256905856e-01, 207 6.3578287760416665784e-08, 7.3029674334022143256e-01, 208 6.3052847365702481089e-08, 7.2727272727272729291e-01, 209 6.2536020747950822927e-08, 7.2428596834014824513e-01, 210 6.2027597815040656970e-08, 7.2133570773394584119e-01, 211 6.1527375252016127325e-08, 7.1842120810709964029e-01, 212 6.1035156250000001271e-08, 7.1554175279993270653e-01, 213 6.0550750248015869655e-08, 7.1269664509979835376e-01, 214 6.0073972687007873182e-08, 7.0988520753289097165e-01, 215 }; 216 217 static const unsigned long long LCONST[] = { 218 0x3feffffffee7f18fULL, /* A0 = 9.99999997962321453275e-01 */ 219 0xbfdffffffe07e52fULL, /* A1 =-4.99999998166077580600e-01 */ 220 0x3fd801180ca296d9ULL, /* A2 = 3.75066768969515586277e-01 */ 221 0xbfd400fc0bbb8e78ULL, /* A3 =-3.12560092408808548438e-01 */ 222 }; 223 224 static void 225 __vrsqrtf_n( int n, float * restrict px, int stridex, float * restrict py, int stridey ); 226 227 #pragma no_inline(__vrsqrtf_n) 228 229 #define RETURN(ret) \ 230 { \ 231 *py = (ret); \ 232 py += stridey; \ 233 if ( n_n == 0 ) \ 234 { \ 235 spx = px; spy = py; \ 236 ax0 = *(int*)px; \ 237 continue; \ 238 } \ 239 n--; \ 240 break; \ 241 } 242 243 void 244 __vrsqrtf( int n, float * restrict px, int stridex, float * restrict py, int stridey ) 245 { 246 float *spx, *spy; 247 int ax0, n_n; 248 float res; 249 float FONE = 1.0f, FTWO = 2.0f; 250 251 while ( n > 1 ) 252 { 253 n_n = 0; 254 spx = px; 255 spy = py; 256 ax0 = *(int*)px; 257 for ( ; n > 1 ; n-- ) 258 { 259 px += stridex; 260 if ( ax0 >= 0x7f800000 ) /* X = NaN or Inf */ 261 { 262 res = *(px - stridex); 263 RETURN ( FONE / res ) 264 } 265 266 py += stridey; 267 268 if ( ax0 < 0x00800000 ) /* X = denormal, zero or negative */ 269 { 270 py -= stridey; 271 res = *(px - stridex); 272 273 if ( (ax0 & 0x7fffffff) == 0 ) /* |X| = zero */ 274 { 275 RETURN ( FONE / res ) 276 } 277 else if ( ax0 >= 0 ) /* X = denormal */ 278 { 279 double A0 = ((double*)LCONST)[0]; /* 9.99999997962321453275e-01 */ 280 double A1 = ((double*)LCONST)[1]; /* -4.99999998166077580600e-01 */ 281 double A2 = ((double*)LCONST)[2]; /* 3.75066768969515586277e-01 */ 282 double A3 = ((double*)LCONST)[3]; /* -3.12560092408808548438e-01 */ 283 284 double res0, xx0, tbl_div0, tbl_sqrt0; 285 float fres0; 286 int iax0, si0, iexp0; 287 288 res = *(int*)&res; 289 res *= FTWO; 290 ax0 = *(int*)&res; 291 iexp0 = ax0 >> 24; 292 iexp0 = 0x3f + 0x4b - iexp0; 293 iexp0 = iexp0 << 23; 294 295 si0 = (ax0 >> 13) & 0x7f0; 296 297 tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0]; 298 tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1]; 299 iax0 = ax0 & 0x7ffe0000; 300 iax0 = ax0 - iax0; 301 xx0 = iax0 * tbl_div0; 302 res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); 303 304 fres0 = res0; 305 iexp0 += *(int*)&fres0; 306 RETURN(*(float*)&iexp0) 307 } 308 else /* X = negative */ 309 { 310 RETURN ( sqrtf(res) ) 311 } 312 } 313 n_n++; 314 ax0 = *(int*)px; 315 } 316 if ( n_n > 0 ) 317 __vrsqrtf_n( n_n, spx, stridex, spy, stridey ); 318 } 319 320 if ( n > 0 ) 321 { 322 ax0 = *(int*)px; 323 324 if ( ax0 >= 0x7f800000 ) /* X = NaN or Inf */ 325 { 326 res = *px; 327 *py = FONE / res; 328 } 329 else if ( ax0 < 0x00800000 ) /* X = denormal, zero or negative */ 330 { 331 res = *px; 332 333 if ( (ax0 & 0x7fffffff) == 0 ) /* |X| = zero */ 334 { 335 *py = FONE / res; 336 } 337 else if ( ax0 >= 0 ) /* X = denormal */ 338 { 339 double A0 = ((double*)LCONST)[0]; /* 9.99999997962321453275e-01 */ 340 double A1 = ((double*)LCONST)[1]; /* -4.99999998166077580600e-01 */ 341 double A2 = ((double*)LCONST)[2]; /* 3.75066768969515586277e-01 */ 342 double A3 = ((double*)LCONST)[3]; /* -3.12560092408808548438e-01 */ 343 double res0, xx0, tbl_div0, tbl_sqrt0; 344 float fres0; 345 int iax0, si0, iexp0; 346 347 res = *(int*)&res; 348 res *= FTWO; 349 ax0 = *(int*)&res; 350 iexp0 = ax0 >> 24; 351 iexp0 = 0x3f + 0x4b - iexp0; 352 iexp0 = iexp0 << 23; 353 354 si0 = (ax0 >> 13) & 0x7f0; 355 356 tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0]; 357 tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1]; 358 iax0 = ax0 & 0x7ffe0000; 359 iax0 = ax0 - iax0; 360 xx0 = iax0 * tbl_div0; 361 res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); 362 363 fres0 = res0; 364 iexp0 += *(int*)&fres0; 365 366 *(int*)py = iexp0; 367 } 368 else /* X = negative */ 369 { 370 *py = sqrtf(res); 371 } 372 } 373 else 374 { 375 double A0 = ((double*)LCONST)[0]; /* 9.99999997962321453275e-01 */ 376 double A1 = ((double*)LCONST)[1]; /* -4.99999998166077580600e-01 */ 377 double A2 = ((double*)LCONST)[2]; /* 3.75066768969515586277e-01 */ 378 double A3 = ((double*)LCONST)[3]; /* -3.12560092408808548438e-01 */ 379 double res0, xx0, tbl_div0, tbl_sqrt0; 380 float fres0; 381 int iax0, si0, iexp0; 382 383 iexp0 = ax0 >> 24; 384 iexp0 = 0x3f - iexp0; 385 iexp0 = iexp0 << 23; 386 387 si0 = (ax0 >> 13) & 0x7f0; 388 389 tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0]; 390 tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1]; 391 iax0 = ax0 & 0x7ffe0000; 392 iax0 = ax0 - iax0; 393 xx0 = iax0 * tbl_div0; 394 res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); 395 396 fres0 = res0; 397 iexp0 += *(int*)&fres0; 398 399 *(int*)py = iexp0; 400 } 401 } 402 } 403 404 void 405 __vrsqrtf_n( int n, float * restrict px, int stridex, float * restrict py, int stridey ) 406 { 407 double A0 = ((double*)LCONST)[0]; /* 9.99999997962321453275e-01 */ 408 double A1 = ((double*)LCONST)[1]; /* -4.99999998166077580600e-01 */ 409 double A2 = ((double*)LCONST)[2]; /* 3.75066768969515586277e-01 */ 410 double A3 = ((double*)LCONST)[3]; /* -3.12560092408808548438e-01 */ 411 double res0, xx0, tbl_div0, tbl_sqrt0; 412 float fres0; 413 int iax0, ax0, si0, iexp0; 414 415 #if defined(ARCH_v7) || defined(ARCH_v8) 416 double res1, xx1, tbl_div1, tbl_sqrt1; 417 double res2, xx2, tbl_div2, tbl_sqrt2; 418 float fres1, fres2; 419 int iax1, ax1, si1, iexp1; 420 int iax2, ax2, si2, iexp2; 421 422 for( ; n > 2 ; n -= 3 ) 423 { 424 ax0 = *(int*)px; 425 px += stridex; 426 427 ax1 = *(int*)px; 428 px += stridex; 429 430 ax2 = *(int*)px; 431 px += stridex; 432 433 iexp0 = ax0 >> 24; 434 iexp1 = ax1 >> 24; 435 iexp2 = ax2 >> 24; 436 iexp0 = 0x3f - iexp0; 437 iexp1 = 0x3f - iexp1; 438 iexp2 = 0x3f - iexp2; 439 440 iexp0 = iexp0 << 23; 441 iexp1 = iexp1 << 23; 442 iexp2 = iexp2 << 23; 443 444 si0 = (ax0 >> 13) & 0x7f0; 445 si1 = (ax1 >> 13) & 0x7f0; 446 si2 = (ax2 >> 13) & 0x7f0; 447 448 tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0]; 449 tbl_div1 = ((double*)((char*)__TBL_rsqrtf + si1))[0]; 450 tbl_div2 = ((double*)((char*)__TBL_rsqrtf + si2))[0]; 451 tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1]; 452 tbl_sqrt1 = ((double*)((char*)__TBL_rsqrtf + si1))[1]; 453 tbl_sqrt2 = ((double*)((char*)__TBL_rsqrtf + si2))[1]; 454 iax0 = ax0 & 0x7ffe0000; 455 iax1 = ax1 & 0x7ffe0000; 456 iax2 = ax2 & 0x7ffe0000; 457 iax0 = ax0 - iax0; 458 iax1 = ax1 - iax1; 459 iax2 = ax2 - iax2; 460 xx0 = iax0 * tbl_div0; 461 xx1 = iax1 * tbl_div1; 462 xx2 = iax2 * tbl_div2; 463 res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); 464 res1 = tbl_sqrt1 * (((A3 * xx1 + A2) * xx1 + A1) * xx1 + A0); 465 res2 = tbl_sqrt2 * (((A3 * xx2 + A2) * xx2 + A1) * xx2 + A0); 466 467 fres0 = res0; 468 fres1 = res1; 469 fres2 = res2; 470 471 iexp0 += *(int*)&fres0; 472 iexp1 += *(int*)&fres1; 473 iexp2 += *(int*)&fres2; 474 *(int*)py = iexp0; 475 py += stridey; 476 *(int*)py = iexp1; 477 py += stridey; 478 *(int*)py = iexp2; 479 py += stridey; 480 } 481 #endif 482 for( ; n > 0 ; n-- ) 483 { 484 ax0 = *(int*)px; 485 px += stridex; 486 487 iexp0 = ax0 >> 24; 488 iexp0 = 0x3f - iexp0; 489 iexp0 = iexp0 << 23; 490 491 si0 = (ax0 >> 13) & 0x7f0; 492 493 tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0]; 494 tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1]; 495 iax0 = ax0 & 0x7ffe0000; 496 iax0 = ax0 - iax0; 497 xx0 = iax0 * tbl_div0; 498 res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); 499 500 fres0 = res0; 501 iexp0 += *(int*)&fres0; 502 *(int*)py = iexp0; 503 py += stridey; 504 } 505 } 506