1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  23  */
  24 /*
  25  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  26  * Use is subject to license terms.
  27  */
  28 
  29         .file   "__vrsqrtf.S"
  30 
  31 #include "libm.h"
  32 
  33         RO_DATA
  34         .align  64
  35 
  36 ! i = [0,63]
  37 ! TBL[2*i  ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-24;
  38 ! TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46)));
  39 ! i = [64,127]
  40 ! TBL[2*i  ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-23;
  41 ! TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46)));
  42 
  43 .CONST_TBL:
  44         .word   0x3e800000, 0x00000000, 0x3ff6a09e, 0x667f3bcd,
  45         .word   0x3e7f81f8, 0x1f81f820, 0x3ff673e3, 0x2ef63a03,
  46         .word   0x3e7f07c1, 0xf07c1f08, 0x3ff6482d, 0x37a5a3d2,
  47         .word   0x3e7e9131, 0xabf0b767, 0x3ff61d72, 0xb7978671,
  48         .word   0x3e7e1e1e, 0x1e1e1e1e, 0x3ff5f3aa, 0x673fa911,
  49         .word   0x3e7dae60, 0x76b981db, 0x3ff5cacb, 0x7802f342,
  50         .word   0x3e7d41d4, 0x1d41d41d, 0x3ff5a2cd, 0x8c69d61a,
  51         .word   0x3e7cd856, 0x89039b0b, 0x3ff57ba8, 0xb0ee01b9,
  52         .word   0x3e7c71c7, 0x1c71c71c, 0x3ff55555, 0x55555555,
  53         .word   0x3e7c0e07, 0x0381c0e0, 0x3ff52fcc, 0x468d6b54,
  54         .word   0x3e7bacf9, 0x14c1bad0, 0x3ff50b06, 0xa8fc6b70,
  55         .word   0x3e7b4e81, 0xb4e81b4f, 0x3ff4e6fd, 0xf33cf032,
  56         .word   0x3e7af286, 0xbca1af28, 0x3ff4c3ab, 0xe93bcf74,
  57         .word   0x3e7a98ef, 0x606a63be, 0x3ff4a10a, 0x97af7b92,
  58         .word   0x3e7a41a4, 0x1a41a41a, 0x3ff47f14, 0x4fe17f9f,
  59         .word   0x3e79ec8e, 0x951033d9, 0x3ff45dc3, 0xa3c34fa3,
  60         .word   0x3e799999, 0x9999999a, 0x3ff43d13, 0x6248490f,
  61         .word   0x3e7948b0, 0xfcd6e9e0, 0x3ff41cfe, 0x93ff5199,
  62         .word   0x3e78f9c1, 0x8f9c18fa, 0x3ff3fd80, 0x77e70577,
  63         .word   0x3e78acb9, 0x0f6bf3aa, 0x3ff3de94, 0x8077db58,
  64         .word   0x3e786186, 0x18618618, 0x3ff3c036, 0x50e00e03,
  65         .word   0x3e781818, 0x18181818, 0x3ff3a261, 0xba6d7a37,
  66         .word   0x3e77d05f, 0x417d05f4, 0x3ff38512, 0xba21f51e,
  67         .word   0x3e778a4c, 0x8178a4c8, 0x3ff36845, 0x766eec92,
  68         .word   0x3e7745d1, 0x745d1746, 0x3ff34bf6, 0x3d156826,
  69         .word   0x3e7702e0, 0x5c0b8170, 0x3ff33021, 0x8127c0e0,
  70         .word   0x3e76c16c, 0x16c16c17, 0x3ff314c3, 0xd92a9e91,
  71         .word   0x3e768168, 0x16816817, 0x3ff2f9d9, 0xfd52fd50,
  72         .word   0x3e7642c8, 0x590b2164, 0x3ff2df60, 0xc5df2c9e,
  73         .word   0x3e760581, 0x60581606, 0x3ff2c555, 0x2988e428,
  74         .word   0x3e75c988, 0x2b931057, 0x3ff2abb4, 0x3c0eb0f4,
  75         .word   0x3e758ed2, 0x308158ed, 0x3ff2927b, 0x2cd320f5,
  76         .word   0x3e755555, 0x55555555, 0x3ff279a7, 0x4590331c,
  77         .word   0x3e751d07, 0xeae2f815, 0x3ff26135, 0xe91daf55,
  78         .word   0x3e74e5e0, 0xa72f0539, 0x3ff24924, 0x92492492,
  79         .word   0x3e74afd6, 0xa052bf5b, 0x3ff23170, 0xd2be638a,
  80         .word   0x3e747ae1, 0x47ae147b, 0x3ff21a18, 0x51ff630a,
  81         .word   0x3e7446f8, 0x6562d9fb, 0x3ff20318, 0xcc6a8f5d,
  82         .word   0x3e741414, 0x14141414, 0x3ff1ec70, 0x124e98f9,
  83         .word   0x3e73e22c, 0xbce4a902, 0x3ff1d61c, 0x070ae7d3,
  84         .word   0x3e73b13b, 0x13b13b14, 0x3ff1c01a, 0xa03be896,
  85         .word   0x3e738138, 0x13813814, 0x3ff1aa69, 0xe4f2777f,
  86         .word   0x3e73521c, 0xfb2b78c1, 0x3ff19507, 0xecf5b9e9,
  87         .word   0x3e7323e3, 0x4a2b10bf, 0x3ff17ff2, 0xe00ec3ee,
  88         .word   0x3e72f684, 0xbda12f68, 0x3ff16b28, 0xf55d72d4,
  89         .word   0x3e72c9fb, 0x4d812ca0, 0x3ff156a8, 0x72b5ef62,
  90         .word   0x3e729e41, 0x29e4129e, 0x3ff1426f, 0xac0654db,
  91         .word   0x3e727350, 0xb8812735, 0x3ff12e7d, 0x02c40253,
  92         .word   0x3e724924, 0x92492492, 0x3ff11ace, 0xe560242a,
  93         .word   0x3e721fb7, 0x8121fb78, 0x3ff10763, 0xcec30b26,
  94         .word   0x3e71f704, 0x7dc11f70, 0x3ff0f43a, 0x45cdedad,
  95         .word   0x3e71cf06, 0xada2811d, 0x3ff0e150, 0xdce2b60c,
  96         .word   0x3e71a7b9, 0x611a7b96, 0x3ff0cea6, 0x317186dc,
  97         .word   0x3e718118, 0x11811812, 0x3ff0bc38, 0xeb8ba412,
  98         .word   0x3e715b1e, 0x5f75270d, 0x3ff0aa07, 0xbd7b7488,
  99         .word   0x3e7135c8, 0x1135c811, 0x3ff09811, 0x63615499,
 100         .word   0x3e711111, 0x11111111, 0x3ff08654, 0xa2d4f6db,
 101         .word   0x3e70ecf5, 0x6be69c90, 0x3ff074d0, 0x4a8b1438,
 102         .word   0x3e70c971, 0x4fbcda3b, 0x3ff06383, 0x31ff307a,
 103         .word   0x3e70a681, 0x0a6810a7, 0x3ff0526c, 0x39213bfa,
 104         .word   0x3e708421, 0x08421084, 0x3ff0418a, 0x4806de7d,
 105         .word   0x3e70624d, 0xd2f1a9fc, 0x3ff030dc, 0x4ea03a72,
 106         .word   0x3e704104, 0x10410410, 0x3ff02061, 0x446ffa9a,
 107         .word   0x3e702040, 0x81020408, 0x3ff01018, 0x28467ee9,
 108         .word   0x3e800000, 0x00000000, 0x3ff00000, 0x00000000,
 109         .word   0x3e7f81f8, 0x1f81f820, 0x3fefc0bd, 0x88a0f1d9,
 110         .word   0x3e7f07c1, 0xf07c1f08, 0x3fef82ec, 0x882c0f9b,
 111         .word   0x3e7e9131, 0xabf0b767, 0x3fef467f, 0x2814b0cc,
 112         .word   0x3e7e1e1e, 0x1e1e1e1e, 0x3fef0b68, 0x48d2af1c,
 113         .word   0x3e7dae60, 0x76b981db, 0x3feed19b, 0x75e78957,
 114         .word   0x3e7d41d4, 0x1d41d41d, 0x3fee990c, 0xdad55ed2,
 115         .word   0x3e7cd856, 0x89039b0b, 0x3fee61b1, 0x38f18adc,
 116         .word   0x3e7c71c7, 0x1c71c71c, 0x3fee2b7d, 0xddfefa66,
 117         .word   0x3e7c0e07, 0x0381c0e0, 0x3fedf668, 0x9b7e6350,
 118         .word   0x3e7bacf9, 0x14c1bad0, 0x3fedc267, 0xbea45549,
 119         .word   0x3e7b4e81, 0xb4e81b4f, 0x3fed8f72, 0x08e6b82d,
 120         .word   0x3e7af286, 0xbca1af28, 0x3fed5d7e, 0xa914b937,
 121         .word   0x3e7a98ef, 0x606a63be, 0x3fed2c85, 0x34ed6d86,
 122         .word   0x3e7a41a4, 0x1a41a41a, 0x3fecfc7d, 0xa32a9213,
 123         .word   0x3e79ec8e, 0x951033d9, 0x3feccd60, 0x45f5d358,
 124         .word   0x3e799999, 0x9999999a, 0x3fec9f25, 0xc5bfedd9,
 125         .word   0x3e7948b0, 0xfcd6e9e0, 0x3fec71c7, 0x1c71c71c,
 126         .word   0x3e78f9c1, 0x8f9c18fa, 0x3fec453d, 0x90f057a2,
 127         .word   0x3e78acb9, 0x0f6bf3aa, 0x3fec1982, 0xb2ece47b,
 128         .word   0x3e786186, 0x18618618, 0x3febee90, 0x56fb9c39,
 129         .word   0x3e781818, 0x18181818, 0x3febc460, 0x92eb3118,
 130         .word   0x3e77d05f, 0x417d05f4, 0x3feb9aed, 0xba588347,
 131         .word   0x3e778a4c, 0x8178a4c8, 0x3feb7232, 0x5b79db11,
 132         .word   0x3e7745d1, 0x745d1746, 0x3feb4a29, 0x3c1d9550,
 133         .word   0x3e7702e0, 0x5c0b8170, 0x3feb22cd, 0x56d87d7e,
 134         .word   0x3e76c16c, 0x16c16c17, 0x3feafc19, 0xd8606169,
 135         .word   0x3e768168, 0x16816817, 0x3fead60a, 0x1d0fb394,
 136         .word   0x3e7642c8, 0x590b2164, 0x3feab099, 0xae8f539a,
 137         .word   0x3e760581, 0x60581606, 0x3fea8bc4, 0x41a3d02c,
 138         .word   0x3e75c988, 0x2b931057, 0x3fea6785, 0xb41bacf7,
 139         .word   0x3e758ed2, 0x308158ed, 0x3fea43da, 0x0adc6899,
 140         .word   0x3e755555, 0x55555555, 0x3fea20bd, 0x700c2c3e,
 141         .word   0x3e751d07, 0xeae2f815, 0x3fe9fe2c, 0x315637ee,
 142         .word   0x3e74e5e0, 0xa72f0539, 0x3fe9dc22, 0xbe484458,
 143         .word   0x3e74afd6, 0xa052bf5b, 0x3fe9ba9d, 0xa6c73588,
 144         .word   0x3e747ae1, 0x47ae147b, 0x3fe99999, 0x9999999a,
 145         .word   0x3e7446f8, 0x6562d9fb, 0x3fe97913, 0x63068b54,
 146         .word   0x3e741414, 0x14141414, 0x3fe95907, 0xeb87ab44,
 147         .word   0x3e73e22c, 0xbce4a902, 0x3fe93974, 0x368cfa31,
 148         .word   0x3e73b13b, 0x13b13b14, 0x3fe91a55, 0x6151761c,
 149         .word   0x3e738138, 0x13813814, 0x3fe8fba8, 0xa1bf6f96,
 150         .word   0x3e73521c, 0xfb2b78c1, 0x3fe8dd6b, 0x4563a009,
 151         .word   0x3e7323e3, 0x4a2b10bf, 0x3fe8bf9a, 0xb06e1af3,
 152         .word   0x3e72f684, 0xbda12f68, 0x3fe8a234, 0x5cc04426,
 153         .word   0x3e72c9fb, 0x4d812ca0, 0x3fe88535, 0xd90703c6,
 154         .word   0x3e729e41, 0x29e4129e, 0x3fe8689c, 0xc7e07e7d,
 155         .word   0x3e727350, 0xb8812735, 0x3fe84c66, 0xdf0ca4c2,
 156         .word   0x3e724924, 0x92492492, 0x3fe83091, 0xe6a7f7e7,
 157         .word   0x3e721fb7, 0x8121fb78, 0x3fe8151b, 0xb86fee1d,
 158         .word   0x3e71f704, 0x7dc11f70, 0x3fe7fa02, 0x3f1068d1,
 159         .word   0x3e71cf06, 0xada2811d, 0x3fe7df43, 0x7579b9b5,
 160         .word   0x3e71a7b9, 0x611a7b96, 0x3fe7c4dd, 0x663ebb88,
 161         .word   0x3e718118, 0x11811812, 0x3fe7aace, 0x2afa8b72,
 162         .word   0x3e715b1e, 0x5f75270d, 0x3fe79113, 0xebbd7729,
 163         .word   0x3e7135c8, 0x1135c811, 0x3fe777ac, 0xde80baea,
 164         .word   0x3e711111, 0x11111111, 0x3fe75e97, 0x46a0b098,
 165         .word   0x3e70ecf5, 0x6be69c90, 0x3fe745d1, 0x745d1746,
 166         .word   0x3e70c971, 0x4fbcda3b, 0x3fe72d59, 0xc45f1fc5,
 167         .word   0x3e70a681, 0x0a6810a7, 0x3fe7152e, 0x9f44f01f,
 168         .word   0x3e708421, 0x08421084, 0x3fe6fd4e, 0x79325467,
 169         .word   0x3e70624d, 0xd2f1a9fc, 0x3fe6e5b7, 0xd16657e1,
 170         .word   0x3e704104, 0x10410410, 0x3fe6ce69, 0x31d5858d,
 171         .word   0x3e702040, 0x81020408, 0x3fe6b761, 0x2ec892f6,
 172 
 173         .word   0x3fefffff, 0xfee7f18f  ! K0 =  9.99999997962321453275e-01
 174         .word   0xbfdfffff, 0xfe07e52f  ! K1 = -4.99999998166077580600e-01
 175         .word   0x3fd80118, 0x0ca296d9  ! K2 =  3.75066768969515586277e-01
 176         .word   0xbfd400fc, 0x0bbb8e78  ! K3 = -3.12560092408808548438e-01
 177         .word   0x7ffe0000, 0x7ffe0000  ! DC0
 178         .word   0x3f800000, 0x40000000  ! FTWO
 179 
 180 #define stridex         %l4
 181 #define stridex2        %l1
 182 #define stridey         %l3
 183 #define stridey2        %i2
 184 #define TBL             %l2
 185 #define counter         %i5
 186 
 187 #define K3              %f38
 188 #define K2              %f36
 189 #define K1              %f34
 190 #define K0              %f32
 191 #define DC0             %f4
 192 #define FONE            %f2
 193 #define FTWO            %f3
 194 
 195 #define _0x00800000     %o2
 196 #define _0x7f800000     %o4
 197 
 198 #define tmp0            STACK_BIAS-0x30
 199 #define tmp1            STACK_BIAS-0x28
 200 #define tmp2            STACK_BIAS-0x20
 201 #define tmp3            STACK_BIAS-0x18
 202 #define tmp_counter     STACK_BIAS-0x10
 203 #define tmp_px          STACK_BIAS-0x08
 204 
 205 ! sizeof temp storage - must be a multiple of 16 for V9
 206 #define tmps            0x30
 207 
 208 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 209 !      !!!!!   algorithm   !!!!!
 210 !  ((float*)&ddx0)[0] = *px;
 211 !  ax0 = *(int*)px;
 212 !
 213 !  ((float*)&ddx0)[1] = *(px + stridex);
 214 !  ax1 = *(int*)(px + stridex);
 215 !
 216 !  px += stridex2;
 217 !
 218 !  if ( ax0 >= 0x7f800000 )
 219 !  {
 220 !    RETURN ( FONE / ((float*)&dres0)[0] );
 221 !  }
 222 !  if ( ax0 < 0x00800000 )
 223 !  {
 224 !    float res = ((float*)&dres0)[0];
 225 !
 226 !    if ( (ax0 & 0x7fffffff) == 0 )  /* |X| = zero  */
 227 !    {
 228 !      RETURN ( FONE / res )
 229 !    }
 230 !    else if ( ax0 >= 0 )  /* X = denormal  */
 231 !    {
 232 !      double    res0, xx0, tbl_div0, tbl_sqrt0;
 233 !      float    fres0;
 234 !      int    iax0, si0, iexp0;
 235 !
 236 !      res = *(int*)&res;
 237 !      res *= FTWO;
 238 !      ax0 = *(int*)&res;
 239 !      iexp0 = ax0 >> 24;
 240 !      iexp0 = 0x3f + 0x4b - iexp0;
 241 !      iexp0 = iexp0 << 23;
 242 !
 243 !      si0 = (ax0 >> 13) & 0x7f0;
 244 !
 245 !      tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0];
 246 !      tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1];
 247 !      iax0 = ax0 & 0x7ffe0000;
 248 !      iax0 = ax0 - iax0;
 249 !      xx0 = iax0 * tbl_div0;
 250 !      res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
 251 !
 252 !      fres0 = res0;
 253 !      iexp0 += *(int*)&fres0;
 254 !      RETURN(*(float*)&iexp0)
 255 !    }
 256 !    else  /* X = negative  */
 257 !    {
 258 !      RETURN ( sqrtf(res) )
 259 !    }
 260 !  }
 261 !  if ( ax1 >= 0x7f800000 )
 262 !  {
 263 !    RETURN ( FONE / ((float*)&dres0)[1] )
 264 !  }
 265 !  if ( ax1 < 0x00800000 )
 266 !  {
 267 !    float res = ((float*)&dres0)[1];
 268 !    if ( (ax0 & 0x7fffffff) == 0 )  /* |X| = zero  */
 269 !    {
 270 !      RETURN ( FONE / res )
 271 !    }
 272 !    else if ( ax0 >= 0 )  /* X = denormal  */
 273 !    {
 274 !      double    res0, xx0, tbl_div0, tbl_sqrt0;
 275 !      float    fres0;
 276 !      int    iax1, si0, iexp0;
 277 !
 278 !      res = *(int*)&res;
 279 !      res *= FTWO;
 280 !      ax1 = *(int*)&res;
 281 !      iexp0 = ax1 >> 24;
 282 !      iexp0 = 0x3f + 0x4b - iexp0;
 283 !      iexp0 = iexp0 << 23;
 284 !
 285 !      si0 = (ax1 >> 13) & 0x7f0;
 286 !
 287 !      tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0];
 288 !      tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1];
 289 !      iax1 = ax1 & 0x7ffe0000;
 290 !      iax1 = ax1 - iax1;
 291 !      xx0 = iax1 * tbl_div0;
 292 !      res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0);
 293 !
 294 !      fres0 = res0;
 295 !      iexp0 += *(int*)&fres0;
 296 !      RETURN(*(float*)&iexp0)
 297 !    }
 298 !    else  /* X = negative  */
 299 !    {
 300 !      RETURN ( sqrtf(res) )
 301 !    }
 302 !  }
 303 !
 304 !  iexp0 = ax0 >> 24;
 305 !  iexp1 = ax1 >> 24;
 306 !  iexp0 = 0x3f - iexp0;
 307 !  iexp1 = 0x3f - iexp1;
 308 !  iexp1 &= 0x1ff;
 309 !  lexp0 = iexp0 << 55;
 310 !  lexp1 = iexp1 << 23;
 311 !
 312 !  lexp0 |= lexp1;
 313 !
 314 !  fdx0 = *((double*)&lexp0);
 315 !
 316 !  si0 = ax0 >> 13;
 317 !  si1 = ax1 >> 13;
 318 !  si0 &= 0x7f0;
 319 !  si1 &= 0x7f0;
 320 !
 321 !  addr0 = (char*)TBL + si0;
 322 !  addr1 = (char*)TBL + si1;
 323 !  tbl_div0 = ((double*)((char*)TBL + si0))[0];
 324 !  tbl_div1 = ((double*)((char*)TBL + si1))[0];
 325 !  tbl_sqrt0 = ((double*)addr0)[1];
 326 !  tbl_sqrt1 = ((double*)addr1)[1];
 327 !  dfx0 = vis_fand(ddx0,DC0);
 328 !  dfx0 = vis_fpsub32(ddx0,dfx0);
 329 !  dtmp0 = (double)(((int*)&dfx0)[0]);
 330 !  dtmp1 = (double)(((int*)&dfx0)[1]);
 331 !  xx0 = dtmp0 * tbl_div0;
 332 !  xx1 = dtmp1 * tbl_div1;
 333 !  res0 = K3 * xx0;
 334 !  res1 = K3 * xx1;
 335 !  res0 += K2;
 336 !  res1 += K2;
 337 !  res0 *= xx0;
 338 !  res1 *= xx1;
 339 !  res0 += K1;
 340 !  res1 += K1;
 341 !  res0 *= xx0;
 342 !  res1 *= xx1;
 343 !  res0 += K0;
 344 !  res1 += K0;
 345 !  res0 = tbl_sqrt0 * res0;
 346 !  res1 = tbl_sqrt1 * res1;
 347 !  ((float*)&dres0)[0] = (float)res0;
 348 !  ((float*)&dres0)[1] = (float)res1;
 349 !  dres0 = vis_fpadd32(dres0,fdx0);
 350 !  *py = ((float*)&dres0)[0];
 351 !  *(py + stridey) = ((float*)&dres0)[1];
 352 !  py += stridey2;
 353 !
 354 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 355 
 356         ENTRY(__vrsqrtf)
 357         save    %sp,-SA(MINFRAME)-tmps,%sp
 358         PIC_SETUP(l7)
 359         PIC_SET(l7,.CONST_TBL,l2)
 360 
 361         st      %i0,[%fp+tmp_counter]
 362         stx     %i1,[%fp+tmp_px]
 363 
 364         ldd     [TBL+2048],K0
 365         sll     %i2,2,stridex
 366 
 367         ldd     [TBL+2048+8],K1
 368         sll     %i4,2,stridey
 369         mov     %i3,%i2
 370 
 371         ldd     [TBL+2048+16],K2
 372         sethi   %hi(0x7f800000),_0x7f800000
 373         sll     stridex,1,stridex2
 374 
 375         ldd     [TBL+2048+24],K3
 376         sethi   %hi(0x00800000),_0x00800000
 377 
 378         ldd     [TBL+2048+32],DC0
 379         add     %g0,0x3f,%l0
 380 
 381         ldd     [TBL+2048+40],FONE
 382 !       ld      [TBL+2048+44],FTWO
 383 .begin:
 384         ld      [%fp+tmp_counter],counter
 385         ldx     [%fp+tmp_px],%l7
 386         st      %g0,[%fp+tmp_counter]
 387 .begin1:
 388         cmp     counter,0
 389         ble,pn  %icc,.exit
 390 
 391         lda     [%l7]0x82,%f14          ! (4_0) ((float*)&ddx0)[0] = *px;
 392 
 393         lda     [stridex+%l7]0x82,%f15  ! (5_0) ((float*)&ddx0)[1] = *(px + stridex);
 394         sethi   %hi(0x7ffffc00),%o0
 395 
 396         lda     [%l7]0x82,%g1           ! (4_0) ax0 = *(int*)px;
 397         add     %l7,stridex2,%i1        ! px += stridex2
 398         add     %o0,0x3ff,%o0
 399 
 400         lda     [stridex+%l7]0x82,%g5   ! (5_0) ax1 = *(int*)(px + stridex);
 401         fand    %f14,DC0,%f16           ! (4_0) dfx0 = vis_fand(ddx0,DC0);
 402 
 403         sra     %g1,13,%l5              ! (4_0) si0 = ax0 >> 13;
 404         add     %i1,stridex2,%o5        ! px += stridex2
 405 
 406         cmp     %g1,_0x7f800000         ! (4_1) ax0 ? 0x7f800000
 407         bge,pn  %icc,.spec0             ! (4_1) if ( ax0 >= 0x7f800000 )
 408         nop
 409 
 410         cmp     %g1,_0x00800000         ! (4_1) ax0 ? 0x00800000
 411         bl,pn   %icc,.spec1             ! (4_1) if ( ax0 < 0x00800000 )
 412         sra     %g5,13,%l6              ! (5_0) si1 = ax1 >> 13;
 413 .cont_spec:
 414         and     %l5,2032,%l5            ! (4_0) si0 &= 0x7f0;
 415 
 416         ldd     [%l5+TBL],%f54          ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
 417         sra     %g5,24,%l7              ! (5_0) iexp1 = ax1 >> 24;
 418         and     %l6,2032,%l6            ! (5_0) si1 &= 0x7f0;
 419         fpsub32 %f14,%f16,%f16          ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);
 420 
 421         ldd     [%l6+TBL],%f46          ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
 422         sra     %g1,24,%i3              ! (4_0) iexp0 = ax0 >> 24;
 423         sub     %l0,%l7,%l7             ! (5_0) iexp1 = 0x3f - iexp1;
 424 
 425         and     %l7,511,%l1             ! (5_0) iexp1 = 0x1ff;
 426         add     %l6,TBL,%l6             ! (5_0) addr1 = (char*)TBL + si1;
 427 
 428         sllx    %l1,23,%l1              ! (5_0) lexp1 = iexp1 << 23;
 429         sub     %l0,%i3,%o0             ! (4_0) iexp0 = 0x3f - iexp0;
 430         fitod   %f16,%f56               ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]);
 431 
 432         sllx    %o0,55,%o0              ! (4_0) lexp0 = iexp0 << 55;
 433         fitod   %f17,%f44               ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]);
 434 
 435         or      %o0,%l1,%o0             ! (4_0) lexp0 |= lexp1;
 436 
 437         stx     %o0,[%fp+tmp0]          ! (4_0) fdx0 = *((double*)lexp0);
 438 
 439         fmuld   %f56,%f54,%f40          ! (4_0) xx0 = dtmp0 * tbl_div0;
 440 
 441         lda     [%i1]0x82,%f18          ! (0_0) ((float*)&ddx0)[0] = *px;
 442         fmuld   %f44,%f46,%f46          ! (5_1) xx1 = dtmp1 * tbl_div1;
 443 
 444         lda     [stridex+%i1]0x82,%f19  ! (1_0) ((float*)&ddx0)[1] = *(px + stridex);
 445 
 446         lda     [%i1]0x82,%g1           ! (0_0) ax0 = *(int*)px;
 447 
 448         lda     [stridex+%i1]0x82,%i4   ! (1_0) ax1 = *(int*)(px + stridex);
 449         cmp     %g5,_0x7f800000         ! (5_1) ax1 ? 0x7f800000
 450         bge,pn  %icc,.update0           ! (5_1) if ( ax1 >= 0x7f800000 )
 451         fmuld   K3,%f40,%f52            ! (4_1) res0 = K3 * xx0;
 452 .cont0:
 453         fmuld   K3,%f46,%f50            ! (5_1) res1 = K3 * xx1;
 454         cmp     %g5,_0x00800000         ! (5_1) ax1 ? 0x00800000
 455         bl,pn   %icc,.update1           ! (5_1) if ( ax1 < 0x00800000 )
 456         fand    %f18,DC0,%f56           ! (0_0) dfx0 = vis_fand(ddx0,DC0);
 457 .cont1:
 458         sra     %g1,13,%o0              ! (0_0) si0 = ax0 >> 13;
 459         cmp     %g1,_0x7f800000         ! (0_0) ax0 ? 0x7f800000
 460 
 461         sra     %i4,13,%g5              ! (1_0) si1 = ax1 >> 13;
 462         and     %o0,2032,%o0            ! (0_0) si0 &= 0x7f0;
 463 
 464         ldd     [%o0+TBL],%f54          ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
 465         sra     %i4,24,%i1              ! (1_0) iexp1 = ax1 >> 24;
 466         and     %g5,2032,%o7            ! (1_0) si1 &= 0x7f0;
 467         fpsub32 %f18,%f56,%f30          ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);
 468 
 469         ldd     [%o7+TBL],%f44          ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
 470         sra     %g1,24,%i3              ! (0_0) iexp0 = ax0 >> 24;
 471         sub     %l0,%i1,%i1             ! (1_0) iexp1 = 0x3f - iexp1;
 472         faddd   %f52,K2,%f62            ! (4_1) res0 += K2;
 473 
 474         sub     %l0,%i3,%g5             ! (0_0) iexp0 = 0x3f - iexp0;
 475         bge,pn  %icc,.update2           ! (0_0) if ( ax0 >= 0x7f800000 )
 476         faddd   %f50,K2,%f60            ! (5_1) res1 += K2;
 477 .cont2:
 478         cmp     %g1,_0x00800000         ! (0_0) ax0 ? 0x00800000
 479         and     %i1,511,%i0             ! (1_0) iexp1 = 0x1ff;
 480         fitod   %f30,%f56               ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]);
 481 
 482         sllx    %i0,23,%i0              ! (1_0) lexp1 = iexp1 << 23;
 483         bl,pn   %icc,.update3           ! (0_0) if ( ax0 < 0x00800000 )
 484         fitod   %f31,%f50               ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]);
 485 .cont3:
 486         fmuld   %f62,%f40,%f30          ! (4_1) res0 *= xx0;
 487         sllx    %g5,55,%g5              ! (0_0) lexp0 = iexp0 << 55;
 488 
 489         fmuld   %f60,%f46,%f48          ! (5_1) res1 *= xx1;
 490         or      %g5,%i0,%g5             ! (0_0) lexp0 |= lexp1;
 491         stx     %g5,[%fp+tmp1]          ! (0_0) fdx0 = *((double*)lexp0);
 492 
 493         fmuld   %f56,%f54,%f26          ! (0_0) xx0 = dtmp0 * tbl_div0;
 494         sll     stridex,1,stridex2      ! stridex2 = stridex * 2;
 495 
 496         lda     [%o5]0x82,%f24          ! (2_0) ((float*)&ddx0)[0] = *px;
 497         add     %o7,TBL,%o7             ! (1_0) addr0 = (char*)TBL + si0;
 498         fmuld   %f50,%f44,%f44          ! (1_0) xx0 = dtmp0 * tbl_div0;
 499 
 500         lda     [stridex+%o5]0x82,%f25  ! (3_0) ((float*)&ddx0)[1] = *(px + stridex);
 501         add     %l5,TBL,%l5             ! (4_1) addr0 = (char*)TBL + si0;
 502         faddd   %f30,K1,%f62            ! (4_1) res0 += K1;
 503 
 504         lda     [%o5]0x82,%g1           ! (2_0) ax0 = *(int*)px;
 505         add     %o5,stridex2,%l7        ! px += stridex2
 506         faddd   %f48,K1,%f42            ! (5_1) res1 += K1;
 507 
 508         lda     [stridex+%o5]0x82,%o5   ! (3_0) ax1 = *(int*)(px + stridex);
 509         cmp     %i4,_0x7f800000         ! (1_0) ax1 ? 0x7f800000
 510         bge,pn  %icc,.update4           ! (1_0) if ( ax1 >= 0x7f800000 )
 511         fmuld   K3,%f26,%f52            ! (0_0) res0 = K3 * xx0;
 512 .cont4:
 513         fmuld   K3,%f44,%f50            ! (1_0) res1 = K3 * xx1;
 514         cmp     %i4,_0x00800000         ! (1_0) ax1 ? 0x00800000
 515         bl,pn   %icc,.update5           ! (1_0) if ( ax1 < 0x00800000 )
 516         fand    %f24,DC0,%f54           ! (2_0) dfx0 = vis_fand(ddx0,DC0);
 517 .cont5:
 518         fmuld   %f62,%f40,%f48          ! (4_1) res0 *= xx0;
 519         sra     %g1,13,%i0              ! (2_0) si0 = ax0 >> 13;
 520         cmp     %g1,_0x7f800000         ! (2_0) ax0 ? 0x7f800000
 521 
 522         fmuld   %f42,%f46,%f58          ! (5_1) res1 *= xx1;
 523         sra     %o5,13,%o1              ! (3_0) si1 = ax1 >> 13;
 524         and     %i0,2032,%i0            ! (2_0) si0 &= 0x7f0;
 525 
 526         ldd     [%i0+TBL],%f30          ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
 527         sra     %o5,24,%o3              ! (3_0) iexp1 = ax1 >> 24;
 528         and     %o1,2032,%o1            ! (3_0) si1 &= 0x7f0;
 529         fpsub32 %f24,%f54,%f12          ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);
 530 
 531         ldd     [%o1+TBL],%f46          ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
 532         sra     %g1,24,%i3              ! (2_0) iexp0 = ax0 >> 24;
 533         sub     %l0,%o3,%o3             ! (3_0) iexp1 = 0x3f - iexp1;
 534         faddd   %f52,K2,%f40            ! (0_0) res0 += K2;
 535 
 536         ldd     [%l5+8],%f42            ! (4_1) tbl_sqrt0 = ((double*)addr0)[1];
 537         sub     %l0,%i3,%g5             ! (2_0) iexp0 = 0x3f - iexp0;
 538         and     %o3,511,%i3             ! (3_0) iexp1 &= 0x1ff;
 539         faddd   %f50,K2,%f60            ! (1_0) res0 += K2;
 540 
 541         ldd     [%l6+8],%f28            ! (5_1) tbl_sqrt1 = ((double*)addr1)[1];
 542         sllx    %g5,55,%g5              ! (2_0) lexp0 = iexp0 << 55;
 543         add     %i0,TBL,%i0             ! (2_0) addr0 = (char*)TBL + si0;
 544         fitod   %f12,%f56               ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]);
 545 
 546         sllx    %i3,23,%i3              ! (3_0) lexp1 = iexp1 << 23;
 547         fitod   %f13,%f50               ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]);
 548 
 549         fmuld   %f40,%f26,%f40          ! (0_0) res0 *= xx0;
 550         or      %g5,%i3,%g5             ! (2_0) lexp0 |= lexp1;
 551         faddd   %f48,K0,%f62            ! (4_1) res0 += K0;
 552 
 553         fmuld   %f60,%f44,%f48          ! (1_0) res1 *= xx1;
 554         add     %o1,TBL,%o1             ! (3_0) addr1 = (char*)TBL + si1;
 555         stx     %g5,[%fp+tmp2]          ! (2_0) fdx0 = *((double*)lexp0);
 556         faddd   %f58,K0,%f60            ! (5_1) res1 += K0;
 557 
 558         fmuld   %f56,%f30,%f30          ! (2_0) xx0 = dtmp0 * tbl_div0;
 559         bge,pn  %icc,.update6           ! (2_0) if ( ax0 >= 0x7f800000 )
 560         lda     [%l7]0x82,%f14          ! (4_0) ((float*)&ddx0)[0] = *px;
 561 .cont6:
 562         cmp     %g1,_0x00800000         ! (2_0) ax0 ? 0x00800000
 563         bl,pn   %icc,.update7           ! (2_0) if ( ax0 < 0x00800000 )
 564         nop
 565 .cont7:
 566         fmuld   %f50,%f46,%f24          ! (3_0) xx1 = dtmp1 * tbl_div1;
 567 
 568         lda     [stridex+%l7]0x82,%f15  ! (5_0) ((float*)&ddx0)[1] = *(px + stridex);
 569         cmp     %o5,_0x7f800000         ! (3_0) ax1 ? 0x7f800000
 570         fmuld   %f42,%f62,%f58          ! (4_1) res0 = tbl_sqrt0 * res0;
 571         faddd   %f40,K1,%f46            ! (0_0) res0 += K1;
 572 
 573         lda     [%l7]0x82,%g1           ! (4_0) ax0 = *(int*)px;
 574         add     %l7,stridex2,%i1        ! px += stridex2
 575         fmuld   %f28,%f60,%f56          ! (5_1) res1 = tbl_sqrt1 * res1;
 576         faddd   %f48,K1,%f62            ! (1_0) res1 += K1;
 577 
 578         lda     [stridex+%l7]0x82,%g5   ! (5_0) ax1 = *(int*)(px + stridex);
 579         add     %o0,TBL,%o0             ! (0_0) addr0 = (char*)TBL + si0;
 580         bge,pn  %icc,.update8           ! (3_0) if ( ax1 >= 0x7f800000 )
 581         fmuld   K3,%f30,%f52            ! (2_0) res0 = K3 * xx0;
 582 .cont8:
 583         fmuld   K3,%f24,%f50            ! (3_0) res1 = K3 * xx1;
 584         cmp     %o5,_0x00800000         ! (3_0) ax1 ? 0x00800000
 585         bl,pn   %icc,.update9           ! (3_0) if ( ax1 < 0x00800000 )
 586         fand    %f14,DC0,%f16           ! (4_0) dfx0 = vis_fand(ddx0,DC0);
 587 .cont9:
 588         fmuld   %f46,%f26,%f48          ! (0_0) res0 *= xx0;
 589         sra     %g1,13,%l5              ! (4_0) si0 = ax0 >> 13;
 590         add     %i1,stridex2,%o5        ! px += stridex2
 591         fdtos   %f58,%f6                ! (4_1) ((float*)&dres0)[0] = (float)res0;
 592 
 593         fmuld   %f62,%f44,%f40          ! (1_0) res1 *= xx1;
 594         sra     %g5,13,%l6              ! (5_0) si1 = ax1 >> 13;
 595         and     %l5,2032,%l5            ! (4_0) si0 &= 0x7f0;
 596         fdtos   %f56,%f7                ! (5_1) ((float*)&dres0)[1] = (float)res1;
 597 
 598         ldd     [%l5+TBL],%f54          ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
 599         sra     %g5,24,%l7              ! (5_0) iexp1 = ax1 >> 24;
 600         and     %l6,2032,%l6            ! (5_0) si1 &= 0x7f0;
 601         fpsub32 %f14,%f16,%f16          ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);
 602 
 603         ldd     [%l6+TBL],%f46          ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
 604         sra     %g1,24,%i3              ! (4_0) iexp0 = ax0 >> 24;
 605         sub     %l0,%l7,%l7             ! (5_0) iexp1 = 0x3f - iexp1;
 606         faddd   %f52,K2,%f58            ! (2_0) res0 += K2;
 607 
 608         ldd     [%o0+8],%f42            ! (0_0) tbl_sqrt0 = ((double*)addr0)[1];
 609         and     %l7,511,%l1             ! (5_0) iexp1 = 0x1ff;
 610         add     %l6,TBL,%l6             ! (5_0) addr1 = (char*)TBL + si1;
 611         faddd   %f50,K2,%f60            ! (3_0) res1 += K2;
 612 
 613         ldd     [%o7+8],%f28            ! (1_0) tbl_sqrt1 = ((double*)addr1)[1];
 614         sllx    %l1,23,%l1              ! (5_0) lexp1 = iexp1 << 23;
 615         sub     %l0,%i3,%o0             ! (4_0) iexp0 = 0x3f - iexp0;
 616         fitod   %f16,%f56               ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]);
 617 
 618         ldd     [%fp+tmp0],%f52         ! (4_1) fdx0 = *((double*)lexp0);
 619         sllx    %o0,55,%o0              ! (4_0) lexp0 = iexp0 << 55;
 620         fitod   %f17,%f44               ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]);
 621 
 622         fmuld   %f58,%f30,%f62          ! (2_0) res0 *= xx0;
 623         or      %o0,%l1,%o0             ! (4_0) lexp0 |= lexp1;
 624         faddd   %f48,K0,%f22            ! (0_0) res0 += K0;
 625 
 626         fmuld   %f60,%f24,%f58          ! (3_0) res1 *= xx1;
 627         stx     %o0,[%fp+tmp0]          ! (4_0) fdx0 = *((double*)lexp0);
 628         faddd   %f40,K0,%f26            ! (1_0) res1 += K0;
 629 
 630         fmuld   %f56,%f54,%f40          ! (4_0) xx0 = dtmp0 * tbl_div0;
 631         fpadd32 %f6,%f52,%f10           ! (4_1) dres0 = vis_fpadd32(dres0,fdx0);
 632 
 633         or      %g0,%i2,%l7
 634         add     stridey,stridey,stridey2
 635 
 636         cmp     counter,6
 637         bl,pn   %icc,.tail
 638         nop
 639 
 640         ba      .main_loop
 641         sub     counter,6,counter       ! counter
 642 
 643         .align  16
 644 .main_loop:
 645         lda     [%i1]0x82,%f18          ! (0_0) ((float*)&ddx0)[0] = *px;
 646         cmp     %g1,_0x7f800000         ! (4_1) ax0 ? 0x7f800000
 647         bge,pn  %icc,.update10          ! (4_1) if ( ax0 >= 0x7f800000 )
 648         fmuld   %f44,%f46,%f46          ! (5_1) xx1 = dtmp1 * tbl_div1;
 649 .cont10:
 650         lda     [stridex+%i1]0x82,%f19  ! (1_0) ((float*)&ddx0)[1] = *(px + stridex);
 651         cmp     %g1,_0x00800000         ! (4_1) ax0 ? 0x00800000
 652         fmuld   %f42,%f22,%f44          ! (0_1) res0 = tbl_sqrt0 * res0;
 653         faddd   %f62,K1,%f42            ! (2_1) res0 += K1;
 654 
 655         lda     [%i1]0x82,%g1           ! (0_0) ax0 = *(int*)px;
 656         fmuld   %f28,%f26,%f60          ! (1_1) res1 = tbl_sqrt1 * res1;
 657         bl,pn   %icc,.update11          ! (4_1) if ( ax0 < 0x00800000 )
 658         faddd   %f58,K1,%f62            ! (3_1) res1 += K1;
 659 .cont11:
 660         lda     [stridex+%i1]0x82,%i4   ! (1_0) ax1 = *(int*)(px + stridex);
 661         cmp     %g5,_0x7f800000         ! (5_1) ax1 ? 0x7f800000
 662         bge,pn  %icc,.update12          ! (5_1) if ( ax1 >= 0x7f800000 )
 663         fmuld   K3,%f40,%f52            ! (4_1) res0 = K3 * xx0;
 664 .cont12:
 665         fmuld   K3,%f46,%f50            ! (5_1) res1 = K3 * xx1;
 666         cmp     %g5,_0x00800000         ! (5_1) ax1 ? 0x00800000
 667         bl,pn   %icc,.update13          ! (5_1) if ( ax1 < 0x00800000 )
 668         fand    %f18,DC0,%f56           ! (0_0) dfx0 = vis_fand(ddx0,DC0);
 669 .cont13:
 670         fmuld   %f42,%f30,%f48          ! (2_1) res0 *= xx0;
 671         sra     %g1,13,%o0              ! (0_0) si0 = ax0 >> 13;
 672         cmp     %g1,_0x7f800000         ! (0_0) ax0 ? 0x7f800000
 673         fdtos   %f44,%f8                ! (0_1) ((float*)&dres0)[0] = (float)res0;
 674 
 675         fmuld   %f62,%f24,%f58          ! (3_1) res1 *= xx1;
 676         sra     %i4,13,%g5              ! (1_0) si1 = ax1 >> 13;
 677         and     %o0,2032,%o0            ! (0_0) si0 &= 0x7f0;
 678         fdtos   %f60,%f9                ! (1_1) ((float*)&dres0)[1] = (float)res1;
 679 
 680         ldd     [%o0+TBL],%f54          ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
 681         sra     %i4,24,%i1              ! (1_0) iexp1 = ax1 >> 24;
 682         and     %g5,2032,%o7            ! (1_0) si1 &= 0x7f0;
 683         fpsub32 %f18,%f56,%f30          ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);
 684 
 685         ldd     [%o7+TBL],%f44          ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
 686         sra     %g1,24,%i3              ! (0_0) iexp0 = ax0 >> 24;
 687         sub     %l0,%i1,%i1             ! (1_0) iexp1 = 0x3f - iexp1;
 688         faddd   %f52,K2,%f62            ! (4_1) res0 += K2;
 689 
 690         ldd     [%i0+8],%f42            ! (2_1) tbl_sqrt0 = ((double*)addr0)[1];
 691         sub     %l0,%i3,%g5             ! (0_0) iexp0 = 0x3f - iexp0;
 692         bge,pn  %icc,.update14          ! (0_0) if ( ax0 >= 0x7f800000 )
 693         faddd   %f50,K2,%f60            ! (5_1) res1 += K2;
 694 .cont14:
 695         ldd     [%o1+8],%f28            ! (3_1) tbl_sqrt1 = ((double*)addr0)[1];
 696         cmp     %g1,_0x00800000         ! (0_0) ax0 ? 0x00800000
 697         and     %i1,511,%i0             ! (1_0) iexp1 = 0x1ff;
 698         fitod   %f30,%f56               ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]);
 699 
 700         ldd     [%fp+tmp1],%f52         ! (0_1) fdx0 = *((double*)lexp0);
 701         sllx    %i0,23,%i0              ! (1_0) lexp1 = iexp1 << 23;
 702         bl,pn   %icc,.update15          ! (0_0) if ( ax0 < 0x00800000 )
 703         fitod   %f31,%f50               ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]);
 704 .cont15:
 705         fmuld   %f62,%f40,%f30          ! (4_1) res0 *= xx0;
 706         sllx    %g5,55,%g5              ! (0_0) lexp0 = iexp0 << 55;
 707         st      %f10,[%l7]              ! (4_2) *py = ((float*)&dres0)[0];
 708         faddd   %f48,K0,%f62            ! (2_1) res0 += K0;
 709 
 710         fmuld   %f60,%f46,%f48          ! (5_1) res1 *= xx1;
 711         or      %g5,%i0,%g5             ! (0_0) lexp0 |= lexp1;
 712         stx     %g5,[%fp+tmp1]          ! (0_0) fdx0 = *((double*)lexp0);
 713         faddd   %f58,K0,%f60            ! (3_1) res1 += K0;
 714 
 715         fmuld   %f56,%f54,%f26          ! (0_0) xx0 = dtmp0 * tbl_div0;
 716         sll     stridex,1,stridex2      ! stridex2 = stridex * 2;
 717         st      %f11,[stridey+%l7]      ! (5_2) *(py + stridey) = ((float*)&dres0)[1];
 718         fpadd32 %f8,%f52,%f10           ! (0_1) dres0 = vis_fpadd32(dres0,fdx0);
 719 
 720         lda     [%o5]0x82,%f24          ! (2_0) ((float*)&ddx0)[0] = *px;
 721         add     %l7,stridey2,%i1        ! py += stridey2
 722         add     %o7,TBL,%o7             ! (1_0) addr0 = (char*)TBL + si0;
 723         fmuld   %f50,%f44,%f44          ! (1_0) xx0 = dtmp0 * tbl_div0;
 724 
 725         lda     [stridex+%o5]0x82,%f25  ! (3_0) ((float*)&ddx0)[1] = *(px + stridex);
 726         add     %l5,TBL,%l5             ! (4_1) addr0 = (char*)TBL + si0;
 727         fmuld   %f42,%f62,%f58          ! (2_1) res0 = tbl_sqrt0 * res0;
 728         faddd   %f30,K1,%f62            ! (4_1) res0 += K1;
 729 
 730         lda     [%o5]0x82,%g1           ! (2_0) ax0 = *(int*)px;
 731         add     %o5,stridex2,%l7        ! px += stridex2
 732         fmuld   %f28,%f60,%f56          ! (3_1) res1 = tbl_sqrt1 * res1;
 733         faddd   %f48,K1,%f42            ! (5_1) res1 += K1;
 734 
 735         lda     [stridex+%o5]0x82,%o5   ! (3_0) ax1 = *(int*)(px + stridex);
 736         cmp     %i4,_0x7f800000         ! (1_0) ax1 ? 0x7f800000
 737         bge,pn  %icc,.update16          ! (1_0) if ( ax1 >= 0x7f800000 )
 738         fmuld   K3,%f26,%f52            ! (0_0) res0 = K3 * xx0;
 739 .cont16:
 740         fmuld   K3,%f44,%f50            ! (1_0) res1 = K3 * xx1;
 741         cmp     %i4,_0x00800000         ! (1_0) ax1 ? 0x00800000
 742         bl,pn   %icc,.update17          ! (1_0) if ( ax1 < 0x00800000 )
 743         fand    %f24,DC0,%f54           ! (2_0) dfx0 = vis_fand(ddx0,DC0);
 744 .cont17:
 745         fmuld   %f62,%f40,%f48          ! (4_1) res0 *= xx0;
 746         sra     %g1,13,%i0              ! (2_0) si0 = ax0 >> 13;
 747         cmp     %g1,_0x7f800000         ! (2_0) ax0 ? 0x7f800000
 748         fdtos   %f58,%f20               ! (2_1) ((float*)&dres0)[0] = (float)res0;
 749 
 750         fmuld   %f42,%f46,%f58          ! (5_1) res1 *= xx1;
 751         sra     %o5,13,%o1              ! (3_0) si1 = ax1 >> 13;
 752         and     %i0,2032,%i0            ! (2_0) si0 &= 0x7f0;
 753         fdtos   %f56,%f21               ! (3_1) ((float*)&dres0)[0] = (float)res0;
 754 
 755         ldd     [%i0+TBL],%f30          ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
 756         sra     %o5,24,%o3              ! (3_0) iexp1 = ax1 >> 24;
 757         and     %o1,2032,%o1            ! (3_0) si1 &= 0x7f0;
 758         fpsub32 %f24,%f54,%f12          ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);
 759 
 760         ldd     [%o1+TBL],%f46          ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
 761         sra     %g1,24,%i3              ! (2_0) iexp0 = ax0 >> 24;
 762         sub     %l0,%o3,%o3             ! (3_0) iexp1 = 0x3f - iexp1;
 763         faddd   %f52,K2,%f40            ! (0_0) res0 += K2;
 764 
 765         ldd     [%l5+8],%f42            ! (4_1) tbl_sqrt0 = ((double*)addr0)[1];
 766         sub     %l0,%i3,%g5             ! (2_0) iexp0 = 0x3f - iexp0;
 767         and     %o3,511,%i3             ! (3_0) iexp1 &= 0x1ff;
 768         faddd   %f50,K2,%f60            ! (1_0) res0 += K2;
 769 
 770         ldd     [%l6+8],%f28            ! (5_1) tbl_sqrt1 = ((double*)addr1)[1];
 771         sllx    %g5,55,%g5              ! (2_0) lexp0 = iexp0 << 55;
 772         add     %i0,TBL,%i0             ! (2_0) addr0 = (char*)TBL + si0;
 773         fitod   %f12,%f56               ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]);
 774 
 775         ldd     [%fp+tmp2],%f52         ! (2_1) fdx0 = *((double*)lexp0);
 776         sllx    %i3,23,%i3              ! (3_0) lexp1 = iexp1 << 23;
 777         add     %i1,stridey2,%o3        ! py += stridey2
 778         fitod   %f13,%f50               ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]);
 779 
 780         fmuld   %f40,%f26,%f40          ! (0_0) res0 *= xx0;
 781         or      %g5,%i3,%g5             ! (2_0) lexp0 |= lexp1;
 782         st      %f10,[%i1]              ! (0_1) *py = ((float*)&dres0)[0];
 783         faddd   %f48,K0,%f62            ! (4_1) res0 += K0;
 784 
 785         fmuld   %f60,%f44,%f48          ! (1_0) res1 *= xx1;
 786         add     %o1,TBL,%o1             ! (3_0) addr1 = (char*)TBL + si1;
 787         stx     %g5,[%fp+tmp2]          ! (2_0) fdx0 = *((double*)lexp0);
 788         faddd   %f58,K0,%f60            ! (5_1) res1 += K0;
 789 
 790         fmuld   %f56,%f30,%f30          ! (2_0) xx0 = dtmp0 * tbl_div0;
 791         bge,pn  %icc,.update18          ! (2_0) if ( ax0 >= 0x7f800000 )
 792         st      %f11,[stridey+%i1]      ! (1_1) *(py + stridey) = ((float*)&dres0)[1];
 793         fpadd32 %f20,%f52,%f0           ! (2_1) dres0 = vis_fpadd32(dres0,fdx0);
 794 .cont18:
 795         cmp     %g1,_0x00800000         ! (2_0) ax0 ? 0x00800000
 796         bl,pn   %icc,.update19          ! (2_0) if ( ax0 < 0x00800000 )
 797         lda     [%l7]0x82,%f14          ! (4_0) ((float*)&ddx0)[0] = *px;
 798         fmuld   %f50,%f46,%f24          ! (3_0) xx1 = dtmp1 * tbl_div1;
 799 .cont19:
 800         lda     [stridex+%l7]0x82,%f15  ! (5_0) ((float*)&ddx0)[1] = *(px + stridex);
 801         cmp     %o5,_0x7f800000         ! (3_0) ax1 ? 0x7f800000
 802         fmuld   %f42,%f62,%f58          ! (4_1) res0 = tbl_sqrt0 * res0;
 803         faddd   %f40,K1,%f46            ! (0_0) res0 += K1;
 804 
 805         lda     [%l7]0x82,%g1           ! (4_0) ax0 = *(int*)px;
 806         add     %l7,stridex2,%i1        ! px += stridex2
 807         fmuld   %f28,%f60,%f56          ! (5_1) res1 = tbl_sqrt1 * res1;
 808         faddd   %f48,K1,%f62            ! (1_0) res1 += K1;
 809 
 810         lda     [stridex+%l7]0x82,%g5   ! (5_0) ax1 = *(int*)(px + stridex);
 811         add     %o0,TBL,%o0             ! (0_0) addr0 = (char*)TBL + si0;
 812         bge,pn  %icc,.update20          ! (3_0) if ( ax1 >= 0x7f800000 )
 813         fmuld   K3,%f30,%f52            ! (2_0) res0 = K3 * xx0;
 814 .cont20:
 815         fmuld   K3,%f24,%f50            ! (3_0) res1 = K3 * xx1;
 816         cmp     %o5,_0x00800000         ! (3_0) ax1 ? 0x00800000
 817         bl,pn   %icc,.update21          ! (3_0) if ( ax1 < 0x00800000 )
 818         fand    %f14,DC0,%f16           ! (4_0) dfx0 = vis_fand(ddx0,DC0);
 819 .cont21:
 820         fmuld   %f46,%f26,%f48          ! (0_0) res0 *= xx0;
 821         sra     %g1,13,%l5              ! (4_0) si0 = ax0 >> 13;
 822         add     %i1,stridex2,%o5        ! px += stridex2
 823         fdtos   %f58,%f6                ! (4_1) ((float*)&dres0)[0] = (float)res0;
 824 
 825         fmuld   %f62,%f44,%f40          ! (1_0) res1 *= xx1;
 826         sra     %g5,13,%l6              ! (5_0) si1 = ax1 >> 13;
 827         and     %l5,2032,%l5            ! (4_0) si0 &= 0x7f0;
 828         fdtos   %f56,%f7                ! (5_1) ((float*)&dres0)[1] = (float)res1;
 829 
 830         ldd     [%l5+TBL],%f54          ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
 831         sra     %g5,24,%l7              ! (5_0) iexp1 = ax1 >> 24;
 832         and     %l6,2032,%l6            ! (5_0) si1 &= 0x7f0;
 833         fpsub32 %f14,%f16,%f16          ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);
 834 
 835         ldd     [%l6+TBL],%f46          ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
 836         sra     %g1,24,%i3              ! (4_0) iexp0 = ax0 >> 24;
 837         sub     %l0,%l7,%l7             ! (5_0) iexp1 = 0x3f - iexp1;
 838         faddd   %f52,K2,%f58            ! (2_0) res0 += K2;
 839 
 840         ldd     [%o0+8],%f42            ! (0_0) tbl_sqrt0 = ((double*)addr0)[1];
 841         and     %l7,511,%l1             ! (5_0) iexp1 = 0x1ff;
 842         add     %l6,TBL,%l6             ! (5_0) addr1 = (char*)TBL + si1;
 843         faddd   %f50,K2,%f60            ! (3_0) res1 += K2;
 844 
 845         ldd     [%o7+8],%f28            ! (1_0) tbl_sqrt1 = ((double*)addr1)[1];
 846         sllx    %l1,23,%l1              ! (5_0) lexp1 = iexp1 << 23;
 847         sub     %l0,%i3,%o0             ! (4_0) iexp0 = 0x3f - iexp0;
 848         fitod   %f16,%f56               ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]);
 849 
 850         ldd     [%fp+tmp0],%f52         ! (4_1) fdx0 = *((double*)lexp0);
 851         sllx    %o0,55,%o0              ! (4_0) lexp0 = iexp0 << 55;
 852         add     %o3,stridey2,%l7        ! py += stridey2
 853         fitod   %f17,%f44               ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]);
 854 
 855         fmuld   %f58,%f30,%f62          ! (2_0) res0 *= xx0;
 856         or      %o0,%l1,%o0             ! (4_0) lexp0 |= lexp1;
 857         st      %f0,[%o3]               ! (2_1) *py = ((float*)&dres0)[0];
 858         faddd   %f48,K0,%f22            ! (0_0) res0 += K0;
 859 
 860         fmuld   %f60,%f24,%f58          ! (3_0) res1 *= xx1;
 861         subcc   counter,6,counter       ! counter -= 6;
 862         stx     %o0,[%fp+tmp0]          ! (4_0) fdx0 = *((double*)lexp0);
 863         faddd   %f40,K0,%f26            ! (1_0) res1 += K0;
 864 
 865         fmuld   %f56,%f54,%f40          ! (4_0) xx0 = dtmp0 * tbl_div0;
 866         st      %f1,[stridey+%o3]       ! (3_1) *(py + stridey) = ((float*)&dres0)[1];
 867         bpos,pt %icc,.main_loop
 868         fpadd32 %f6,%f52,%f10           ! (4_1) dres0 = vis_fpadd32(dres0,fdx0);
 869 
 870         add     counter,6,counter
 871 .tail:
 872         sll     stridex,1,stridex2
 873         subcc   counter,1,counter
 874         bneg,a  .begin
 875         mov     %l7,%i2
 876 
 877         fmuld   %f42,%f22,%f44          ! (0_1) res0 = tbl_sqrt0 * res0;
 878         faddd   %f62,K1,%f42            ! (2_1) res0 += K1;
 879 
 880         fmuld   %f28,%f26,%f60          ! (1_1) res1 = tbl_sqrt1 * res1;
 881 
 882         fmuld   %f42,%f30,%f48          ! (2_1) res0 *= xx0;
 883         fdtos   %f44,%f8                ! (0_1) ((float*)&dres0)[0] = (float)res0;
 884 
 885         fdtos   %f60,%f9                ! (1_1) ((float*)&dres0)[1] = (float)res1;
 886 
 887         ldd     [%i0+8],%f42            ! (2_1) tbl_sqrt0 = ((double*)addr0)[1];
 888 
 889         ldd     [%fp+tmp1],%f52         ! (0_1) fdx0 = *((double*)lexp0);
 890 
 891         st      %f10,[%l7]              ! (4_2) *py = ((float*)&dres0)[0];
 892         subcc   counter,1,counter
 893         bneg,a  .begin
 894         add     %l7,stridey,%i2
 895 
 896         faddd   %f48,K0,%f62            ! (2_1) res0 += K0;
 897         st      %f11,[stridey+%l7]      ! (5_2) *(py + stridey) = ((float*)&dres0)[1];
 898         subcc   counter,1,counter
 899         bneg,a  .begin
 900         add     %l7,stridey2,%i2
 901         fpadd32 %f8,%f52,%f10           ! (0_1) dres0 = vis_fpadd32(dres0,fdx0);
 902 
 903         add     %l7,stridey2,%i1        ! py += stridey2
 904 
 905         fmuld   %f42,%f62,%f58          ! (2_1) res0 = tbl_sqrt0 * res0;
 906 
 907         fdtos   %f58,%f20               ! (2_1) ((float*)&dres0)[0] = (float)res0;
 908 
 909         ldd     [%fp+tmp2],%f52         ! (2_1) fdx0 = *((double*)lexp0);
 910         add     %i1,stridey2,%o3        ! py += stridey2
 911 
 912         st      %f10,[%i1]              ! (0_1) *py = ((float*)&dres0)[0];
 913         subcc   counter,1,counter
 914         bneg,a  .begin
 915         add     %i1,stridey,%i2
 916 
 917         st      %f11,[stridey+%i1]      ! (1_1) *(py + stridey) = ((float*)&dres0)[1];
 918         subcc   counter,1,counter
 919         bneg,a  .begin
 920         mov     %o3,%i2
 921         fpadd32 %f20,%f52,%f0           ! (2_1) dres0 = vis_fpadd32(dres0,fdx0);
 922 
 923         st      %f0,[%o3]               ! (2_1) *py = ((float*)&dres0)[0];
 924         ba      .begin
 925         add     %o3,stridey,%i2
 926 
 927         .align  16
 928 .spec0:
 929         fdivs   FONE,%f14,%f14          ! x0 = FONE / x0;
 930         add     %l7,stridex,%l7         ! px += stridex
 931         st      %f14,[%i2]              ! *py = x0;
 932         sub     counter,1,counter
 933         ba      .begin1
 934         add     %i2,stridey,%i2         ! py += stridey
 935 
 936         .align  16
 937 .spec1:
 938         andcc   %g1,%o0,%g0
 939         bz,a    1f
 940         fdivs   FONE,%f14,%f14          ! x0 = DONE / x0;
 941 
 942         cmp     %g1,0
 943         bl,a    1f
 944         fsqrts  %f14,%f14               ! x0 = sqrtf(x0);
 945 
 946         fitod   %f14,%f0
 947         fdtos   %f0,%f14
 948         fmuls   %f14,FTWO,%f14
 949         st      %f14,[%fp+tmp3]
 950         ld      [%fp+tmp3],%g1
 951         sethi   %hi(0x4b000000),%o0
 952         sra     %g1,13,%l5              ! (4_0) si0 = ax0 >> 13;
 953         fands   %f14,DC0,%f16           ! (4_0) dfx0 = vis_fand(ddx0,DC0);
 954         ba      .cont_spec
 955         sub     %g1,%o0,%g1
 956 1:
 957         add     %l7,stridex,%l7         ! px += stridex
 958         sub     counter,1,counter
 959         st      %f14,[%i2]              ! *py = x0;
 960         ba      .begin1
 961         add     %i2,stridey,%i2         ! py += stridey
 962 
 963         .align  16
 964 .update0:
 965         cmp     counter,1
 966         ble     .cont0
 967         nop
 968 
 969         sub     %i1,stridex,%o1
 970         stx     %o1,[%fp+tmp_px]
 971 
 972         sub     counter,1,counter
 973         st      counter,[%fp+tmp_counter]
 974 
 975         ba      .cont0
 976         mov     1,counter
 977 
 978         .align  16
 979 .update1:
 980         sethi   %hi(0x7ffffc00),%o0
 981         cmp     counter,1
 982         ble     .cont1
 983 
 984         add     %o0,0x3ff,%o0
 985 
 986         andcc   %g5,%o0,%g0
 987         bz,a    1f
 988         nop
 989 
 990         cmp     %g5,0
 991         bl,a    1f
 992         nop
 993 
 994         fitod   %f15,%f0
 995         fdtos   %f0,%f15
 996         fmuls   %f15,FTWO,%f15
 997         st      %f15,[%fp+tmp3]
 998         ld      [%fp+tmp3],%g5
 999         sethi   %hi(0x4b000000),%o0
1000         sub     %g5,%o0,%g5
1001 
1002         fands   %f15,DC0,%f17           ! (4_0) dfx0 = vis_fand(ddx0,DC0);
1003 
1004         sra     %g5,13,%l6              ! (5_0) si1 = ax1 >> 13;
1005 
1006         sra     %g5,24,%l7              ! (5_0) iexp1 = ax1 >> 24;
1007         and     %l6,2032,%l6            ! (5_0) si1 &= 0x7f0;
1008 
1009         fpsub32s        %f15,%f17,%f17  ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1010 
1011         ldd     [%l6+TBL],%f46          ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
1012         sub     %l0,%l7,%l1             ! (5_0) iexp1 = 0x3f - iexp1;
1013 
1014         sll     %l1,23,%l1              ! (5_0) lexp1 = iexp1 << 23;
1015         add     %l6,TBL,%l6             ! (5_0) addr1 = (char*)TBL + si1;
1016         st      %l1,[%fp+tmp0+4]        ! (4_0) fdx0 = *((double*)lexp0);
1017         fitod   %f17,%f44               ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]);
1018 
1019         fmuld   %f44,%f46,%f46          ! (5_1) xx1 = dtmp1 * tbl_div1;
1020 
1021         ba      .cont1
1022         fmuld   K3,%f46,%f50            ! (5_1) res1 = K3 * xx1;
1023 1:
1024         sub     %i1,stridex,%o1
1025         stx     %o1,[%fp+tmp_px]
1026 
1027         sub     counter,1,counter
1028         st      counter,[%fp+tmp_counter]
1029 
1030         ba      .cont1
1031         mov     1,counter
1032 
1033         .align  16
1034 .update2:
1035         cmp     counter,2
1036         ble     .cont2
1037         sub     %o5,stridex,%o1
1038 
1039         sub     %o1,stridex,%o1
1040         stx     %o1,[%fp+tmp_px]
1041 
1042         sub     counter,2,counter
1043         st      counter,[%fp+tmp_counter]
1044 
1045         ba      .cont2
1046         mov     2,counter
1047 
1048         .align  16
1049 .update3:
1050         sethi   %hi(0x7ffffc00),%o1
1051         cmp     counter,2
1052         ble     .cont3
1053 
1054         add     %o1,0x3ff,%o1
1055 
1056         andcc   %g1,%o1,%g0
1057         bz,a    1f
1058         sub     %o5,stridex,%o1
1059 
1060         cmp     %g1,0
1061         bl,a    1f
1062         sub     %o5,stridex,%o1
1063 
1064         fitod   %f18,%f0
1065         fdtos   %f0,%f18
1066         fmuls   %f18,FTWO,%f18
1067         st      %f18,[%fp+tmp3]
1068         ld      [%fp+tmp3],%g1
1069         sethi   %hi(0x4b000000),%o1
1070         sub     %g1,%o1,%g1
1071 
1072         fand    %f18,DC0,%f56           ! (0_0) dfx0 = vis_fand(ddx0,DC0);
1073         sra     %g1,13,%o0              ! (0_0) si0 = ax0 >> 13;
1074 
1075         and     %o0,2032,%o0            ! (0_0) si0 &= 0x7f0;
1076 
1077         ldd     [%o0+TBL],%f54          ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
1078         fpsub32 %f18,%f56,%f30          ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1079 
1080         sra     %g1,24,%i3              ! (0_0) iexp0 = ax0 >> 24;
1081         sub     %l0,%i3,%g5             ! (0_0) iexp0 = 0x3f - iexp0;
1082         ba      .cont3
1083         fitod   %f30,%f56               ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]);
1084 1:
1085         sub     %o1,stridex,%o1
1086         stx     %o1,[%fp+tmp_px]
1087 
1088         sub     counter,2,counter
1089         st      counter,[%fp+tmp_counter]
1090 
1091         ba      .cont3
1092         mov     2,counter
1093 
1094         .align  16
1095 .update4:
1096         cmp     counter,3
1097         ble     .cont4
1098         sub     %l7,stridex2,%o1
1099 
1100         sub     %o1,stridex,%o1
1101         stx     %o1,[%fp+tmp_px]
1102 
1103         sub     counter,3,counter
1104         st      counter,[%fp+tmp_counter]
1105 
1106         ba      .cont4
1107         mov     3,counter
1108 
1109         .align  16
1110 .update5:
1111         sethi   %hi(0x7ffffc00),%o1
1112         cmp     counter,3
1113         ble     .cont5
1114 
1115         add     %o1,0x3ff,%o1
1116 
1117         andcc   %i4,%o1,%g0
1118         bz,a    1f
1119         sub     %l7,stridex2,%o1
1120 
1121         cmp     %i4,0
1122         bl,a    1f
1123         sub     %l7,stridex2,%o1
1124 
1125         fitod   %f19,%f0
1126         fdtos   %f0,%f19
1127         fmuls   %f19,FTWO,%f19
1128         st      %f19,[%fp+tmp3]
1129         ld      [%fp+tmp3],%i4
1130         sethi   %hi(0x4b000000),%o1
1131         sub     %i4,%o1,%i4
1132 
1133         fands   %f19,DC0,%f0            ! (0_0) dfx0 = vis_fand(ddx0,DC0);
1134 
1135         sra     %i4,13,%g5              ! (1_0) si1 = ax1 >> 13;
1136 
1137         sra     %i4,24,%i1              ! (1_0) iexp1 = ax1 >> 24;
1138         and     %g5,2032,%o7            ! (1_0) si1 &= 0x7f0;
1139         fpsub32s        %f19,%f0,%f31   ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1140 
1141         ldd     [%o7+TBL],%f44          ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
1142         sub     %l0,%i1,%i0             ! (1_0) iexp1 = 0x3f - iexp1;
1143 
1144         sll     %i0,23,%i0              ! (1_0) lexp1 = iexp1 << 23;
1145         fitod   %f31,%f50               ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]);
1146 
1147         st      %i0,[%fp+tmp1+4]        ! (0_0) fdx0 = *((double*)lexp0);
1148 
1149         add     %o7,TBL,%o7             ! (1_0) addr0 = (char*)TBL + si0;
1150         fmuld   %f50,%f44,%f44          ! (1_0) xx0 = dtmp0 * tbl_div0;
1151 
1152         ba      .cont5
1153         fmuld   K3,%f44,%f50            ! (1_0) res1 = K3 * xx1;
1154 1:
1155         sub     %o1,stridex,%o1
1156         stx     %o1,[%fp+tmp_px]
1157 
1158         sub     counter,3,counter
1159         st      counter,[%fp+tmp_counter]
1160 
1161         ba      .cont5
1162         mov     3,counter
1163 
1164         .align  16
1165 .update6:
1166         cmp     counter,4
1167         ble     .cont6
1168         sub     %l7,stridex,%o3
1169 
1170         sub     %o3,stridex,%o3
1171         stx     %o3,[%fp+tmp_px]
1172 
1173         sub     counter,4,counter
1174         st      counter,[%fp+tmp_counter]
1175 
1176         ba      .cont6
1177         mov     4,counter
1178 
1179         .align  16
1180 .update7:
1181         sethi   %hi(0x7ffffc00),%o3
1182         cmp     counter,4
1183         ble     .cont7
1184 
1185         add     %o3,0x3ff,%o3
1186 
1187         andcc   %g1,%o3,%g0
1188         bz,a    1f
1189         sub     %l7,stridex,%o3
1190 
1191         cmp     %g1,0
1192         bl,a    1f
1193         sub     %l7,stridex,%o3
1194 
1195         fitod   %f24,%f0
1196         fdtos   %f0,%f24
1197         fmuls   %f24,FTWO,%f24
1198         st      %f24,[%fp+tmp3]
1199         ld      [%fp+tmp3],%g1
1200         sethi   %hi(0x4b000000),%o3
1201         sub     %g1,%o3,%g1
1202 
1203         fands   %f24,DC0,%f0            ! (2_0) dfx0 = vis_fand(ddx0,DC0);
1204         sra     %g1,13,%i0              ! (2_0) si0 = ax0 >> 13;
1205 
1206         and     %i0,2032,%i0            ! (2_0) si0 &= 0x7f0;
1207 
1208         ldd     [%i0+TBL],%f30          ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
1209         fpsub32s        %f24,%f0,%f12   ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1210 
1211         sra     %g1,24,%i3              ! (2_0) iexp0 = ax0 >> 24;
1212 
1213         sub     %l0,%i3,%g5             ! (2_0) iexp0 = 0x3f - iexp0;
1214 
1215         sll     %g5,23,%g5              ! (2_0) lexp0 = iexp0 << 55;
1216         add     %i0,TBL,%i0             ! (2_0) addr0 = (char*)TBL + si0;
1217         fitod   %f12,%f56               ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]);
1218 
1219         st      %g5,[%fp+tmp2]          ! (2_0) fdx0 = *((double*)lexp0);
1220         ba      .cont7
1221         fmuld   %f56,%f30,%f30          ! (2_0) xx0 = dtmp0 * tbl_div0;
1222 1:
1223         sub     %o3,stridex,%o3
1224         stx     %o3,[%fp+tmp_px]
1225 
1226         sub     counter,4,counter
1227         st      counter,[%fp+tmp_counter]
1228 
1229         ba      .cont7
1230         mov     4,counter
1231 
1232         .align  16
1233 .update8:
1234         cmp     counter,5
1235         ble     .cont8
1236         nop
1237 
1238         sub     %l7,stridex,%o3
1239         stx     %o3,[%fp+tmp_px]
1240 
1241         sub     counter,5,counter
1242         st      counter,[%fp+tmp_counter]
1243 
1244         ba      .cont8
1245         mov     5,counter
1246 
1247         .align  16
1248 .update9:
1249         sethi   %hi(0x7ffffc00),%o3
1250         cmp     counter,5
1251         ble     .cont9
1252         sub     %l7,stridex,%i3
1253 
1254         add     %o3,0x3ff,%o3
1255 
1256         andcc   %o5,%o3,%g0
1257         bz      1f
1258         ld      [%i3],%f0
1259 
1260         cmp     %o5,0
1261         bl,a    1f
1262         nop
1263 
1264         fitod   %f0,%f0
1265         fdtos   %f0,%f0
1266         fmuls   %f0,FTWO,%f0
1267         st      %f0,[%fp+tmp3]
1268         ld      [%fp+tmp3],%o5
1269         sethi   %hi(0x4b000000),%o3
1270         sub     %o5,%o3,%o5
1271 
1272         fands   %f0,DC0,%f8             ! (2_0) dfx0 = vis_fand(ddx0,DC0);
1273 
1274         sra     %o5,13,%o1              ! (3_0) si1 = ax1 >> 13;
1275 
1276         sra     %o5,24,%o3              ! (3_0) iexp1 = ax1 >> 24;
1277         and     %o1,2032,%o1            ! (3_0) si1 &= 0x7f0;
1278         fpsub32s        %f0,%f8,%f0     ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1279 
1280         ldd     [%o1+TBL],%f8           ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
1281         sub     %l0,%o3,%i3             ! (3_0) iexp1 = 0x3f - iexp1;
1282 
1283         sllx    %i3,23,%i3              ! (3_0) lexp1 = iexp1 << 23;
1284         fitod   %f0,%f50                ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]);
1285 
1286         add     %o1,TBL,%o1             ! (3_0) addr1 = (char*)TBL + si1;
1287         st      %i3,[%fp+tmp2+4]        ! (2_0) fdx0 = *((double*)lexp0);
1288 
1289         fmuld   %f50,%f8,%f24           ! (3_0) xx1 = dtmp1 * tbl_div1;
1290 
1291         ba      .cont9
1292         fmuld   K3,%f24,%f50            ! (3_0) res1 = K3 * xx1;
1293 1:
1294         stx     %i3,[%fp+tmp_px]
1295 
1296         sub     counter,5,counter
1297         st      counter,[%fp+tmp_counter]
1298 
1299         ba      .cont9
1300         mov     5,counter
1301 
1302         .align  16
1303 .update10:
1304         cmp     counter,0
1305         ble     .cont10
1306         sub     %i1,stridex,%o3
1307 
1308         sub     %o3,stridex,%o3
1309         stx     %o3,[%fp+tmp_px]
1310 
1311         st      counter,[%fp+tmp_counter]
1312 
1313         ba      .cont10
1314         mov     0,counter
1315 
1316         .align  16
1317 .update11:
1318         sethi   %hi(0x7ffffc00),%i4
1319         cmp     counter,0
1320         ble     .cont11
1321         sub     %i1,stridex,%o3
1322 
1323         sub     %o3,stridex,%o3
1324         add     %i4,0x3ff,%i4
1325         ld      [%o3],%i3
1326 
1327         andcc   %i3,%i4,%g0
1328         bz      1f
1329 
1330         cmp     %i3,0
1331         bl,a    1f
1332         nop
1333 
1334         fitod   %f14,%f0
1335         fdtos   %f0,%f14
1336         fmuls   %f14,FTWO,%f14
1337         st      %f14,[%fp+tmp3]
1338         ld      [%fp+tmp3],%i3
1339         sethi   %hi(0x4b000000),%o3
1340         sub     %i3,%o3,%i3
1341 
1342         fands   %f14,DC0,%f16           ! (4_0) dfx0 = vis_fand(ddx0,DC0);
1343         sra     %i3,13,%l5              ! (4_0) si0 = ax0 >> 13;
1344 
1345         and     %l5,2032,%l5            ! (4_0) si0 &= 0x7f0;
1346 
1347         ldd     [%l5+TBL],%f54          ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
1348         fpsub32s        %f14,%f16,%f16  ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1349 
1350         sra     %i3,24,%i3              ! (4_0) iexp0 = ax0 >> 24;
1351 
1352         sub     %l0,%i3,%o0             ! (4_0) iexp0 = 0x3f - iexp0;
1353         fitod   %f16,%f56               ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]);
1354 
1355         sllx    %o0,23,%o0              ! (4_0) lexp0 = iexp0 << 55;
1356 
1357         st      %o0,[%fp+tmp0]          ! (4_0) fdx0 = *((double*)lexp0);
1358 
1359         ba      .cont11
1360         fmuld   %f56,%f54,%f40          ! (4_0) xx0 = dtmp0 * tbl_div0;
1361 1:
1362         stx     %o3,[%fp+tmp_px]
1363 
1364         st      counter,[%fp+tmp_counter]
1365 
1366         ba      .cont11
1367         mov     0,counter
1368 
1369         .align  16
1370 .update12:
1371         cmp     counter,1
1372         ble     .cont12
1373         nop
1374 
1375         sub     %i1,stridex,%i1
1376         stx     %i1,[%fp+tmp_px]
1377 
1378         sub     counter,1,counter
1379         st      counter,[%fp+tmp_counter]
1380 
1381         ba      .cont12
1382         mov     1,counter
1383 
1384         .align  16
1385 .update13:
1386         sethi   %hi(0x7ffffc00),%o3
1387         cmp     counter,1
1388         ble     .cont13
1389 
1390         add     %o3,0x3ff,%o3
1391 
1392         andcc   %g5,%o3,%g0
1393         bz      1f
1394 
1395         cmp     %g5,0
1396         bl,a    1f
1397         nop
1398 
1399         fitod   %f15,%f0
1400         fdtos   %f0,%f15
1401         fmuls   %f15,FTWO,%f15
1402         st      %f15,[%fp+tmp3]
1403         ld      [%fp+tmp3],%g5
1404         sethi   %hi(0x4b000000),%o3
1405         sub     %g5,%o3,%g5
1406 
1407         fands   %f15,DC0,%f17           ! (4_0) dfx0 = vis_fand(ddx0,DC0);
1408 
1409         sra     %g5,13,%l6              ! (5_0) si1 = ax1 >> 13;
1410         sra     %g5,24,%o3              ! (5_0) iexp1 = ax1 >> 24;
1411         and     %l6,2032,%l6            ! (5_0) si1 &= 0x7f0;
1412         fpsub32s        %f15,%f17,%f17  ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1413 
1414         ldd     [%l6+TBL],%f46          ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
1415         sub     %l0,%o3,%l1             ! (5_0) iexp1 = 0x3f - iexp1;
1416 
1417         add     %l6,TBL,%l6             ! (5_0) addr1 = (char*)TBL + si1;
1418 
1419         sllx    %l1,23,%l1              ! (5_0) lexp1 = iexp1 << 23;
1420         st      %l1,[%fp+tmp0+4]        ! (4_0) fdx0 = *((double*)lexp0);
1421 
1422         fitod   %f17,%f0                ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]);
1423 
1424         fmuld   %f0,%f46,%f46           ! (5_1) xx1 = dtmp1 * tbl_div1;
1425         ba      .cont13
1426         fmuld   K3,%f46,%f50            ! (5_1) res1 = K3 * xx1;
1427 1:
1428         sub     %i1,stridex,%i1
1429         stx     %i1,[%fp+tmp_px]
1430 
1431         sub     counter,1,counter
1432         st      counter,[%fp+tmp_counter]
1433 
1434         ba      .cont13
1435         mov     1,counter
1436 
1437         .align  16
1438 .update14:
1439         cmp     counter,2
1440         ble     .cont14
1441         sub     %o5,stridex,%o3
1442 
1443         sub     %o3,stridex,%o3
1444         stx     %o3,[%fp+tmp_px]
1445 
1446         sub     counter,2,counter
1447         st      counter,[%fp+tmp_counter]
1448 
1449         ba      .cont14
1450         mov     2,counter
1451 
1452         .align  16
1453 .update15:
1454         sethi   %hi(0x7ffffc00),%i3
1455         cmp     counter,2
1456         ble     .cont15
1457         sub     %o5,stridex,%o3
1458 
1459         add     %i3,0x3ff,%i3
1460 
1461         andcc   %g1,%i3,%g0
1462         bz      1f
1463         sub     %o3,stridex,%o3
1464 
1465         cmp     %g1,0
1466         bl,a    1f
1467         nop
1468 
1469         fitod   %f18,%f0
1470         fdtos   %f0,%f18
1471         fmuls   %f18,FTWO,%f18
1472         st      %f18,[%fp+tmp3]
1473         ld      [%fp+tmp3],%g1
1474         sethi   %hi(0x4b000000),%o3
1475         sub     %g1,%o3,%g1
1476 
1477         fands   %f18,DC0,%f0            ! (0_0) dfx0 = vis_fand(ddx0,DC0);
1478         sra     %g1,13,%o0              ! (0_0) si0 = ax0 >> 13;
1479         and     %o0,2032,%o0            ! (0_0) si0 &= 0x7f0;
1480 
1481         ldd     [%o0+TBL],%f54          ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
1482         fpsub32s        %f18,%f0,%f30   ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1483 
1484         sra     %g1,24,%i3              ! (0_0) iexp0 = ax0 >> 24;
1485 
1486         sub     %l0,%i3,%g5             ! (0_0) iexp0 = 0x3f - iexp0;
1487 
1488         ba      .cont15
1489         fitod   %f30,%f56               ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]);
1490 1:
1491         stx     %o3,[%fp+tmp_px]
1492 
1493         sub     counter,2,counter
1494         st      counter,[%fp+tmp_counter]
1495 
1496         ba      .cont15
1497         mov     2,counter
1498 
1499         .align  16
1500 .update16:
1501         cmp     counter,3
1502         ble     .cont16
1503         sub     %l7,stridex2,%o3
1504 
1505         sub     %o3,stridex,%o3
1506         stx     %o3,[%fp+tmp_px]
1507 
1508         sub     counter,3,counter
1509         st      counter,[%fp+tmp_counter]
1510 
1511         ba      .cont16
1512         mov     3,counter
1513 
1514         .align  16
1515 .update17:
1516         sethi   %hi(0x7ffffc00),%i3
1517         cmp     counter,3
1518         ble     .cont17
1519         sub     %l7,stridex2,%o3
1520 
1521         add     %i3,0x3ff,%i3
1522 
1523         andcc   %i4,%i3,%g0
1524         bz      1f
1525         sub     %o3,stridex,%o3
1526 
1527         cmp     %i4,0
1528         bl,a    1f
1529         nop
1530 
1531         fitod   %f19,%f0
1532         fdtos   %f0,%f19
1533         fmuls   %f19,FTWO,%f19
1534         st      %f19,[%fp+tmp3]
1535         ld      [%fp+tmp3],%i4
1536         sethi   %hi(0x4b000000),%o3
1537         sub     %i4,%o3,%i4
1538 
1539         fands   %f19,DC0,%f0            ! (0_0) dfx0 = vis_fand(ddx0,DC0);
1540 
1541         sra     %i4,13,%g5              ! (1_0) si1 = ax1 >> 13;
1542 
1543         sra     %i4,24,%i0              ! (1_0) iexp1 = ax1 >> 24;
1544         and     %g5,2032,%o7            ! (1_0) si1 &= 0x7f0;
1545         fpsub32s        %f19,%f0,%f31   ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1546 
1547         ldd     [%o7+TBL],%f44          ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
1548         sub     %l0,%i0,%i0             ! (1_0) iexp1 = 0x3f - iexp1;
1549 
1550         sllx    %i0,23,%i0              ! (1_0) lexp1 = iexp1 << 23;
1551         fitod   %f31,%f50               ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]);
1552 
1553         st      %i0,[%fp+tmp1+4]        ! (0_0) fdx0 = *((double*)lexp0);
1554 
1555         add     %o7,TBL,%o7             ! (1_0) addr0 = (char*)TBL + si0;
1556         fmuld   %f50,%f44,%f44          ! (1_0) xx0 = dtmp0 * tbl_div0;
1557 
1558         ba      .cont17
1559         fmuld   K3,%f44,%f50            ! (1_0) res1 = K3 * xx1;
1560 1:
1561         stx     %o3,[%fp+tmp_px]
1562 
1563         sub     counter,3,counter
1564         st      counter,[%fp+tmp_counter]
1565 
1566         ba      .cont17
1567         mov     3,counter
1568 
1569         .align  16
1570 .update18:
1571         cmp     counter,4
1572         ble     .cont18
1573         fpadd32 %f20,%f52,%f0           ! (2_1) dres0 = vis_fpadd32(dres0,fdx0);
1574 
1575         sub     %l7,stridex2,%i3
1576         stx     %i3,[%fp+tmp_px]
1577 
1578         sub     counter,4,counter
1579         st      counter,[%fp+tmp_counter]
1580 
1581         ba      .cont18
1582         mov     4,counter
1583 
1584         .align  16
1585 .update19:
1586         sethi   %hi(0x7ffffc00),%i3
1587         cmp     counter,4
1588         ble,a   .cont19
1589         fmuld   %f50,%f46,%f24          ! (3_0) xx1 = dtmp1 * tbl_div1;
1590 
1591         add     %i3,0x3ff,%i3
1592 
1593         andcc   %g1,%i3,%g0
1594         bz      1f
1595         nop
1596 
1597         cmp     %g1,0
1598         bl,a    1f
1599         nop
1600 
1601         fitod   %f24,%f24
1602         fdtos   %f24,%f24
1603         fmuls   %f24,FTWO,%f24
1604         st      %f24,[%fp+tmp3]
1605         ld      [%fp+tmp3],%g1
1606         sethi   %hi(0x4b000000),%i3
1607         sub     %g1,%i3,%g1
1608 
1609         fands   %f24,DC0,%f8            ! (2_0) dfx0 = vis_fand(ddx0,DC0);
1610         sra     %g1,13,%i0              ! (2_0) si0 = ax0 >> 13;
1611 
1612         and     %i0,2032,%i0            ! (2_0) si0 &= 0x7f0;
1613 
1614         ldd     [%i0+TBL],%f30          ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0];
1615         fpsub32s        %f24,%f8,%f12   ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1616 
1617         sra     %g1,24,%i3              ! (2_0) iexp0 = ax0 >> 24;
1618 
1619         sub     %l0,%i3,%g5             ! (2_0) iexp0 = 0x3f - iexp0;
1620 
1621         sllx    %g5,23,%g5              ! (2_0) lexp0 = iexp0 << 55;
1622         add     %i0,TBL,%i0             ! (2_0) addr0 = (char*)TBL + si0;
1623         fitod   %f12,%f56               ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]);
1624 
1625         st      %g5,[%fp+tmp2]          ! (2_0) fdx0 = *((double*)lexp0);
1626         fmuld   %f56,%f30,%f30          ! (2_0) xx0 = dtmp0 * tbl_div0;
1627 
1628         ba      .cont19
1629         fmuld   %f50,%f46,%f24          ! (3_0) xx1 = dtmp1 * tbl_div1;
1630 1:
1631         sub     %l7,stridex2,%i3
1632         stx     %i3,[%fp+tmp_px]
1633 
1634         sub     counter,4,counter
1635         st      counter,[%fp+tmp_counter]
1636 
1637         mov     4,counter
1638         ba      .cont19
1639         fmuld   %f50,%f46,%f24          ! (3_0) xx1 = dtmp1 * tbl_div1;
1640 
1641         .align  16
1642 .update20:
1643         cmp     counter,5
1644         ble     .cont20
1645         nop
1646 
1647         sub     %l7,stridex,%i3
1648         stx     %i3,[%fp+tmp_px]
1649 
1650         sub     counter,5,counter
1651         st      counter,[%fp+tmp_counter]
1652 
1653         ba      .cont20
1654         mov     5,counter
1655 
1656         .align  16
1657 .update21:
1658         sethi   %hi(0x7ffffc00),%i3
1659         cmp     counter,5
1660         ble,a   .cont21
1661         nop
1662 
1663         sub     %l7,stridex,%i4
1664         add     %i3,0x3ff,%i3
1665 
1666         andcc   %o5,%i3,%g0
1667         bz      1f
1668         ld      [%i4],%f8
1669 
1670         cmp     %o5,0
1671         bl,a    1f
1672         nop
1673 
1674         fitod   %f8,%f8
1675         fdtos   %f8,%f8
1676         fmuls   %f8,FTWO,%f8
1677         st      %f8,[%fp+tmp3]
1678         ld      [%fp+tmp3],%o5
1679         sethi   %hi(0x4b000000),%i3
1680         sub     %o5,%i3,%o5
1681 
1682         fands   %f8,DC0,%f24            ! (2_0) dfx0 = vis_fand(ddx0,DC0);
1683 
1684         sra     %o5,13,%o1              ! (3_0) si1 = ax1 >> 13;
1685 
1686         sra     %o5,24,%i3              ! (3_0) iexp1 = ax1 >> 24;
1687         and     %o1,2032,%o1            ! (3_0) si1 &= 0x7f0;
1688         fpsub32s        %f8,%f24,%f24   ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0);
1689 
1690         ldd     [%o1+TBL],%f8           ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0];
1691         sub     %l0,%i3,%i3             ! (3_0) iexp1 = 0x3f - iexp1;
1692 
1693         sllx    %i3,23,%i3              ! (3_0) lexp1 = iexp1 << 23;
1694         fitod   %f24,%f50               ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]);
1695 
1696         add     %o1,TBL,%o1             ! (3_0) addr1 = (char*)TBL + si1;
1697         st      %i3,[%fp+tmp2+4]        ! (2_0) fdx0 = *((double*)lexp0);
1698 
1699         fmuld   %f50,%f8,%f24           ! (3_0) xx1 = dtmp1 * tbl_div1;
1700 
1701         ba      .cont21
1702         fmuld   K3,%f24,%f50            ! (3_0) res1 = K3 * xx1;
1703 1:
1704         sub     %l7,stridex,%i3
1705         stx     %i3,[%fp+tmp_px]
1706 
1707         sub     counter,5,counter
1708         st      counter,[%fp+tmp_counter]
1709 
1710         ba      .cont21
1711         mov     5,counter
1712 
1713         .align  16
1714 .exit:
1715         ret
1716         restore
1717 
1718         SET_SIZE(__vrsqrtf)
1719