1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  23  */
  24 /*
  25  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  26  * Use is subject to license terms.
  27  */
  28 
  29         .file   "__vcos.S"
  30 
  31 #include "libm.h"
  32 
  33         RO_DATA
  34         .align  64
  35 constants:
  36         .word   0x3ec718e3,0xa6972785
  37         .word   0x3ef9fd39,0x94293940
  38         .word   0xbf2a019f,0x75ee4be1
  39         .word   0xbf56c16b,0xba552569
  40         .word   0x3f811111,0x1108c703
  41         .word   0x3fa55555,0x554f5b35
  42         .word   0xbfc55555,0x555554d0
  43         .word   0xbfdfffff,0xffffff85
  44         .word   0x3ff00000,0x00000000
  45         .word   0xbfc55555,0x5551fc28
  46         .word   0x3f811107,0x62eacc9d
  47         .word   0xbfdfffff,0xffff6328
  48         .word   0x3fa55551,0x5f7acf0c
  49         .word   0x3fe45f30,0x6dc9c883
  50         .word   0x43380000,0x00000000
  51         .word   0x3ff921fb,0x54400000
  52         .word   0x3dd0b461,0x1a600000
  53         .word   0x3ba3198a,0x2e000000
  54         .word   0x397b839a,0x252049c1
  55         .word   0x80000000,0x00004000
  56         .word   0xffff8000,0x00000000   ! N.B.: low-order words used
  57         .word   0x3fc90000,0x80000000   ! for sign bit hacking; see
  58         .word   0x3fc40000,0x00000000   ! references to "thresh" below
  59 
  60 #define p4              0x0
  61 #define q4              0x08
  62 #define p3              0x10
  63 #define q3              0x18
  64 #define p2              0x20
  65 #define q2              0x28
  66 #define p1              0x30
  67 #define q1              0x38
  68 #define one             0x40
  69 #define pp1             0x48
  70 #define pp2             0x50
  71 #define qq1             0x58
  72 #define qq2             0x60
  73 #define invpio2         0x68
  74 #define round           0x70
  75 #define pio2_1          0x78
  76 #define pio2_2          0x80
  77 #define pio2_3          0x88
  78 #define pio2_3t         0x90
  79 #define f30val          0x98
  80 #define mask            0xa0
  81 #define thresh          0xa8
  82 
  83 ! local storage indices
  84 
  85 #define xsave           STACK_BIAS-0x8
  86 #define ysave           STACK_BIAS-0x10
  87 #define nsave           STACK_BIAS-0x14
  88 #define sxsave          STACK_BIAS-0x18
  89 #define sysave          STACK_BIAS-0x1c
  90 #define biguns          STACK_BIAS-0x20
  91 #define n2              STACK_BIAS-0x24
  92 #define n1              STACK_BIAS-0x28
  93 #define n0              STACK_BIAS-0x2c
  94 #define x2_1            STACK_BIAS-0x40
  95 #define x1_1            STACK_BIAS-0x50
  96 #define x0_1            STACK_BIAS-0x60
  97 #define y2_0            STACK_BIAS-0x70
  98 #define y1_0            STACK_BIAS-0x80
  99 #define y0_0            STACK_BIAS-0x90
 100 ! sizeof temp storage - must be a multiple of 16 for V9
 101 #define tmps            0x90
 102 
 103 !--------------------------------------------------------------------
 104 ! define pipes for easier reading
 105 
 106 #define P0_f0           %f0
 107 #define P0_f1           %f1
 108 #define P0_f2           %f2
 109 #define P0_f3           %f3
 110 #define P0_f4           %f4
 111 #define P0_f5           %f5
 112 #define P0_f6           %f6
 113 #define P0_f7           %f7
 114 #define P0_f8           %f8
 115 #define P0_f9           %f9
 116 
 117 #define P1_f10          %f10
 118 #define P1_f11          %f11
 119 #define P1_f12          %f12
 120 #define P1_f13          %f13
 121 #define P1_f14          %f14
 122 #define P1_f15          %f15
 123 #define P1_f16          %f16
 124 #define P1_f17          %f17
 125 #define P1_f18          %f18
 126 #define P1_f19          %f19
 127 
 128 #define P2_f20          %f20
 129 #define P2_f21          %f21
 130 #define P2_f22          %f22
 131 #define P2_f23          %f23
 132 #define P2_f24          %f24
 133 #define P2_f25          %f25
 134 #define P2_f26          %f26
 135 #define P2_f27          %f27
 136 #define P2_f28          %f28
 137 #define P2_f29          %f29
 138 
 139 ! define __vlibm_TBL_sincos_hi & lo for easy reading
 140 
 141 #define SC_HI           %l3
 142 #define SC_LO           %l4
 143 
 144 ! define constants for easy reading
 145 
 146 #define C_q1 %f46
 147 #define C_q2 %f48
 148 #define C_q3 %f50
 149 #define C_q4 %f52
 150 
 151 ! one ( 1 ) uno eins echi un
 152 #define C_ONE           %f54
 153 #define C_ONE_LO        %f55
 154 
 155 ! masks
 156 #define MSK_SIGN        %i5     
 157 #define MSK_BIT31       %f30    
 158 #define MSK_BIT13       %f31    
 159 #define MSK_BITSHI17    %f44    
 160 
 161 
 162 ! constants for pp and qq
 163 #define C_pp1 %f56
 164 #define C_pp2 %f58
 165 #define C_qq1 %f60
 166 #define C_qq2 %f62
 167 
 168 ! sign mask
 169 #define C_signM         %i5
 170 
 171 #define LIM_l5          %l5
 172 #define LIM_l6          %l6
 173 ! when in pri range, using value as transition from poly to table.
 174 ! for Medium range,change use of %l6 and use to keep track of biguns.
 175 #define LIM_l7          %l7
 176 
 177 !--------------------------------------------------------------------
 178 
 179   
 180         ENTRY(__vcos)
 181         save    %sp,-SA(MINFRAME)-tmps,%sp
 182         PIC_SETUP(g5)
 183         PIC_SET(g5,__vlibm_TBL_sincos_hi,l3)
 184         PIC_SET(g5,__vlibm_TBL_sincos_lo,l4)
 185         PIC_SET(g5,constants,o0)
 186         mov     %o0,%g1
 187         wr      %g0,0x82,%asi           ! set %asi for non-faulting loads
 188 
 189 ! ========== primary range ==========
 190 
 191 ! register use
 192 
 193 ! i0  n
 194 ! i1  x
 195 ! i2  stridex
 196 ! i3  y
 197 ! i4  stridey
 198 ! i5  0x80000000
 199 
 200 ! l0  hx0
 201 ! l1  hx1
 202 ! l2  hx2
 203 ! l3  __vlibm_TBL_sincos_hi
 204 ! l4  __vlibm_TBL_sincos_lo
 205 ! l5  0x3fc40000
 206 ! l6  0x3e400000
 207 ! l7  0x3fe921fb
 208 
 209 ! the following are 64-bit registers in both V8+ and V9
 210 
 211 ! g1  scratch
 212 ! g5  
 213 
 214 ! o0  py0
 215 ! o1  py1
 216 ! o2  py2
 217 ! o3  oy0
 218 ! o4  oy1
 219 ! o5  oy2
 220 ! o7  scratch
 221 
 222 ! f0  x0
 223 ! f2  
 224 ! f4  
 225 ! f6  
 226 ! f8  scratch for table base
 227 ! f9  signbit0
 228 ! f10 x1
 229 ! f12 
 230 ! f14 
 231 ! f16 
 232 ! f18 scratch for table base
 233 ! f19 signbit1
 234 ! f20 x2
 235 ! f22 
 236 ! f24 
 237 ! f26 
 238 ! f28 scratch for table base
 239 ! f29 signbit2
 240 ! f30 0x80000000
 241 ! f31 0x4000
 242 ! f32 
 243 ! f34 
 244 ! f36 
 245 ! f38 
 246 ! f40 
 247 ! f42 
 248 ! f44 0xffff800000000000
 249 ! f46 p1
 250 ! f48 p2
 251 ! f50 p3
 252 ! f52 p4
 253 ! f54 one
 254 ! f56 pp1
 255 ! f58 pp2
 256 ! f60 qq1
 257 ! f62 qq2
 258 
 259 #ifdef __sparcv9
 260         stx     %i1,[%fp+xsave]         ! save arguments
 261         stx     %i3,[%fp+ysave]
 262 #else
 263         st      %i1,[%fp+xsave]         ! save arguments
 264         st      %i3,[%fp+ysave]
 265 #endif
 266 
 267         st      %i0,[%fp+nsave]
 268         st      %i2,[%fp+sxsave]
 269         st      %i4,[%fp+sysave]
 270         sethi   %hi(0x80000000),MSK_SIGN        ! load/set up constants
 271         sethi   %hi(0x3fc40000),LIM_l5
 272         sethi   %hi(0x3e400000),LIM_l6
 273         sethi   %hi(0x3fe921fb),LIM_l7
 274         or      LIM_l7,%lo(0x3fe921fb),LIM_l7
 275         ldd     [%g1+f30val],MSK_BIT31
 276         ldd     [%g1+mask],MSK_BITSHI17
 277         ldd     [%g1+q1],C_q1
 278         ldd     [%g1+q2],C_q2
 279         ldd     [%g1+q3],C_q3
 280         ldd     [%g1+q4],C_q4
 281         ldd     [%g1+one],C_ONE
 282         ldd     [%g1+pp1],C_pp1
 283         ldd     [%g1+pp2],C_pp2
 284         ldd     [%g1+qq1],C_qq1
 285         ldd     [%g1+qq2],C_qq2
 286         sll     %i2,3,%i2               ! scale strides
 287         sll     %i4,3,%i4
 288         add     %fp,x0_1,%o3            ! precondition loop
 289         add     %fp,x0_1,%o4
 290         add     %fp,x0_1,%o5
 291         ld      [%i1],%l0               ! hx = *x
 292         ld      [%i1],P0_f0
 293         ld      [%i1+4],P0_f1
 294         andn    %l0,MSK_SIGN,%l0                ! hx &= ~0x80000000
 295         add     %i1,%i2,%i1             ! x += stridex
 296         
 297         ba,pt   %icc,.loop0
 298 !delay slot
 299         nop
 300 
 301         .align 32
 302 .loop0:
 303         lda     [%i1]%asi,%l1           ! preload next argument
 304         sub     %l0,LIM_l6,%g1
 305         sub     LIM_l7,%l0,%o7
 306         fands   P0_f0,MSK_BIT31,P0_f9           ! save signbit
 307 
 308         lda     [%i1]%asi,P1_f10
 309         orcc    %o7,%g1,%g0
 310         mov     %i3,%o0                 ! py0 = y
 311         bl,pn   %icc,.range0            ! if hx < 0x3e400000 or > 0x3fe921fb
 312 
 313 ! delay slot
 314         lda     [%i1+4]%asi,P1_f11
 315         addcc   %i0,-1,%i0
 316         add     %i3,%i4,%i3             ! y += stridey
 317         ble,pn  %icc,.endloop1
 318 
 319 ! delay slot
 320         andn    %l1,MSK_SIGN,%l1
 321         add     %i1,%i2,%i1             ! x += stridex
 322         fabsd   P0_f0,P0_f0
 323         fmuld   C_ONE,C_ONE,C_ONE               ! one*one; a nop for alignment only
 324 
 325 .loop1:
 326         lda     [%i1]%asi,%l2           ! preload next argument
 327         sub     %l1,LIM_l6,%g1
 328         sub     LIM_l7,%l1,%o7
 329         fands   P1_f10,MSK_BIT31,P1_f19         ! save signbit
 330 
 331         lda     [%i1]%asi,P2_f20
 332         orcc    %o7,%g1,%g0
 333         mov     %i3,%o1                 ! py1 = y
 334         bl,pn   %icc,.range1            ! if hx < 0x3e400000 or > 0x3fe921fb
 335 
 336 ! delay slot
 337         lda     [%i1+4]%asi,P2_f21
 338         addcc   %i0,-1,%i0
 339         add     %i3,%i4,%i3             ! y += stridey
 340         ble,pn  %icc,.endloop2
 341 
 342 ! delay slot
 343         andn    %l2,MSK_SIGN,%l2
 344         add     %i1,%i2,%i1             ! x += stridex
 345         fabsd   P1_f10,P1_f10
 346         fmuld   C_ONE,C_ONE,C_ONE               ! one*one; a nop for alignment only
 347 
 348 .loop2:
 349         st      P0_f6,[%o3]
 350         sub     %l2,LIM_l6,%g1
 351         sub     LIM_l7,%l2,%o7
 352         fands   P2_f20,MSK_BIT31,P2_f29         ! save signbit
 353 
 354         st      P0_f7,[%o3+4]
 355         orcc    %g1,%o7,%g0
 356         mov     %i3,%o2                 ! py2 = y
 357         bl,pn   %icc,.range2            ! if hx < 0x3e400000 or > 0x3fe921fb
 358 
 359 ! delay slot
 360         add     %i3,%i4,%i3             ! y += stridey
 361         cmp     %l0,LIM_l5
 362         fabsd   P2_f20,P2_f20
 363         bl,pn   %icc,.case4
 364 
 365 ! delay slot
 366         st      P1_f16,[%o4]
 367         cmp     %l1,LIM_l5
 368         fpadd32s P0_f0,MSK_BIT13,P0_f8
 369         bl,pn   %icc,.case2
 370 
 371 ! delay slot
 372         st      P1_f17,[%o4+4]
 373         cmp     %l2,LIM_l5
 374         fpadd32s P1_f10,MSK_BIT13,P1_f18
 375         bl,pn   %icc,.case1
 376 
 377 ! delay slot
 378         st      P2_f26,[%o5]
 379         mov     %o0,%o3
 380         sethi   %hi(0x3fc3c000),%o7
 381         fpadd32s P2_f20,MSK_BIT13,P2_f28
 382 
 383         st      P2_f27,[%o5+4]
 384         fand    P0_f8,MSK_BITSHI17,P0_f2
 385         mov     %o1,%o4
 386 
 387         fand    P1_f18,MSK_BITSHI17,P1_f12
 388         mov     %o2,%o5
 389         sub     %l0,%o7,%l0
 390 
 391         fand    P2_f28,MSK_BITSHI17,P2_f22
 392         sub     %l1,%o7,%l1
 393         sub     %l2,%o7,%l2
 394 
 395         fsubd   P0_f0,P0_f2,P0_f0
 396         srl     %l0,10,%l0
 397         add     SC_HI,8,%g1;add SC_LO,8,%o7
 398 
 399         fsubd   P1_f10,P1_f12,P1_f10
 400         srl     %l1,10,%l1
 401 
 402         fsubd   P2_f20,P2_f22,P2_f20
 403         srl     %l2,10,%l2
 404 
 405         fmuld   P0_f0,P0_f0,P0_f2
 406         andn    %l0,0x1f,%l0
 407 
 408         fmuld   P1_f10,P1_f10,P1_f12
 409         andn    %l1,0x1f,%l1
 410 
 411         fmuld   P2_f20,P2_f20,P2_f22
 412         andn    %l2,0x1f,%l2
 413 
 414         fmuld   P0_f2,C_pp2,P0_f6
 415         ldd     [%g1+%l0],%f32
 416 
 417         fmuld   P1_f12,C_pp2,P1_f16
 418         ldd     [%g1+%l1],%f36
 419 
 420         fmuld   P2_f22,C_pp2,P2_f26
 421         ldd     [%g1+%l2],%f40
 422 
 423         faddd   P0_f6,C_pp1,P0_f6
 424         fmuld   P0_f2,C_qq2,P0_f4
 425         ldd     [SC_HI+%l0],%f34
 426 
 427         faddd   P1_f16,C_pp1,P1_f16
 428         fmuld   P1_f12,C_qq2,P1_f14
 429         ldd     [SC_HI+%l1],%f38
 430 
 431         faddd   P2_f26,C_pp1,P2_f26
 432         fmuld   P2_f22,C_qq2,P2_f24
 433         ldd     [SC_HI+%l2],%f42
 434 
 435         fmuld   P0_f2,P0_f6,P0_f6
 436         faddd   P0_f4,C_qq1,P0_f4
 437 
 438         fmuld   P1_f12,P1_f16,P1_f16
 439         faddd   P1_f14,C_qq1,P1_f14
 440 
 441         fmuld   P2_f22,P2_f26,P2_f26
 442         faddd   P2_f24,C_qq1,P2_f24
 443 
 444         faddd   P0_f6,C_ONE,P0_f6
 445         fmuld   P0_f2,P0_f4,P0_f4
 446 
 447         faddd   P1_f16,C_ONE,P1_f16
 448         fmuld   P1_f12,P1_f14,P1_f14
 449 
 450         faddd   P2_f26,C_ONE,P2_f26
 451         fmuld   P2_f22,P2_f24,P2_f24
 452 
 453         fmuld   P0_f0,P0_f6,P0_f6
 454         ldd     [%o7+%l0],P0_f2
 455 
 456         fmuld   P1_f10,P1_f16,P1_f16
 457         ldd     [%o7+%l1],P1_f12
 458 
 459         fmuld   P2_f20,P2_f26,P2_f26
 460         ldd     [%o7+%l2],P2_f22
 461 
 462         fmuld   P0_f4,%f32,P0_f4
 463         lda     [%i1]%asi,%l0           ! preload next argument
 464 
 465         fmuld   P1_f14,%f36,P1_f14
 466         lda     [%i1]%asi,P0_f0
 467 
 468         fmuld   P2_f24,%f40,P2_f24
 469         lda     [%i1+4]%asi,P0_f1
 470 
 471         fmuld   P0_f6,%f34,P0_f6
 472         add     %i1,%i2,%i1             ! x += stridex
 473 
 474         fmuld   P1_f16,%f38,P1_f16
 475 
 476         fmuld   P2_f26,%f42,P2_f26
 477 
 478         fsubd   P0_f6,P0_f4,P0_f6
 479 
 480         fsubd   P1_f16,P1_f14,P1_f16
 481 
 482         fsubd   P2_f26,P2_f24,P2_f26
 483 
 484         fsubd   P0_f2,P0_f6,P0_f6
 485 
 486         fsubd   P1_f12,P1_f16,P1_f16
 487 
 488         fsubd   P2_f22,P2_f26,P2_f26
 489 
 490         faddd   P0_f6,%f32,P0_f6
 491 
 492         faddd   P1_f16,%f36,P1_f16
 493 
 494         faddd   P2_f26,%f40,P2_f26
 495         andn    %l0,MSK_SIGN,%l0                ! hx &= ~0x80000000
 496 
 497         nop     !!(vsin)        fors    P0_f6,P0_f9,P0_f6
 498         addcc   %i0,-1,%i0
 499 
 500         nop     !!(vsin)        fors    P1_f16,P1_f19,P1_f16
 501         bg,pt   %icc,.loop0
 502 
 503 ! delay slot
 504         nop     !!(vsin)        fors    P2_f26,P2_f29,P2_f26
 505 
 506         ba,pt   %icc,.endloop0
 507 ! delay slot
 508         nop
 509 
 510         .align  32
 511 .case1:
 512         st      P2_f27,[%o5+4]
 513         sethi   %hi(0x3fc3c000),%o7
 514         fand    P0_f8,MSK_BITSHI17,P0_f2
 515 
 516         sub     %l0,%o7,%l0
 517         sub     %l1,%o7,%l1
 518         add     SC_HI,8,%g1;add SC_LO,8,%o7
 519         fand    P1_f18,MSK_BITSHI17,P1_f12
 520         fmuld   P2_f20,P2_f20,P2_f22
 521 
 522         fsubd   P0_f0,P0_f2,P0_f0
 523         srl     %l0,10,%l0
 524         mov     %o0,%o3
 525 
 526         fsubd   P1_f10,P1_f12,P1_f10
 527         srl     %l1,10,%l1
 528         mov     %o1,%o4
 529 
 530         fmuld   P2_f22,C_q4,P2_f24
 531         mov     %o2,%o5
 532 
 533         fmuld   P0_f0,P0_f0,P0_f2
 534         andn    %l0,0x1f,%l0
 535 
 536         fmuld   P1_f10,P1_f10,P1_f12
 537         andn    %l1,0x1f,%l1
 538 
 539         faddd   P2_f24,C_q3,P2_f24
 540 
 541         fmuld   P0_f2,C_pp2,P0_f6
 542         ldd     [%g1+%l0],%f32
 543 
 544         fmuld   P1_f12,C_pp2,P1_f16
 545         ldd     [%g1+%l1],%f36
 546 
 547         fmuld   P2_f22,P2_f24,P2_f24
 548 
 549         faddd   P0_f6,C_pp1,P0_f6
 550         fmuld   P0_f2,C_qq2,P0_f4
 551         ldd     [SC_HI+%l0],%f34
 552 
 553         faddd   P1_f16,C_pp1,P1_f16
 554         fmuld   P1_f12,C_qq2,P1_f14
 555         ldd     [SC_HI+%l1],%f38
 556 
 557         faddd   P2_f24,C_q2,P2_f24
 558 
 559         fmuld   P0_f2,P0_f6,P0_f6
 560         faddd   P0_f4,C_qq1,P0_f4
 561 
 562         fmuld   P1_f12,P1_f16,P1_f16
 563         faddd   P1_f14,C_qq1,P1_f14
 564 
 565         fmuld   P2_f22,P2_f24,P2_f24
 566 
 567         faddd   P0_f6,C_ONE,P0_f6
 568         fmuld   P0_f2,P0_f4,P0_f4
 569 
 570         faddd   P1_f16,C_ONE,P1_f16
 571         fmuld   P1_f12,P1_f14,P1_f14
 572 
 573         faddd   P2_f24,C_q1,P2_f24
 574 
 575         fmuld   P0_f0,P0_f6,P0_f6
 576         ldd     [%o7+%l0],P0_f2
 577 
 578         fmuld   P1_f10,P1_f16,P1_f16
 579         ldd     [%o7+%l1],P1_f12
 580 
 581         fmuld   P0_f4,%f32,P0_f4
 582         lda     [%i1]%asi,%l0           ! preload next argument
 583 
 584         fmuld   P1_f14,%f36,P1_f14
 585         lda     [%i1]%asi,P0_f0
 586 
 587         fmuld   P0_f6,%f34,P0_f6
 588         lda     [%i1+4]%asi,P0_f1
 589 
 590         fmuld   P1_f16,%f38,P1_f16
 591         add     %i1,%i2,%i1             ! x += stridex
 592 
 593         fmuld   P2_f22,P2_f24,P2_f24
 594 
 595         fsubd   P0_f6,P0_f4,P0_f6
 596 
 597         fsubd   P1_f16,P1_f14,P1_f16
 598 
 599         !!(vsin)fmuld   P2_f20,P2_f24,P2_f24
 600 
 601         fsubd   P0_f2,P0_f6,P0_f6
 602 
 603         fsubd   P1_f12,P1_f16,P1_f16
 604 
 605         faddd   C_ONE,P2_f24,P2_f26 !!(vsin)faddd       P2_f20,P2_f24,P2_f26
 606 
 607         faddd   P0_f6,%f32,P0_f6
 608 
 609         faddd   P1_f16,%f36,P1_f16
 610         andn    %l0,MSK_SIGN,%l0                ! hx &= ~0x80000000
 611 
 612         nop     !!(vsin)        fors    P2_f26,P2_f29,P2_f26
 613         addcc   %i0,-1,%i0
 614 
 615         nop     !!(vsin)        fors    P0_f6,P0_f9,P0_f6
 616         bg,pt   %icc,.loop0
 617 
 618 ! delay slot
 619         nop     !!(vsin)        fors    P1_f16,P1_f19,P1_f16
 620 
 621         ba,pt   %icc,.endloop0
 622 ! delay slot
 623         nop
 624 
 625         .align  32
 626 .case2:
 627         st      P2_f26,[%o5]
 628         cmp     %l2,LIM_l5
 629         fpadd32s P2_f20,MSK_BIT13,P2_f28
 630         bl,pn   %icc,.case3
 631 
 632 ! delay slot
 633         st      P2_f27,[%o5+4]
 634         sethi   %hi(0x3fc3c000),%o7
 635         fand    P0_f8,MSK_BITSHI17,P0_f2
 636 
 637         sub     %l0,%o7,%l0
 638         sub     %l2,%o7,%l2
 639         add     SC_HI,8,%g1;add SC_LO,8,%o7
 640         fand    P2_f28,MSK_BITSHI17,P2_f22
 641         fmuld   P1_f10,P1_f10,P1_f12
 642 
 643         fsubd   P0_f0,P0_f2,P0_f0
 644         srl     %l0,10,%l0
 645         mov     %o0,%o3
 646 
 647         fsubd   P2_f20,P2_f22,P2_f20
 648         srl     %l2,10,%l2
 649         mov     %o2,%o5
 650 
 651         fmuld   P1_f12,C_q4,P1_f14
 652         mov     %o1,%o4
 653 
 654         fmuld   P0_f0,P0_f0,P0_f2
 655         andn    %l0,0x1f,%l0
 656 
 657         fmuld   P2_f20,P2_f20,P2_f22
 658         andn    %l2,0x1f,%l2
 659 
 660         faddd   P1_f14,C_q3,P1_f14
 661 
 662         fmuld   P0_f2,C_pp2,P0_f6
 663         ldd     [%g1+%l0],%f32
 664 
 665         fmuld   P2_f22,C_pp2,P2_f26
 666         ldd     [%g1+%l2],%f40
 667 
 668         fmuld   P1_f12,P1_f14,P1_f14
 669 
 670         faddd   P0_f6,C_pp1,P0_f6
 671         fmuld   P0_f2,C_qq2,P0_f4
 672         ldd     [SC_HI+%l0],%f34
 673 
 674         faddd   P2_f26,C_pp1,P2_f26
 675         fmuld   P2_f22,C_qq2,P2_f24
 676         ldd     [SC_HI+%l2],%f42
 677 
 678         faddd   P1_f14,C_q2,P1_f14
 679 
 680         fmuld   P0_f2,P0_f6,P0_f6
 681         faddd   P0_f4,C_qq1,P0_f4
 682 
 683         fmuld   P2_f22,P2_f26,P2_f26
 684         faddd   P2_f24,C_qq1,P2_f24
 685 
 686         fmuld   P1_f12,P1_f14,P1_f14
 687 
 688         faddd   P0_f6,C_ONE,P0_f6
 689         fmuld   P0_f2,P0_f4,P0_f4
 690 
 691         faddd   P2_f26,C_ONE,P2_f26
 692         fmuld   P2_f22,P2_f24,P2_f24
 693 
 694         faddd   P1_f14,C_q1,P1_f14
 695 
 696         fmuld   P0_f0,P0_f6,P0_f6
 697         ldd     [%o7+%l0],P0_f2
 698 
 699         fmuld   P2_f20,P2_f26,P2_f26
 700         ldd     [%o7+%l2],P2_f22
 701 
 702         fmuld   P0_f4,%f32,P0_f4
 703         lda     [%i1]%asi,%l0           ! preload next argument
 704 
 705         fmuld   P2_f24,%f40,P2_f24
 706         lda     [%i1]%asi,P0_f0
 707 
 708         fmuld   P0_f6,%f34,P0_f6
 709         lda     [%i1+4]%asi,P0_f1
 710 
 711         fmuld   P2_f26,%f42,P2_f26
 712         add     %i1,%i2,%i1             ! x += stridex
 713 
 714         fmuld   P1_f12,P1_f14,P1_f14
 715 
 716         fsubd   P0_f6,P0_f4,P0_f6
 717 
 718         fsubd   P2_f26,P2_f24,P2_f26
 719 
 720         !!(vsin)fmuld   P1_f10,P1_f14,P1_f14
 721 
 722         fsubd   P0_f2,P0_f6,P0_f6
 723 
 724         fsubd   P2_f22,P2_f26,P2_f26
 725 
 726         faddd   C_ONE,P1_f14,P1_f16 !!(vsin)faddd       P1_f10,P1_f14,P1_f16
 727 
 728         faddd   P0_f6,%f32,P0_f6
 729 
 730         faddd   P2_f26,%f40,P2_f26
 731         andn    %l0,MSK_SIGN,%l0                ! hx &= ~0x80000000
 732 
 733         nop     !!(vsin)        fors    P1_f16,P1_f19,P1_f16
 734         addcc   %i0,-1,%i0
 735 
 736         nop     !!(vsin)        fors    P0_f6,P0_f9,P0_f6
 737         bg,pt   %icc,.loop0
 738 
 739 ! delay slot
 740         nop     !!(vsin)        fors    P2_f26,P2_f29,P2_f26
 741 
 742         ba,pt   %icc,.endloop0
 743 ! delay slot
 744         nop
 745 
 746         .align  32
 747 .case3:
 748         sethi   %hi(0x3fc3c000),%o7
 749         fand    P0_f8,MSK_BITSHI17,P0_f2
 750         fmuld   P1_f10,P1_f10,P1_f12
 751 
 752         sub     %l0,%o7,%l0
 753         add     SC_HI,8,%g1;add SC_LO,8,%o7
 754         fmuld   P2_f20,P2_f20,P2_f22
 755 
 756         fsubd   P0_f0,P0_f2,P0_f0
 757         srl     %l0,10,%l0
 758         mov     %o0,%o3
 759 
 760         fmuld   P1_f12,C_q4,P1_f14
 761         mov     %o1,%o4
 762 
 763         fmuld   P2_f22,C_q4,P2_f24
 764         mov     %o2,%o5
 765 
 766         fmuld   P0_f0,P0_f0,P0_f2
 767         andn    %l0,0x1f,%l0
 768 
 769         faddd   P1_f14,C_q3,P1_f14
 770 
 771         faddd   P2_f24,C_q3,P2_f24
 772 
 773         fmuld   P0_f2,C_pp2,P0_f6
 774         ldd     [%g1+%l0],%f32
 775 
 776         fmuld   P1_f12,P1_f14,P1_f14
 777 
 778         fmuld   P2_f22,P2_f24,P2_f24
 779 
 780         faddd   P0_f6,C_pp1,P0_f6
 781         fmuld   P0_f2,C_qq2,P0_f4
 782         ldd     [SC_HI+%l0],%f34
 783 
 784         faddd   P1_f14,C_q2,P1_f14
 785 
 786         faddd   P2_f24,C_q2,P2_f24
 787 
 788         fmuld   P0_f2,P0_f6,P0_f6
 789         faddd   P0_f4,C_qq1,P0_f4
 790 
 791         fmuld   P1_f12,P1_f14,P1_f14
 792 
 793         fmuld   P2_f22,P2_f24,P2_f24
 794 
 795         faddd   P0_f6,C_ONE,P0_f6
 796         fmuld   P0_f2,P0_f4,P0_f4
 797 
 798         faddd   P1_f14,C_q1,P1_f14
 799 
 800         faddd   P2_f24,C_q1,P2_f24
 801 
 802         fmuld   P0_f0,P0_f6,P0_f6
 803         ldd     [%o7+%l0],P0_f2
 804 
 805         fmuld   P0_f4,%f32,P0_f4
 806         lda     [%i1]%asi,%l0           ! preload next argument
 807 
 808         fmuld   P1_f12,P1_f14,P1_f14
 809         lda     [%i1]%asi,P0_f0
 810 
 811         fmuld   P0_f6,%f34,P0_f6
 812         lda     [%i1+4]%asi,P0_f1
 813 
 814         fmuld   P2_f22,P2_f24,P2_f24
 815         add     %i1,%i2,%i1             ! x += stridex
 816 
 817         !!(vsin)fmuld   P1_f10,P1_f14,P1_f14
 818 
 819         fsubd   P0_f6,P0_f4,P0_f6
 820 
 821         !!(vsin)fmuld   P2_f20,P2_f24,P2_f24
 822 
 823         faddd   C_ONE,P1_f14,P1_f16 !!(vsin)faddd       P1_f10,P1_f14,P1_f16
 824 
 825         fsubd   P0_f2,P0_f6,P0_f6
 826 
 827         faddd   C_ONE,P2_f24,P2_f26 !!(vsin)faddd       P2_f20,P2_f24,P2_f26
 828 
 829         nop     !!(vsin)        fors    P1_f16,P1_f19,P1_f16
 830         andn    %l0,MSK_SIGN,%l0                ! hx &= ~0x80000000
 831 
 832         faddd   P0_f6,%f32,P0_f6
 833         addcc   %i0,-1,%i0
 834 
 835         nop     !!(vsin)        fors    P2_f26,P2_f29,P2_f26
 836         bg,pt   %icc,.loop0
 837 
 838 ! delay slot
 839         nop     !!(vsin)        fors    P0_f6,P0_f9,P0_f6
 840 
 841         ba,pt   %icc,.endloop0
 842 ! delay slot
 843         nop
 844 
 845         .align  32
 846 .case4:
 847         st      P1_f17,[%o4+4]
 848         cmp     %l1,LIM_l5
 849         fpadd32s P1_f10,MSK_BIT13,P1_f18
 850         bl,pn   %icc,.case6
 851 
 852 ! delay slot
 853         st      P2_f26,[%o5]
 854         cmp     %l2,LIM_l5
 855         fpadd32s P2_f20,MSK_BIT13,P2_f28
 856         bl,pn   %icc,.case5
 857 
 858 ! delay slot
 859         st      P2_f27,[%o5+4]
 860         sethi   %hi(0x3fc3c000),%o7
 861         fand    P1_f18,MSK_BITSHI17,P1_f12
 862 
 863         sub     %l1,%o7,%l1
 864         sub     %l2,%o7,%l2
 865         add     SC_HI,8,%g1;add SC_LO,8,%o7
 866         fand    P2_f28,MSK_BITSHI17,P2_f22
 867         fmuld   P0_f0,P0_f0,P0_f2
 868 
 869         fsubd   P1_f10,P1_f12,P1_f10
 870         srl     %l1,10,%l1
 871         mov     %o1,%o4
 872 
 873         fsubd   P2_f20,P2_f22,P2_f20
 874         srl     %l2,10,%l2
 875         mov     %o2,%o5
 876 
 877         fmovd   P0_f0,P0_f6             !ID for processing
 878         fmuld   P0_f2,C_q4,P0_f4
 879         mov     %o0,%o3
 880 
 881         fmuld   P1_f10,P1_f10,P1_f12
 882         andn    %l1,0x1f,%l1
 883 
 884         fmuld   P2_f20,P2_f20,P2_f22
 885         andn    %l2,0x1f,%l2
 886 
 887         faddd   P0_f4,C_q3,P0_f4
 888 
 889         fmuld   P1_f12,C_pp2,P1_f16
 890         ldd     [%g1+%l1],%f36
 891 
 892         fmuld   P2_f22,C_pp2,P2_f26
 893         ldd     [%g1+%l2],%f40
 894 
 895         fmuld   P0_f2,P0_f4,P0_f4
 896 
 897         faddd   P1_f16,C_pp1,P1_f16
 898         fmuld   P1_f12,C_qq2,P1_f14
 899         ldd     [SC_HI+%l1],%f38
 900 
 901         faddd   P2_f26,C_pp1,P2_f26
 902         fmuld   P2_f22,C_qq2,P2_f24
 903         ldd     [SC_HI+%l2],%f42
 904 
 905         faddd   P0_f4,C_q2,P0_f4
 906 
 907         fmuld   P1_f12,P1_f16,P1_f16
 908         faddd   P1_f14,C_qq1,P1_f14
 909 
 910         fmuld   P2_f22,P2_f26,P2_f26
 911         faddd   P2_f24,C_qq1,P2_f24
 912 
 913         fmuld   P0_f2,P0_f4,P0_f4
 914 
 915         faddd   P1_f16,C_ONE,P1_f16
 916         fmuld   P1_f12,P1_f14,P1_f14
 917 
 918         faddd   P2_f26,C_ONE,P2_f26
 919         fmuld   P2_f22,P2_f24,P2_f24
 920 
 921         faddd   P0_f4,C_q1,P0_f4
 922 
 923         fmuld   P1_f10,P1_f16,P1_f16
 924         ldd     [%o7+%l1],P1_f12
 925 
 926         fmuld   P2_f20,P2_f26,P2_f26
 927         ldd     [%o7+%l2],P2_f22
 928 
 929         fmuld   P1_f14,%f36,P1_f14
 930         lda     [%i1]%asi,%l0           ! preload next argument
 931 
 932         fmuld   P2_f24,%f40,P2_f24
 933         lda     [%i1]%asi,P0_f0
 934 
 935         fmuld   P1_f16,%f38,P1_f16
 936         lda     [%i1+4]%asi,P0_f1
 937 
 938         fmuld   P2_f26,%f42,P2_f26
 939         add     %i1,%i2,%i1             ! x += stridex
 940 
 941         fmuld   P0_f2,P0_f4,P0_f4
 942 
 943         fsubd   P1_f16,P1_f14,P1_f16
 944 
 945         fsubd   P2_f26,P2_f24,P2_f26
 946 
 947         !!(vsin)fmuld   P0_f6,P0_f4,P0_f4
 948 
 949         fsubd   P1_f12,P1_f16,P1_f16
 950 
 951         fsubd   P2_f22,P2_f26,P2_f26
 952 
 953         faddd   C_ONE,P0_f4,P0_f6 !!(vsin)faddd   P0_f6,P0_f4,P0_f6     ! faddd then spaces for processing
 954 
 955         faddd   P1_f16,%f36,P1_f16
 956 
 957         faddd   P2_f26,%f40,P2_f26
 958         andn    %l0,MSK_SIGN,%l0                ! hx &= ~0x80000000
 959 
 960         nop     !!(vsin)        fors    P0_f6,P0_f9,P0_f6
 961         addcc   %i0,-1,%i0
 962 
 963         nop     !!(vsin)        fors    P1_f16,P1_f19,P1_f16
 964         bg,pt   %icc,.loop0
 965 
 966 ! delay slot
 967         nop     !!(vsin)        fors    P2_f26,P2_f29,P2_f26
 968 
 969         ba,pt   %icc,.endloop0
 970 ! delay slot
 971         nop
 972 
 973         .align  32
 974 .case5:
 975         sethi   %hi(0x3fc3c000),%o7
 976         fand    P1_f18,MSK_BITSHI17,P1_f12
 977         fmuld   P0_f0,P0_f0,P0_f2
 978 
 979         sub     %l1,%o7,%l1
 980         add     SC_HI,8,%g1;add SC_LO,8,%o7
 981         fmuld   P2_f20,P2_f20,P2_f22
 982 
 983         fsubd   P1_f10,P1_f12,P1_f10
 984         srl     %l1,10,%l1
 985         mov     %o1,%o4
 986 
 987         fmovd   P0_f0,P0_f6             !ID for processing
 988         fmuld   P0_f2,C_q4,P0_f4
 989         mov     %o0,%o3
 990 
 991         fmuld   P2_f22,C_q4,P2_f24
 992         mov     %o2,%o5
 993 
 994         fmuld   P1_f10,P1_f10,P1_f12
 995         andn    %l1,0x1f,%l1
 996 
 997         faddd   P0_f4,C_q3,P0_f4
 998 
 999         faddd   P2_f24,C_q3,P2_f24
1000 
1001         fmuld   P1_f12,C_pp2,P1_f16
1002         ldd     [%g1+%l1],%f36
1003 
1004         fmuld   P0_f2,P0_f4,P0_f4
1005 
1006         fmuld   P2_f22,P2_f24,P2_f24
1007 
1008         faddd   P1_f16,C_pp1,P1_f16
1009         fmuld   P1_f12,C_qq2,P1_f14
1010         ldd     [SC_HI+%l1],%f38
1011 
1012         faddd   P0_f4,C_q2,P0_f4
1013 
1014         faddd   P2_f24,C_q2,P2_f24
1015 
1016         fmuld   P1_f12,P1_f16,P1_f16
1017         faddd   P1_f14,C_qq1,P1_f14
1018 
1019         fmuld   P0_f2,P0_f4,P0_f4
1020 
1021         fmuld   P2_f22,P2_f24,P2_f24
1022 
1023         faddd   P1_f16,C_ONE,P1_f16
1024         fmuld   P1_f12,P1_f14,P1_f14
1025 
1026         faddd   P0_f4,C_q1,P0_f4
1027 
1028         faddd   P2_f24,C_q1,P2_f24
1029 
1030         fmuld   P1_f10,P1_f16,P1_f16
1031         ldd     [%o7+%l1],P1_f12
1032 
1033         fmuld   P1_f14,%f36,P1_f14
1034         lda     [%i1]%asi,%l0           ! preload next argument
1035 
1036         fmuld   P0_f2,P0_f4,P0_f4
1037         lda     [%i1]%asi,P0_f0
1038 
1039         fmuld   P1_f16,%f38,P1_f16
1040         lda     [%i1+4]%asi,P0_f1
1041 
1042         fmuld   P2_f22,P2_f24,P2_f24
1043         add     %i1,%i2,%i1             ! x += stridex
1044 
1045         !!(vsin)fmuld   P0_f6,P0_f4,P0_f4
1046 
1047         fsubd   P1_f16,P1_f14,P1_f16
1048 
1049         !!(vsin)fmuld   P2_f20,P2_f24,P2_f24
1050 
1051         faddd   C_ONE,P0_f4,P0_f6 !!(vsin)faddd   P0_f6,P0_f4,P0_f6     ! faddd then spaces for processing
1052 
1053         fsubd   P1_f12,P1_f16,P1_f16
1054 
1055         faddd   C_ONE,P2_f24,P2_f26 !!(vsin)faddd       P2_f20,P2_f24,P2_f26
1056 
1057         nop     !!(vsin)        fors    P0_f6,P0_f9,P0_f6
1058         andn    %l0,MSK_SIGN,%l0                ! hx &= ~0x80000000
1059 
1060         faddd   P1_f16,%f36,P1_f16
1061         addcc   %i0,-1,%i0
1062 
1063         nop     !!(vsin)        fors    P2_f26,P2_f29,P2_f26
1064         bg,pt   %icc,.loop0
1065 
1066 ! delay slot
1067         nop     !!(vsin)        fors    P1_f16,P1_f19,P1_f16
1068 
1069         ba,pt   %icc,.endloop0
1070 ! delay slot
1071         nop
1072 
1073         .align  32
1074 .case6:
1075         st      P2_f27,[%o5+4]
1076         cmp     %l2,LIM_l5
1077         fpadd32s P2_f20,MSK_BIT13,P2_f28
1078         bl,pn   %icc,.case7
1079 
1080 ! delay slot
1081         sethi   %hi(0x3fc3c000),%o7
1082         fand    P2_f28,MSK_BITSHI17,P2_f22
1083         fmuld   P0_f0,P0_f0,P0_f2
1084 
1085         sub     %l2,%o7,%l2
1086         add     SC_HI,8,%g1;add SC_LO,8,%o7
1087         fmuld   P1_f10,P1_f10,P1_f12
1088 
1089         fsubd   P2_f20,P2_f22,P2_f20
1090         srl     %l2,10,%l2
1091         mov     %o2,%o5
1092 
1093         fmovd   P0_f0,P0_f6             !ID for processing
1094         fmuld   P0_f2,C_q4,P0_f4
1095         mov     %o0,%o3
1096 
1097         fmuld   P1_f12,C_q4,P1_f14
1098         mov     %o1,%o4
1099 
1100         fmuld   P2_f20,P2_f20,P2_f22
1101         andn    %l2,0x1f,%l2
1102 
1103         faddd   P0_f4,C_q3,P0_f4
1104 
1105         faddd   P1_f14,C_q3,P1_f14
1106 
1107         fmuld   P2_f22,C_pp2,P2_f26
1108         ldd     [%g1+%l2],%f40
1109 
1110         fmuld   P0_f2,P0_f4,P0_f4
1111 
1112         fmuld   P1_f12,P1_f14,P1_f14
1113 
1114         faddd   P2_f26,C_pp1,P2_f26
1115         fmuld   P2_f22,C_qq2,P2_f24
1116         ldd     [SC_HI+%l2],%f42
1117 
1118         faddd   P0_f4,C_q2,P0_f4
1119 
1120         faddd   P1_f14,C_q2,P1_f14
1121 
1122         fmuld   P2_f22,P2_f26,P2_f26
1123         faddd   P2_f24,C_qq1,P2_f24
1124 
1125         fmuld   P0_f2,P0_f4,P0_f4
1126 
1127         fmuld   P1_f12,P1_f14,P1_f14
1128 
1129         faddd   P2_f26,C_ONE,P2_f26
1130         fmuld   P2_f22,P2_f24,P2_f24
1131 
1132         faddd   P0_f4,C_q1,P0_f4
1133 
1134         faddd   P1_f14,C_q1,P1_f14
1135 
1136         fmuld   P2_f20,P2_f26,P2_f26
1137         ldd     [%o7+%l2],P2_f22
1138 
1139         fmuld   P2_f24,%f40,P2_f24
1140         lda     [%i1]%asi,%l0           ! preload next argument
1141 
1142         fmuld   P0_f2,P0_f4,P0_f4
1143         lda     [%i1]%asi,P0_f0
1144 
1145         fmuld   P2_f26,%f42,P2_f26
1146         lda     [%i1+4]%asi,P0_f1
1147 
1148         fmuld   P1_f12,P1_f14,P1_f14
1149         add     %i1,%i2,%i1             ! x += stridex
1150 
1151         !!(vsin)fmuld   P0_f6,P0_f4,P0_f4
1152 
1153         fsubd   P2_f26,P2_f24,P2_f26
1154 
1155         !!(vsin)fmuld   P1_f10,P1_f14,P1_f14
1156 
1157         faddd   C_ONE,P0_f4,P0_f6 !!(vsin)faddd   P0_f6,P0_f4,P0_f6     ! faddd then spaces for processing
1158 
1159         fsubd   P2_f22,P2_f26,P2_f26
1160 
1161         faddd   C_ONE,P1_f14,P1_f16 !!(vsin)faddd       P1_f10,P1_f14,P1_f16
1162 
1163         nop     !!(vsin)        fors    P0_f6,P0_f9,P0_f6
1164         andn    %l0,MSK_SIGN,%l0                ! hx &= ~0x80000000
1165 
1166         faddd   P2_f26,%f40,P2_f26
1167         addcc   %i0,-1,%i0
1168 
1169         nop     !!(vsin)        fors    P1_f16,P1_f19,P1_f16
1170         bg,pt   %icc,.loop0
1171 
1172 ! delay slot
1173         nop     !!(vsin)        fors    P2_f26,P2_f29,P2_f26
1174 
1175         ba,pt   %icc,.endloop0
1176 ! delay slot
1177         nop
1178 
1179         .align  32
1180 .case7:
1181         fmuld   P0_f0,P0_f0,P0_f2
1182         fmovd   P0_f0,P0_f6             !ID for processing
1183         mov     %o0,%o3
1184 
1185         fmuld   P1_f10,P1_f10,P1_f12
1186         mov     %o1,%o4
1187 
1188         fmuld   P2_f20,P2_f20,P2_f22
1189         mov     %o2,%o5
1190 
1191         fmuld   P0_f2,C_q4,P0_f4
1192         lda     [%i1]%asi,%l0           ! preload next argument
1193 
1194         fmuld   P1_f12,C_q4,P1_f14
1195         lda     [%i1]%asi,P0_f0
1196 
1197         fmuld   P2_f22,C_q4,P2_f24
1198         lda     [%i1+4]%asi,P0_f1
1199 
1200         faddd   P0_f4,C_q3,P0_f4
1201         add     %i1,%i2,%i1             ! x += stridex
1202 
1203         faddd   P1_f14,C_q3,P1_f14
1204 
1205         faddd   P2_f24,C_q3,P2_f24
1206 
1207         fmuld   P0_f2,P0_f4,P0_f4
1208 
1209         fmuld   P1_f12,P1_f14,P1_f14
1210 
1211         fmuld   P2_f22,P2_f24,P2_f24
1212 
1213         faddd   P0_f4,C_q2,P0_f4
1214 
1215         faddd   P1_f14,C_q2,P1_f14
1216 
1217         faddd   P2_f24,C_q2,P2_f24
1218 
1219         fmuld   P0_f2,P0_f4,P0_f4
1220 
1221         fmuld   P1_f12,P1_f14,P1_f14
1222 
1223         fmuld   P2_f22,P2_f24,P2_f24
1224 
1225         faddd   P0_f4,C_q1,P0_f4
1226 
1227         faddd   P1_f14,C_q1,P1_f14
1228 
1229         faddd   P2_f24,C_q1,P2_f24
1230 
1231         fmuld   P0_f2,P0_f4,P0_f4
1232 
1233         fmuld   P1_f12,P1_f14,P1_f14
1234 
1235         fmuld   P2_f22,P2_f24,P2_f24
1236 
1237         !!(vsin)fmuld   P0_f6,P0_f4,P0_f4
1238 
1239         !!(vsin)fmuld   P1_f10,P1_f14,P1_f14
1240 
1241         !!(vsin)fmuld   P2_f20,P2_f24,P2_f24
1242 
1243         faddd   C_ONE,P0_f4,P0_f6 !!(vsin)faddd   P0_f6,P0_f4,P0_f6     ! faddd then spaces for processing
1244 
1245         faddd   C_ONE,P1_f14,P1_f16 !!(vsin)faddd       P1_f10,P1_f14,P1_f16
1246 
1247         faddd   C_ONE,P2_f24,P2_f26 !!(vsin)faddd       P2_f20,P2_f24,P2_f26
1248         andn    %l0,MSK_SIGN,%l0                ! hx &= ~0x80000000
1249 
1250         nop     !!(vsin)        fors    P0_f6,P0_f9,P0_f6
1251         addcc   %i0,-1,%i0
1252 
1253         nop     !!(vsin)        fors    P1_f16,P1_f19,P1_f16
1254         bg,pt   %icc,.loop0
1255 
1256 ! delay slot
1257         nop     !!(vsin)        fors    P2_f26,P2_f29,P2_f26
1258 
1259         ba,pt   %icc,.endloop0
1260 ! delay slot
1261         nop
1262 
1263 
1264         .align  32
1265 .endloop2:
1266         cmp     %l1,LIM_l5
1267         bl,pn   %icc,1f
1268 ! delay slot
1269         fabsd   P1_f10,P1_f10
1270         sethi   %hi(0x3fc3c000),%o7
1271         fpadd32s P1_f10,MSK_BIT13,P1_f18
1272         fand    P1_f18,MSK_BITSHI17,P1_f12
1273         sub     %l1,%o7,%l1
1274         add     SC_HI,8,%g1;add SC_LO,8,%o7
1275         fsubd   P1_f10,P1_f12,P1_f10
1276         srl     %l1,10,%l1
1277         fmuld   P1_f10,P1_f10,P1_f12
1278         andn    %l1,0x1f,%l1
1279         fmuld   P1_f12,C_pp2,P2_f20
1280         ldd     [%g1+%l1],%f36
1281         faddd   P2_f20,C_pp1,P2_f20
1282         fmuld   P1_f12,C_qq2,P1_f14
1283         ldd     [SC_HI+%l1],%f38
1284         fmuld   P1_f12,P2_f20,P2_f20
1285         faddd   P1_f14,C_qq1,P1_f14
1286         faddd   P2_f20,C_ONE,P2_f20
1287         fmuld   P1_f12,P1_f14,P1_f14
1288         fmuld   P1_f10,P2_f20,P2_f20
1289         ldd     [%o7+%l1],P1_f12
1290         fmuld   P1_f14,%f36,P1_f14
1291         fmuld   P2_f20,%f38,P2_f20
1292         fsubd   P2_f20,P1_f14,P2_f20
1293         fsubd   P1_f12,P2_f20,P2_f20
1294         ba,pt   %icc,2f
1295 ! delay slot
1296         faddd   P2_f20,%f36,P2_f20
1297 1:
1298         fmuld   P1_f10,P1_f10,P1_f12
1299         fmuld   P1_f12,C_q4,P1_f14
1300         faddd   P1_f14,C_q3,P1_f14
1301         fmuld   P1_f12,P1_f14,P1_f14
1302         faddd   P1_f14,C_q2,P1_f14
1303         fmuld   P1_f12,P1_f14,P1_f14
1304         faddd   P1_f14,C_q1,P1_f14
1305         fmuld   P1_f12,P1_f14,P1_f14
1306         !!(vsin)fmuld   P1_f10,P1_f14,P1_f14
1307         faddd   C_ONE,P1_f14,P2_f20 !!(vsin)faddd       P1_f10,P1_f14,P2_f20
1308 2:
1309         nop     !!(vsin)        fors    P2_f20,P1_f19,P2_f20
1310         st      P2_f20,[%o1]
1311         st      P2_f21,[%o1+4]
1312 
1313 .endloop1:
1314         cmp     %l0,LIM_l5
1315         bl,pn   %icc,1f
1316 ! delay slot
1317         fabsd   P0_f0,P0_f0
1318         sethi   %hi(0x3fc3c000),%o7
1319         fpadd32s P0_f0,MSK_BIT13,P0_f8
1320         fand    P0_f8,MSK_BITSHI17,P0_f2
1321         sub     %l0,%o7,%l0
1322         add     SC_HI,8,%g1;add SC_LO,8,%o7
1323         fsubd   P0_f0,P0_f2,P0_f0
1324         srl     %l0,10,%l0
1325         fmuld   P0_f0,P0_f0,P0_f2
1326         andn    %l0,0x1f,%l0
1327         fmuld   P0_f2,C_pp2,P2_f20
1328         ldd     [%g1+%l0],%f32
1329         faddd   P2_f20,C_pp1,P2_f20
1330         fmuld   P0_f2,C_qq2,P0_f4
1331         ldd     [SC_HI+%l0],%f34
1332         fmuld   P0_f2,P2_f20,P2_f20
1333         faddd   P0_f4,C_qq1,P0_f4
1334         faddd   P2_f20,C_ONE,P2_f20
1335         fmuld   P0_f2,P0_f4,P0_f4
1336         fmuld   P0_f0,P2_f20,P2_f20
1337         ldd     [%o7+%l0],P0_f2
1338         fmuld   P0_f4,%f32,P0_f4
1339         fmuld   P2_f20,%f34,P2_f20
1340         fsubd   P2_f20,P0_f4,P2_f20
1341         fsubd   P0_f2,P2_f20,P2_f20
1342         ba,pt   %icc,2f
1343 ! delay slot
1344         faddd   P2_f20,%f32,P2_f20
1345 1:
1346         fmuld   P0_f0,P0_f0,P0_f2
1347         fmuld   P0_f2,C_q4,P0_f4
1348         faddd   P0_f4,C_q3,P0_f4
1349         fmuld   P0_f2,P0_f4,P0_f4
1350         faddd   P0_f4,C_q2,P0_f4
1351         fmuld   P0_f2,P0_f4,P0_f4
1352         faddd   P0_f4,C_q1,P0_f4
1353         fmuld   P0_f2,P0_f4,P0_f4
1354         !!(vsin)fmuld   P0_f0,P0_f4,P0_f4
1355         faddd   C_ONE,P0_f4,P2_f20 !!(vsin)faddd        P0_f0,P0_f4,P2_f20
1356 2:
1357         nop     !!(vsin)        fors    P2_f20,P0_f9,P2_f20
1358         st      P2_f20,[%o0]
1359         st      P2_f21,[%o0+4]
1360 
1361 .endloop0:
1362         st      P0_f6,[%o3]
1363         st      P0_f7,[%o3+4]
1364         st      P1_f16,[%o4]
1365         st      P1_f17,[%o4+4]
1366         st      P2_f26,[%o5]
1367         st      P2_f27,[%o5+4]
1368 
1369 ! return.  finished off with only primary range arguments
1370 
1371         ret
1372         restore
1373 
1374 
1375         .align  32
1376 .range0:
1377         cmp     %l0,LIM_l6
1378         bg,a,pt %icc,.MEDIUM            ! branch to Medium range on big arg.
1379 ! delay slot, annulled if branch not taken
1380         mov     0x1,LIM_l6              ! set biguns flag or
1381         fdtoi   P0_f0,P0_f2; fmovd      C_ONE,P0_f0 ; st        P0_f0,[%o0]             ! *y = *x with inexact if x nonzero
1382         st      P0_f1,[%o0+4]
1383         !nop            ! (vsin) fdtoi  P0_f0,P0_f2
1384         addcc   %i0,-1,%i0
1385         ble,pn  %icc,.endloop0
1386 ! delay slot, harmless if branch taken
1387         add     %i3,%i4,%i3             ! y += stridey
1388         andn    %l1,MSK_SIGN,%l0                ! hx &= ~0x80000000
1389         fmovd   P1_f10,P0_f0
1390         ba,pt   %icc,.loop0
1391 ! delay slot
1392         add     %i1,%i2,%i1             ! x += stridex
1393 
1394 
1395         .align  32
1396 .range1:
1397         cmp     %l1,LIM_l6
1398         bg,a,pt %icc,.MEDIUM            ! branch to Medium range on big arg.
1399 ! delay slot, annulled if branch not taken
1400         mov     0x2,LIM_l6              ! set biguns flag or
1401         fdtoi   P1_f10,P1_f12; fmovd    C_ONE,P1_f10 ; st       P1_f10,[%o1]            ! *y = *x with inexact if x nonzero
1402         st      P1_f11,[%o1+4]
1403         !nop            ! (vsin) fdtoi  P1_f10,P1_f12
1404         addcc   %i0,-1,%i0
1405         ble,pn  %icc,.endloop1
1406 ! delay slot, harmless if branch taken
1407         add     %i3,%i4,%i3             ! y += stridey
1408         andn    %l2,MSK_SIGN,%l1                ! hx &= ~0x80000000
1409         fmovd   P2_f20,P1_f10
1410         ba,pt   %icc,.loop1
1411 ! delay slot
1412         add     %i1,%i2,%i1             ! x += stridex
1413 
1414 
1415         .align  32
1416 .range2:
1417         cmp     %l2,LIM_l6
1418         bg,a,pt %icc,.MEDIUM            ! brance to Medium range on big arg.
1419 ! delay slot, annulled if branch not taken
1420         mov     0x3,LIM_l6              ! set biguns flag or
1421         fdtoi   P2_f20,P2_f22; fmovd    C_ONE,P2_f20 ; st       P2_f20,[%o2]            ! *y = *x with inexact if x nonzero
1422         st      P2_f21,[%o2+4]
1423         nop             ! (vsin) fdtoi  P2_f20,P2_f22
1424 1:
1425         addcc   %i0,-1,%i0
1426         ble,pn  %icc,.endloop2
1427 ! delay slot
1428         nop
1429         ld      [%i1],%l2
1430         ld      [%i1],P2_f20
1431         ld      [%i1+4],P2_f21
1432         andn    %l2,MSK_SIGN,%l2                ! hx &= ~0x80000000
1433         ba,pt   %icc,.loop2
1434 ! delay slot
1435         add     %i1,%i2,%i1             ! x += stridex
1436 
1437 
1438         .align  32
1439 .MEDIUM:
1440 
1441 ! ========== medium range ==========
1442 
1443 ! register use
1444 
1445 ! i0  n
1446 ! i1  x
1447 ! i2  stridex
1448 ! i3  y
1449 ! i4  stridey
1450 ! i5  0x80000000
1451 
1452 ! l0  hx0
1453 ! l1  hx1
1454 ! l2  hx2
1455 ! l3  __vlibm_TBL_sincos_hi
1456 ! l4  __vlibm_TBL_sincos_lo
1457 ! l5  constants
1458 ! l6  biguns stored here : still called LIM_l6
1459 ! l7  0x413921fb
1460 
1461 ! the following are 64-bit registers in both V8+ and V9
1462 
1463 ! g1  scratch
1464 ! g5  
1465 
1466 ! o0  py0
1467 ! o1  py1
1468 ! o2  py2
1469 ! o3  n0
1470 ! o4  n1
1471 ! o5  n2
1472 ! o7  scratch
1473 
1474 ! f0  x0
1475 ! f2  n0,y0
1476 ! f4  
1477 ! f6  
1478 ! f8  scratch for table base
1479 ! f9  signbit0
1480 ! f10 x1
1481 ! f12 n1,y1
1482 ! f14 
1483 ! f16 
1484 ! f18 scratch for table base
1485 ! f19 signbit1
1486 ! f20 x2
1487 ! f22 n2,y2
1488 ! f24 
1489 ! f26 
1490 ! f28 scratch for table base
1491 ! f29 signbit2
1492 ! f30 0x80000000
1493 ! f31 0x4000
1494 ! f32 
1495 ! f34 
1496 ! f36 
1497 ! f38 
1498 ! f40 invpio2
1499 ! f42 round
1500 ! f44 0xffff800000000000
1501 ! f46 pio2_1
1502 ! f48 pio2_2
1503 ! f50 pio2_3
1504 ! f52 pio2_3t
1505 ! f54 one
1506 ! f56 pp1
1507 ! f58 pp2
1508 ! f60 qq1
1509 ! f62 qq2
1510 
1511         
1512         PIC_SET(g5,constants,l5)
1513 
1514         ! %o3,%o4,%o5 need to be stored 
1515         st      P0_f6,[%o3]
1516         sethi   %hi(0x413921fb),%l7
1517         st      P0_f7,[%o3+4]
1518         or      %l7,%lo(0x413921fb),%l7
1519         st      P1_f16,[%o4]
1520         st      P1_f17,[%o4+4]
1521         st      P2_f26,[%o5]
1522         st      P2_f27,[%o5+4]
1523         ldd     [%l5+invpio2],%f40
1524         ldd     [%l5+round],%f42
1525         ldd     [%l5+pio2_1],%f46
1526         ldd     [%l5+pio2_2],%f48
1527         ldd     [%l5+pio2_3],%f50
1528         ldd     [%l5+pio2_3t],%f52
1529         std     %f54,[%fp+x0_1+8]       ! set up stack data
1530         std     %f54,[%fp+x1_1+8]
1531         std     %f54,[%fp+x2_1+8]
1532         stx     %g0,[%fp+y0_0+8]
1533         stx     %g0,[%fp+y1_0+8]
1534         stx     %g0,[%fp+y2_0+8]
1535 
1536 !       branched here in the middle of the array.  Need to adjust 
1537 !       for the members of the triple that were selected in the primary
1538 !       loop.
1539 
1540 !       no adjustment since all three selected here
1541         subcc   LIM_l6,0x1,%g0          ! continue in LOOP0?
1542         bz,a    %icc,.LOOP0
1543         mov     0x0,LIM_l6              ! delay slot set biguns=0
1544 
1545 !       ajust 1st triple since 2d and 3d done here
1546         subcc   LIM_l6,0x2,%g0          ! continue in LOOP1?
1547         fmuld   %f0,%f40,%f2            ! adj LOOP0
1548         bz,a    %icc,.LOOP1
1549         mov     0x0,LIM_l6              ! delay slot set biguns=0
1550 
1551 !       ajust 1st and 2d triple since 3d done here
1552         subcc   LIM_l6,0x3,%g0          ! continue in LOOP2?
1553         !done fmuld     %f0,%f40,%f2            ! adj LOOP0
1554         sub     %i3,%i4,%i3             ! adjust to not double increment
1555         fmuld   %f10,%f40,%f12          ! adj LOOP1
1556         faddd   %f2,%f42,%f2            ! adj LOOP1
1557         bz,a    %icc,.LOOP2
1558         mov     0x0,LIM_l6              ! delay slot set biguns=0
1559 
1560         ba      .LOOP0
1561         nop
1562 
1563 ! -- 16 byte aligned
1564 
1565         .align  32
1566 .LOOP0:
1567         lda     [%i1]%asi,%l1           ! preload next argument
1568         mov     %i3,%o0                 ! py0 = y
1569 
1570         lda     [%i1]%asi,%f10
1571         cmp     %l0,%l7
1572         add     %i3,%i4,%i3             ! y += stridey
1573         bg,pn   %icc,.BIG0              ! if hx > 0x413921fb
1574 
1575 ! delay slot
1576         lda     [%i1+4]%asi,%f11
1577         addcc   %i0,-1,%i0
1578         add     %i1,%i2,%i1             ! x += stridex
1579         ble,pn  %icc,.ENDLOOP1
1580 
1581 ! delay slot
1582         andn    %l1,%i5,%l1
1583         nop
1584         fmuld   %f0,%f40,%f2
1585         fabsd   %f54,%f54               ! a nop for alignment only
1586 
1587 .LOOP1:
1588         lda     [%i1]%asi,%l2           ! preload next argument
1589         mov     %i3,%o1                 ! py1 = y
1590 
1591         lda     [%i1]%asi,%f20
1592         cmp     %l1,%l7
1593         add     %i3,%i4,%i3             ! y += stridey
1594         bg,pn   %icc,.BIG1              ! if hx > 0x413921fb
1595 
1596 ! delay slot
1597         lda     [%i1+4]%asi,%f21
1598         addcc   %i0,-1,%i0
1599         add     %i1,%i2,%i1             ! x += stridex
1600         ble,pn  %icc,.ENDLOOP2
1601 
1602 ! delay slot
1603         andn    %l2,%i5,%l2
1604         nop
1605         fmuld   %f10,%f40,%f12
1606         faddd   %f2,%f42,%f2
1607 
1608 .LOOP2:
1609         st      %f3,[%fp+n0]
1610         mov     %i3,%o2                 ! py2 = y
1611 
1612         cmp     %l2,%l7
1613         add     %i3,%i4,%i3             ! y += stridey
1614         fmuld   %f20,%f40,%f22
1615         bg,pn   %icc,.BIG2              ! if hx > 0x413921fb
1616 
1617 ! delay slot
1618         add     %l5,thresh+4,%o7
1619         faddd   %f12,%f42,%f12
1620         st      %f13,[%fp+n1]
1621 
1622 ! -
1623 
1624         add     %l5,thresh,%g1
1625         faddd   %f22,%f42,%f22
1626         st      %f23,[%fp+n2]
1627 
1628         fsubd   %f2,%f42,%f2            ! n
1629 
1630         fsubd   %f12,%f42,%f12          ! n
1631 
1632         fsubd   %f22,%f42,%f22          ! n
1633 
1634         fmuld   %f2,%f46,%f4
1635 
1636         fmuld   %f12,%f46,%f14
1637 
1638         fmuld   %f22,%f46,%f24
1639 
1640         fsubd   %f0,%f4,%f4
1641         fmuld   %f2,%f48,%f6
1642 
1643         fsubd   %f10,%f14,%f14
1644         fmuld   %f12,%f48,%f16
1645 
1646         fsubd   %f20,%f24,%f24
1647         fmuld   %f22,%f48,%f26
1648 
1649         fsubd   %f4,%f6,%f0
1650         ld      [%fp+n0],%o3 ; add      %o3,1,%o3
1651 
1652         fsubd   %f14,%f16,%f10
1653         ld      [%fp+n1],%o4 ; add      %o4,1,%o4
1654 
1655         fsubd   %f24,%f26,%f20
1656         ld      [%fp+n2],%o5 ; add      %o5,1,%o5
1657 
1658         fsubd   %f4,%f0,%f32
1659         and     %o3,1,%o3
1660 
1661         fsubd   %f14,%f10,%f34
1662         and     %o4,1,%o4
1663 
1664         fsubd   %f24,%f20,%f36
1665         and     %o5,1,%o5
1666 
1667         fsubd   %f32,%f6,%f32
1668         fmuld   %f2,%f50,%f8
1669         sll     %o3,3,%o3
1670 
1671         fsubd   %f34,%f16,%f34
1672         fmuld   %f12,%f50,%f18
1673         sll     %o4,3,%o4
1674 
1675         fsubd   %f36,%f26,%f36
1676         fmuld   %f22,%f50,%f28
1677         sll     %o5,3,%o5
1678 
1679         fsubd   %f8,%f32,%f8
1680         ld      [%g1+%o3],%f6
1681 
1682         fsubd   %f18,%f34,%f18
1683         ld      [%g1+%o4],%f16
1684 
1685         fsubd   %f28,%f36,%f28
1686         ld      [%g1+%o5],%f26
1687 
1688         fsubd   %f0,%f8,%f4
1689 
1690         fsubd   %f10,%f18,%f14
1691 
1692         fsubd   %f20,%f28,%f24
1693 
1694         fsubd   %f0,%f4,%f32
1695 
1696         fsubd   %f10,%f14,%f34
1697 
1698         fsubd   %f20,%f24,%f36
1699 
1700         fsubd   %f32,%f8,%f32
1701         fmuld   %f2,%f52,%f2
1702 
1703         fsubd   %f34,%f18,%f34
1704         fmuld   %f12,%f52,%f12
1705 
1706         fsubd   %f36,%f28,%f36
1707         fmuld   %f22,%f52,%f22
1708 
1709         fsubd   %f2,%f32,%f2
1710         ld      [%o7+%o3],%f8
1711 
1712         fsubd   %f12,%f34,%f12
1713         ld      [%o7+%o4],%f18
1714 
1715         fsubd   %f22,%f36,%f22
1716         ld      [%o7+%o5],%f28
1717 
1718         fsubd   %f4,%f2,%f0             ! x
1719 
1720         fsubd   %f14,%f12,%f10          ! x
1721 
1722         fsubd   %f24,%f22,%f20          ! x
1723 
1724         fsubd   %f4,%f0,%f4
1725 
1726         fsubd   %f14,%f10,%f14
1727 
1728         fsubd   %f24,%f20,%f24
1729 
1730         fands   %f0,%f30,%f9            ! save signbit
1731 
1732         fands   %f10,%f30,%f19          ! save signbit
1733 
1734         fands   %f20,%f30,%f29          ! save signbit
1735 
1736         fabsd   %f0,%f0
1737         std     %f0,[%fp+x0_1]
1738 
1739         fabsd   %f10,%f10
1740         std     %f10,[%fp+x1_1]
1741 
1742         fabsd   %f20,%f20
1743         std     %f20,[%fp+x2_1]
1744 
1745         fsubd   %f4,%f2,%f2             ! y
1746 
1747         fsubd   %f14,%f12,%f12          ! y
1748 
1749         fsubd   %f24,%f22,%f22          ! y
1750 
1751         fcmpgt32 %f6,%f0,%l0
1752 
1753         fcmpgt32 %f16,%f10,%l1
1754 
1755         fcmpgt32 %f26,%f20,%l2
1756 
1757 ! -- 16 byte aligned
1758         fxors   %f2,%f9,%f2
1759 
1760         fxors   %f12,%f19,%f12
1761 
1762         fxors   %f22,%f29,%f22
1763 
1764         fands   %f9,%f8,%f9             ! if (n & 1) clear sign bit
1765         andcc   %l0,2,%g0
1766         bne,pn  %icc,.CASE4
1767 
1768 ! delay slot
1769         fands   %f19,%f18,%f19          ! if (n & 1) clear sign bit
1770         andcc   %l1,2,%g0
1771         bne,pn  %icc,.CASE2
1772 
1773 ! delay slot
1774         fands   %f29,%f28,%f29          ! if (n & 1) clear sign bit
1775         andcc   %l2,2,%g0
1776         bne,pn  %icc,.CASE1
1777 
1778 ! delay slot
1779         fpadd32s %f0,%f31,%f8
1780         sethi   %hi(0x3fc3c000),%o7
1781         ld      [%fp+x0_1],%l0
1782 
1783         fpadd32s %f10,%f31,%f18
1784         add     %l3,8,%g1
1785         ld      [%fp+x1_1],%l1
1786 
1787         fpadd32s %f20,%f31,%f28
1788         ld      [%fp+x2_1],%l2
1789 
1790         fand    %f8,%f44,%f4
1791         sub     %l0,%o7,%l0
1792 
1793         fand    %f18,%f44,%f14
1794         sub     %l1,%o7,%l1
1795 
1796         fand    %f28,%f44,%f24
1797         sub     %l2,%o7,%l2
1798 
1799         fsubd   %f0,%f4,%f0
1800         srl     %l0,10,%l0
1801 
1802         fsubd   %f10,%f14,%f10
1803         srl     %l1,10,%l1
1804 
1805         fsubd   %f20,%f24,%f20
1806         srl     %l2,10,%l2
1807 
1808         faddd   %f0,%f2,%f0
1809         andn    %l0,0x1f,%l0
1810 
1811         faddd   %f10,%f12,%f10
1812         andn    %l1,0x1f,%l1
1813 
1814         faddd   %f20,%f22,%f20
1815         andn    %l2,0x1f,%l2
1816 
1817         fmuld   %f0,%f0,%f2
1818         add     %l0,%o3,%l0
1819 
1820         fmuld   %f10,%f10,%f12
1821         add     %l1,%o4,%l1
1822 
1823         fmuld   %f20,%f20,%f22
1824         add     %l2,%o5,%l2
1825 
1826         fmuld   %f2,%f58,%f6
1827         ldd     [%l3+%l0],%f32
1828 
1829         fmuld   %f12,%f58,%f16
1830         ldd     [%l3+%l1],%f34
1831 
1832         fmuld   %f22,%f58,%f26
1833         ldd     [%l3+%l2],%f36
1834 
1835         faddd   %f6,%f56,%f6
1836         fmuld   %f2,%f62,%f4
1837 
1838         faddd   %f16,%f56,%f16
1839         fmuld   %f12,%f62,%f14
1840 
1841         faddd   %f26,%f56,%f26
1842         fmuld   %f22,%f62,%f24
1843 
1844         fmuld   %f2,%f6,%f6
1845         faddd   %f4,%f60,%f4
1846 
1847         fmuld   %f12,%f16,%f16
1848         faddd   %f14,%f60,%f14
1849 
1850         fmuld   %f22,%f26,%f26
1851         faddd   %f24,%f60,%f24
1852 
1853         faddd   %f6,%f54,%f6
1854         fmuld   %f2,%f4,%f4
1855 
1856         faddd   %f16,%f54,%f16
1857         fmuld   %f12,%f14,%f14
1858 
1859         faddd   %f26,%f54,%f26
1860         fmuld   %f22,%f24,%f24
1861 
1862         fmuld   %f0,%f6,%f6
1863         ldd     [%g1+%l0],%f2
1864 
1865         fmuld   %f10,%f16,%f16
1866         ldd     [%g1+%l1],%f12
1867 
1868         fmuld   %f20,%f26,%f26
1869         ldd     [%g1+%l2],%f22
1870 
1871         fmuld   %f4,%f32,%f4
1872         ldd     [%l4+%l0],%f0
1873 
1874         fmuld   %f14,%f34,%f14
1875         ldd     [%l4+%l1],%f10
1876 
1877         fmuld   %f24,%f36,%f24
1878         ldd     [%l4+%l2],%f20
1879 
1880         fmuld   %f6,%f2,%f6
1881 
1882         fmuld   %f16,%f12,%f16
1883 
1884         fmuld   %f26,%f22,%f26
1885 
1886         faddd   %f6,%f4,%f6
1887 
1888         faddd   %f16,%f14,%f16
1889 
1890         faddd   %f26,%f24,%f26
1891 
1892         faddd   %f6,%f0,%f6
1893 
1894         faddd   %f16,%f10,%f16
1895 
1896         faddd   %f26,%f20,%f26
1897 
1898         faddd   %f6,%f32,%f6
1899 
1900         faddd   %f16,%f34,%f16
1901 
1902         faddd   %f26,%f36,%f26
1903 
1904 .FIXSIGN:
1905         ld      [%fp+n0],%o3 ; add      %o3,1,%o3
1906         add     %l5,thresh-4,%g1
1907 
1908         ld      [%fp+n1],%o4 ; add      %o4,1,%o4
1909 
1910         ld      [%fp+n2],%o5 ; add      %o5,1,%o5
1911         and     %o3,2,%o3
1912 
1913         sll     %o3,2,%o3
1914         and     %o4,2,%o4
1915         lda     [%i1]%asi,%l0           ! preload next argument
1916 
1917         sll     %o4,2,%o4
1918         and     %o5,2,%o5
1919         ld      [%g1+%o3],%f8
1920 
1921         sll     %o5,2,%o5
1922         ld      [%g1+%o4],%f18
1923 
1924         ld      [%g1+%o5],%f28
1925         fxors   %f9,%f8,%f9
1926 
1927         lda     [%i1]%asi,%f0
1928         fxors   %f29,%f28,%f29
1929 
1930         lda     [%i1+4]%asi,%f1
1931         fxors   %f19,%f18,%f19
1932 
1933         fors    %f6,%f9,%f6             ! tack on sign
1934         add     %i1,%i2,%i1             ! x += stridex
1935         st      %f6,[%o0]
1936 
1937         fors    %f26,%f29,%f26          ! tack on sign
1938         st      %f7,[%o0+4]
1939 
1940         fors    %f16,%f19,%f16          ! tack on sign
1941         st      %f26,[%o2]
1942 
1943         st      %f27,[%o2+4]
1944         addcc   %i0,-1,%i0
1945 
1946         st      %f16,[%o1]
1947         andn    %l0,%i5,%l0             ! hx &= ~0x80000000
1948         bg,pt   %icc,.LOOP0
1949 
1950 ! delay slot
1951         st      %f17,[%o1+4]
1952 
1953         ba,pt   %icc,.ENDLOOP0
1954 ! delay slot
1955         nop
1956 
1957         .align  32
1958 .CASE1:
1959         fpadd32s %f10,%f31,%f18
1960         sethi   %hi(0x3fc3c000),%o7
1961         ld      [%fp+x0_1],%l0
1962 
1963         fand    %f8,%f44,%f4
1964         add     %l3,8,%g1
1965         ld      [%fp+x1_1],%l1
1966 
1967         fand    %f18,%f44,%f14
1968         sub     %l0,%o7,%l0
1969 
1970         fsubd   %f0,%f4,%f0
1971         srl     %l0,10,%l0
1972         sub     %l1,%o7,%l1
1973 
1974         fsubd   %f10,%f14,%f10
1975         srl     %l1,10,%l1
1976 
1977         fmuld   %f20,%f20,%f20
1978         ldd     [%l5+%o5],%f36
1979         add     %l5,%o5,%l2
1980 
1981         faddd   %f0,%f2,%f0
1982         andn    %l0,0x1f,%l0
1983 
1984         faddd   %f10,%f12,%f10
1985         andn    %l1,0x1f,%l1
1986 
1987         fmuld   %f20,%f36,%f24
1988         ldd     [%l2+0x10],%f26
1989         add     %fp,%o5,%o5
1990 
1991         fmuld   %f0,%f0,%f2
1992         add     %l0,%o3,%l0
1993 
1994         fmuld   %f10,%f10,%f12
1995         add     %l1,%o4,%l1
1996 
1997         faddd   %f24,%f26,%f24
1998         ldd     [%l2+0x20],%f36
1999 
2000         fmuld   %f2,%f58,%f6
2001         ldd     [%l3+%l0],%f32
2002 
2003         fmuld   %f12,%f58,%f16
2004         ldd     [%l3+%l1],%f34
2005 
2006         fmuld   %f20,%f24,%f24
2007         ldd     [%l2+0x30],%f26
2008 
2009         faddd   %f6,%f56,%f6
2010         fmuld   %f2,%f62,%f4
2011 
2012         faddd   %f16,%f56,%f16
2013         fmuld   %f12,%f62,%f14
2014 
2015         faddd   %f24,%f36,%f24
2016         ldd     [%o5+x2_1],%f36
2017 
2018         fmuld   %f2,%f6,%f6
2019         faddd   %f4,%f60,%f4
2020 
2021         fmuld   %f12,%f16,%f16
2022         faddd   %f14,%f60,%f14
2023 
2024         fmuld   %f20,%f24,%f24
2025 
2026         faddd   %f6,%f54,%f6
2027         fmuld   %f2,%f4,%f4
2028         ldd     [%g1+%l0],%f2
2029 
2030         faddd   %f16,%f54,%f16
2031         fmuld   %f12,%f14,%f14
2032         ldd     [%g1+%l1],%f12
2033 
2034         faddd   %f24,%f26,%f24
2035 
2036         fmuld   %f0,%f6,%f6
2037         ldd     [%l4+%l0],%f0
2038 
2039         fmuld   %f10,%f16,%f16
2040         ldd     [%l4+%l1],%f10
2041 
2042         fmuld   %f4,%f32,%f4
2043         std     %f22,[%fp+y2_0]
2044 
2045         fmuld   %f14,%f34,%f14
2046 
2047         fmuld   %f6,%f2,%f6
2048 
2049         fmuld   %f16,%f12,%f16
2050 
2051         fmuld   %f20,%f24,%f24
2052 
2053         faddd   %f6,%f4,%f6
2054 
2055         faddd   %f16,%f14,%f16
2056 
2057         fmuld   %f36,%f24,%f24
2058         ldd     [%o5+y2_0],%f22
2059 
2060         faddd   %f6,%f0,%f6
2061 
2062         faddd   %f16,%f10,%f16
2063 
2064         faddd   %f24,%f22,%f24
2065 
2066         faddd   %f6,%f32,%f6
2067 
2068         faddd   %f16,%f34,%f16
2069         ba,pt   %icc,.FIXSIGN
2070 
2071 ! delay slot
2072         faddd   %f36,%f24,%f26
2073 
2074         .align  32
2075 .CASE2:
2076         fpadd32s %f0,%f31,%f8
2077         ld      [%fp+x0_1],%l0
2078         andcc   %l2,2,%g0
2079         bne,pn  %icc,.CASE3
2080 
2081 ! delay slot
2082         sethi   %hi(0x3fc3c000),%o7
2083         fpadd32s %f20,%f31,%f28
2084         ld      [%fp+x2_1],%l2
2085 
2086         fand    %f8,%f44,%f4
2087         sub     %l0,%o7,%l0
2088         add     %l3,8,%g1
2089 
2090         fand    %f28,%f44,%f24
2091         sub     %l2,%o7,%l2
2092 
2093         fsubd   %f0,%f4,%f0
2094         srl     %l0,10,%l0
2095 
2096         fsubd   %f20,%f24,%f20
2097         srl     %l2,10,%l2
2098 
2099         fmuld   %f10,%f10,%f10
2100         ldd     [%l5+%o4],%f34
2101         add     %l5,%o4,%l1
2102 
2103         faddd   %f0,%f2,%f0
2104         andn    %l0,0x1f,%l0
2105 
2106         faddd   %f20,%f22,%f20
2107         andn    %l2,0x1f,%l2
2108 
2109         fmuld   %f10,%f34,%f14
2110         ldd     [%l1+0x10],%f16
2111         add     %fp,%o4,%o4
2112 
2113         fmuld   %f0,%f0,%f2
2114         add     %l0,%o3,%l0
2115 
2116         fmuld   %f20,%f20,%f22
2117         add     %l2,%o5,%l2
2118 
2119         faddd   %f14,%f16,%f14
2120         ldd     [%l1+0x20],%f34
2121 
2122         fmuld   %f2,%f58,%f6
2123         ldd     [%l3+%l0],%f32
2124 
2125         fmuld   %f22,%f58,%f26
2126         ldd     [%l3+%l2],%f36
2127 
2128         fmuld   %f10,%f14,%f14
2129         ldd     [%l1+0x30],%f16
2130 
2131         faddd   %f6,%f56,%f6
2132         fmuld   %f2,%f62,%f4
2133 
2134         faddd   %f26,%f56,%f26
2135         fmuld   %f22,%f62,%f24
2136 
2137         faddd   %f14,%f34,%f14
2138         ldd     [%o4+x1_1],%f34
2139 
2140         fmuld   %f2,%f6,%f6
2141         faddd   %f4,%f60,%f4
2142 
2143         fmuld   %f22,%f26,%f26
2144         faddd   %f24,%f60,%f24
2145 
2146         fmuld   %f10,%f14,%f14
2147 
2148         faddd   %f6,%f54,%f6
2149         fmuld   %f2,%f4,%f4
2150         ldd     [%g1+%l0],%f2
2151 
2152         faddd   %f26,%f54,%f26
2153         fmuld   %f22,%f24,%f24
2154         ldd     [%g1+%l2],%f22
2155 
2156         faddd   %f14,%f16,%f14
2157 
2158         fmuld   %f0,%f6,%f6
2159         ldd     [%l4+%l0],%f0
2160 
2161         fmuld   %f20,%f26,%f26
2162         ldd     [%l4+%l2],%f20
2163 
2164         fmuld   %f4,%f32,%f4
2165         std     %f12,[%fp+y1_0]
2166 
2167         fmuld   %f24,%f36,%f24
2168 
2169         fmuld   %f6,%f2,%f6
2170 
2171         fmuld   %f26,%f22,%f26
2172 
2173         fmuld   %f10,%f14,%f14
2174 
2175         faddd   %f6,%f4,%f6
2176 
2177         faddd   %f26,%f24,%f26
2178 
2179         fmuld   %f34,%f14,%f14
2180         ldd     [%o4+y1_0],%f12
2181 
2182         faddd   %f6,%f0,%f6
2183 
2184         faddd   %f26,%f20,%f26
2185 
2186         faddd   %f14,%f12,%f14
2187 
2188         faddd   %f6,%f32,%f6
2189 
2190         faddd   %f26,%f36,%f26
2191         ba,pt   %icc,.FIXSIGN
2192 
2193 ! delay slot
2194         faddd   %f34,%f14,%f16
2195 
2196         .align  32
2197 .CASE3:
2198         fand    %f8,%f44,%f4
2199         add     %l3,8,%g1
2200         sub     %l0,%o7,%l0
2201 
2202         fmuld   %f10,%f10,%f10
2203         ldd     [%l5+%o4],%f34
2204         add     %l5,%o4,%l1
2205 
2206         fsubd   %f0,%f4,%f0
2207         srl     %l0,10,%l0
2208 
2209         fmuld   %f20,%f20,%f20
2210         ldd     [%l5+%o5],%f36
2211         add     %l5,%o5,%l2
2212 
2213         fmuld   %f10,%f34,%f14
2214         ldd     [%l1+0x10],%f16
2215         add     %fp,%o4,%o4
2216 
2217         faddd   %f0,%f2,%f0
2218         andn    %l0,0x1f,%l0
2219 
2220         fmuld   %f20,%f36,%f24
2221         ldd     [%l2+0x10],%f26
2222         add     %fp,%o5,%o5
2223 
2224         faddd   %f14,%f16,%f14
2225         ldd     [%l1+0x20],%f34
2226 
2227         fmuld   %f0,%f0,%f2
2228         add     %l0,%o3,%l0
2229 
2230         faddd   %f24,%f26,%f24
2231         ldd     [%l2+0x20],%f36
2232 
2233         fmuld   %f10,%f14,%f14
2234         ldd     [%l1+0x30],%f16
2235 
2236         fmuld   %f2,%f58,%f6
2237         ldd     [%l3+%l0],%f32
2238 
2239         fmuld   %f20,%f24,%f24
2240         ldd     [%l2+0x30],%f26
2241 
2242         faddd   %f14,%f34,%f14
2243         ldd     [%o4+x1_1],%f34
2244 
2245         faddd   %f6,%f56,%f6
2246         fmuld   %f2,%f62,%f4
2247 
2248         faddd   %f24,%f36,%f24
2249         ldd     [%o5+x2_1],%f36
2250 
2251         fmuld   %f10,%f14,%f14
2252         std     %f12,[%fp+y1_0]
2253 
2254         fmuld   %f2,%f6,%f6
2255         faddd   %f4,%f60,%f4
2256 
2257         fmuld   %f20,%f24,%f24
2258         std     %f22,[%fp+y2_0]
2259 
2260         faddd   %f14,%f16,%f14
2261 
2262         faddd   %f6,%f54,%f6
2263         fmuld   %f2,%f4,%f4
2264         ldd     [%g1+%l0],%f2
2265 
2266         faddd   %f24,%f26,%f24
2267 
2268         fmuld   %f10,%f14,%f14
2269 
2270         fmuld   %f0,%f6,%f6
2271         ldd     [%l4+%l0],%f0
2272 
2273         fmuld   %f4,%f32,%f4
2274 
2275         fmuld   %f20,%f24,%f24
2276 
2277         fmuld   %f6,%f2,%f6
2278 
2279         fmuld   %f34,%f14,%f14
2280         ldd     [%o4+y1_0],%f12
2281 
2282         fmuld   %f36,%f24,%f24
2283         ldd     [%o5+y2_0],%f22
2284 
2285         faddd   %f6,%f4,%f6
2286 
2287         faddd   %f14,%f12,%f14
2288 
2289         faddd   %f24,%f22,%f24
2290 
2291         faddd   %f6,%f0,%f6
2292 
2293         faddd   %f34,%f14,%f16
2294 
2295         faddd   %f36,%f24,%f26
2296         ba,pt   %icc,.FIXSIGN
2297 
2298 ! delay slot
2299         faddd   %f6,%f32,%f6
2300 
2301         .align  32
2302 .CASE4:
2303         fands   %f29,%f28,%f29          ! if (n & 1) clear sign bit
2304         sethi   %hi(0x3fc3c000),%o7
2305         andcc   %l1,2,%g0
2306         bne,pn  %icc,.CASE6
2307 
2308 ! delay slot
2309         andcc   %l2,2,%g0
2310         fpadd32s %f10,%f31,%f18
2311         ld      [%fp+x1_1],%l1
2312         bne,pn  %icc,.CASE5
2313 
2314 ! delay slot
2315         add     %l3,8,%g1
2316         ld      [%fp+x2_1],%l2
2317         fpadd32s %f20,%f31,%f28
2318 
2319         fand    %f18,%f44,%f14
2320         sub     %l1,%o7,%l1
2321 
2322         fand    %f28,%f44,%f24
2323         sub     %l2,%o7,%l2
2324 
2325         fsubd   %f10,%f14,%f10
2326         srl     %l1,10,%l1
2327 
2328         fsubd   %f20,%f24,%f20
2329         srl     %l2,10,%l2
2330 
2331         fmuld   %f0,%f0,%f0
2332         ldd     [%l5+%o3],%f32
2333         add     %l5,%o3,%l0
2334 
2335         faddd   %f10,%f12,%f10
2336         andn    %l1,0x1f,%l1
2337 
2338         faddd   %f20,%f22,%f20
2339         andn    %l2,0x1f,%l2
2340 
2341         fmuld   %f0,%f32,%f4
2342         ldd     [%l0+0x10],%f6
2343         add     %fp,%o3,%o3
2344 
2345         fmuld   %f10,%f10,%f12
2346         add     %l1,%o4,%l1
2347 
2348         fmuld   %f20,%f20,%f22
2349         add     %l2,%o5,%l2
2350 
2351         faddd   %f4,%f6,%f4
2352         ldd     [%l0+0x20],%f32
2353 
2354         fmuld   %f12,%f58,%f16
2355         ldd     [%l3+%l1],%f34
2356 
2357         fmuld   %f22,%f58,%f26
2358         ldd     [%l3+%l2],%f36
2359 
2360         fmuld   %f0,%f4,%f4
2361         ldd     [%l0+0x30],%f6
2362 
2363         faddd   %f16,%f56,%f16
2364         fmuld   %f12,%f62,%f14
2365 
2366         faddd   %f26,%f56,%f26
2367         fmuld   %f22,%f62,%f24
2368 
2369         faddd   %f4,%f32,%f4
2370         ldd     [%o3+x0_1],%f32
2371 
2372         fmuld   %f12,%f16,%f16
2373         faddd   %f14,%f60,%f14
2374 
2375         fmuld   %f22,%f26,%f26
2376         faddd   %f24,%f60,%f24
2377 
2378         fmuld   %f0,%f4,%f4
2379 
2380         faddd   %f16,%f54,%f16
2381         fmuld   %f12,%f14,%f14
2382         ldd     [%g1+%l1],%f12
2383 
2384         faddd   %f26,%f54,%f26
2385         fmuld   %f22,%f24,%f24
2386         ldd     [%g1+%l2],%f22
2387 
2388         faddd   %f4,%f6,%f4
2389 
2390         fmuld   %f10,%f16,%f16
2391         ldd     [%l4+%l1],%f10
2392 
2393         fmuld   %f20,%f26,%f26
2394         ldd     [%l4+%l2],%f20
2395 
2396         fmuld   %f14,%f34,%f14
2397         std     %f2,[%fp+y0_0]
2398 
2399         fmuld   %f24,%f36,%f24
2400 
2401         fmuld   %f0,%f4,%f4
2402 
2403         fmuld   %f16,%f12,%f16
2404 
2405         fmuld   %f26,%f22,%f26
2406 
2407         fmuld   %f32,%f4,%f4
2408         ldd     [%o3+y0_0],%f2
2409 
2410         faddd   %f16,%f14,%f16
2411 
2412         faddd   %f26,%f24,%f26
2413 
2414         faddd   %f4,%f2,%f4
2415 
2416         faddd   %f16,%f10,%f16
2417 
2418         faddd   %f26,%f20,%f26
2419 
2420         faddd   %f32,%f4,%f6
2421 
2422         faddd   %f16,%f34,%f16
2423         ba,pt   %icc,.FIXSIGN
2424 
2425 ! delay slot
2426         faddd   %f26,%f36,%f26
2427 
2428         .align  32
2429 .CASE5:
2430         fand    %f18,%f44,%f14
2431         sub     %l1,%o7,%l1
2432 
2433         fmuld   %f0,%f0,%f0
2434         ldd     [%l5+%o3],%f32
2435         add     %l5,%o3,%l0
2436 
2437         fsubd   %f10,%f14,%f10
2438         srl     %l1,10,%l1
2439 
2440         fmuld   %f20,%f20,%f20
2441         ldd     [%l5+%o5],%f36
2442         add     %l5,%o5,%l2
2443 
2444         fmuld   %f0,%f32,%f4
2445         ldd     [%l0+0x10],%f6
2446         add     %fp,%o3,%o3
2447 
2448         faddd   %f10,%f12,%f10
2449         andn    %l1,0x1f,%l1
2450 
2451         fmuld   %f20,%f36,%f24
2452         ldd     [%l2+0x10],%f26
2453         add     %fp,%o5,%o5
2454 
2455         faddd   %f4,%f6,%f4
2456         ldd     [%l0+0x20],%f32
2457 
2458         fmuld   %f10,%f10,%f12
2459         add     %l1,%o4,%l1
2460 
2461         faddd   %f24,%f26,%f24
2462         ldd     [%l2+0x20],%f36
2463 
2464         fmuld   %f0,%f4,%f4
2465         ldd     [%l0+0x30],%f6
2466 
2467         fmuld   %f12,%f58,%f16
2468         ldd     [%l3+%l1],%f34
2469 
2470         fmuld   %f20,%f24,%f24
2471         ldd     [%l2+0x30],%f26
2472 
2473         faddd   %f4,%f32,%f4
2474         ldd     [%o3+x0_1],%f32
2475 
2476         faddd   %f16,%f56,%f16
2477         fmuld   %f12,%f62,%f14
2478 
2479         faddd   %f24,%f36,%f24
2480         ldd     [%o5+x2_1],%f36
2481 
2482         fmuld   %f0,%f4,%f4
2483         std     %f2,[%fp+y0_0]
2484 
2485         fmuld   %f12,%f16,%f16
2486         faddd   %f14,%f60,%f14
2487 
2488         fmuld   %f20,%f24,%f24
2489         std     %f22,[%fp+y2_0]
2490 
2491         faddd   %f4,%f6,%f4
2492 
2493         faddd   %f16,%f54,%f16
2494         fmuld   %f12,%f14,%f14
2495         ldd     [%g1+%l1],%f12
2496 
2497         faddd   %f24,%f26,%f24
2498 
2499         fmuld   %f0,%f4,%f4
2500 
2501         fmuld   %f10,%f16,%f16
2502         ldd     [%l4+%l1],%f10
2503 
2504         fmuld   %f14,%f34,%f14
2505 
2506         fmuld   %f20,%f24,%f24
2507 
2508         fmuld   %f16,%f12,%f16
2509 
2510         fmuld   %f32,%f4,%f4
2511         ldd     [%o3+y0_0],%f2
2512 
2513         fmuld   %f36,%f24,%f24
2514         ldd     [%o5+y2_0],%f22
2515 
2516         faddd   %f16,%f14,%f16
2517 
2518         faddd   %f4,%f2,%f4
2519 
2520         faddd   %f24,%f22,%f24
2521 
2522         faddd   %f16,%f10,%f16
2523 
2524         faddd   %f32,%f4,%f6
2525 
2526         faddd   %f36,%f24,%f26
2527         ba,pt   %icc,.FIXSIGN
2528 
2529 ! delay slot
2530         faddd   %f16,%f34,%f16
2531 
2532         .align  32
2533 .CASE6:
2534         ld      [%fp+x2_1],%l2
2535         add     %l3,8,%g1
2536         bne,pn  %icc,.CASE7
2537 ! delay slot
2538         fpadd32s %f20,%f31,%f28
2539 
2540         fand    %f28,%f44,%f24
2541         ldd     [%l5+%o3],%f32
2542         add     %l5,%o3,%l0
2543 
2544         fmuld   %f0,%f0,%f0
2545         sub     %l2,%o7,%l2
2546 
2547         fsubd   %f20,%f24,%f20
2548         srl     %l2,10,%l2
2549 
2550         fmuld   %f10,%f10,%f10
2551         ldd     [%l5+%o4],%f34
2552         add     %l5,%o4,%l1
2553 
2554         fmuld   %f0,%f32,%f4
2555         ldd     [%l0+0x10],%f6
2556         add     %fp,%o3,%o3
2557 
2558         faddd   %f20,%f22,%f20
2559         andn    %l2,0x1f,%l2
2560 
2561         fmuld   %f10,%f34,%f14
2562         ldd     [%l1+0x10],%f16
2563         add     %fp,%o4,%o4
2564 
2565         faddd   %f4,%f6,%f4
2566         ldd     [%l0+0x20],%f32
2567 
2568         fmuld   %f20,%f20,%f22
2569         add     %l2,%o5,%l2
2570 
2571         faddd   %f14,%f16,%f14
2572         ldd     [%l1+0x20],%f34
2573 
2574         fmuld   %f0,%f4,%f4
2575         ldd     [%l0+0x30],%f6
2576 
2577         fmuld   %f22,%f58,%f26
2578         ldd     [%l3+%l2],%f36
2579 
2580         fmuld   %f10,%f14,%f14
2581         ldd     [%l1+0x30],%f16
2582 
2583         faddd   %f4,%f32,%f4
2584         ldd     [%o3+x0_1],%f32
2585 
2586         faddd   %f26,%f56,%f26
2587         fmuld   %f22,%f62,%f24
2588 
2589         faddd   %f14,%f34,%f14
2590         ldd     [%o4+x1_1],%f34
2591 
2592         fmuld   %f0,%f4,%f4
2593         std     %f2,[%fp+y0_0]
2594 
2595         fmuld   %f22,%f26,%f26
2596         faddd   %f24,%f60,%f24
2597 
2598         fmuld   %f10,%f14,%f14
2599         std     %f12,[%fp+y1_0]
2600 
2601         faddd   %f4,%f6,%f4
2602 
2603         faddd   %f26,%f54,%f26
2604         fmuld   %f22,%f24,%f24
2605         ldd     [%g1+%l2],%f22
2606 
2607         faddd   %f14,%f16,%f14
2608 
2609         fmuld   %f0,%f4,%f4
2610 
2611         fmuld   %f20,%f26,%f26
2612         ldd     [%l4+%l2],%f20
2613 
2614         fmuld   %f24,%f36,%f24
2615 
2616         fmuld   %f10,%f14,%f14
2617 
2618         fmuld   %f26,%f22,%f26
2619 
2620         fmuld   %f32,%f4,%f4
2621         ldd     [%o3+y0_0],%f2
2622 
2623         fmuld   %f34,%f14,%f14
2624         ldd     [%o4+y1_0],%f12
2625 
2626         faddd   %f26,%f24,%f26
2627 
2628         faddd   %f4,%f2,%f4
2629 
2630         faddd   %f14,%f12,%f14
2631 
2632         faddd   %f26,%f20,%f26
2633 
2634         faddd   %f32,%f4,%f6
2635 
2636         faddd   %f34,%f14,%f16
2637         ba,pt   %icc,.FIXSIGN
2638 
2639 ! delay slot
2640         faddd   %f26,%f36,%f26
2641 
2642         .align  32
2643 .CASE7:
2644         fmuld   %f0,%f0,%f0
2645         ldd     [%l5+%o3],%f32
2646         add     %l5,%o3,%l0
2647 
2648         fmuld   %f10,%f10,%f10
2649         ldd     [%l5+%o4],%f34
2650         add     %l5,%o4,%l1
2651 
2652         fmuld   %f20,%f20,%f20
2653         ldd     [%l5+%o5],%f36
2654         add     %l5,%o5,%l2
2655 
2656         fmuld   %f0,%f32,%f4
2657         ldd     [%l0+0x10],%f6
2658         add     %fp,%o3,%o3
2659 
2660         fmuld   %f10,%f34,%f14
2661         ldd     [%l1+0x10],%f16
2662         add     %fp,%o4,%o4
2663 
2664         fmuld   %f20,%f36,%f24
2665         ldd     [%l2+0x10],%f26
2666         add     %fp,%o5,%o5
2667 
2668         faddd   %f4,%f6,%f4
2669         ldd     [%l0+0x20],%f32
2670 
2671         faddd   %f14,%f16,%f14
2672         ldd     [%l1+0x20],%f34
2673 
2674         faddd   %f24,%f26,%f24
2675         ldd     [%l2+0x20],%f36
2676 
2677         fmuld   %f0,%f4,%f4
2678         ldd     [%l0+0x30],%f6
2679 
2680         fmuld   %f10,%f14,%f14
2681         ldd     [%l1+0x30],%f16
2682 
2683         fmuld   %f20,%f24,%f24
2684         ldd     [%l2+0x30],%f26
2685 
2686         faddd   %f4,%f32,%f4
2687         ldd     [%o3+x0_1],%f32
2688 
2689         faddd   %f14,%f34,%f14
2690         ldd     [%o4+x1_1],%f34
2691 
2692         faddd   %f24,%f36,%f24
2693         ldd     [%o5+x2_1],%f36
2694 
2695         fmuld   %f0,%f4,%f4
2696         std     %f2,[%fp+y0_0]
2697 
2698         fmuld   %f10,%f14,%f14
2699         std     %f12,[%fp+y1_0]
2700 
2701         fmuld   %f20,%f24,%f24
2702         std     %f22,[%fp+y2_0]
2703 
2704         faddd   %f4,%f6,%f4
2705 
2706         faddd   %f14,%f16,%f14
2707 
2708         faddd   %f24,%f26,%f24
2709 
2710         fmuld   %f0,%f4,%f4
2711 
2712         fmuld   %f10,%f14,%f14
2713 
2714         fmuld   %f20,%f24,%f24
2715 
2716         fmuld   %f32,%f4,%f4
2717         ldd     [%o3+y0_0],%f2
2718 
2719         fmuld   %f34,%f14,%f14
2720         ldd     [%o4+y1_0],%f12
2721 
2722         fmuld   %f36,%f24,%f24
2723         ldd     [%o5+y2_0],%f22
2724 
2725         faddd   %f4,%f2,%f4
2726 
2727         faddd   %f14,%f12,%f14
2728 
2729         faddd   %f24,%f22,%f24
2730 
2731         faddd   %f32,%f4,%f6
2732 
2733         faddd   %f34,%f14,%f16
2734         ba,pt   %icc,.FIXSIGN
2735 
2736 ! delay slot
2737         faddd   %f36,%f24,%f26
2738 
2739 
2740         .align  32
2741 .ENDLOOP2:
2742         fmuld   %f10,%f40,%f12
2743         add     %l5,thresh,%g1
2744         faddd   %f12,%f42,%f12
2745         st      %f13,[%fp+n1]
2746         fsubd   %f12,%f42,%f12          ! n
2747         fmuld   %f12,%f46,%f14
2748         fsubd   %f10,%f14,%f14
2749         fmuld   %f12,%f48,%f16
2750         fsubd   %f14,%f16,%f10
2751         ld      [%fp+n1],%o4 ; add      %o4,1,%o4
2752         fsubd   %f14,%f10,%f34
2753         and     %o4,1,%o4
2754         fsubd   %f34,%f16,%f34
2755         fmuld   %f12,%f50,%f18
2756         sll     %o4,3,%o4
2757         fsubd   %f18,%f34,%f18
2758         ld      [%g1+%o4],%f16
2759         fsubd   %f10,%f18,%f14
2760         fsubd   %f10,%f14,%f34
2761         add     %l5,thresh+4,%o7
2762         fsubd   %f34,%f18,%f34
2763         fmuld   %f12,%f52,%f12
2764         fsubd   %f12,%f34,%f12
2765         ld      [%o7+%o4],%f18
2766         fsubd   %f14,%f12,%f10          ! x
2767         fsubd   %f14,%f10,%f14
2768         fands   %f10,%f30,%f19          ! save signbit
2769         fabsd   %f10,%f10
2770         std     %f10,[%fp+x1_1]
2771         fsubd   %f14,%f12,%f12          ! y
2772         fcmpgt32 %f16,%f10,%l1
2773         fxors   %f12,%f19,%f12
2774         fands   %f19,%f18,%f19          ! if (n & 1) clear sign bit
2775         andcc   %l1,2,%g0
2776         bne,pn  %icc,1f
2777 ! delay slot
2778         nop
2779         fpadd32s %f10,%f31,%f18
2780         ld      [%fp+x1_1],%l1
2781         fand    %f18,%f44,%f14
2782         sethi   %hi(0x3fc3c000),%o7
2783         add     %l3,8,%g1
2784         fsubd   %f10,%f14,%f10
2785         sub     %l1,%o7,%l1
2786         srl     %l1,10,%l1
2787         faddd   %f10,%f12,%f10
2788         andn    %l1,0x1f,%l1
2789         fmuld   %f10,%f10,%f12
2790         add     %l1,%o4,%l1
2791         fmuld   %f12,%f58,%f16
2792         ldd     [%l3+%l1],%f34
2793         faddd   %f16,%f56,%f16
2794         fmuld   %f12,%f62,%f14
2795         fmuld   %f12,%f16,%f16
2796         faddd   %f14,%f60,%f14
2797         faddd   %f16,%f54,%f16
2798         fmuld   %f12,%f14,%f14
2799         ldd     [%g1+%l1],%f12
2800         fmuld   %f10,%f16,%f16
2801         ldd     [%l4+%l1],%f10
2802         fmuld   %f14,%f34,%f14
2803         fmuld   %f16,%f12,%f16
2804         faddd   %f16,%f14,%f16
2805         faddd   %f16,%f10,%f16
2806         ba,pt   %icc,2f
2807         faddd   %f16,%f34,%f16
2808 1:
2809         fmuld   %f10,%f10,%f10
2810         ldd     [%l5+%o4],%f34
2811         add     %l5,%o4,%l1
2812         fmuld   %f10,%f34,%f14
2813         ldd     [%l1+0x10],%f16
2814         add     %fp,%o4,%o4
2815         faddd   %f14,%f16,%f14
2816         ldd     [%l1+0x20],%f34
2817         fmuld   %f10,%f14,%f14
2818         ldd     [%l1+0x30],%f16
2819         faddd   %f14,%f34,%f14
2820         ldd     [%o4+x1_1],%f34
2821         fmuld   %f10,%f14,%f14
2822         std     %f12,[%fp+y1_0]
2823         faddd   %f14,%f16,%f14
2824         fmuld   %f10,%f14,%f14
2825         fmuld   %f34,%f14,%f14
2826         ldd     [%o4+y1_0],%f12
2827         faddd   %f14,%f12,%f14
2828         faddd   %f34,%f14,%f16
2829 2:
2830         add     %l5,thresh-4,%g1
2831         ld      [%fp+n1],%o4 ; add      %o4,1,%o4
2832         and     %o4,2,%o4
2833         sll     %o4,2,%o4
2834         ld      [%g1+%o4],%f18
2835         fxors   %f19,%f18,%f19
2836         fors    %f16,%f19,%f16          ! tack on sign
2837         st      %f16,[%o1]
2838         st      %f17,[%o1+4]
2839 
2840 .ENDLOOP1:
2841         fmuld   %f0,%f40,%f2
2842         add     %l5,thresh,%g1
2843         faddd   %f2,%f42,%f2
2844         st      %f3,[%fp+n0]
2845         fsubd   %f2,%f42,%f2            ! n
2846         fmuld   %f2,%f46,%f4
2847         fsubd   %f0,%f4,%f4
2848         fmuld   %f2,%f48,%f6
2849         fsubd   %f4,%f6,%f0
2850         ld      [%fp+n0],%o3 ; add      %o3,1,%o3
2851         fsubd   %f4,%f0,%f32
2852         and     %o3,1,%o3
2853         fsubd   %f32,%f6,%f32
2854         fmuld   %f2,%f50,%f8
2855         sll     %o3,3,%o3
2856         fsubd   %f8,%f32,%f8
2857         ld      [%g1+%o3],%f6
2858         fsubd   %f0,%f8,%f4
2859         fsubd   %f0,%f4,%f32
2860         add     %l5,thresh+4,%o7
2861         fsubd   %f32,%f8,%f32
2862         fmuld   %f2,%f52,%f2
2863         fsubd   %f2,%f32,%f2
2864         ld      [%o7+%o3],%f8
2865         fsubd   %f4,%f2,%f0             ! x
2866         fsubd   %f4,%f0,%f4
2867         fands   %f0,%f30,%f9            ! save signbit
2868         fabsd   %f0,%f0
2869         std     %f0,[%fp+x0_1]
2870         fsubd   %f4,%f2,%f2             ! y
2871         fcmpgt32 %f6,%f0,%l0
2872         fxors   %f2,%f9,%f2
2873         fands   %f9,%f8,%f9             ! if (n & 1) clear sign bit
2874         andcc   %l0,2,%g0
2875         bne,pn  %icc,1f
2876 ! delay slot
2877         nop
2878         fpadd32s %f0,%f31,%f8
2879         ld      [%fp+x0_1],%l0
2880         fand    %f8,%f44,%f4
2881         sethi   %hi(0x3fc3c000),%o7
2882         add     %l3,8,%g1
2883         fsubd   %f0,%f4,%f0
2884         sub     %l0,%o7,%l0
2885         srl     %l0,10,%l0
2886         faddd   %f0,%f2,%f0
2887         andn    %l0,0x1f,%l0
2888         fmuld   %f0,%f0,%f2
2889         add     %l0,%o3,%l0
2890         fmuld   %f2,%f58,%f6
2891         ldd     [%l3+%l0],%f32
2892         faddd   %f6,%f56,%f6
2893         fmuld   %f2,%f62,%f4
2894         fmuld   %f2,%f6,%f6
2895         faddd   %f4,%f60,%f4
2896         faddd   %f6,%f54,%f6
2897         fmuld   %f2,%f4,%f4
2898         ldd     [%g1+%l0],%f2
2899         fmuld   %f0,%f6,%f6
2900         ldd     [%l4+%l0],%f0
2901         fmuld   %f4,%f32,%f4
2902         fmuld   %f6,%f2,%f6
2903         faddd   %f6,%f4,%f6
2904         faddd   %f6,%f0,%f6
2905         ba,pt   %icc,2f
2906         faddd   %f6,%f32,%f6
2907 1:
2908         fmuld   %f0,%f0,%f0
2909         ldd     [%l5+%o3],%f32
2910         add     %l5,%o3,%l0
2911         fmuld   %f0,%f32,%f4
2912         ldd     [%l0+0x10],%f6
2913         add     %fp,%o3,%o3
2914         faddd   %f4,%f6,%f4
2915         ldd     [%l0+0x20],%f32
2916         fmuld   %f0,%f4,%f4
2917         ldd     [%l0+0x30],%f6
2918         faddd   %f4,%f32,%f4
2919         ldd     [%o3+x0_1],%f32
2920         fmuld   %f0,%f4,%f4
2921         std     %f2,[%fp+y0_0]
2922         faddd   %f4,%f6,%f4
2923         fmuld   %f0,%f4,%f4
2924         fmuld   %f32,%f4,%f4
2925         ldd     [%o3+y0_0],%f2
2926         faddd   %f4,%f2,%f4
2927         faddd   %f32,%f4,%f6
2928 2:
2929         add     %l5,thresh-4,%g1
2930         ld      [%fp+n0],%o3 ; add      %o3,1,%o3
2931         and     %o3,2,%o3
2932         sll     %o3,2,%o3
2933         ld      [%g1+%o3],%f8
2934         fxors   %f9,%f8,%f9
2935         fors    %f6,%f9,%f6             ! tack on sign
2936         st      %f6,[%o0]
2937         st      %f7,[%o0+4]
2938 
2939 .ENDLOOP0:
2940 
2941 ! check for huge arguments remaining
2942 
2943         tst     LIM_l6
2944         be,pt   %icc,.exit
2945 ! delay slot
2946         nop
2947 
2948 ! ========== huge range (use C code) ==========
2949 
2950 #ifdef __sparcv9
2951         ldx     [%fp+xsave],%o1
2952         ldx     [%fp+ysave],%o3
2953 #else
2954         ld      [%fp+xsave],%o1
2955         ld      [%fp+ysave],%o3
2956 #endif
2957         ld      [%fp+nsave],%o0
2958         ld      [%fp+sxsave],%o2
2959         ld      [%fp+sysave],%o4
2960         sra     %o2,0,%o2               ! sign-extend for V9
2961         sra     %o4,0,%o4
2962         call    __vlibm_vcos_big
2963         mov     %l7,%o5                 ! delay slot
2964 
2965 .exit:
2966         ret
2967         restore
2968 
2969 
2970         .align  32
2971 .SKIP0:
2972         addcc   %i0,-1,%i0
2973         ble,pn  %icc,.ENDLOOP0
2974 ! delay slot, harmless if branch taken
2975         add     %i3,%i4,%i3             ! y += stridey
2976         andn    %l1,%i5,%l0             ! hx &= ~0x80000000
2977         fmovs   %f10,%f0
2978         ld      [%i1+4],%f1
2979         ba,pt   %icc,.LOOP0
2980 ! delay slot
2981         add     %i1,%i2,%i1             ! x += stridex
2982 
2983 
2984         .align  32
2985 .SKIP1:
2986         addcc   %i0,-1,%i0
2987         ble,pn  %icc,.ENDLOOP1
2988 ! delay slot, harmless if branch taken
2989         add     %i3,%i4,%i3             ! y += stridey
2990         andn    %l2,%i5,%l1             ! hx &= ~0x80000000
2991         fmovs   %f20,%f10
2992         ld      [%i1+4],%f11
2993         ba,pt   %icc,.LOOP1
2994 ! delay slot
2995         add     %i1,%i2,%i1             ! x += stridex
2996 
2997 
2998         .align  32
2999 .SKIP2:
3000         addcc   %i0,-1,%i0
3001         ble,pn  %icc,.ENDLOOP2
3002 ! delay slot, harmless if branch taken
3003         add     %i3,%i4,%i3             ! y += stridey
3004         ld      [%i1],%l2
3005         ld      [%i1],%f20
3006         ld      [%i1+4],%f21
3007         andn    %l2,%i5,%l2             ! hx &= ~0x80000000
3008         ba,pt   %icc,.LOOP2
3009 ! delay slot
3010         add     %i1,%i2,%i1             ! x += stridex
3011 
3012 
3013         .align  32
3014 .BIG0:
3015         sethi   %hi(0x7ff00000),%o7
3016         cmp     %l0,%o7
3017         bl,a,pt %icc,1f                 ! if hx < 0x7ff00000
3018 ! delay slot, annulled if branch not taken
3019         mov     %l7,LIM_l6      ! set biguns flag or
3020         fsubd   %f0,%f0,%f0             ! y = x - x
3021         st      %f0,[%o0]
3022         st      %f1,[%o0+4]
3023 1:
3024         addcc   %i0,-1,%i0
3025         ble,pn  %icc,.ENDLOOP0
3026 ! delay slot, harmless if branch taken
3027         andn    %l1,%i5,%l0             ! hx &= ~0x80000000
3028         fmovd   %f10,%f0
3029         ba,pt   %icc,.LOOP0
3030 ! delay slot
3031         add     %i1,%i2,%i1             ! x += stridex
3032 
3033 
3034         .align  32
3035 .BIG1:
3036         sethi   %hi(0x7ff00000),%o7
3037         cmp     %l1,%o7
3038         bl,a,pt %icc,1f                 ! if hx < 0x7ff00000
3039 ! delay slot, annulled if branch not taken
3040         mov     %l7,LIM_l6              ! set biguns flag or
3041         fsubd   %f10,%f10,%f10          ! y = x - x
3042         st      %f10,[%o1]
3043         st      %f11,[%o1+4]
3044 1:
3045         addcc   %i0,-1,%i0
3046         ble,pn  %icc,.ENDLOOP1
3047 ! delay slot, harmless if branch taken
3048         andn    %l2,%i5,%l1             ! hx &= ~0x80000000
3049         fmovd   %f20,%f10
3050         ba,pt   %icc,.LOOP1
3051 ! delay slot
3052         add     %i1,%i2,%i1             ! x += stridex
3053 
3054 
3055         .align  32
3056 .BIG2:
3057         sethi   %hi(0x7ff00000),%o7
3058         cmp     %l2,%o7
3059         bl,a,pt %icc,1f                 ! if hx < 0x7ff00000
3060 ! delay slot, annulled if branch not taken
3061         mov     %l7,LIM_l6              ! set biguns flag or
3062         fsubd   %f20,%f20,%f20          ! y = x - x
3063         st      %f20,[%o2]
3064         st      %f21,[%o2+4]
3065 1:
3066         addcc   %i0,-1,%i0
3067         ble,pn  %icc,.ENDLOOP2
3068 ! delay slot
3069         nop
3070         ld      [%i1],%l2
3071         ld      [%i1],%f20
3072         ld      [%i1+4],%f21
3073         andn    %l2,%i5,%l2             ! hx &= ~0x80000000
3074         ba,pt   %icc,.LOOP2
3075 ! delay slot
3076         add     %i1,%i2,%i1             ! x += stridex
3077 
3078         SET_SIZE(__vcos)
3079