1 #!/usr/bin/env perl
   2 
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 
  10 # October 2005.
  11 #
  12 # Montgomery multiplication routine for x86_64. While it gives modest
  13 # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
  14 # than twice, >2x, as fast. Most common rsa1024 sign is improved by
  15 # respectful 50%. It remains to be seen if loop unrolling and
  16 # dedicated squaring routine can provide further improvement...
  17 
  18 # July 2011.
  19 #
  20 # Add dedicated squaring procedure. Performance improvement varies
  21 # from platform to platform, but in average it's ~5%/15%/25%/33%
  22 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
  23 
  24 # August 2011.
  25 #
  26 # Unroll and modulo-schedule inner loops in such manner that they
  27 # are "fallen through" for input lengths of 8, which is critical for
  28 # 1024-bit RSA *sign*. Average performance improvement in comparison
  29 # to *initial* version of this module from 2005 is ~0%/30%/40%/45%
  30 # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
  31 
  32 $flavour = shift;
  33 $output  = shift;
  34 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  35 
  36 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  37 
  38 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  39 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  40 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  41 die "can't locate x86_64-xlate.pl";
  42 
  43 open OUT,"| \"$^X\" $xlate $flavour $output";
  44 *STDOUT=*OUT;
  45 
  46 # int bn_mul_mont(
  47 $rp="%rdi";     # BN_ULONG *rp,
  48 $ap="%rsi";     # const BN_ULONG *ap,
  49 $bp="%rdx";     # const BN_ULONG *bp,
  50 $np="%rcx";     # const BN_ULONG *np,
  51 $n0="%r8";      # const BN_ULONG *n0,
  52 $num="%r9";     # int num);
  53 $lo0="%r10";
  54 $hi0="%r11";
  55 $hi1="%r13";
  56 $i="%r14";
  57 $j="%r15";
  58 $m0="%rbx";
  59 $m1="%rbp";
  60 
  61 $code=<<___;
  62 .text
  63 
  64 .globl  bn_mul_mont
  65 .type   bn_mul_mont,\@function,6
  66 .align  16
  67 bn_mul_mont:
  68         test    \$3,${num}d
  69         jnz     .Lmul_enter
  70         cmp     \$8,${num}d
  71         jb      .Lmul_enter
  72         cmp     $ap,$bp
  73         jne     .Lmul4x_enter
  74         jmp     .Lsqr4x_enter
  75 
  76 .align  16
  77 .Lmul_enter:
  78         push    %rbx
  79         push    %rbp
  80         push    %r12
  81         push    %r13
  82         push    %r14
  83         push    %r15
  84 
  85         mov     ${num}d,${num}d
  86         lea     2($num),%r10
  87         mov     %rsp,%r11
  88         neg     %r10
  89         lea     (%rsp,%r10,8),%rsp      # tp=alloca(8*(num+2))
  90         and     \$-1024,%rsp            # minimize TLB usage
  91 
  92         mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
  93 .Lmul_body:
  94         mov     $bp,%r12                # reassign $bp
  95 ___
  96                 $bp="%r12";
  97 $code.=<<___;
  98         mov     ($n0),$n0               # pull n0[0] value
  99         mov     ($bp),$m0               # m0=bp[0]
 100         mov     ($ap),%rax
 101 
 102         xor     $i,$i                   # i=0
 103         xor     $j,$j                   # j=0
 104 
 105         mov     $n0,$m1
 106         mulq    $m0                     # ap[0]*bp[0]
 107         mov     %rax,$lo0
 108         mov     ($np),%rax
 109 
 110         imulq   $lo0,$m1                # "tp[0]"*n0
 111         mov     %rdx,$hi0
 112 
 113         mulq    $m1                     # np[0]*m1
 114         add     %rax,$lo0               # discarded
 115         mov     8($ap),%rax
 116         adc     \$0,%rdx
 117         mov     %rdx,$hi1
 118 
 119         lea     1($j),$j                # j++
 120         jmp     .L1st_enter
 121 
 122 .align  16
 123 .L1st:
 124         add     %rax,$hi1
 125         mov     ($ap,$j,8),%rax
 126         adc     \$0,%rdx
 127         add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
 128         mov     $lo0,$hi0
 129         adc     \$0,%rdx
 130         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
 131         mov     %rdx,$hi1
 132 
 133 .L1st_enter:
 134         mulq    $m0                     # ap[j]*bp[0]
 135         add     %rax,$hi0
 136         mov     ($np,$j,8),%rax
 137         adc     \$0,%rdx
 138         lea     1($j),$j                # j++
 139         mov     %rdx,$lo0
 140 
 141         mulq    $m1                     # np[j]*m1
 142         cmp     $num,$j
 143         jne     .L1st
 144 
 145         add     %rax,$hi1
 146         mov     ($ap),%rax              # ap[0]
 147         adc     \$0,%rdx
 148         add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
 149         adc     \$0,%rdx
 150         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
 151         mov     %rdx,$hi1
 152         mov     $lo0,$hi0
 153 
 154         xor     %rdx,%rdx
 155         add     $hi0,$hi1
 156         adc     \$0,%rdx
 157         mov     $hi1,-8(%rsp,$num,8)
 158         mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
 159 
 160         lea     1($i),$i                # i++
 161         jmp     .Louter
 162 .align  16
 163 .Louter:
 164         mov     ($bp,$i,8),$m0          # m0=bp[i]
 165         xor     $j,$j                   # j=0
 166         mov     $n0,$m1
 167         mov     (%rsp),$lo0
 168         mulq    $m0                     # ap[0]*bp[i]
 169         add     %rax,$lo0               # ap[0]*bp[i]+tp[0]
 170         mov     ($np),%rax
 171         adc     \$0,%rdx
 172 
 173         imulq   $lo0,$m1                # tp[0]*n0
 174         mov     %rdx,$hi0
 175 
 176         mulq    $m1                     # np[0]*m1
 177         add     %rax,$lo0               # discarded
 178         mov     8($ap),%rax
 179         adc     \$0,%rdx
 180         mov     8(%rsp),$lo0            # tp[1]
 181         mov     %rdx,$hi1
 182 
 183         lea     1($j),$j                # j++
 184         jmp     .Linner_enter
 185 
 186 .align  16
 187 .Linner:
 188         add     %rax,$hi1
 189         mov     ($ap,$j,8),%rax
 190         adc     \$0,%rdx
 191         add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
 192         mov     (%rsp,$j,8),$lo0
 193         adc     \$0,%rdx
 194         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
 195         mov     %rdx,$hi1
 196 
 197 .Linner_enter:
 198         mulq    $m0                     # ap[j]*bp[i]
 199         add     %rax,$hi0
 200         mov     ($np,$j,8),%rax
 201         adc     \$0,%rdx
 202         add     $hi0,$lo0               # ap[j]*bp[i]+tp[j]
 203         mov     %rdx,$hi0
 204         adc     \$0,$hi0
 205         lea     1($j),$j                # j++
 206 
 207         mulq    $m1                     # np[j]*m1
 208         cmp     $num,$j
 209         jne     .Linner
 210 
 211         add     %rax,$hi1
 212         mov     ($ap),%rax              # ap[0]
 213         adc     \$0,%rdx
 214         add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
 215         mov     (%rsp,$j,8),$lo0
 216         adc     \$0,%rdx
 217         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
 218         mov     %rdx,$hi1
 219 
 220         xor     %rdx,%rdx
 221         add     $hi0,$hi1
 222         adc     \$0,%rdx
 223         add     $lo0,$hi1               # pull upmost overflow bit
 224         adc     \$0,%rdx
 225         mov     $hi1,-8(%rsp,$num,8)
 226         mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
 227 
 228         lea     1($i),$i                # i++
 229         cmp     $num,$i
 230         jl      .Louter
 231 
 232         xor     $i,$i                   # i=0 and clear CF!
 233         mov     (%rsp),%rax             # tp[0]
 234         lea     (%rsp),$ap              # borrow ap for tp
 235         mov     $num,$j                 # j=num
 236         jmp     .Lsub
 237 .align  16
 238 .Lsub:  sbb     ($np,$i,8),%rax
 239         mov     %rax,($rp,$i,8)         # rp[i]=tp[i]-np[i]
 240         mov     8($ap,$i,8),%rax        # tp[i+1]
 241         lea     1($i),$i                # i++
 242         dec     $j                      # doesnn't affect CF!
 243         jnz     .Lsub
 244 
 245         sbb     \$0,%rax                # handle upmost overflow bit
 246         xor     $i,$i
 247         and     %rax,$ap
 248         not     %rax
 249         mov     $rp,$np
 250         and     %rax,$np
 251         mov     $num,$j                 # j=num
 252         or      $np,$ap                 # ap=borrow?tp:rp
 253 .align  16
 254 .Lcopy:                                 # copy or in-place refresh
 255         mov     ($ap,$i,8),%rax
 256         mov     $i,(%rsp,$i,8)          # zap temporary vector
 257         mov     %rax,($rp,$i,8)         # rp[i]=tp[i]
 258         lea     1($i),$i
 259         sub     \$1,$j
 260         jnz     .Lcopy
 261 
 262         mov     8(%rsp,$num,8),%rsi     # restore %rsp
 263         mov     \$1,%rax
 264         mov     (%rsi),%r15
 265         mov     8(%rsi),%r14
 266         mov     16(%rsi),%r13
 267         mov     24(%rsi),%r12
 268         mov     32(%rsi),%rbp
 269         mov     40(%rsi),%rbx
 270         lea     48(%rsi),%rsp
 271 .Lmul_epilogue:
 272         ret
 273 .size   bn_mul_mont,.-bn_mul_mont
 274 ___
 275 {{{
 276 my @A=("%r10","%r11");
 277 my @N=("%r13","%rdi");
 278 $code.=<<___;
 279 .type   bn_mul4x_mont,\@function,6
 280 .align  16
 281 bn_mul4x_mont:
 282 .Lmul4x_enter:
 283         push    %rbx
 284         push    %rbp
 285         push    %r12
 286         push    %r13
 287         push    %r14
 288         push    %r15
 289 
 290         mov     ${num}d,${num}d
 291         lea     4($num),%r10
 292         mov     %rsp,%r11
 293         neg     %r10
 294         lea     (%rsp,%r10,8),%rsp      # tp=alloca(8*(num+4))
 295         and     \$-1024,%rsp            # minimize TLB usage
 296 
 297         mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
 298 .Lmul4x_body:
 299         mov     $rp,16(%rsp,$num,8)     # tp[num+2]=$rp
 300         mov     %rdx,%r12               # reassign $bp
 301 ___
 302                 $bp="%r12";
 303 $code.=<<___;
 304         mov     ($n0),$n0               # pull n0[0] value
 305         mov     ($bp),$m0               # m0=bp[0]
 306         mov     ($ap),%rax
 307 
 308         xor     $i,$i                   # i=0
 309         xor     $j,$j                   # j=0
 310 
 311         mov     $n0,$m1
 312         mulq    $m0                     # ap[0]*bp[0]
 313         mov     %rax,$A[0]
 314         mov     ($np),%rax
 315 
 316         imulq   $A[0],$m1               # "tp[0]"*n0
 317         mov     %rdx,$A[1]
 318 
 319         mulq    $m1                     # np[0]*m1
 320         add     %rax,$A[0]              # discarded
 321         mov     8($ap),%rax
 322         adc     \$0,%rdx
 323         mov     %rdx,$N[1]
 324 
 325         mulq    $m0
 326         add     %rax,$A[1]
 327         mov     8($np),%rax
 328         adc     \$0,%rdx
 329         mov     %rdx,$A[0]
 330 
 331         mulq    $m1
 332         add     %rax,$N[1]
 333         mov     16($ap),%rax
 334         adc     \$0,%rdx
 335         add     $A[1],$N[1]
 336         lea     4($j),$j                # j++
 337         adc     \$0,%rdx
 338         mov     $N[1],(%rsp)
 339         mov     %rdx,$N[0]
 340         jmp     .L1st4x
 341 .align  16
 342 .L1st4x:
 343         mulq    $m0                     # ap[j]*bp[0]
 344         add     %rax,$A[0]
 345         mov     -16($np,$j,8),%rax
 346         adc     \$0,%rdx
 347         mov     %rdx,$A[1]
 348 
 349         mulq    $m1                     # np[j]*m1
 350         add     %rax,$N[0]
 351         mov     -8($ap,$j,8),%rax
 352         adc     \$0,%rdx
 353         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
 354         adc     \$0,%rdx
 355         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
 356         mov     %rdx,$N[1]
 357 
 358         mulq    $m0                     # ap[j]*bp[0]
 359         add     %rax,$A[1]
 360         mov     -8($np,$j,8),%rax
 361         adc     \$0,%rdx
 362         mov     %rdx,$A[0]
 363 
 364         mulq    $m1                     # np[j]*m1
 365         add     %rax,$N[1]
 366         mov     ($ap,$j,8),%rax
 367         adc     \$0,%rdx
 368         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
 369         adc     \$0,%rdx
 370         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
 371         mov     %rdx,$N[0]
 372 
 373         mulq    $m0                     # ap[j]*bp[0]
 374         add     %rax,$A[0]
 375         mov     ($np,$j,8),%rax
 376         adc     \$0,%rdx
 377         mov     %rdx,$A[1]
 378 
 379         mulq    $m1                     # np[j]*m1
 380         add     %rax,$N[0]
 381         mov     8($ap,$j,8),%rax
 382         adc     \$0,%rdx
 383         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
 384         adc     \$0,%rdx
 385         mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
 386         mov     %rdx,$N[1]
 387 
 388         mulq    $m0                     # ap[j]*bp[0]
 389         add     %rax,$A[1]
 390         mov     8($np,$j,8),%rax
 391         adc     \$0,%rdx
 392         lea     4($j),$j                # j++
 393         mov     %rdx,$A[0]
 394 
 395         mulq    $m1                     # np[j]*m1
 396         add     %rax,$N[1]
 397         mov     -16($ap,$j,8),%rax
 398         adc     \$0,%rdx
 399         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
 400         adc     \$0,%rdx
 401         mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
 402         mov     %rdx,$N[0]
 403         cmp     $num,$j
 404         jl      .L1st4x
 405 
 406         mulq    $m0                     # ap[j]*bp[0]
 407         add     %rax,$A[0]
 408         mov     -16($np,$j,8),%rax
 409         adc     \$0,%rdx
 410         mov     %rdx,$A[1]
 411 
 412         mulq    $m1                     # np[j]*m1
 413         add     %rax,$N[0]
 414         mov     -8($ap,$j,8),%rax
 415         adc     \$0,%rdx
 416         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
 417         adc     \$0,%rdx
 418         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
 419         mov     %rdx,$N[1]
 420 
 421         mulq    $m0                     # ap[j]*bp[0]
 422         add     %rax,$A[1]
 423         mov     -8($np,$j,8),%rax
 424         adc     \$0,%rdx
 425         mov     %rdx,$A[0]
 426 
 427         mulq    $m1                     # np[j]*m1
 428         add     %rax,$N[1]
 429         mov     ($ap),%rax              # ap[0]
 430         adc     \$0,%rdx
 431         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
 432         adc     \$0,%rdx
 433         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
 434         mov     %rdx,$N[0]
 435 
 436         xor     $N[1],$N[1]
 437         add     $A[0],$N[0]
 438         adc     \$0,$N[1]
 439         mov     $N[0],-8(%rsp,$j,8)
 440         mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
 441 
 442         lea     1($i),$i                # i++
 443 .align  4
 444 .Louter4x:
 445         mov     ($bp,$i,8),$m0          # m0=bp[i]
 446         xor     $j,$j                   # j=0
 447         mov     (%rsp),$A[0]
 448         mov     $n0,$m1
 449         mulq    $m0                     # ap[0]*bp[i]
 450         add     %rax,$A[0]              # ap[0]*bp[i]+tp[0]
 451         mov     ($np),%rax
 452         adc     \$0,%rdx
 453 
 454         imulq   $A[0],$m1               # tp[0]*n0
 455         mov     %rdx,$A[1]
 456 
 457         mulq    $m1                     # np[0]*m1
 458         add     %rax,$A[0]              # "$N[0]", discarded
 459         mov     8($ap),%rax
 460         adc     \$0,%rdx
 461         mov     %rdx,$N[1]
 462 
 463         mulq    $m0                     # ap[j]*bp[i]
 464         add     %rax,$A[1]
 465         mov     8($np),%rax
 466         adc     \$0,%rdx
 467         add     8(%rsp),$A[1]           # +tp[1]
 468         adc     \$0,%rdx
 469         mov     %rdx,$A[0]
 470 
 471         mulq    $m1                     # np[j]*m1
 472         add     %rax,$N[1]
 473         mov     16($ap),%rax
 474         adc     \$0,%rdx
 475         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[i]+tp[j]
 476         lea     4($j),$j                # j+=2
 477         adc     \$0,%rdx
 478         mov     $N[1],(%rsp)            # tp[j-1]
 479         mov     %rdx,$N[0]
 480         jmp     .Linner4x
 481 .align  16
 482 .Linner4x:
 483         mulq    $m0                     # ap[j]*bp[i]
 484         add     %rax,$A[0]
 485         mov     -16($np,$j,8),%rax
 486         adc     \$0,%rdx
 487         add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
 488         adc     \$0,%rdx
 489         mov     %rdx,$A[1]
 490 
 491         mulq    $m1                     # np[j]*m1
 492         add     %rax,$N[0]
 493         mov     -8($ap,$j,8),%rax
 494         adc     \$0,%rdx
 495         add     $A[0],$N[0]
 496         adc     \$0,%rdx
 497         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
 498         mov     %rdx,$N[1]
 499 
 500         mulq    $m0                     # ap[j]*bp[i]
 501         add     %rax,$A[1]
 502         mov     -8($np,$j,8),%rax
 503         adc     \$0,%rdx
 504         add     -8(%rsp,$j,8),$A[1]
 505         adc     \$0,%rdx
 506         mov     %rdx,$A[0]
 507 
 508         mulq    $m1                     # np[j]*m1
 509         add     %rax,$N[1]
 510         mov     ($ap,$j,8),%rax
 511         adc     \$0,%rdx
 512         add     $A[1],$N[1]
 513         adc     \$0,%rdx
 514         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
 515         mov     %rdx,$N[0]
 516 
 517         mulq    $m0                     # ap[j]*bp[i]
 518         add     %rax,$A[0]
 519         mov     ($np,$j,8),%rax
 520         adc     \$0,%rdx
 521         add     (%rsp,$j,8),$A[0]       # ap[j]*bp[i]+tp[j]
 522         adc     \$0,%rdx
 523         mov     %rdx,$A[1]
 524 
 525         mulq    $m1                     # np[j]*m1
 526         add     %rax,$N[0]
 527         mov     8($ap,$j,8),%rax
 528         adc     \$0,%rdx
 529         add     $A[0],$N[0]
 530         adc     \$0,%rdx
 531         mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
 532         mov     %rdx,$N[1]
 533 
 534         mulq    $m0                     # ap[j]*bp[i]
 535         add     %rax,$A[1]
 536         mov     8($np,$j,8),%rax
 537         adc     \$0,%rdx
 538         add     8(%rsp,$j,8),$A[1]
 539         adc     \$0,%rdx
 540         lea     4($j),$j                # j++
 541         mov     %rdx,$A[0]
 542 
 543         mulq    $m1                     # np[j]*m1
 544         add     %rax,$N[1]
 545         mov     -16($ap,$j,8),%rax
 546         adc     \$0,%rdx
 547         add     $A[1],$N[1]
 548         adc     \$0,%rdx
 549         mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
 550         mov     %rdx,$N[0]
 551         cmp     $num,$j
 552         jl      .Linner4x
 553 
 554         mulq    $m0                     # ap[j]*bp[i]
 555         add     %rax,$A[0]
 556         mov     -16($np,$j,8),%rax
 557         adc     \$0,%rdx
 558         add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
 559         adc     \$0,%rdx
 560         mov     %rdx,$A[1]
 561 
 562         mulq    $m1                     # np[j]*m1
 563         add     %rax,$N[0]
 564         mov     -8($ap,$j,8),%rax
 565         adc     \$0,%rdx
 566         add     $A[0],$N[0]
 567         adc     \$0,%rdx
 568         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
 569         mov     %rdx,$N[1]
 570 
 571         mulq    $m0                     # ap[j]*bp[i]
 572         add     %rax,$A[1]
 573         mov     -8($np,$j,8),%rax
 574         adc     \$0,%rdx
 575         add     -8(%rsp,$j,8),$A[1]
 576         adc     \$0,%rdx
 577         lea     1($i),$i                # i++
 578         mov     %rdx,$A[0]
 579 
 580         mulq    $m1                     # np[j]*m1
 581         add     %rax,$N[1]
 582         mov     ($ap),%rax              # ap[0]
 583         adc     \$0,%rdx
 584         add     $A[1],$N[1]
 585         adc     \$0,%rdx
 586         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
 587         mov     %rdx,$N[0]
 588 
 589         xor     $N[1],$N[1]
 590         add     $A[0],$N[0]
 591         adc     \$0,$N[1]
 592         add     (%rsp,$num,8),$N[0]     # pull upmost overflow bit
 593         adc     \$0,$N[1]
 594         mov     $N[0],-8(%rsp,$j,8)
 595         mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
 596 
 597         cmp     $num,$i
 598         jl      .Louter4x
 599 ___
 600 {
 601 my @ri=("%rax","%rdx",$m0,$m1);
 602 $code.=<<___;
 603         mov     16(%rsp,$num,8),$rp     # restore $rp
 604         mov     0(%rsp),@ri[0]          # tp[0]
 605         pxor    %xmm0,%xmm0
 606         mov     8(%rsp),@ri[1]          # tp[1]
 607         shr     \$2,$num                # num/=4
 608         lea     (%rsp),$ap              # borrow ap for tp
 609         xor     $i,$i                   # i=0 and clear CF!
 610 
 611         sub     0($np),@ri[0]
 612         mov     16($ap),@ri[2]          # tp[2]
 613         mov     24($ap),@ri[3]          # tp[3]
 614         sbb     8($np),@ri[1]
 615         lea     -1($num),$j             # j=num/4-1
 616         jmp     .Lsub4x
 617 .align  16
 618 .Lsub4x:
 619         mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
 620         mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
 621         sbb     16($np,$i,8),@ri[2]
 622         mov     32($ap,$i,8),@ri[0]     # tp[i+1]
 623         mov     40($ap,$i,8),@ri[1]
 624         sbb     24($np,$i,8),@ri[3]
 625         mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
 626         mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
 627         sbb     32($np,$i,8),@ri[0]
 628         mov     48($ap,$i,8),@ri[2]
 629         mov     56($ap,$i,8),@ri[3]
 630         sbb     40($np,$i,8),@ri[1]
 631         lea     4($i),$i                # i++
 632         dec     $j                      # doesnn't affect CF!
 633         jnz     .Lsub4x
 634 
 635         mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
 636         mov     32($ap,$i,8),@ri[0]     # load overflow bit
 637         sbb     16($np,$i,8),@ri[2]
 638         mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
 639         sbb     24($np,$i,8),@ri[3]
 640         mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
 641 
 642         sbb     \$0,@ri[0]              # handle upmost overflow bit
 643         mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
 644         xor     $i,$i                   # i=0
 645         and     @ri[0],$ap
 646         not     @ri[0]
 647         mov     $rp,$np
 648         and     @ri[0],$np
 649         lea     -1($num),$j
 650         or      $np,$ap                 # ap=borrow?tp:rp
 651 
 652         movdqu  ($ap),%xmm1
 653         movdqa  %xmm0,(%rsp)
 654         movdqu  %xmm1,($rp)
 655         jmp     .Lcopy4x
 656 .align  16
 657 .Lcopy4x:                                       # copy or in-place refresh
 658         movdqu  16($ap,$i),%xmm2
 659         movdqu  32($ap,$i),%xmm1
 660         movdqa  %xmm0,16(%rsp,$i)
 661         movdqu  %xmm2,16($rp,$i)
 662         movdqa  %xmm0,32(%rsp,$i)
 663         movdqu  %xmm1,32($rp,$i)
 664         lea     32($i),$i
 665         dec     $j
 666         jnz     .Lcopy4x
 667 
 668         shl     \$2,$num
 669         movdqu  16($ap,$i),%xmm2
 670         movdqa  %xmm0,16(%rsp,$i)
 671         movdqu  %xmm2,16($rp,$i)
 672 ___
 673 }
 674 $code.=<<___;
 675         mov     8(%rsp,$num,8),%rsi     # restore %rsp
 676         mov     \$1,%rax
 677         mov     (%rsi),%r15
 678         mov     8(%rsi),%r14
 679         mov     16(%rsi),%r13
 680         mov     24(%rsi),%r12
 681         mov     32(%rsi),%rbp
 682         mov     40(%rsi),%rbx
 683         lea     48(%rsi),%rsp
 684 .Lmul4x_epilogue:
 685         ret
 686 .size   bn_mul4x_mont,.-bn_mul4x_mont
 687 ___
 688 }}}
 689 {{{
 690 ######################################################################
 691 # void bn_sqr4x_mont(
 692 my $rptr="%rdi";        # const BN_ULONG *rptr,
 693 my $aptr="%rsi";        # const BN_ULONG *aptr,
 694 my $bptr="%rdx";        # not used
 695 my $nptr="%rcx";        # const BN_ULONG *nptr,
 696 my $n0  ="%r8";         # const BN_ULONG *n0);
 697 my $num ="%r9";         # int num, has to be divisible by 4 and
 698                         # not less than 8
 699 
 700 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
 701 my @A0=("%r10","%r11");
 702 my @A1=("%r12","%r13");
 703 my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
 704 
 705 $code.=<<___;
 706 .type   bn_sqr4x_mont,\@function,6
 707 .align  16
 708 bn_sqr4x_mont:
 709 .Lsqr4x_enter:
 710         push    %rbx
 711         push    %rbp
 712         push    %r12
 713         push    %r13
 714         push    %r14
 715         push    %r15
 716 
 717         shl     \$3,${num}d             # convert $num to bytes
 718         xor     %r10,%r10
 719         mov     %rsp,%r11               # put aside %rsp
 720         sub     $num,%r10               # -$num
 721         mov     ($n0),$n0               # *n0
 722         lea     -72(%rsp,%r10,2),%rsp   # alloca(frame+2*$num)
 723         and     \$-1024,%rsp            # minimize TLB usage
 724         ##############################################################
 725         # Stack layout
 726         #
 727         # +0    saved $num, used in reduction section
 728         # +8    &t[2*$num], used in reduction section
 729         # +32   saved $rptr
 730         # +40   saved $nptr
 731         # +48   saved *n0
 732         # +56   saved %rsp
 733         # +64   t[2*$num]
 734         #
 735         mov     $rptr,32(%rsp)          # save $rptr
 736         mov     $nptr,40(%rsp)
 737         mov     $n0,  48(%rsp)
 738         mov     %r11, 56(%rsp)          # save original %rsp
 739 .Lsqr4x_body:
 740         ##############################################################
 741         # Squaring part:
 742         #
 743         # a) multiply-n-add everything but a[i]*a[i];
 744         # b) shift result of a) by 1 to the left and accumulate
 745         #    a[i]*a[i] products;
 746         #
 747         lea     32(%r10),$i             # $i=-($num-32)
 748         lea     ($aptr,$num),$aptr      # end of a[] buffer, ($aptr,$i)=&ap[2]
 749 
 750         mov     $num,$j                 # $j=$num
 751 
 752                                         # comments apply to $num==8 case
 753         mov     -32($aptr,$i),$a0       # a[0]
 754         lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
 755         mov     -24($aptr,$i),%rax      # a[1]
 756         lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
 757         mov     -16($aptr,$i),$ai       # a[2]
 758         mov     %rax,$a1
 759 
 760         mul     $a0                     # a[1]*a[0]
 761         mov     %rax,$A0[0]             # a[1]*a[0]
 762          mov    $ai,%rax                # a[2]
 763         mov     %rdx,$A0[1]
 764         mov     $A0[0],-24($tptr,$i)    # t[1]
 765 
 766         xor     $A0[0],$A0[0]
 767         mul     $a0                     # a[2]*a[0]
 768         add     %rax,$A0[1]
 769          mov    $ai,%rax
 770         adc     %rdx,$A0[0]
 771         mov     $A0[1],-16($tptr,$i)    # t[2]
 772 
 773         lea     -16($i),$j              # j=-16
 774 
 775 
 776          mov    8($aptr,$j),$ai         # a[3]
 777         mul     $a1                     # a[2]*a[1]
 778         mov     %rax,$A1[0]             # a[2]*a[1]+t[3]
 779          mov    $ai,%rax
 780         mov     %rdx,$A1[1]
 781 
 782         xor     $A0[1],$A0[1]
 783         add     $A1[0],$A0[0]
 784          lea    16($j),$j
 785         adc     \$0,$A0[1]
 786         mul     $a0                     # a[3]*a[0]
 787         add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
 788          mov    $ai,%rax
 789         adc     %rdx,$A0[1]
 790         mov     $A0[0],-8($tptr,$j)     # t[3]
 791         jmp     .Lsqr4x_1st
 792 
 793 .align  16
 794 .Lsqr4x_1st:
 795          mov    ($aptr,$j),$ai          # a[4]
 796         xor     $A1[0],$A1[0]
 797         mul     $a1                     # a[3]*a[1]
 798         add     %rax,$A1[1]             # a[3]*a[1]+t[4]
 799          mov    $ai,%rax
 800         adc     %rdx,$A1[0]
 801 
 802         xor     $A0[0],$A0[0]
 803         add     $A1[1],$A0[1]
 804         adc     \$0,$A0[0]
 805         mul     $a0                     # a[4]*a[0]
 806         add     %rax,$A0[1]             # a[4]*a[0]+a[3]*a[1]+t[4]
 807          mov    $ai,%rax                # a[3]
 808         adc     %rdx,$A0[0]
 809         mov     $A0[1],($tptr,$j)       # t[4]
 810 
 811 
 812          mov    8($aptr,$j),$ai         # a[5]
 813         xor     $A1[1],$A1[1]
 814         mul     $a1                     # a[4]*a[3]
 815         add     %rax,$A1[0]             # a[4]*a[3]+t[5]
 816          mov    $ai,%rax
 817         adc     %rdx,$A1[1]
 818 
 819         xor     $A0[1],$A0[1]
 820         add     $A1[0],$A0[0]
 821         adc     \$0,$A0[1]
 822         mul     $a0                     # a[5]*a[2]
 823         add     %rax,$A0[0]             # a[5]*a[2]+a[4]*a[3]+t[5]
 824          mov    $ai,%rax
 825         adc     %rdx,$A0[1]
 826         mov     $A0[0],8($tptr,$j)      # t[5]
 827 
 828          mov    16($aptr,$j),$ai        # a[6]
 829         xor     $A1[0],$A1[0]
 830         mul     $a1                     # a[5]*a[3]
 831         add     %rax,$A1[1]             # a[5]*a[3]+t[6]
 832          mov    $ai,%rax
 833         adc     %rdx,$A1[0]
 834 
 835         xor     $A0[0],$A0[0]
 836         add     $A1[1],$A0[1]
 837         adc     \$0,$A0[0]
 838         mul     $a0                     # a[6]*a[2]
 839         add     %rax,$A0[1]             # a[6]*a[2]+a[5]*a[3]+t[6]
 840          mov    $ai,%rax                # a[3]
 841         adc     %rdx,$A0[0]
 842         mov     $A0[1],16($tptr,$j)     # t[6]
 843 
 844 
 845          mov    24($aptr,$j),$ai        # a[7]
 846         xor     $A1[1],$A1[1]
 847         mul     $a1                     # a[6]*a[5]
 848         add     %rax,$A1[0]             # a[6]*a[5]+t[7]
 849          mov    $ai,%rax
 850         adc     %rdx,$A1[1]
 851 
 852         xor     $A0[1],$A0[1]
 853         add     $A1[0],$A0[0]
 854          lea    32($j),$j
 855         adc     \$0,$A0[1]
 856         mul     $a0                     # a[7]*a[4]
 857         add     %rax,$A0[0]             # a[7]*a[4]+a[6]*a[5]+t[6]
 858          mov    $ai,%rax
 859         adc     %rdx,$A0[1]
 860         mov     $A0[0],-8($tptr,$j)     # t[7]
 861 
 862         cmp     \$0,$j
 863         jne     .Lsqr4x_1st
 864 
 865         xor     $A1[0],$A1[0]
 866         add     $A0[1],$A1[1]
 867         adc     \$0,$A1[0]
 868         mul     $a1                     # a[7]*a[5]
 869         add     %rax,$A1[1]
 870         adc     %rdx,$A1[0]
 871 
 872         mov     $A1[1],($tptr)          # t[8]
 873         lea     16($i),$i
 874         mov     $A1[0],8($tptr)         # t[9]
 875         jmp     .Lsqr4x_outer
 876 
 877 .align  16
 878 .Lsqr4x_outer:                          # comments apply to $num==6 case
 879         mov     -32($aptr,$i),$a0       # a[0]
 880         lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
 881         mov     -24($aptr,$i),%rax      # a[1]
 882         lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
 883         mov     -16($aptr,$i),$ai       # a[2]
 884         mov     %rax,$a1
 885 
 886         mov     -24($tptr,$i),$A0[0]    # t[1]
 887         xor     $A0[1],$A0[1]
 888         mul     $a0                     # a[1]*a[0]
 889         add     %rax,$A0[0]             # a[1]*a[0]+t[1]
 890          mov    $ai,%rax                # a[2]
 891         adc     %rdx,$A0[1]
 892         mov     $A0[0],-24($tptr,$i)    # t[1]
 893 
 894         xor     $A0[0],$A0[0]
 895         add     -16($tptr,$i),$A0[1]    # a[2]*a[0]+t[2]
 896         adc     \$0,$A0[0]
 897         mul     $a0                     # a[2]*a[0]
 898         add     %rax,$A0[1]
 899          mov    $ai,%rax
 900         adc     %rdx,$A0[0]
 901         mov     $A0[1],-16($tptr,$i)    # t[2]
 902 
 903         lea     -16($i),$j              # j=-16
 904         xor     $A1[0],$A1[0]
 905 
 906 
 907          mov    8($aptr,$j),$ai         # a[3]
 908         xor     $A1[1],$A1[1]
 909         add     8($tptr,$j),$A1[0]
 910         adc     \$0,$A1[1]
 911         mul     $a1                     # a[2]*a[1]
 912         add     %rax,$A1[0]             # a[2]*a[1]+t[3]
 913          mov    $ai,%rax
 914         adc     %rdx,$A1[1]
 915 
 916         xor     $A0[1],$A0[1]
 917         add     $A1[0],$A0[0]
 918         adc     \$0,$A0[1]
 919         mul     $a0                     # a[3]*a[0]
 920         add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
 921          mov    $ai,%rax
 922         adc     %rdx,$A0[1]
 923         mov     $A0[0],8($tptr,$j)      # t[3]
 924 
 925         lea     16($j),$j
 926         jmp     .Lsqr4x_inner
 927 
 928 .align  16
 929 .Lsqr4x_inner:
 930          mov    ($aptr,$j),$ai          # a[4]
 931         xor     $A1[0],$A1[0]
 932         add     ($tptr,$j),$A1[1]
 933         adc     \$0,$A1[0]
 934         mul     $a1                     # a[3]*a[1]
 935         add     %rax,$A1[1]             # a[3]*a[1]+t[4]
 936          mov    $ai,%rax
 937         adc     %rdx,$A1[0]
 938 
 939         xor     $A0[0],$A0[0]
 940         add     $A1[1],$A0[1]
 941         adc     \$0,$A0[0]
 942         mul     $a0                     # a[4]*a[0]
 943         add     %rax,$A0[1]             # a[4]*a[0]+a[3]*a[1]+t[4]
 944          mov    $ai,%rax                # a[3]
 945         adc     %rdx,$A0[0]
 946         mov     $A0[1],($tptr,$j)       # t[4]
 947 
 948          mov    8($aptr,$j),$ai         # a[5]
 949         xor     $A1[1],$A1[1]
 950         add     8($tptr,$j),$A1[0]
 951         adc     \$0,$A1[1]
 952         mul     $a1                     # a[4]*a[3]
 953         add     %rax,$A1[0]             # a[4]*a[3]+t[5]
 954          mov    $ai,%rax
 955         adc     %rdx,$A1[1]
 956 
 957         xor     $A0[1],$A0[1]
 958         add     $A1[0],$A0[0]
 959         lea     16($j),$j               # j++
 960         adc     \$0,$A0[1]
 961         mul     $a0                     # a[5]*a[2]
 962         add     %rax,$A0[0]             # a[5]*a[2]+a[4]*a[3]+t[5]
 963          mov    $ai,%rax
 964         adc     %rdx,$A0[1]
 965         mov     $A0[0],-8($tptr,$j)     # t[5], "preloaded t[1]" below
 966 
 967         cmp     \$0,$j
 968         jne     .Lsqr4x_inner
 969 
 970         xor     $A1[0],$A1[0]
 971         add     $A0[1],$A1[1]
 972         adc     \$0,$A1[0]
 973         mul     $a1                     # a[5]*a[3]
 974         add     %rax,$A1[1]
 975         adc     %rdx,$A1[0]
 976 
 977         mov     $A1[1],($tptr)          # t[6], "preloaded t[2]" below
 978         mov     $A1[0],8($tptr)         # t[7], "preloaded t[3]" below
 979 
 980         add     \$16,$i
 981         jnz     .Lsqr4x_outer
 982 
 983                                         # comments apply to $num==4 case
 984         mov     -32($aptr),$a0          # a[0]
 985         lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
 986         mov     -24($aptr),%rax         # a[1]
 987         lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
 988         mov     -16($aptr),$ai          # a[2]
 989         mov     %rax,$a1
 990 
 991         xor     $A0[1],$A0[1]
 992         mul     $a0                     # a[1]*a[0]
 993         add     %rax,$A0[0]             # a[1]*a[0]+t[1], preloaded t[1]
 994          mov    $ai,%rax                # a[2]
 995         adc     %rdx,$A0[1]
 996         mov     $A0[0],-24($tptr)       # t[1]
 997 
 998         xor     $A0[0],$A0[0]
 999         add     $A1[1],$A0[1]           # a[2]*a[0]+t[2], preloaded t[2]
1000         adc     \$0,$A0[0]
1001         mul     $a0                     # a[2]*a[0]
1002         add     %rax,$A0[1]
1003          mov    $ai,%rax
1004         adc     %rdx,$A0[0]
1005         mov     $A0[1],-16($tptr)       # t[2]
1006 
1007          mov    -8($aptr),$ai           # a[3]
1008         mul     $a1                     # a[2]*a[1]
1009         add     %rax,$A1[0]             # a[2]*a[1]+t[3], preloaded t[3]
1010          mov    $ai,%rax
1011         adc     \$0,%rdx
1012 
1013         xor     $A0[1],$A0[1]
1014         add     $A1[0],$A0[0]
1015          mov    %rdx,$A1[1]
1016         adc     \$0,$A0[1]
1017         mul     $a0                     # a[3]*a[0]
1018         add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
1019          mov    $ai,%rax
1020         adc     %rdx,$A0[1]
1021         mov     $A0[0],-8($tptr)        # t[3]
1022 
1023         xor     $A1[0],$A1[0]
1024         add     $A0[1],$A1[1]
1025         adc     \$0,$A1[0]
1026         mul     $a1                     # a[3]*a[1]
1027         add     %rax,$A1[1]
1028          mov    -16($aptr),%rax         # a[2]
1029         adc     %rdx,$A1[0]
1030 
1031         mov     $A1[1],($tptr)          # t[4]
1032         mov     $A1[0],8($tptr)         # t[5]
1033 
1034         mul     $ai                     # a[2]*a[3]
1035 ___
1036 {
1037 my ($shift,$carry)=($a0,$a1);
1038 my @S=(@A1,$ai,$n0);
1039 $code.=<<___;
1040          add    \$16,$i
1041          xor    $shift,$shift
1042          sub    $num,$i                 # $i=16-$num
1043          xor    $carry,$carry
1044 
1045         add     $A1[0],%rax             # t[5]
1046         adc     \$0,%rdx
1047         mov     %rax,8($tptr)           # t[5]
1048         mov     %rdx,16($tptr)          # t[6]
1049         mov     $carry,24($tptr)        # t[7]
1050 
1051          mov    -16($aptr,$i),%rax      # a[0]
1052         lea     64(%rsp,$num,2),$tptr
1053          xor    $A0[0],$A0[0]           # t[0]
1054          mov    -24($tptr,$i,2),$A0[1]  # t[1]
1055 
1056         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1057         shr     \$63,$A0[0]
1058         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1059         shr     \$63,$A0[1]
1060         or      $A0[0],$S[1]            # | t[2*i]>>63
1061          mov    -16($tptr,$i,2),$A0[0]  # t[2*i+2]      # prefetch
1062         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1063         mul     %rax                    # a[i]*a[i]
1064         neg     $carry                  # mov $carry,cf
1065          mov    -8($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
1066         adc     %rax,$S[0]
1067          mov    -8($aptr,$i),%rax       # a[i+1]        # prefetch
1068         mov     $S[0],-32($tptr,$i,2)
1069         adc     %rdx,$S[1]
1070 
1071         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1072          mov    $S[1],-24($tptr,$i,2)
1073          sbb    $carry,$carry           # mov cf,$carry
1074         shr     \$63,$A0[0]
1075         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1076         shr     \$63,$A0[1]
1077         or      $A0[0],$S[3]            # | t[2*i]>>63
1078          mov    0($tptr,$i,2),$A0[0]    # t[2*i+2]      # prefetch
1079         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1080         mul     %rax                    # a[i]*a[i]
1081         neg     $carry                  # mov $carry,cf
1082          mov    8($tptr,$i,2),$A0[1]    # t[2*i+2+1]    # prefetch
1083         adc     %rax,$S[2]
1084          mov    0($aptr,$i),%rax        # a[i+1]        # prefetch
1085         mov     $S[2],-16($tptr,$i,2)
1086         adc     %rdx,$S[3]
1087         lea     16($i),$i
1088         mov     $S[3],-40($tptr,$i,2)
1089         sbb     $carry,$carry           # mov cf,$carry
1090         jmp     .Lsqr4x_shift_n_add
1091 
1092 .align  16
1093 .Lsqr4x_shift_n_add:
1094         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1095         shr     \$63,$A0[0]
1096         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1097         shr     \$63,$A0[1]
1098         or      $A0[0],$S[1]            # | t[2*i]>>63
1099          mov    -16($tptr,$i,2),$A0[0]  # t[2*i+2]      # prefetch
1100         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1101         mul     %rax                    # a[i]*a[i]
1102         neg     $carry                  # mov $carry,cf
1103          mov    -8($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
1104         adc     %rax,$S[0]
1105          mov    -8($aptr,$i),%rax       # a[i+1]        # prefetch
1106         mov     $S[0],-32($tptr,$i,2)
1107         adc     %rdx,$S[1]
1108 
1109         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1110          mov    $S[1],-24($tptr,$i,2)
1111          sbb    $carry,$carry           # mov cf,$carry
1112         shr     \$63,$A0[0]
1113         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1114         shr     \$63,$A0[1]
1115         or      $A0[0],$S[3]            # | t[2*i]>>63
1116          mov    0($tptr,$i,2),$A0[0]    # t[2*i+2]      # prefetch
1117         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1118         mul     %rax                    # a[i]*a[i]
1119         neg     $carry                  # mov $carry,cf
1120          mov    8($tptr,$i,2),$A0[1]    # t[2*i+2+1]    # prefetch
1121         adc     %rax,$S[2]
1122          mov    0($aptr,$i),%rax        # a[i+1]        # prefetch
1123         mov     $S[2],-16($tptr,$i,2)
1124         adc     %rdx,$S[3]
1125 
1126         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1127          mov    $S[3],-8($tptr,$i,2)
1128          sbb    $carry,$carry           # mov cf,$carry
1129         shr     \$63,$A0[0]
1130         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1131         shr     \$63,$A0[1]
1132         or      $A0[0],$S[1]            # | t[2*i]>>63
1133          mov    16($tptr,$i,2),$A0[0]   # t[2*i+2]      # prefetch
1134         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1135         mul     %rax                    # a[i]*a[i]
1136         neg     $carry                  # mov $carry,cf
1137          mov    24($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
1138         adc     %rax,$S[0]
1139          mov    8($aptr,$i),%rax        # a[i+1]        # prefetch
1140         mov     $S[0],0($tptr,$i,2)
1141         adc     %rdx,$S[1]
1142 
1143         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1144          mov    $S[1],8($tptr,$i,2)
1145          sbb    $carry,$carry           # mov cf,$carry
1146         shr     \$63,$A0[0]
1147         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1148         shr     \$63,$A0[1]
1149         or      $A0[0],$S[3]            # | t[2*i]>>63
1150          mov    32($tptr,$i,2),$A0[0]   # t[2*i+2]      # prefetch
1151         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1152         mul     %rax                    # a[i]*a[i]
1153         neg     $carry                  # mov $carry,cf
1154          mov    40($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
1155         adc     %rax,$S[2]
1156          mov    16($aptr,$i),%rax       # a[i+1]        # prefetch
1157         mov     $S[2],16($tptr,$i,2)
1158         adc     %rdx,$S[3]
1159         mov     $S[3],24($tptr,$i,2)
1160         sbb     $carry,$carry           # mov cf,$carry
1161         add     \$32,$i
1162         jnz     .Lsqr4x_shift_n_add
1163 
1164         lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1165         shr     \$63,$A0[0]
1166         lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
1167         shr     \$63,$A0[1]
1168         or      $A0[0],$S[1]            # | t[2*i]>>63
1169          mov    -16($tptr),$A0[0]       # t[2*i+2]      # prefetch
1170         mov     $A0[1],$shift           # shift=t[2*i+1]>>63
1171         mul     %rax                    # a[i]*a[i]
1172         neg     $carry                  # mov $carry,cf
1173          mov    -8($tptr),$A0[1]        # t[2*i+2+1]    # prefetch
1174         adc     %rax,$S[0]
1175          mov    -8($aptr),%rax          # a[i+1]        # prefetch
1176         mov     $S[0],-32($tptr)
1177         adc     %rdx,$S[1]
1178 
1179         lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
1180          mov    $S[1],-24($tptr)
1181          sbb    $carry,$carry           # mov cf,$carry
1182         shr     \$63,$A0[0]
1183         lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
1184         shr     \$63,$A0[1]
1185         or      $A0[0],$S[3]            # | t[2*i]>>63
1186         mul     %rax                    # a[i]*a[i]
1187         neg     $carry                  # mov $carry,cf
1188         adc     %rax,$S[2]
1189         adc     %rdx,$S[3]
1190         mov     $S[2],-16($tptr)
1191         mov     $S[3],-8($tptr)
1192 ___
1193 }
1194 ##############################################################
1195 # Montgomery reduction part, "word-by-word" algorithm.
1196 #
1197 {
1198 my ($topbit,$nptr)=("%rbp",$aptr);
1199 my ($m0,$m1)=($a0,$a1);
1200 my @Ni=("%rbx","%r9");
1201 $code.=<<___;
1202         mov     40(%rsp),$nptr          # restore $nptr
1203         mov     48(%rsp),$n0            # restore *n0
1204         xor     $j,$j
1205         mov     $num,0(%rsp)            # save $num
1206         sub     $num,$j                 # $j=-$num
1207          mov    64(%rsp),$A0[0]         # t[0]          # modsched #
1208          mov    $n0,$m0                 #               # modsched #
1209         lea     64(%rsp,$num,2),%rax    # end of t[] buffer
1210         lea     64(%rsp,$num),$tptr     # end of t[] window
1211         mov     %rax,8(%rsp)            # save end of t[] buffer
1212         lea     ($nptr,$num),$nptr      # end of n[] buffer
1213         xor     $topbit,$topbit         # $topbit=0
1214 
1215         mov     0($nptr,$j),%rax        # n[0]          # modsched #
1216         mov     8($nptr,$j),$Ni[1]      # n[1]          # modsched #
1217          imulq  $A0[0],$m0              # m0=t[0]*n0    # modsched #
1218          mov    %rax,$Ni[0]             #               # modsched #
1219         jmp     .Lsqr4x_mont_outer
1220 
1221 .align  16
1222 .Lsqr4x_mont_outer:
1223         xor     $A0[1],$A0[1]
1224         mul     $m0                     # n[0]*m0
1225         add     %rax,$A0[0]             # n[0]*m0+t[0]
1226          mov    $Ni[1],%rax
1227         adc     %rdx,$A0[1]
1228         mov     $n0,$m1
1229 
1230         xor     $A0[0],$A0[0]
1231         add     8($tptr,$j),$A0[1]
1232         adc     \$0,$A0[0]
1233         mul     $m0                     # n[1]*m0
1234         add     %rax,$A0[1]             # n[1]*m0+t[1]
1235          mov    $Ni[0],%rax
1236         adc     %rdx,$A0[0]
1237 
1238         imulq   $A0[1],$m1
1239 
1240         mov     16($nptr,$j),$Ni[0]     # n[2]
1241         xor     $A1[1],$A1[1]
1242         add     $A0[1],$A1[0]
1243         adc     \$0,$A1[1]
1244         mul     $m1                     # n[0]*m1
1245         add     %rax,$A1[0]             # n[0]*m1+"t[1]"
1246          mov    $Ni[0],%rax
1247         adc     %rdx,$A1[1]
1248         mov     $A1[0],8($tptr,$j)      # "t[1]"
1249 
1250         xor     $A0[1],$A0[1]
1251         add     16($tptr,$j),$A0[0]
1252         adc     \$0,$A0[1]
1253         mul     $m0                     # n[2]*m0
1254         add     %rax,$A0[0]             # n[2]*m0+t[2]
1255          mov    $Ni[1],%rax
1256         adc     %rdx,$A0[1]
1257 
1258         mov     24($nptr,$j),$Ni[1]     # n[3]
1259         xor     $A1[0],$A1[0]
1260         add     $A0[0],$A1[1]
1261         adc     \$0,$A1[0]
1262         mul     $m1                     # n[1]*m1
1263         add     %rax,$A1[1]             # n[1]*m1+"t[2]"
1264          mov    $Ni[1],%rax
1265         adc     %rdx,$A1[0]
1266         mov     $A1[1],16($tptr,$j)     # "t[2]"
1267 
1268         xor     $A0[0],$A0[0]
1269         add     24($tptr,$j),$A0[1]
1270         lea     32($j),$j
1271         adc     \$0,$A0[0]
1272         mul     $m0                     # n[3]*m0
1273         add     %rax,$A0[1]             # n[3]*m0+t[3]
1274          mov    $Ni[0],%rax
1275         adc     %rdx,$A0[0]
1276         jmp     .Lsqr4x_mont_inner
1277 
1278 .align  16
1279 .Lsqr4x_mont_inner:
1280         mov     ($nptr,$j),$Ni[0]       # n[4]
1281         xor     $A1[1],$A1[1]
1282         add     $A0[1],$A1[0]
1283         adc     \$0,$A1[1]
1284         mul     $m1                     # n[2]*m1
1285         add     %rax,$A1[0]             # n[2]*m1+"t[3]"
1286          mov    $Ni[0],%rax
1287         adc     %rdx,$A1[1]
1288         mov     $A1[0],-8($tptr,$j)     # "t[3]"
1289 
1290         xor     $A0[1],$A0[1]
1291         add     ($tptr,$j),$A0[0]
1292         adc     \$0,$A0[1]
1293         mul     $m0                     # n[4]*m0
1294         add     %rax,$A0[0]             # n[4]*m0+t[4]
1295          mov    $Ni[1],%rax
1296         adc     %rdx,$A0[1]
1297 
1298         mov     8($nptr,$j),$Ni[1]      # n[5]
1299         xor     $A1[0],$A1[0]
1300         add     $A0[0],$A1[1]
1301         adc     \$0,$A1[0]
1302         mul     $m1                     # n[3]*m1
1303         add     %rax,$A1[1]             # n[3]*m1+"t[4]"
1304          mov    $Ni[1],%rax
1305         adc     %rdx,$A1[0]
1306         mov     $A1[1],($tptr,$j)       # "t[4]"
1307 
1308         xor     $A0[0],$A0[0]
1309         add     8($tptr,$j),$A0[1]
1310         adc     \$0,$A0[0]
1311         mul     $m0                     # n[5]*m0
1312         add     %rax,$A0[1]             # n[5]*m0+t[5]
1313          mov    $Ni[0],%rax
1314         adc     %rdx,$A0[0]
1315 
1316 
1317         mov     16($nptr,$j),$Ni[0]     # n[6]
1318         xor     $A1[1],$A1[1]
1319         add     $A0[1],$A1[0]
1320         adc     \$0,$A1[1]
1321         mul     $m1                     # n[4]*m1
1322         add     %rax,$A1[0]             # n[4]*m1+"t[5]"
1323          mov    $Ni[0],%rax
1324         adc     %rdx,$A1[1]
1325         mov     $A1[0],8($tptr,$j)      # "t[5]"
1326 
1327         xor     $A0[1],$A0[1]
1328         add     16($tptr,$j),$A0[0]
1329         adc     \$0,$A0[1]
1330         mul     $m0                     # n[6]*m0
1331         add     %rax,$A0[0]             # n[6]*m0+t[6]
1332          mov    $Ni[1],%rax
1333         adc     %rdx,$A0[1]
1334 
1335         mov     24($nptr,$j),$Ni[1]     # n[7]
1336         xor     $A1[0],$A1[0]
1337         add     $A0[0],$A1[1]
1338         adc     \$0,$A1[0]
1339         mul     $m1                     # n[5]*m1
1340         add     %rax,$A1[1]             # n[5]*m1+"t[6]"
1341          mov    $Ni[1],%rax
1342         adc     %rdx,$A1[0]
1343         mov     $A1[1],16($tptr,$j)     # "t[6]"
1344 
1345         xor     $A0[0],$A0[0]
1346         add     24($tptr,$j),$A0[1]
1347         lea     32($j),$j
1348         adc     \$0,$A0[0]
1349         mul     $m0                     # n[7]*m0
1350         add     %rax,$A0[1]             # n[7]*m0+t[7]
1351          mov    $Ni[0],%rax
1352         adc     %rdx,$A0[0]
1353         cmp     \$0,$j
1354         jne     .Lsqr4x_mont_inner
1355 
1356          sub    0(%rsp),$j              # $j=-$num      # modsched #
1357          mov    $n0,$m0                 #               # modsched #
1358 
1359         xor     $A1[1],$A1[1]
1360         add     $A0[1],$A1[0]
1361         adc     \$0,$A1[1]
1362         mul     $m1                     # n[6]*m1
1363         add     %rax,$A1[0]             # n[6]*m1+"t[7]"
1364         mov     $Ni[1],%rax
1365         adc     %rdx,$A1[1]
1366         mov     $A1[0],-8($tptr)        # "t[7]"
1367 
1368         xor     $A0[1],$A0[1]
1369         add     ($tptr),$A0[0]          # +t[8]
1370         adc     \$0,$A0[1]
1371          mov    0($nptr,$j),$Ni[0]      # n[0]          # modsched #
1372         add     $topbit,$A0[0]
1373         adc     \$0,$A0[1]
1374 
1375          imulq  16($tptr,$j),$m0        # m0=t[0]*n0    # modsched #
1376         xor     $A1[0],$A1[0]
1377          mov    8($nptr,$j),$Ni[1]      # n[1]          # modsched #
1378         add     $A0[0],$A1[1]
1379          mov    16($tptr,$j),$A0[0]     # t[0]          # modsched #
1380         adc     \$0,$A1[0]
1381         mul     $m1                     # n[7]*m1
1382         add     %rax,$A1[1]             # n[7]*m1+"t[8]"
1383          mov    $Ni[0],%rax             #               # modsched #
1384         adc     %rdx,$A1[0]
1385         mov     $A1[1],($tptr)          # "t[8]"
1386 
1387         xor     $topbit,$topbit
1388         add     8($tptr),$A1[0]         # +t[9]
1389         adc     $topbit,$topbit
1390         add     $A0[1],$A1[0]
1391         lea     16($tptr),$tptr         # "t[$num]>>128"
1392         adc     \$0,$topbit
1393         mov     $A1[0],-8($tptr)        # "t[9]"
1394         cmp     8(%rsp),$tptr           # are we done?
1395         jb      .Lsqr4x_mont_outer
1396 
1397         mov     0(%rsp),$num            # restore $num
1398         mov     $topbit,($tptr)         # save $topbit
1399 ___
1400 }
1401 ##############################################################
1402 # Post-condition, 4x unrolled copy from bn_mul_mont
1403 #
1404 {
1405 my ($tptr,$nptr)=("%rbx",$aptr);
1406 my @ri=("%rax","%rdx","%r10","%r11");
1407 $code.=<<___;
1408         mov     64(%rsp,$num),@ri[0]    # tp[0]
1409         lea     64(%rsp,$num),$tptr     # upper half of t[2*$num] holds result
1410         mov     40(%rsp),$nptr          # restore $nptr
1411         shr     \$5,$num                # num/4
1412         mov     8($tptr),@ri[1]         # t[1]
1413         xor     $i,$i                   # i=0 and clear CF!
1414 
1415         mov     32(%rsp),$rptr          # restore $rptr
1416         sub     0($nptr),@ri[0]
1417         mov     16($tptr),@ri[2]        # t[2]
1418         mov     24($tptr),@ri[3]        # t[3]
1419         sbb     8($nptr),@ri[1]
1420         lea     -1($num),$j             # j=num/4-1
1421         jmp     .Lsqr4x_sub
1422 .align  16
1423 .Lsqr4x_sub:
1424         mov     @ri[0],0($rptr,$i,8)    # rp[i]=tp[i]-np[i]
1425         mov     @ri[1],8($rptr,$i,8)    # rp[i]=tp[i]-np[i]
1426         sbb     16($nptr,$i,8),@ri[2]
1427         mov     32($tptr,$i,8),@ri[0]   # tp[i+1]
1428         mov     40($tptr,$i,8),@ri[1]
1429         sbb     24($nptr,$i,8),@ri[3]
1430         mov     @ri[2],16($rptr,$i,8)   # rp[i]=tp[i]-np[i]
1431         mov     @ri[3],24($rptr,$i,8)   # rp[i]=tp[i]-np[i]
1432         sbb     32($nptr,$i,8),@ri[0]
1433         mov     48($tptr,$i,8),@ri[2]
1434         mov     56($tptr,$i,8),@ri[3]
1435         sbb     40($nptr,$i,8),@ri[1]
1436         lea     4($i),$i                # i++
1437         dec     $j                      # doesn't affect CF!
1438         jnz     .Lsqr4x_sub
1439 
1440         mov     @ri[0],0($rptr,$i,8)    # rp[i]=tp[i]-np[i]
1441         mov     32($tptr,$i,8),@ri[0]   # load overflow bit
1442         sbb     16($nptr,$i,8),@ri[2]
1443         mov     @ri[1],8($rptr,$i,8)    # rp[i]=tp[i]-np[i]
1444         sbb     24($nptr,$i,8),@ri[3]
1445         mov     @ri[2],16($rptr,$i,8)   # rp[i]=tp[i]-np[i]
1446 
1447         sbb     \$0,@ri[0]              # handle upmost overflow bit
1448         mov     @ri[3],24($rptr,$i,8)   # rp[i]=tp[i]-np[i]
1449         xor     $i,$i                   # i=0
1450         and     @ri[0],$tptr
1451         not     @ri[0]
1452         mov     $rptr,$nptr
1453         and     @ri[0],$nptr
1454         lea     -1($num),$j
1455         or      $nptr,$tptr             # tp=borrow?tp:rp
1456 
1457         pxor    %xmm0,%xmm0
1458         lea     64(%rsp,$num,8),$nptr
1459         movdqu  ($tptr),%xmm1
1460         lea     ($nptr,$num,8),$nptr
1461         movdqa  %xmm0,64(%rsp)          # zap lower half of temporary vector
1462         movdqa  %xmm0,($nptr)           # zap upper half of temporary vector
1463         movdqu  %xmm1,($rptr)
1464         jmp     .Lsqr4x_copy
1465 .align  16
1466 .Lsqr4x_copy:                           # copy or in-place refresh
1467         movdqu  16($tptr,$i),%xmm2
1468         movdqu  32($tptr,$i),%xmm1
1469         movdqa  %xmm0,80(%rsp,$i)       # zap lower half of temporary vector
1470         movdqa  %xmm0,96(%rsp,$i)       # zap lower half of temporary vector
1471         movdqa  %xmm0,16($nptr,$i)      # zap upper half of temporary vector
1472         movdqa  %xmm0,32($nptr,$i)      # zap upper half of temporary vector
1473         movdqu  %xmm2,16($rptr,$i)
1474         movdqu  %xmm1,32($rptr,$i)
1475         lea     32($i),$i
1476         dec     $j
1477         jnz     .Lsqr4x_copy
1478 
1479         movdqu  16($tptr,$i),%xmm2
1480         movdqa  %xmm0,80(%rsp,$i)       # zap lower half of temporary vector
1481         movdqa  %xmm0,16($nptr,$i)      # zap upper half of temporary vector
1482         movdqu  %xmm2,16($rptr,$i)
1483 ___
1484 }
1485 $code.=<<___;
1486         mov     56(%rsp),%rsi           # restore %rsp
1487         mov     \$1,%rax
1488         mov     0(%rsi),%r15
1489         mov     8(%rsi),%r14
1490         mov     16(%rsi),%r13
1491         mov     24(%rsi),%r12
1492         mov     32(%rsi),%rbp
1493         mov     40(%rsi),%rbx
1494         lea     48(%rsi),%rsp
1495 .Lsqr4x_epilogue:
1496         ret
1497 .size   bn_sqr4x_mont,.-bn_sqr4x_mont
1498 ___
1499 }}}
1500 $code.=<<___;
1501 .asciz  "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1502 .align  16
1503 ___
1504 
1505 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1506 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1507 if ($win64) {
1508 $rec="%rcx";
1509 $frame="%rdx";
1510 $context="%r8";
1511 $disp="%r9";
1512 
1513 $code.=<<___;
1514 .extern __imp_RtlVirtualUnwind
1515 .type   mul_handler,\@abi-omnipotent
1516 .align  16
1517 mul_handler:
1518         push    %rsi
1519         push    %rdi
1520         push    %rbx
1521         push    %rbp
1522         push    %r12
1523         push    %r13
1524         push    %r14
1525         push    %r15
1526         pushfq
1527         sub     \$64,%rsp
1528 
1529         mov     120($context),%rax      # pull context->Rax
1530         mov     248($context),%rbx      # pull context->Rip
1531 
1532         mov     8($disp),%rsi           # disp->ImageBase
1533         mov     56($disp),%r11          # disp->HandlerData
1534 
1535         mov     0(%r11),%r10d           # HandlerData[0]
1536         lea     (%rsi,%r10),%r10        # end of prologue label
1537         cmp     %r10,%rbx               # context->Rip<end of prologue label
1538         jb      .Lcommon_seh_tail
1539 
1540         mov     152($context),%rax      # pull context->Rsp
1541 
1542         mov     4(%r11),%r10d           # HandlerData[1]
1543         lea     (%rsi,%r10),%r10        # epilogue label
1544         cmp     %r10,%rbx               # context->Rip>=epilogue label
1545         jae     .Lcommon_seh_tail
1546 
1547         mov     192($context),%r10      # pull $num
1548         mov     8(%rax,%r10,8),%rax     # pull saved stack pointer
1549         lea     48(%rax),%rax
1550 
1551         mov     -8(%rax),%rbx
1552         mov     -16(%rax),%rbp
1553         mov     -24(%rax),%r12
1554         mov     -32(%rax),%r13
1555         mov     -40(%rax),%r14
1556         mov     -48(%rax),%r15
1557         mov     %rbx,144($context)      # restore context->Rbx
1558         mov     %rbp,160($context)      # restore context->Rbp
1559         mov     %r12,216($context)      # restore context->R12
1560         mov     %r13,224($context)      # restore context->R13
1561         mov     %r14,232($context)      # restore context->R14
1562         mov     %r15,240($context)      # restore context->R15
1563 
1564         jmp     .Lcommon_seh_tail
1565 .size   mul_handler,.-mul_handler
1566 
1567 .type   sqr_handler,\@abi-omnipotent
1568 .align  16
1569 sqr_handler:
1570         push    %rsi
1571         push    %rdi
1572         push    %rbx
1573         push    %rbp
1574         push    %r12
1575         push    %r13
1576         push    %r14
1577         push    %r15
1578         pushfq
1579         sub     \$64,%rsp
1580 
1581         mov     120($context),%rax      # pull context->Rax
1582         mov     248($context),%rbx      # pull context->Rip
1583 
1584         lea     .Lsqr4x_body(%rip),%r10
1585         cmp     %r10,%rbx               # context->Rip<.Lsqr_body
1586         jb      .Lcommon_seh_tail
1587 
1588         mov     152($context),%rax      # pull context->Rsp
1589 
1590         lea     .Lsqr4x_epilogue(%rip),%r10
1591         cmp     %r10,%rbx               # context->Rip>=.Lsqr_epilogue
1592         jae     .Lcommon_seh_tail
1593 
1594         mov     56(%rax),%rax           # pull saved stack pointer
1595         lea     48(%rax),%rax
1596 
1597         mov     -8(%rax),%rbx
1598         mov     -16(%rax),%rbp
1599         mov     -24(%rax),%r12
1600         mov     -32(%rax),%r13
1601         mov     -40(%rax),%r14
1602         mov     -48(%rax),%r15
1603         mov     %rbx,144($context)      # restore context->Rbx
1604         mov     %rbp,160($context)      # restore context->Rbp
1605         mov     %r12,216($context)      # restore context->R12
1606         mov     %r13,224($context)      # restore context->R13
1607         mov     %r14,232($context)      # restore context->R14
1608         mov     %r15,240($context)      # restore context->R15
1609 
1610 .Lcommon_seh_tail:
1611         mov     8(%rax),%rdi
1612         mov     16(%rax),%rsi
1613         mov     %rax,152($context)      # restore context->Rsp
1614         mov     %rsi,168($context)      # restore context->Rsi
1615         mov     %rdi,176($context)      # restore context->Rdi
1616 
1617         mov     40($disp),%rdi          # disp->ContextRecord
1618         mov     $context,%rsi           # context
1619         mov     \$154,%ecx              # sizeof(CONTEXT)
1620         .long   0xa548f3fc              # cld; rep movsq
1621 
1622         mov     $disp,%rsi
1623         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1624         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1625         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1626         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1627         mov     40(%rsi),%r10           # disp->ContextRecord
1628         lea     56(%rsi),%r11           # &disp->HandlerData
1629         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1630         mov     %r10,32(%rsp)           # arg5
1631         mov     %r11,40(%rsp)           # arg6
1632         mov     %r12,48(%rsp)           # arg7
1633         mov     %rcx,56(%rsp)           # arg8, (NULL)
1634         call    *__imp_RtlVirtualUnwind(%rip)
1635 
1636         mov     \$1,%eax                # ExceptionContinueSearch
1637         add     \$64,%rsp
1638         popfq
1639         pop     %r15
1640         pop     %r14
1641         pop     %r13
1642         pop     %r12
1643         pop     %rbp
1644         pop     %rbx
1645         pop     %rdi
1646         pop     %rsi
1647         ret
1648 .size   sqr_handler,.-sqr_handler
1649 
1650 .section        .pdata
1651 .align  4
1652         .rva    .LSEH_begin_bn_mul_mont
1653         .rva    .LSEH_end_bn_mul_mont
1654         .rva    .LSEH_info_bn_mul_mont
1655 
1656         .rva    .LSEH_begin_bn_mul4x_mont
1657         .rva    .LSEH_end_bn_mul4x_mont
1658         .rva    .LSEH_info_bn_mul4x_mont
1659 
1660         .rva    .LSEH_begin_bn_sqr4x_mont
1661         .rva    .LSEH_end_bn_sqr4x_mont
1662         .rva    .LSEH_info_bn_sqr4x_mont
1663 
1664 .section        .xdata
1665 .align  8
1666 .LSEH_info_bn_mul_mont:
1667         .byte   9,0,0,0
1668         .rva    mul_handler
1669         .rva    .Lmul_body,.Lmul_epilogue       # HandlerData[]
1670 .LSEH_info_bn_mul4x_mont:
1671         .byte   9,0,0,0
1672         .rva    mul_handler
1673         .rva    .Lmul4x_body,.Lmul4x_epilogue   # HandlerData[]
1674 .LSEH_info_bn_sqr4x_mont:
1675         .byte   9,0,0,0
1676         .rva    sqr_handler
1677 ___
1678 }
1679 
1680 print $code;
1681 close STDOUT;