1 #!/usr/bin/env perl
   2 
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 
  10 # August 2011.
  11 #
  12 # Companion to x86_64-mont.pl that optimizes cache-timing attack
  13 # countermeasures. The subroutines are produced by replacing bp[i]
  14 # references in their x86_64-mont.pl counterparts with cache-neutral
  15 # references to powers table computed in BN_mod_exp_mont_consttime.
  16 # In addition subroutine that scatters elements of the powers table
  17 # is implemented, so that scatter-/gathering can be tuned without
  18 # bn_exp.c modifications.
  19 
  20 $flavour = shift;
  21 $output  = shift;
  22 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  23 
  24 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  25 
  26 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  27 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  28 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  29 die "can't locate x86_64-xlate.pl";
  30 
  31 open OUT,"| \"$^X\" $xlate $flavour $output";
  32 *STDOUT=*OUT;
  33 
  34 # int bn_mul_mont_gather5(
  35 $rp="%rdi";     # BN_ULONG *rp,
  36 $ap="%rsi";     # const BN_ULONG *ap,
  37 $bp="%rdx";     # const BN_ULONG *bp,
  38 $np="%rcx";     # const BN_ULONG *np,
  39 $n0="%r8";      # const BN_ULONG *n0,
  40 $num="%r9";     # int num,
  41                 # int idx);     # 0 to 2^5-1, "index" in $bp holding
  42                                 # pre-computed powers of a', interlaced
  43                                 # in such manner that b[0] is $bp[idx],
  44                                 # b[1] is [2^5+idx], etc.
  45 $lo0="%r10";
  46 $hi0="%r11";
  47 $hi1="%r13";
  48 $i="%r14";
  49 $j="%r15";
  50 $m0="%rbx";
  51 $m1="%rbp";
  52 
  53 $code=<<___;
  54 .text
  55 
  56 .globl  bn_mul_mont_gather5
  57 .type   bn_mul_mont_gather5,\@function,6
  58 .align  64
  59 bn_mul_mont_gather5:
  60         test    \$3,${num}d
  61         jnz     .Lmul_enter
  62         cmp     \$8,${num}d
  63         jb      .Lmul_enter
  64         jmp     .Lmul4x_enter
  65 
  66 .align  16
  67 .Lmul_enter:
  68         mov     ${num}d,${num}d
  69         mov     `($win64?56:8)`(%rsp),%r10d     # load 7th argument
  70         push    %rbx
  71         push    %rbp
  72         push    %r12
  73         push    %r13
  74         push    %r14
  75         push    %r15
  76 ___
  77 $code.=<<___ if ($win64);
  78         lea     -0x28(%rsp),%rsp
  79         movaps  %xmm6,(%rsp)
  80         movaps  %xmm7,0x10(%rsp)
  81 .Lmul_alloca:
  82 ___
  83 $code.=<<___;
  84         mov     %rsp,%rax
  85         lea     2($num),%r11
  86         neg     %r11
  87         lea     (%rsp,%r11,8),%rsp      # tp=alloca(8*(num+2))
  88         and     \$-1024,%rsp            # minimize TLB usage
  89 
  90         mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
  91 .Lmul_body:
  92         mov     $bp,%r12                # reassign $bp
  93 ___
  94                 $bp="%r12";
  95                 $STRIDE=2**5*8;         # 5 is "window size"
  96                 $N=$STRIDE/4;           # should match cache line size
  97 $code.=<<___;
  98         mov     %r10,%r11
  99         shr     \$`log($N/8)/log(2)`,%r10
 100         and     \$`$N/8-1`,%r11
 101         not     %r10
 102         lea     .Lmagic_masks(%rip),%rax
 103         and     \$`2**5/($N/8)-1`,%r10  # 5 is "window size"
 104         lea     96($bp,%r11,8),$bp      # pointer within 1st cache line
 105         movq    0(%rax,%r10,8),%xmm4    # set of masks denoting which
 106         movq    8(%rax,%r10,8),%xmm5    # cache line contains element
 107         movq    16(%rax,%r10,8),%xmm6   # denoted by 7th argument
 108         movq    24(%rax,%r10,8),%xmm7
 109 
 110         movq    `0*$STRIDE/4-96`($bp),%xmm0
 111         movq    `1*$STRIDE/4-96`($bp),%xmm1
 112         pand    %xmm4,%xmm0
 113         movq    `2*$STRIDE/4-96`($bp),%xmm2
 114         pand    %xmm5,%xmm1
 115         movq    `3*$STRIDE/4-96`($bp),%xmm3
 116         pand    %xmm6,%xmm2
 117         por     %xmm1,%xmm0
 118         pand    %xmm7,%xmm3
 119         por     %xmm2,%xmm0
 120         lea     $STRIDE($bp),$bp
 121         por     %xmm3,%xmm0
 122 
 123         movq    %xmm0,$m0               # m0=bp[0]
 124 
 125         mov     ($n0),$n0               # pull n0[0] value
 126         mov     ($ap),%rax
 127 
 128         xor     $i,$i                   # i=0
 129         xor     $j,$j                   # j=0
 130 
 131         movq    `0*$STRIDE/4-96`($bp),%xmm0
 132         movq    `1*$STRIDE/4-96`($bp),%xmm1
 133         pand    %xmm4,%xmm0
 134         movq    `2*$STRIDE/4-96`($bp),%xmm2
 135         pand    %xmm5,%xmm1
 136 
 137         mov     $n0,$m1
 138         mulq    $m0                     # ap[0]*bp[0]
 139         mov     %rax,$lo0
 140         mov     ($np),%rax
 141 
 142         movq    `3*$STRIDE/4-96`($bp),%xmm3
 143         pand    %xmm6,%xmm2
 144         por     %xmm1,%xmm0
 145         pand    %xmm7,%xmm3
 146 
 147         imulq   $lo0,$m1                # "tp[0]"*n0
 148         mov     %rdx,$hi0
 149 
 150         por     %xmm2,%xmm0
 151         lea     $STRIDE($bp),$bp
 152         por     %xmm3,%xmm0
 153 
 154         mulq    $m1                     # np[0]*m1
 155         add     %rax,$lo0               # discarded
 156         mov     8($ap),%rax
 157         adc     \$0,%rdx
 158         mov     %rdx,$hi1
 159 
 160         lea     1($j),$j                # j++
 161         jmp     .L1st_enter
 162 
 163 .align  16
 164 .L1st:
 165         add     %rax,$hi1
 166         mov     ($ap,$j,8),%rax
 167         adc     \$0,%rdx
 168         add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
 169         mov     $lo0,$hi0
 170         adc     \$0,%rdx
 171         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
 172         mov     %rdx,$hi1
 173 
 174 .L1st_enter:
 175         mulq    $m0                     # ap[j]*bp[0]
 176         add     %rax,$hi0
 177         mov     ($np,$j,8),%rax
 178         adc     \$0,%rdx
 179         lea     1($j),$j                # j++
 180         mov     %rdx,$lo0
 181 
 182         mulq    $m1                     # np[j]*m1
 183         cmp     $num,$j
 184         jne     .L1st
 185 
 186         movq    %xmm0,$m0               # bp[1]
 187 
 188         add     %rax,$hi1
 189         mov     ($ap),%rax              # ap[0]
 190         adc     \$0,%rdx
 191         add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
 192         adc     \$0,%rdx
 193         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
 194         mov     %rdx,$hi1
 195         mov     $lo0,$hi0
 196 
 197         xor     %rdx,%rdx
 198         add     $hi0,$hi1
 199         adc     \$0,%rdx
 200         mov     $hi1,-8(%rsp,$num,8)
 201         mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
 202 
 203         lea     1($i),$i                # i++
 204         jmp     .Louter
 205 .align  16
 206 .Louter:
 207         xor     $j,$j                   # j=0
 208         mov     $n0,$m1
 209         mov     (%rsp),$lo0
 210 
 211         movq    `0*$STRIDE/4-96`($bp),%xmm0
 212         movq    `1*$STRIDE/4-96`($bp),%xmm1
 213         pand    %xmm4,%xmm0
 214         movq    `2*$STRIDE/4-96`($bp),%xmm2
 215         pand    %xmm5,%xmm1
 216 
 217         mulq    $m0                     # ap[0]*bp[i]
 218         add     %rax,$lo0               # ap[0]*bp[i]+tp[0]
 219         mov     ($np),%rax
 220         adc     \$0,%rdx
 221 
 222         movq    `3*$STRIDE/4-96`($bp),%xmm3
 223         pand    %xmm6,%xmm2
 224         por     %xmm1,%xmm0
 225         pand    %xmm7,%xmm3
 226 
 227         imulq   $lo0,$m1                # tp[0]*n0
 228         mov     %rdx,$hi0
 229 
 230         por     %xmm2,%xmm0
 231         lea     $STRIDE($bp),$bp
 232         por     %xmm3,%xmm0
 233 
 234         mulq    $m1                     # np[0]*m1
 235         add     %rax,$lo0               # discarded
 236         mov     8($ap),%rax
 237         adc     \$0,%rdx
 238         mov     8(%rsp),$lo0            # tp[1]
 239         mov     %rdx,$hi1
 240 
 241         lea     1($j),$j                # j++
 242         jmp     .Linner_enter
 243 
 244 .align  16
 245 .Linner:
 246         add     %rax,$hi1
 247         mov     ($ap,$j,8),%rax
 248         adc     \$0,%rdx
 249         add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
 250         mov     (%rsp,$j,8),$lo0
 251         adc     \$0,%rdx
 252         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
 253         mov     %rdx,$hi1
 254 
 255 .Linner_enter:
 256         mulq    $m0                     # ap[j]*bp[i]
 257         add     %rax,$hi0
 258         mov     ($np,$j,8),%rax
 259         adc     \$0,%rdx
 260         add     $hi0,$lo0               # ap[j]*bp[i]+tp[j]
 261         mov     %rdx,$hi0
 262         adc     \$0,$hi0
 263         lea     1($j),$j                # j++
 264 
 265         mulq    $m1                     # np[j]*m1
 266         cmp     $num,$j
 267         jne     .Linner
 268 
 269         movq    %xmm0,$m0               # bp[i+1]
 270 
 271         add     %rax,$hi1
 272         mov     ($ap),%rax              # ap[0]
 273         adc     \$0,%rdx
 274         add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
 275         mov     (%rsp,$j,8),$lo0
 276         adc     \$0,%rdx
 277         mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
 278         mov     %rdx,$hi1
 279 
 280         xor     %rdx,%rdx
 281         add     $hi0,$hi1
 282         adc     \$0,%rdx
 283         add     $lo0,$hi1               # pull upmost overflow bit
 284         adc     \$0,%rdx
 285         mov     $hi1,-8(%rsp,$num,8)
 286         mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
 287 
 288         lea     1($i),$i                # i++
 289         cmp     $num,$i
 290         jl      .Louter
 291 
 292         xor     $i,$i                   # i=0 and clear CF!
 293         mov     (%rsp),%rax             # tp[0]
 294         lea     (%rsp),$ap              # borrow ap for tp
 295         mov     $num,$j                 # j=num
 296         jmp     .Lsub
 297 .align  16
 298 .Lsub:  sbb     ($np,$i,8),%rax
 299         mov     %rax,($rp,$i,8)         # rp[i]=tp[i]-np[i]
 300         mov     8($ap,$i,8),%rax        # tp[i+1]
 301         lea     1($i),$i                # i++
 302         dec     $j                      # doesnn't affect CF!
 303         jnz     .Lsub
 304 
 305         sbb     \$0,%rax                # handle upmost overflow bit
 306         xor     $i,$i
 307         and     %rax,$ap
 308         not     %rax
 309         mov     $rp,$np
 310         and     %rax,$np
 311         mov     $num,$j                 # j=num
 312         or      $np,$ap                 # ap=borrow?tp:rp
 313 .align  16
 314 .Lcopy:                                 # copy or in-place refresh
 315         mov     ($ap,$i,8),%rax
 316         mov     $i,(%rsp,$i,8)          # zap temporary vector
 317         mov     %rax,($rp,$i,8)         # rp[i]=tp[i]
 318         lea     1($i),$i
 319         sub     \$1,$j
 320         jnz     .Lcopy
 321 
 322         mov     8(%rsp,$num,8),%rsi     # restore %rsp
 323         mov     \$1,%rax
 324 ___
 325 $code.=<<___ if ($win64);
 326         movaps  (%rsi),%xmm6
 327         movaps  0x10(%rsi),%xmm7
 328         lea     0x28(%rsi),%rsi
 329 ___
 330 $code.=<<___;
 331         mov     (%rsi),%r15
 332         mov     8(%rsi),%r14
 333         mov     16(%rsi),%r13
 334         mov     24(%rsi),%r12
 335         mov     32(%rsi),%rbp
 336         mov     40(%rsi),%rbx
 337         lea     48(%rsi),%rsp
 338 .Lmul_epilogue:
 339         ret
 340 .size   bn_mul_mont_gather5,.-bn_mul_mont_gather5
 341 ___
 342 {{{
 343 my @A=("%r10","%r11");
 344 my @N=("%r13","%rdi");
 345 $code.=<<___;
 346 .type   bn_mul4x_mont_gather5,\@function,6
 347 .align  16
 348 bn_mul4x_mont_gather5:
 349 .Lmul4x_enter:
 350         mov     ${num}d,${num}d
 351         mov     `($win64?56:8)`(%rsp),%r10d     # load 7th argument
 352         push    %rbx
 353         push    %rbp
 354         push    %r12
 355         push    %r13
 356         push    %r14
 357         push    %r15
 358 ___
 359 $code.=<<___ if ($win64);
 360         lea     -0x28(%rsp),%rsp
 361         movaps  %xmm6,(%rsp)
 362         movaps  %xmm7,0x10(%rsp)
 363 .Lmul4x_alloca:
 364 ___
 365 $code.=<<___;
 366         mov     %rsp,%rax
 367         lea     4($num),%r11
 368         neg     %r11
 369         lea     (%rsp,%r11,8),%rsp      # tp=alloca(8*(num+4))
 370         and     \$-1024,%rsp            # minimize TLB usage
 371 
 372         mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
 373 .Lmul4x_body:
 374         mov     $rp,16(%rsp,$num,8)     # tp[num+2]=$rp
 375         mov     %rdx,%r12               # reassign $bp
 376 ___
 377                 $bp="%r12";
 378                 $STRIDE=2**5*8;         # 5 is "window size"
 379                 $N=$STRIDE/4;           # should match cache line size
 380 $code.=<<___;
 381         mov     %r10,%r11
 382         shr     \$`log($N/8)/log(2)`,%r10
 383         and     \$`$N/8-1`,%r11
 384         not     %r10
 385         lea     .Lmagic_masks(%rip),%rax
 386         and     \$`2**5/($N/8)-1`,%r10  # 5 is "window size"
 387         lea     96($bp,%r11,8),$bp      # pointer within 1st cache line
 388         movq    0(%rax,%r10,8),%xmm4    # set of masks denoting which
 389         movq    8(%rax,%r10,8),%xmm5    # cache line contains element
 390         movq    16(%rax,%r10,8),%xmm6   # denoted by 7th argument
 391         movq    24(%rax,%r10,8),%xmm7
 392 
 393         movq    `0*$STRIDE/4-96`($bp),%xmm0
 394         movq    `1*$STRIDE/4-96`($bp),%xmm1
 395         pand    %xmm4,%xmm0
 396         movq    `2*$STRIDE/4-96`($bp),%xmm2
 397         pand    %xmm5,%xmm1
 398         movq    `3*$STRIDE/4-96`($bp),%xmm3
 399         pand    %xmm6,%xmm2
 400         por     %xmm1,%xmm0
 401         pand    %xmm7,%xmm3
 402         por     %xmm2,%xmm0
 403         lea     $STRIDE($bp),$bp
 404         por     %xmm3,%xmm0
 405 
 406         movq    %xmm0,$m0               # m0=bp[0]
 407         mov     ($n0),$n0               # pull n0[0] value
 408         mov     ($ap),%rax
 409 
 410         xor     $i,$i                   # i=0
 411         xor     $j,$j                   # j=0
 412 
 413         movq    `0*$STRIDE/4-96`($bp),%xmm0
 414         movq    `1*$STRIDE/4-96`($bp),%xmm1
 415         pand    %xmm4,%xmm0
 416         movq    `2*$STRIDE/4-96`($bp),%xmm2
 417         pand    %xmm5,%xmm1
 418 
 419         mov     $n0,$m1
 420         mulq    $m0                     # ap[0]*bp[0]
 421         mov     %rax,$A[0]
 422         mov     ($np),%rax
 423 
 424         movq    `3*$STRIDE/4-96`($bp),%xmm3
 425         pand    %xmm6,%xmm2
 426         por     %xmm1,%xmm0
 427         pand    %xmm7,%xmm3
 428 
 429         imulq   $A[0],$m1               # "tp[0]"*n0
 430         mov     %rdx,$A[1]
 431 
 432         por     %xmm2,%xmm0
 433         lea     $STRIDE($bp),$bp
 434         por     %xmm3,%xmm0
 435 
 436         mulq    $m1                     # np[0]*m1
 437         add     %rax,$A[0]              # discarded
 438         mov     8($ap),%rax
 439         adc     \$0,%rdx
 440         mov     %rdx,$N[1]
 441 
 442         mulq    $m0
 443         add     %rax,$A[1]
 444         mov     8($np),%rax
 445         adc     \$0,%rdx
 446         mov     %rdx,$A[0]
 447 
 448         mulq    $m1
 449         add     %rax,$N[1]
 450         mov     16($ap),%rax
 451         adc     \$0,%rdx
 452         add     $A[1],$N[1]
 453         lea     4($j),$j                # j++
 454         adc     \$0,%rdx
 455         mov     $N[1],(%rsp)
 456         mov     %rdx,$N[0]
 457         jmp     .L1st4x
 458 .align  16
 459 .L1st4x:
 460         mulq    $m0                     # ap[j]*bp[0]
 461         add     %rax,$A[0]
 462         mov     -16($np,$j,8),%rax
 463         adc     \$0,%rdx
 464         mov     %rdx,$A[1]
 465 
 466         mulq    $m1                     # np[j]*m1
 467         add     %rax,$N[0]
 468         mov     -8($ap,$j,8),%rax
 469         adc     \$0,%rdx
 470         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
 471         adc     \$0,%rdx
 472         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
 473         mov     %rdx,$N[1]
 474 
 475         mulq    $m0                     # ap[j]*bp[0]
 476         add     %rax,$A[1]
 477         mov     -8($np,$j,8),%rax
 478         adc     \$0,%rdx
 479         mov     %rdx,$A[0]
 480 
 481         mulq    $m1                     # np[j]*m1
 482         add     %rax,$N[1]
 483         mov     ($ap,$j,8),%rax
 484         adc     \$0,%rdx
 485         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
 486         adc     \$0,%rdx
 487         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
 488         mov     %rdx,$N[0]
 489 
 490         mulq    $m0                     # ap[j]*bp[0]
 491         add     %rax,$A[0]
 492         mov     ($np,$j,8),%rax
 493         adc     \$0,%rdx
 494         mov     %rdx,$A[1]
 495 
 496         mulq    $m1                     # np[j]*m1
 497         add     %rax,$N[0]
 498         mov     8($ap,$j,8),%rax
 499         adc     \$0,%rdx
 500         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
 501         adc     \$0,%rdx
 502         mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
 503         mov     %rdx,$N[1]
 504 
 505         mulq    $m0                     # ap[j]*bp[0]
 506         add     %rax,$A[1]
 507         mov     8($np,$j,8),%rax
 508         adc     \$0,%rdx
 509         lea     4($j),$j                # j++
 510         mov     %rdx,$A[0]
 511 
 512         mulq    $m1                     # np[j]*m1
 513         add     %rax,$N[1]
 514         mov     -16($ap,$j,8),%rax
 515         adc     \$0,%rdx
 516         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
 517         adc     \$0,%rdx
 518         mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
 519         mov     %rdx,$N[0]
 520         cmp     $num,$j
 521         jl      .L1st4x
 522 
 523         mulq    $m0                     # ap[j]*bp[0]
 524         add     %rax,$A[0]
 525         mov     -16($np,$j,8),%rax
 526         adc     \$0,%rdx
 527         mov     %rdx,$A[1]
 528 
 529         mulq    $m1                     # np[j]*m1
 530         add     %rax,$N[0]
 531         mov     -8($ap,$j,8),%rax
 532         adc     \$0,%rdx
 533         add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
 534         adc     \$0,%rdx
 535         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
 536         mov     %rdx,$N[1]
 537 
 538         mulq    $m0                     # ap[j]*bp[0]
 539         add     %rax,$A[1]
 540         mov     -8($np,$j,8),%rax
 541         adc     \$0,%rdx
 542         mov     %rdx,$A[0]
 543 
 544         mulq    $m1                     # np[j]*m1
 545         add     %rax,$N[1]
 546         mov     ($ap),%rax              # ap[0]
 547         adc     \$0,%rdx
 548         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
 549         adc     \$0,%rdx
 550         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
 551         mov     %rdx,$N[0]
 552 
 553         movq    %xmm0,$m0               # bp[1]
 554 
 555         xor     $N[1],$N[1]
 556         add     $A[0],$N[0]
 557         adc     \$0,$N[1]
 558         mov     $N[0],-8(%rsp,$j,8)
 559         mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
 560 
 561         lea     1($i),$i                # i++
 562 .align  4
 563 .Louter4x:
 564         xor     $j,$j                   # j=0
 565         movq    `0*$STRIDE/4-96`($bp),%xmm0
 566         movq    `1*$STRIDE/4-96`($bp),%xmm1
 567         pand    %xmm4,%xmm0
 568         movq    `2*$STRIDE/4-96`($bp),%xmm2
 569         pand    %xmm5,%xmm1
 570 
 571         mov     (%rsp),$A[0]
 572         mov     $n0,$m1
 573         mulq    $m0                     # ap[0]*bp[i]
 574         add     %rax,$A[0]              # ap[0]*bp[i]+tp[0]
 575         mov     ($np),%rax
 576         adc     \$0,%rdx
 577 
 578         movq    `3*$STRIDE/4-96`($bp),%xmm3
 579         pand    %xmm6,%xmm2
 580         por     %xmm1,%xmm0
 581         pand    %xmm7,%xmm3
 582 
 583         imulq   $A[0],$m1               # tp[0]*n0
 584         mov     %rdx,$A[1]
 585 
 586         por     %xmm2,%xmm0
 587         lea     $STRIDE($bp),$bp
 588         por     %xmm3,%xmm0
 589 
 590         mulq    $m1                     # np[0]*m1
 591         add     %rax,$A[0]              # "$N[0]", discarded
 592         mov     8($ap),%rax
 593         adc     \$0,%rdx
 594         mov     %rdx,$N[1]
 595 
 596         mulq    $m0                     # ap[j]*bp[i]
 597         add     %rax,$A[1]
 598         mov     8($np),%rax
 599         adc     \$0,%rdx
 600         add     8(%rsp),$A[1]           # +tp[1]
 601         adc     \$0,%rdx
 602         mov     %rdx,$A[0]
 603 
 604         mulq    $m1                     # np[j]*m1
 605         add     %rax,$N[1]
 606         mov     16($ap),%rax
 607         adc     \$0,%rdx
 608         add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[i]+tp[j]
 609         lea     4($j),$j                # j+=2
 610         adc     \$0,%rdx
 611         mov     %rdx,$N[0]
 612         jmp     .Linner4x
 613 .align  16
 614 .Linner4x:
 615         mulq    $m0                     # ap[j]*bp[i]
 616         add     %rax,$A[0]
 617         mov     -16($np,$j,8),%rax
 618         adc     \$0,%rdx
 619         add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
 620         adc     \$0,%rdx
 621         mov     %rdx,$A[1]
 622 
 623         mulq    $m1                     # np[j]*m1
 624         add     %rax,$N[0]
 625         mov     -8($ap,$j,8),%rax
 626         adc     \$0,%rdx
 627         add     $A[0],$N[0]
 628         adc     \$0,%rdx
 629         mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
 630         mov     %rdx,$N[1]
 631 
 632         mulq    $m0                     # ap[j]*bp[i]
 633         add     %rax,$A[1]
 634         mov     -8($np,$j,8),%rax
 635         adc     \$0,%rdx
 636         add     -8(%rsp,$j,8),$A[1]
 637         adc     \$0,%rdx
 638         mov     %rdx,$A[0]
 639 
 640         mulq    $m1                     # np[j]*m1
 641         add     %rax,$N[1]
 642         mov     ($ap,$j,8),%rax
 643         adc     \$0,%rdx
 644         add     $A[1],$N[1]
 645         adc     \$0,%rdx
 646         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
 647         mov     %rdx,$N[0]
 648 
 649         mulq    $m0                     # ap[j]*bp[i]
 650         add     %rax,$A[0]
 651         mov     ($np,$j,8),%rax
 652         adc     \$0,%rdx
 653         add     (%rsp,$j,8),$A[0]       # ap[j]*bp[i]+tp[j]
 654         adc     \$0,%rdx
 655         mov     %rdx,$A[1]
 656 
 657         mulq    $m1                     # np[j]*m1
 658         add     %rax,$N[0]
 659         mov     8($ap,$j,8),%rax
 660         adc     \$0,%rdx
 661         add     $A[0],$N[0]
 662         adc     \$0,%rdx
 663         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
 664         mov     %rdx,$N[1]
 665 
 666         mulq    $m0                     # ap[j]*bp[i]
 667         add     %rax,$A[1]
 668         mov     8($np,$j,8),%rax
 669         adc     \$0,%rdx
 670         add     8(%rsp,$j,8),$A[1]
 671         adc     \$0,%rdx
 672         lea     4($j),$j                # j++
 673         mov     %rdx,$A[0]
 674 
 675         mulq    $m1                     # np[j]*m1
 676         add     %rax,$N[1]
 677         mov     -16($ap,$j,8),%rax
 678         adc     \$0,%rdx
 679         add     $A[1],$N[1]
 680         adc     \$0,%rdx
 681         mov     $N[0],-40(%rsp,$j,8)    # tp[j-1]
 682         mov     %rdx,$N[0]
 683         cmp     $num,$j
 684         jl      .Linner4x
 685 
 686         mulq    $m0                     # ap[j]*bp[i]
 687         add     %rax,$A[0]
 688         mov     -16($np,$j,8),%rax
 689         adc     \$0,%rdx
 690         add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
 691         adc     \$0,%rdx
 692         mov     %rdx,$A[1]
 693 
 694         mulq    $m1                     # np[j]*m1
 695         add     %rax,$N[0]
 696         mov     -8($ap,$j,8),%rax
 697         adc     \$0,%rdx
 698         add     $A[0],$N[0]
 699         adc     \$0,%rdx
 700         mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
 701         mov     %rdx,$N[1]
 702 
 703         mulq    $m0                     # ap[j]*bp[i]
 704         add     %rax,$A[1]
 705         mov     -8($np,$j,8),%rax
 706         adc     \$0,%rdx
 707         add     -8(%rsp,$j,8),$A[1]
 708         adc     \$0,%rdx
 709         lea     1($i),$i                # i++
 710         mov     %rdx,$A[0]
 711 
 712         mulq    $m1                     # np[j]*m1
 713         add     %rax,$N[1]
 714         mov     ($ap),%rax              # ap[0]
 715         adc     \$0,%rdx
 716         add     $A[1],$N[1]
 717         adc     \$0,%rdx
 718         mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
 719         mov     %rdx,$N[0]
 720 
 721         movq    %xmm0,$m0               # bp[i+1]
 722         mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
 723 
 724         xor     $N[1],$N[1]
 725         add     $A[0],$N[0]
 726         adc     \$0,$N[1]
 727         add     (%rsp,$num,8),$N[0]     # pull upmost overflow bit
 728         adc     \$0,$N[1]
 729         mov     $N[0],-8(%rsp,$j,8)
 730         mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
 731 
 732         cmp     $num,$i
 733         jl      .Louter4x
 734 ___
 735 {
 736 my @ri=("%rax","%rdx",$m0,$m1);
 737 $code.=<<___;
 738         mov     16(%rsp,$num,8),$rp     # restore $rp
 739         mov     0(%rsp),@ri[0]          # tp[0]
 740         pxor    %xmm0,%xmm0
 741         mov     8(%rsp),@ri[1]          # tp[1]
 742         shr     \$2,$num                # num/=4
 743         lea     (%rsp),$ap              # borrow ap for tp
 744         xor     $i,$i                   # i=0 and clear CF!
 745 
 746         sub     0($np),@ri[0]
 747         mov     16($ap),@ri[2]          # tp[2]
 748         mov     24($ap),@ri[3]          # tp[3]
 749         sbb     8($np),@ri[1]
 750         lea     -1($num),$j             # j=num/4-1
 751         jmp     .Lsub4x
 752 .align  16
 753 .Lsub4x:
 754         mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
 755         mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
 756         sbb     16($np,$i,8),@ri[2]
 757         mov     32($ap,$i,8),@ri[0]     # tp[i+1]
 758         mov     40($ap,$i,8),@ri[1]
 759         sbb     24($np,$i,8),@ri[3]
 760         mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
 761         mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
 762         sbb     32($np,$i,8),@ri[0]
 763         mov     48($ap,$i,8),@ri[2]
 764         mov     56($ap,$i,8),@ri[3]
 765         sbb     40($np,$i,8),@ri[1]
 766         lea     4($i),$i                # i++
 767         dec     $j                      # doesnn't affect CF!
 768         jnz     .Lsub4x
 769 
 770         mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
 771         mov     32($ap,$i,8),@ri[0]     # load overflow bit
 772         sbb     16($np,$i,8),@ri[2]
 773         mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
 774         sbb     24($np,$i,8),@ri[3]
 775         mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
 776 
 777         sbb     \$0,@ri[0]              # handle upmost overflow bit
 778         mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
 779         xor     $i,$i                   # i=0
 780         and     @ri[0],$ap
 781         not     @ri[0]
 782         mov     $rp,$np
 783         and     @ri[0],$np
 784         lea     -1($num),$j
 785         or      $np,$ap                 # ap=borrow?tp:rp
 786 
 787         movdqu  ($ap),%xmm1
 788         movdqa  %xmm0,(%rsp)
 789         movdqu  %xmm1,($rp)
 790         jmp     .Lcopy4x
 791 .align  16
 792 .Lcopy4x:                                       # copy or in-place refresh
 793         movdqu  16($ap,$i),%xmm2
 794         movdqu  32($ap,$i),%xmm1
 795         movdqa  %xmm0,16(%rsp,$i)
 796         movdqu  %xmm2,16($rp,$i)
 797         movdqa  %xmm0,32(%rsp,$i)
 798         movdqu  %xmm1,32($rp,$i)
 799         lea     32($i),$i
 800         dec     $j
 801         jnz     .Lcopy4x
 802 
 803         shl     \$2,$num
 804         movdqu  16($ap,$i),%xmm2
 805         movdqa  %xmm0,16(%rsp,$i)
 806         movdqu  %xmm2,16($rp,$i)
 807 ___
 808 }
 809 $code.=<<___;
 810         mov     8(%rsp,$num,8),%rsi     # restore %rsp
 811         mov     \$1,%rax
 812 ___
 813 $code.=<<___ if ($win64);
 814         movaps  (%rsi),%xmm6
 815         movaps  0x10(%rsi),%xmm7
 816         lea     0x28(%rsi),%rsi
 817 ___
 818 $code.=<<___;
 819         mov     (%rsi),%r15
 820         mov     8(%rsi),%r14
 821         mov     16(%rsi),%r13
 822         mov     24(%rsi),%r12
 823         mov     32(%rsi),%rbp
 824         mov     40(%rsi),%rbx
 825         lea     48(%rsi),%rsp
 826 .Lmul4x_epilogue:
 827         ret
 828 .size   bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
 829 ___
 830 }}}
 831 
 832 {
 833 my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
 834                                 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
 835 my $out=$inp;
 836 my $STRIDE=2**5*8;
 837 my $N=$STRIDE/4;
 838 
 839 $code.=<<___;
 840 .globl  bn_scatter5
 841 .type   bn_scatter5,\@abi-omnipotent
 842 .align  16
 843 bn_scatter5:
 844         cmp     \$0, $num
 845         jz      .Lscatter_epilogue
 846         lea     ($tbl,$idx,8),$tbl
 847 .Lscatter:
 848         mov     ($inp),%rax
 849         lea     8($inp),$inp
 850         mov     %rax,($tbl)
 851         lea     32*8($tbl),$tbl
 852         sub     \$1,$num
 853         jnz     .Lscatter
 854 .Lscatter_epilogue:
 855         ret
 856 .size   bn_scatter5,.-bn_scatter5
 857 
 858 .globl  bn_gather5
 859 .type   bn_gather5,\@abi-omnipotent
 860 .align  16
 861 bn_gather5:
 862 ___
 863 $code.=<<___ if ($win64);
 864 .LSEH_begin_bn_gather5:
 865         # I can't trust assembler to use specific encoding:-(
 866         .byte   0x48,0x83,0xec,0x28             #sub    \$0x28,%rsp
 867         .byte   0x0f,0x29,0x34,0x24             #movaps %xmm6,(%rsp)
 868         .byte   0x0f,0x29,0x7c,0x24,0x10        #movdqa %xmm7,0x10(%rsp)
 869 ___
 870 $code.=<<___;
 871         mov     $idx,%r11
 872         shr     \$`log($N/8)/log(2)`,$idx
 873         and     \$`$N/8-1`,%r11
 874         not     $idx
 875         lea     .Lmagic_masks(%rip),%rax
 876         and     \$`2**5/($N/8)-1`,$idx  # 5 is "window size"
 877         lea     96($tbl,%r11,8),$tbl    # pointer within 1st cache line
 878         movq    0(%rax,$idx,8),%xmm4    # set of masks denoting which
 879         movq    8(%rax,$idx,8),%xmm5    # cache line contains element
 880         movq    16(%rax,$idx,8),%xmm6   # denoted by 7th argument
 881         movq    24(%rax,$idx,8),%xmm7
 882         jmp     .Lgather
 883 .align  16
 884 .Lgather:
 885         movq    `0*$STRIDE/4-96`($tbl),%xmm0
 886         movq    `1*$STRIDE/4-96`($tbl),%xmm1
 887         pand    %xmm4,%xmm0
 888         movq    `2*$STRIDE/4-96`($tbl),%xmm2
 889         pand    %xmm5,%xmm1
 890         movq    `3*$STRIDE/4-96`($tbl),%xmm3
 891         pand    %xmm6,%xmm2
 892         por     %xmm1,%xmm0
 893         pand    %xmm7,%xmm3
 894         por     %xmm2,%xmm0
 895         lea     $STRIDE($tbl),$tbl
 896         por     %xmm3,%xmm0
 897 
 898         movq    %xmm0,($out)            # m0=bp[0]
 899         lea     8($out),$out
 900         sub     \$1,$num
 901         jnz     .Lgather
 902 ___
 903 $code.=<<___ if ($win64);
 904         movaps  (%rsp),%xmm6
 905         movaps  0x10(%rsp),%xmm7
 906         lea     0x28(%rsp),%rsp
 907 ___
 908 $code.=<<___;
 909         ret
 910 .LSEH_end_bn_gather5:
 911 .size   bn_gather5,.-bn_gather5
 912 ___
 913 }
 914 $code.=<<___;
 915 .align  64
 916 .Lmagic_masks:
 917         .long   0,0, 0,0, 0,0, -1,-1
 918         .long   0,0, 0,0, 0,0,  0,0
 919 .asciz  "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 920 ___
 921 
 922 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
 923 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
 924 if ($win64) {
 925 $rec="%rcx";
 926 $frame="%rdx";
 927 $context="%r8";
 928 $disp="%r9";
 929 
 930 $code.=<<___;
 931 .extern __imp_RtlVirtualUnwind
 932 .type   mul_handler,\@abi-omnipotent
 933 .align  16
 934 mul_handler:
 935         push    %rsi
 936         push    %rdi
 937         push    %rbx
 938         push    %rbp
 939         push    %r12
 940         push    %r13
 941         push    %r14
 942         push    %r15
 943         pushfq
 944         sub     \$64,%rsp
 945 
 946         mov     120($context),%rax      # pull context->Rax
 947         mov     248($context),%rbx      # pull context->Rip
 948 
 949         mov     8($disp),%rsi           # disp->ImageBase
 950         mov     56($disp),%r11          # disp->HandlerData
 951 
 952         mov     0(%r11),%r10d           # HandlerData[0]
 953         lea     (%rsi,%r10),%r10        # end of prologue label
 954         cmp     %r10,%rbx               # context->Rip<end of prologue label
 955         jb      .Lcommon_seh_tail
 956 
 957         lea     `40+48`(%rax),%rax
 958 
 959         mov     4(%r11),%r10d           # HandlerData[1]
 960         lea     (%rsi,%r10),%r10        # end of alloca label
 961         cmp     %r10,%rbx               # context->Rip<end of alloca label
 962         jb      .Lcommon_seh_tail
 963 
 964         mov     152($context),%rax      # pull context->Rsp
 965 
 966         mov     8(%r11),%r10d           # HandlerData[2]
 967         lea     (%rsi,%r10),%r10        # epilogue label
 968         cmp     %r10,%rbx               # context->Rip>=epilogue label
 969         jae     .Lcommon_seh_tail
 970 
 971         mov     192($context),%r10      # pull $num
 972         mov     8(%rax,%r10,8),%rax     # pull saved stack pointer
 973 
 974         movaps  (%rax),%xmm0
 975         movaps  16(%rax),%xmm1
 976         lea     `40+48`(%rax),%rax
 977 
 978         mov     -8(%rax),%rbx
 979         mov     -16(%rax),%rbp
 980         mov     -24(%rax),%r12
 981         mov     -32(%rax),%r13
 982         mov     -40(%rax),%r14
 983         mov     -48(%rax),%r15
 984         mov     %rbx,144($context)      # restore context->Rbx
 985         mov     %rbp,160($context)      # restore context->Rbp
 986         mov     %r12,216($context)      # restore context->R12
 987         mov     %r13,224($context)      # restore context->R13
 988         mov     %r14,232($context)      # restore context->R14
 989         mov     %r15,240($context)      # restore context->R15
 990         movups  %xmm0,512($context)     # restore context->Xmm6
 991         movups  %xmm1,528($context)     # restore context->Xmm7
 992 
 993 .Lcommon_seh_tail:
 994         mov     8(%rax),%rdi
 995         mov     16(%rax),%rsi
 996         mov     %rax,152($context)      # restore context->Rsp
 997         mov     %rsi,168($context)      # restore context->Rsi
 998         mov     %rdi,176($context)      # restore context->Rdi
 999 
1000         mov     40($disp),%rdi          # disp->ContextRecord
1001         mov     $context,%rsi           # context
1002         mov     \$154,%ecx              # sizeof(CONTEXT)
1003         .long   0xa548f3fc              # cld; rep movsq
1004 
1005         mov     $disp,%rsi
1006         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1007         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1008         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1009         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1010         mov     40(%rsi),%r10           # disp->ContextRecord
1011         lea     56(%rsi),%r11           # &disp->HandlerData
1012         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1013         mov     %r10,32(%rsp)           # arg5
1014         mov     %r11,40(%rsp)           # arg6
1015         mov     %r12,48(%rsp)           # arg7
1016         mov     %rcx,56(%rsp)           # arg8, (NULL)
1017         call    *__imp_RtlVirtualUnwind(%rip)
1018 
1019         mov     \$1,%eax                # ExceptionContinueSearch
1020         add     \$64,%rsp
1021         popfq
1022         pop     %r15
1023         pop     %r14
1024         pop     %r13
1025         pop     %r12
1026         pop     %rbp
1027         pop     %rbx
1028         pop     %rdi
1029         pop     %rsi
1030         ret
1031 .size   mul_handler,.-mul_handler
1032 
1033 .section        .pdata
1034 .align  4
1035         .rva    .LSEH_begin_bn_mul_mont_gather5
1036         .rva    .LSEH_end_bn_mul_mont_gather5
1037         .rva    .LSEH_info_bn_mul_mont_gather5
1038 
1039         .rva    .LSEH_begin_bn_mul4x_mont_gather5
1040         .rva    .LSEH_end_bn_mul4x_mont_gather5
1041         .rva    .LSEH_info_bn_mul4x_mont_gather5
1042 
1043         .rva    .LSEH_begin_bn_gather5
1044         .rva    .LSEH_end_bn_gather5
1045         .rva    .LSEH_info_bn_gather5
1046 
1047 .section        .xdata
1048 .align  8
1049 .LSEH_info_bn_mul_mont_gather5:
1050         .byte   9,0,0,0
1051         .rva    mul_handler
1052         .rva    .Lmul_alloca,.Lmul_body,.Lmul_epilogue          # HandlerData[]
1053 .align  8
1054 .LSEH_info_bn_mul4x_mont_gather5:
1055         .byte   9,0,0,0
1056         .rva    mul_handler
1057         .rva    .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue    # HandlerData[]
1058 .align  8
1059 .LSEH_info_bn_gather5:
1060         .byte   0x01,0x0d,0x05,0x00
1061         .byte   0x0d,0x78,0x01,0x00     #movaps 0x10(rsp),xmm7
1062         .byte   0x08,0x68,0x00,0x00     #movaps (rsp),xmm6
1063         .byte   0x04,0x42,0x00,0x00     #sub    rsp,0x28
1064 .align  8
1065 ___
1066 }
1067 
1068 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1069 
1070 print $code;
1071 close STDOUT;