1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # June 2011
  11 #
  12 # This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
  13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  15 # parallelism, interleaving it with another algorithm would allow to
  16 # utilize processor resources better and achieve better performance.
  17 # SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
  18 # AESNI code is weaved into it. Below are performance numbers in
  19 # cycles per processed byte, less is better, for standalone AESNI-CBC
  20 # encrypt, sum of the latter and standalone SHA1, and "stitched"
  21 # subroutine:
  22 #
  23 #               AES-128-CBC     +SHA1           stitch      gain
  24 # Westmere      3.77[+5.6]      9.37            6.65        +41%
  25 # Sandy Bridge  5.05[+5.2(6.3)] 10.25(11.35)    6.16(7.08)  +67%(+60%)
  26 #
  27 #               AES-192-CBC
  28 # Westmere      4.51            10.11           6.97        +45%
  29 # Sandy Bridge  6.05            11.25(12.35)    6.34(7.27)  +77%(+70%)
  30 #
  31 #               AES-256-CBC
  32 # Westmere      5.25            10.85           7.25        +50%
  33 # Sandy Bridge  7.05            12.25(13.35)    7.06(7.70)  +74%(+73%)
  34 #
  35 # (*)   There are two code paths: SSSE3 and AVX. See sha1-568.pl for
  36 #       background information. Above numbers in parentheses are SSSE3
  37 #       results collected on AVX-capable CPU, i.e. apply on OSes that
  38 #       don't support AVX.
  39 #
  40 # Needless to mention that it makes no sense to implement "stitched"
  41 # *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
  42 # fully utilize parallelism, so stitching would not give any gain
  43 # anyway. Well, there might be some, e.g. because of better cache
  44 # locality... For reference, here are performance results for
  45 # standalone AESNI-CBC decrypt:
  46 #
  47 #               AES-128-CBC     AES-192-CBC     AES-256-CBC
  48 # Westmere      1.31            1.55            1.80
  49 # Sandy Bridge  0.93            1.06            1.22
  50 
  51 $flavour = shift;
  52 $output  = shift;
  53 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  54 
  55 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  56 
  57 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  58 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  59 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  60 die "can't locate x86_64-xlate.pl";
  61 
  62 $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  63                 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
  64            $1>=2.19);
  65 $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  66            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
  67            $1>=2.09);
  68 $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  69            `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
  70            $1>=10);
  71 
  72 open OUT,"| \"$^X\" $xlate $flavour $output";
  73 *STDOUT=*OUT;
  74 
  75 # void aesni_cbc_sha1_enc(const void *inp,
  76 #                       void *out,
  77 #                       size_t length,
  78 #                       const AES_KEY *key,
  79 #                       unsigned char *iv,
  80 #                       SHA_CTX *ctx,
  81 #                       const void *in0);
  82 
  83 $code.=<<___;
  84 .text
  85 .extern OPENSSL_ia32cap_P
  86 
  87 .globl  aesni_cbc_sha1_enc
  88 .type   aesni_cbc_sha1_enc,\@abi-omnipotent
  89 .align  16
  90 aesni_cbc_sha1_enc:
  91         # caller should check for SSSE3 and AES-NI bits
  92         mov     OPENSSL_ia32cap_P+0(%rip),%r10d
  93         mov     OPENSSL_ia32cap_P+4(%rip),%r11d
  94 ___
  95 $code.=<<___ if ($avx);
  96         and     \$`1<<28`,%r11d           # mask AVX bit
  97         and     \$`1<<30`,%r10d           # mask "Intel CPU" bit
  98         or      %r11d,%r10d
  99         cmp     \$`1<<28|1<<30`,%r10d
 100         je      aesni_cbc_sha1_enc_avx
 101 ___
 102 $code.=<<___;
 103         jmp     aesni_cbc_sha1_enc_ssse3
 104         ret
 105 .size   aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
 106 ___
 107 
 108 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 109 
 110 my $Xi=4;
 111 my @X=map("%xmm$_",(4..7,0..3));
 112 my @Tx=map("%xmm$_",(8..10));
 113 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");    # size optimization
 114 my @T=("%esi","%edi");
 115 my $j=0; my $jj=0; my $r=0; my $sn=0;
 116 my $K_XX_XX="%r11";
 117 my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
 118 my @rndkey=("%xmm14","%xmm15");
 119 
 120 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
 121 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
 122   my $arg = pop;
 123     $arg = "\$$arg" if ($arg*1 eq $arg);
 124     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 125 }
 126 
 127 my $_rol=sub { &rol(@_) };
 128 my $_ror=sub { &ror(@_) };
 129 
 130 $code.=<<___;
 131 .type   aesni_cbc_sha1_enc_ssse3,\@function,6
 132 .align  16
 133 aesni_cbc_sha1_enc_ssse3:
 134         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
 135         #shr    \$6,$len                        # debugging artefact
 136         #jz     .Lepilogue_ssse3                # debugging artefact
 137         push    %rbx
 138         push    %rbp
 139         push    %r12
 140         push    %r13
 141         push    %r14
 142         push    %r15
 143         lea     `-104-($win64?10*16:0)`(%rsp),%rsp
 144         #mov    $in0,$inp                       # debugging artefact
 145         #lea    64(%rsp),$ctx                   # debugging artefact
 146 ___
 147 $code.=<<___ if ($win64);
 148         movaps  %xmm6,96+0(%rsp)
 149         movaps  %xmm7,96+16(%rsp)
 150         movaps  %xmm8,96+32(%rsp)
 151         movaps  %xmm9,96+48(%rsp)
 152         movaps  %xmm10,96+64(%rsp)
 153         movaps  %xmm11,96+80(%rsp)
 154         movaps  %xmm12,96+96(%rsp)
 155         movaps  %xmm13,96+112(%rsp)
 156         movaps  %xmm14,96+128(%rsp)
 157         movaps  %xmm15,96+144(%rsp)
 158 .Lprologue_ssse3:
 159 ___
 160 $code.=<<___;
 161         mov     $in0,%r12                       # reassign arguments
 162         mov     $out,%r13
 163         mov     $len,%r14
 164         mov     $key,%r15
 165         movdqu  ($ivp),$iv                      # load IV
 166         mov     $ivp,88(%rsp)                   # save $ivp
 167 ___
 168 my ($in0,$out,$len,$key)=map("%r$_",(12..15));  # reassign arguments
 169 my $rounds="${ivp}d";
 170 $code.=<<___;
 171         shl     \$6,$len
 172         sub     $in0,$out
 173         mov     240($key),$rounds
 174         add     $inp,$len               # end of input
 175 
 176         lea     K_XX_XX(%rip),$K_XX_XX
 177         mov     0($ctx),$A              # load context
 178         mov     4($ctx),$B
 179         mov     8($ctx),$C
 180         mov     12($ctx),$D
 181         mov     $B,@T[0]                # magic seed
 182         mov     16($ctx),$E
 183 
 184         movdqa  64($K_XX_XX),@X[2]      # pbswap mask
 185         movdqa  0($K_XX_XX),@Tx[1]      # K_00_19
 186         movdqu  0($inp),@X[-4&7]    # load input to %xmm[0-3]
 187         movdqu  16($inp),@X[-3&7]
 188         movdqu  32($inp),@X[-2&7]
 189         movdqu  48($inp),@X[-1&7]
 190         pshufb  @X[2],@X[-4&7]              # byte swap
 191         add     \$64,$inp
 192         pshufb  @X[2],@X[-3&7]
 193         pshufb  @X[2],@X[-2&7]
 194         pshufb  @X[2],@X[-1&7]
 195         paddd   @Tx[1],@X[-4&7]             # add K_00_19
 196         paddd   @Tx[1],@X[-3&7]
 197         paddd   @Tx[1],@X[-2&7]
 198         movdqa  @X[-4&7],0(%rsp)    # X[]+K xfer to IALU
 199         psubd   @Tx[1],@X[-4&7]             # restore X[]
 200         movdqa  @X[-3&7],16(%rsp)
 201         psubd   @Tx[1],@X[-3&7]
 202         movdqa  @X[-2&7],32(%rsp)
 203         psubd   @Tx[1],@X[-2&7]
 204         movups  ($key),$rndkey0         # $key[0]
 205         movups  16($key),$rndkey[0]     # forward reference
 206         jmp     .Loop_ssse3
 207 ___
 208 
 209 my $aesenc=sub {
 210   use integer;
 211   my ($n,$k)=($r/10,$r%10);
 212     if ($k==0) {
 213       $code.=<<___;
 214         movups          `16*$n`($in0),$in               # load input
 215         xorps           $rndkey0,$in
 216 ___
 217       $code.=<<___ if ($n);
 218         movups          $iv,`16*($n-1)`($out,$in0)      # write output
 219 ___
 220       $code.=<<___;
 221         xorps           $in,$iv
 222         aesenc          $rndkey[0],$iv
 223         movups          `32+16*$k`($key),$rndkey[1]
 224 ___
 225     } elsif ($k==9) {
 226       $sn++;
 227       $code.=<<___;
 228         cmp             \$11,$rounds
 229         jb              .Laesenclast$sn
 230         movups          `32+16*($k+0)`($key),$rndkey[1]
 231         aesenc          $rndkey[0],$iv
 232         movups          `32+16*($k+1)`($key),$rndkey[0]
 233         aesenc          $rndkey[1],$iv
 234         je              .Laesenclast$sn
 235         movups          `32+16*($k+2)`($key),$rndkey[1]
 236         aesenc          $rndkey[0],$iv
 237         movups          `32+16*($k+3)`($key),$rndkey[0]
 238         aesenc          $rndkey[1],$iv
 239 .Laesenclast$sn:
 240         aesenclast      $rndkey[0],$iv
 241         movups          16($key),$rndkey[1]             # forward reference
 242 ___
 243     } else {
 244       $code.=<<___;
 245         aesenc          $rndkey[0],$iv
 246         movups          `32+16*$k`($key),$rndkey[1]
 247 ___
 248     }
 249     $r++;       unshift(@rndkey,pop(@rndkey));
 250 };
 251 
 252 sub Xupdate_ssse3_16_31()               # recall that $Xi starts wtih 4
 253 { use integer;
 254   my $body = shift;
 255   my @insns = (&$body,&$body,&$body,&$body);    # 40 instructions
 256   my ($a,$b,$c,$d,$e);
 257 
 258         &movdqa     (@X[0],@X[-3&7]);
 259          eval(shift(@insns));
 260          eval(shift(@insns));
 261         &movdqa     (@Tx[0],@X[-1&7]);
 262         &palignr(@X[0],@X[-4&7],8);     # compose "X[-14]" in "X[0]"
 263          eval(shift(@insns));
 264          eval(shift(@insns));
 265 
 266           &paddd    (@Tx[1],@X[-1&7]);
 267          eval(shift(@insns));
 268          eval(shift(@insns));
 269         &psrldq     (@Tx[0],4);             # "X[-3]", 3 dwords
 270          eval(shift(@insns));
 271          eval(shift(@insns));
 272         &pxor       (@X[0],@X[-4&7]);   # "X[0]"^="X[-16]"
 273          eval(shift(@insns));
 274          eval(shift(@insns));
 275 
 276         &pxor       (@Tx[0],@X[-2&7]);  # "X[-3]"^"X[-8]"
 277          eval(shift(@insns));
 278          eval(shift(@insns));
 279          eval(shift(@insns));
 280          eval(shift(@insns));
 281 
 282         &pxor       (@X[0],@Tx[0]);         # "X[0]"^="X[-3]"^"X[-8]"
 283          eval(shift(@insns));
 284          eval(shift(@insns));
 285           &movdqa   (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);     # X[]+K xfer to IALU
 286          eval(shift(@insns));
 287          eval(shift(@insns));
 288 
 289         &movdqa     (@Tx[2],@X[0]);
 290         &movdqa     (@Tx[0],@X[0]);
 291          eval(shift(@insns));
 292          eval(shift(@insns));
 293          eval(shift(@insns));
 294          eval(shift(@insns));
 295 
 296         &pslldq     (@Tx[2],12);            # "X[0]"<<96, extract one dword
 297         &paddd      (@X[0],@X[0]);
 298          eval(shift(@insns));
 299          eval(shift(@insns));
 300          eval(shift(@insns));
 301          eval(shift(@insns));
 302 
 303         &psrld      (@Tx[0],31);
 304          eval(shift(@insns));
 305          eval(shift(@insns));
 306         &movdqa     (@Tx[1],@Tx[2]);
 307          eval(shift(@insns));
 308          eval(shift(@insns));
 309 
 310         &psrld      (@Tx[2],30);
 311         &por        (@X[0],@Tx[0]);         # "X[0]"<<<=1
 312          eval(shift(@insns));
 313          eval(shift(@insns));
 314          eval(shift(@insns));
 315          eval(shift(@insns));
 316 
 317         &pslld      (@Tx[1],2);
 318         &pxor       (@X[0],@Tx[2]);
 319          eval(shift(@insns));
 320          eval(shift(@insns));
 321           &movdqa   (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");       # K_XX_XX
 322          eval(shift(@insns));
 323          eval(shift(@insns));
 324 
 325         &pxor       (@X[0],@Tx[1]);         # "X[0]"^=("X[0]">>96)<<<2
 326 
 327          foreach (@insns) { eval; }     # remaining instructions [if any]
 328 
 329   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
 330                 push(@Tx,shift(@Tx));
 331 }
 332 
 333 sub Xupdate_ssse3_32_79()
 334 { use integer;
 335   my $body = shift;
 336   my @insns = (&$body,&$body,&$body,&$body);    # 32 to 48 instructions
 337   my ($a,$b,$c,$d,$e);
 338 
 339         &movdqa     (@Tx[0],@X[-1&7])   if ($Xi==8);
 340          eval(shift(@insns));           # body_20_39
 341         &pxor       (@X[0],@X[-4&7]);   # "X[0]"="X[-32]"^"X[-16]"
 342         &palignr(@Tx[0],@X[-2&7],8);    # compose "X[-6]"
 343          eval(shift(@insns));
 344          eval(shift(@insns));
 345          eval(shift(@insns));           # rol
 346 
 347         &pxor       (@X[0],@X[-7&7]);   # "X[0]"^="X[-28]"
 348          eval(shift(@insns));
 349          eval(shift(@insns))    if (@insns[0] !~ /&ro[rl]/);
 350         if ($Xi%5) {
 351           &movdqa   (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
 352         } else {                        # ... or load next one
 353           &movdqa   (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
 354         }
 355           &paddd    (@Tx[1],@X[-1&7]);
 356          eval(shift(@insns));           # ror
 357          eval(shift(@insns));
 358 
 359         &pxor       (@X[0],@Tx[0]);         # "X[0]"^="X[-6]"
 360          eval(shift(@insns));           # body_20_39
 361          eval(shift(@insns));
 362          eval(shift(@insns));
 363          eval(shift(@insns));           # rol
 364 
 365         &movdqa     (@Tx[0],@X[0]);
 366           &movdqa   (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);     # X[]+K xfer to IALU
 367          eval(shift(@insns));
 368          eval(shift(@insns));
 369          eval(shift(@insns));           # ror
 370          eval(shift(@insns));
 371 
 372         &pslld      (@X[0],2);
 373          eval(shift(@insns));           # body_20_39
 374          eval(shift(@insns));
 375         &psrld      (@Tx[0],30);
 376          eval(shift(@insns));
 377          eval(shift(@insns));           # rol
 378          eval(shift(@insns));
 379          eval(shift(@insns));
 380          eval(shift(@insns));           # ror
 381          eval(shift(@insns));
 382 
 383         &por        (@X[0],@Tx[0]);         # "X[0]"<<<=2
 384          eval(shift(@insns));           # body_20_39
 385          eval(shift(@insns));
 386           &movdqa   (@Tx[1],@X[0])  if ($Xi<19);
 387          eval(shift(@insns));
 388          eval(shift(@insns));           # rol
 389          eval(shift(@insns));
 390          eval(shift(@insns));
 391          eval(shift(@insns));           # rol
 392          eval(shift(@insns));
 393 
 394          foreach (@insns) { eval; }     # remaining instructions
 395 
 396   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
 397                 push(@Tx,shift(@Tx));
 398 }
 399 
 400 sub Xuplast_ssse3_80()
 401 { use integer;
 402   my $body = shift;
 403   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 404   my ($a,$b,$c,$d,$e);
 405 
 406          eval(shift(@insns));
 407           &paddd    (@Tx[1],@X[-1&7]);
 408          eval(shift(@insns));
 409          eval(shift(@insns));
 410          eval(shift(@insns));
 411          eval(shift(@insns));
 412 
 413           &movdqa   (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);     # X[]+K xfer IALU
 414 
 415          foreach (@insns) { eval; }             # remaining instructions
 416 
 417         &cmp        ($inp,$len);
 418         &je (".Ldone_ssse3");
 419 
 420         unshift(@Tx,pop(@Tx));
 421 
 422         &movdqa     (@X[2],"64($K_XX_XX)");         # pbswap mask
 423         &movdqa     (@Tx[1],"0($K_XX_XX)");         # K_00_19
 424         &movdqu     (@X[-4&7],"0($inp)");               # load input
 425         &movdqu     (@X[-3&7],"16($inp)");
 426         &movdqu     (@X[-2&7],"32($inp)");
 427         &movdqu     (@X[-1&7],"48($inp)");
 428         &pshufb     (@X[-4&7],@X[2]);           # byte swap
 429         &add        ($inp,64);
 430 
 431   $Xi=0;
 432 }
 433 
 434 sub Xloop_ssse3()
 435 { use integer;
 436   my $body = shift;
 437   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 438   my ($a,$b,$c,$d,$e);
 439 
 440          eval(shift(@insns));
 441          eval(shift(@insns));
 442         &pshufb     (@X[($Xi-3)&7],@X[2]);
 443          eval(shift(@insns));
 444          eval(shift(@insns));
 445         &paddd      (@X[($Xi-4)&7],@Tx[1]);
 446          eval(shift(@insns));
 447          eval(shift(@insns));
 448          eval(shift(@insns));
 449          eval(shift(@insns));
 450         &movdqa     (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);      # X[]+K xfer to IALU
 451          eval(shift(@insns));
 452          eval(shift(@insns));
 453         &psubd      (@X[($Xi-4)&7],@Tx[1]);
 454 
 455         foreach (@insns) { eval; }
 456   $Xi++;
 457 }
 458 
 459 sub Xtail_ssse3()
 460 { use integer;
 461   my $body = shift;
 462   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 463   my ($a,$b,$c,$d,$e);
 464 
 465         foreach (@insns) { eval; }
 466 }
 467 
 468 sub body_00_19 () {
 469   use integer;
 470   my ($k,$n);
 471   my @r=(
 472         '($a,$b,$c,$d,$e)=@V;'.
 473         '&add       ($e,eval(4*($j&15))."(%rsp)");',    # X[]+K xfer
 474         '&xor       ($c,$d);',
 475         '&mov       (@T[1],$a);',   # $b in next round
 476         '&$_rol     ($a,5);',
 477         '&and       (@T[0],$c);',   # ($b&($c^$d))
 478         '&xor       ($c,$d);',      # restore $c
 479         '&xor       (@T[0],$d);',
 480         '&add       ($e,$a);',
 481         '&$_ror     ($b,$j?7:2);',  # $b>>>2
 482         '&add       ($e,@T[0]);'    .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
 483         );
 484         $n = scalar(@r);
 485         $k = (($jj+1)*12/20)*20*$n/12;  # 12 aesencs per these 20 rounds
 486         @r[$k%$n].='&$aesenc();'    if ($jj==$k/$n);
 487         $jj++;
 488     return @r;
 489 }
 490 
 491 sub body_20_39 () {
 492   use integer;
 493   my ($k,$n);
 494   my @r=(
 495         '($a,$b,$c,$d,$e)=@V;'.
 496         '&add       ($e,eval(4*($j++&15))."(%rsp)");',  # X[]+K xfer
 497         '&xor       (@T[0],$d);',   # ($b^$d)
 498         '&mov       (@T[1],$a);',   # $b in next round
 499         '&$_rol     ($a,5);',
 500         '&xor       (@T[0],$c);',   # ($b^$d^$c)
 501         '&add       ($e,$a);',
 502         '&$_ror     ($b,7);',       # $b>>>2
 503         '&add       ($e,@T[0]);'    .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
 504         );
 505         $n = scalar(@r);
 506         $k = (($jj+1)*8/20)*20*$n/8;    # 8 aesencs per these 20 rounds
 507         @r[$k%$n].='&$aesenc();'    if ($jj==$k/$n);
 508         $jj++;
 509     return @r;
 510 }
 511 
 512 sub body_40_59 () {
 513   use integer;
 514   my ($k,$n);
 515   my @r=(
 516         '($a,$b,$c,$d,$e)=@V;'.
 517         '&mov       (@T[1],$c);',
 518         '&xor       ($c,$d);',
 519         '&add       ($e,eval(4*($j++&15))."(%rsp)");',  # X[]+K xfer
 520         '&and       (@T[1],$d);',
 521         '&and       (@T[0],$c);',   # ($b&($c^$d))
 522         '&$_ror     ($b,7);',       # $b>>>2
 523         '&add       ($e,@T[1]);',
 524         '&mov       (@T[1],$a);',   # $b in next round
 525         '&$_rol     ($a,5);',
 526         '&add       ($e,@T[0]);',
 527         '&xor       ($c,$d);',      # restore $c
 528         '&add       ($e,$a);'       .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
 529         );
 530         $n = scalar(@r);
 531         $k=(($jj+1)*12/20)*20*$n/12;    # 12 aesencs per these 20 rounds
 532         @r[$k%$n].='&$aesenc();'    if ($jj==$k/$n);
 533         $jj++;
 534     return @r;
 535 }
 536 $code.=<<___;
 537 .align  16
 538 .Loop_ssse3:
 539 ___
 540         &Xupdate_ssse3_16_31(\&body_00_19);
 541         &Xupdate_ssse3_16_31(\&body_00_19);
 542         &Xupdate_ssse3_16_31(\&body_00_19);
 543         &Xupdate_ssse3_16_31(\&body_00_19);
 544         &Xupdate_ssse3_32_79(\&body_00_19);
 545         &Xupdate_ssse3_32_79(\&body_20_39);
 546         &Xupdate_ssse3_32_79(\&body_20_39);
 547         &Xupdate_ssse3_32_79(\&body_20_39);
 548         &Xupdate_ssse3_32_79(\&body_20_39);
 549         &Xupdate_ssse3_32_79(\&body_20_39);
 550         &Xupdate_ssse3_32_79(\&body_40_59);
 551         &Xupdate_ssse3_32_79(\&body_40_59);
 552         &Xupdate_ssse3_32_79(\&body_40_59);
 553         &Xupdate_ssse3_32_79(\&body_40_59);
 554         &Xupdate_ssse3_32_79(\&body_40_59);
 555         &Xupdate_ssse3_32_79(\&body_20_39);
 556         &Xuplast_ssse3_80(\&body_20_39);        # can jump to "done"
 557 
 558                                 $saved_j=$j; @saved_V=@V;
 559                                 $saved_r=$r; @saved_rndkey=@rndkey;
 560 
 561         &Xloop_ssse3(\&body_20_39);
 562         &Xloop_ssse3(\&body_20_39);
 563         &Xloop_ssse3(\&body_20_39);
 564 
 565 $code.=<<___;
 566         movups  $iv,48($out,$in0)               # write output
 567         lea     64($in0),$in0
 568 
 569         add     0($ctx),$A                      # update context
 570         add     4($ctx),@T[0]
 571         add     8($ctx),$C
 572         add     12($ctx),$D
 573         mov     $A,0($ctx)
 574         add     16($ctx),$E
 575         mov     @T[0],4($ctx)
 576         mov     @T[0],$B                        # magic seed
 577         mov     $C,8($ctx)
 578         mov     $D,12($ctx)
 579         mov     $E,16($ctx)
 580         jmp     .Loop_ssse3
 581 
 582 .align  16
 583 .Ldone_ssse3:
 584 ___
 585                                 $jj=$j=$saved_j; @V=@saved_V;
 586                                 $r=$saved_r;     @rndkey=@saved_rndkey;
 587 
 588         &Xtail_ssse3(\&body_20_39);
 589         &Xtail_ssse3(\&body_20_39);
 590         &Xtail_ssse3(\&body_20_39);
 591 
 592 $code.=<<___;
 593         movups  $iv,48($out,$in0)               # write output
 594         mov     88(%rsp),$ivp                   # restore $ivp
 595 
 596         add     0($ctx),$A                      # update context
 597         add     4($ctx),@T[0]
 598         add     8($ctx),$C
 599         mov     $A,0($ctx)
 600         add     12($ctx),$D
 601         mov     @T[0],4($ctx)
 602         add     16($ctx),$E
 603         mov     $C,8($ctx)
 604         mov     $D,12($ctx)
 605         mov     $E,16($ctx)
 606         movups  $iv,($ivp)                      # write IV
 607 ___
 608 $code.=<<___ if ($win64);
 609         movaps  96+0(%rsp),%xmm6
 610         movaps  96+16(%rsp),%xmm7
 611         movaps  96+32(%rsp),%xmm8
 612         movaps  96+48(%rsp),%xmm9
 613         movaps  96+64(%rsp),%xmm10
 614         movaps  96+80(%rsp),%xmm11
 615         movaps  96+96(%rsp),%xmm12
 616         movaps  96+112(%rsp),%xmm13
 617         movaps  96+128(%rsp),%xmm14
 618         movaps  96+144(%rsp),%xmm15
 619 ___
 620 $code.=<<___;
 621         lea     `104+($win64?10*16:0)`(%rsp),%rsi
 622         mov     0(%rsi),%r15
 623         mov     8(%rsi),%r14
 624         mov     16(%rsi),%r13
 625         mov     24(%rsi),%r12
 626         mov     32(%rsi),%rbp
 627         mov     40(%rsi),%rbx
 628         lea     48(%rsi),%rsp
 629 .Lepilogue_ssse3:
 630         ret
 631 .size   aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
 632 ___
 633 
 634 $j=$jj=$r=$sn=0;
 635 
 636 if ($avx) {
 637 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 638 
 639 my $Xi=4;
 640 my @X=map("%xmm$_",(4..7,0..3));
 641 my @Tx=map("%xmm$_",(8..10));
 642 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");    # size optimization
 643 my @T=("%esi","%edi");
 644 
 645 my $_rol=sub { &shld(@_[0],@_) };
 646 my $_ror=sub { &shrd(@_[0],@_) };
 647 
 648 $code.=<<___;
 649 .type   aesni_cbc_sha1_enc_avx,\@function,6
 650 .align  16
 651 aesni_cbc_sha1_enc_avx:
 652         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
 653         #shr    \$6,$len                        # debugging artefact
 654         #jz     .Lepilogue_avx                  # debugging artefact
 655         push    %rbx
 656         push    %rbp
 657         push    %r12
 658         push    %r13
 659         push    %r14
 660         push    %r15
 661         lea     `-104-($win64?10*16:0)`(%rsp),%rsp
 662         #mov    $in0,$inp                       # debugging artefact
 663         #lea    64(%rsp),$ctx                   # debugging artefact
 664 ___
 665 $code.=<<___ if ($win64);
 666         movaps  %xmm6,96+0(%rsp)
 667         movaps  %xmm7,96+16(%rsp)
 668         movaps  %xmm8,96+32(%rsp)
 669         movaps  %xmm9,96+48(%rsp)
 670         movaps  %xmm10,96+64(%rsp)
 671         movaps  %xmm11,96+80(%rsp)
 672         movaps  %xmm12,96+96(%rsp)
 673         movaps  %xmm13,96+112(%rsp)
 674         movaps  %xmm14,96+128(%rsp)
 675         movaps  %xmm15,96+144(%rsp)
 676 .Lprologue_avx:
 677 ___
 678 $code.=<<___;
 679         vzeroall
 680         mov     $in0,%r12                       # reassign arguments
 681         mov     $out,%r13
 682         mov     $len,%r14
 683         mov     $key,%r15
 684         vmovdqu ($ivp),$iv                      # load IV
 685         mov     $ivp,88(%rsp)                   # save $ivp
 686 ___
 687 my ($in0,$out,$len,$key)=map("%r$_",(12..15));  # reassign arguments
 688 my $rounds="${ivp}d";
 689 $code.=<<___;
 690         shl     \$6,$len
 691         sub     $in0,$out
 692         mov     240($key),$rounds
 693         add     \$112,$key              # size optimization
 694         add     $inp,$len               # end of input
 695 
 696         lea     K_XX_XX(%rip),$K_XX_XX
 697         mov     0($ctx),$A              # load context
 698         mov     4($ctx),$B
 699         mov     8($ctx),$C
 700         mov     12($ctx),$D
 701         mov     $B,@T[0]                # magic seed
 702         mov     16($ctx),$E
 703 
 704         vmovdqa 64($K_XX_XX),@X[2]      # pbswap mask
 705         vmovdqa 0($K_XX_XX),@Tx[1]      # K_00_19
 706         vmovdqu 0($inp),@X[-4&7]    # load input to %xmm[0-3]
 707         vmovdqu 16($inp),@X[-3&7]
 708         vmovdqu 32($inp),@X[-2&7]
 709         vmovdqu 48($inp),@X[-1&7]
 710         vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
 711         add     \$64,$inp
 712         vpshufb @X[2],@X[-3&7],@X[-3&7]
 713         vpshufb @X[2],@X[-2&7],@X[-2&7]
 714         vpshufb @X[2],@X[-1&7],@X[-1&7]
 715         vpaddd  @Tx[1],@X[-4&7],@X[0]       # add K_00_19
 716         vpaddd  @Tx[1],@X[-3&7],@X[1]
 717         vpaddd  @Tx[1],@X[-2&7],@X[2]
 718         vmovdqa @X[0],0(%rsp)           # X[]+K xfer to IALU
 719         vmovdqa @X[1],16(%rsp)
 720         vmovdqa @X[2],32(%rsp)
 721         vmovups -112($key),$rndkey0     # $key[0]
 722         vmovups 16-112($key),$rndkey[0] # forward reference
 723         jmp     .Loop_avx
 724 ___
 725 
 726 my $aesenc=sub {
 727   use integer;
 728   my ($n,$k)=($r/10,$r%10);
 729     if ($k==0) {
 730       $code.=<<___;
 731         vmovups         `16*$n`($in0),$in               # load input
 732         vxorps          $rndkey0,$in,$in
 733 ___
 734       $code.=<<___ if ($n);
 735         vmovups         $iv,`16*($n-1)`($out,$in0)      # write output
 736 ___
 737       $code.=<<___;
 738         vxorps          $in,$iv,$iv
 739         vaesenc         $rndkey[0],$iv,$iv
 740         vmovups         `32+16*$k-112`($key),$rndkey[1]
 741 ___
 742     } elsif ($k==9) {
 743       $sn++;
 744       $code.=<<___;
 745         cmp             \$11,$rounds
 746         jb              .Lvaesenclast$sn
 747         vaesenc         $rndkey[0],$iv,$iv
 748         vmovups         `32+16*($k+0)-112`($key),$rndkey[1]
 749         vaesenc         $rndkey[1],$iv,$iv
 750         vmovups         `32+16*($k+1)-112`($key),$rndkey[0]
 751         je              .Lvaesenclast$sn
 752         vaesenc         $rndkey[0],$iv,$iv
 753         vmovups         `32+16*($k+2)-112`($key),$rndkey[1]
 754         vaesenc         $rndkey[1],$iv,$iv
 755         vmovups         `32+16*($k+3)-112`($key),$rndkey[0]
 756 .Lvaesenclast$sn:
 757         vaesenclast     $rndkey[0],$iv,$iv
 758         vmovups         16-112($key),$rndkey[1]         # forward reference
 759 ___
 760     } else {
 761       $code.=<<___;
 762         vaesenc         $rndkey[0],$iv,$iv
 763         vmovups         `32+16*$k-112`($key),$rndkey[1]
 764 ___
 765     }
 766     $r++;       unshift(@rndkey,pop(@rndkey));
 767 };
 768 
 769 sub Xupdate_avx_16_31()         # recall that $Xi starts wtih 4
 770 { use integer;
 771   my $body = shift;
 772   my @insns = (&$body,&$body,&$body,&$body);    # 40 instructions
 773   my ($a,$b,$c,$d,$e);
 774 
 775          eval(shift(@insns));
 776          eval(shift(@insns));
 777         &vpalignr(@X[0],@X[-3&7],@X[-4&7],8);       # compose "X[-14]" in "X[0]"
 778          eval(shift(@insns));
 779          eval(shift(@insns));
 780 
 781           &vpaddd   (@Tx[1],@Tx[1],@X[-1&7]);
 782          eval(shift(@insns));
 783          eval(shift(@insns));
 784         &vpsrldq(@Tx[0],@X[-1&7],4);    # "X[-3]", 3 dwords
 785          eval(shift(@insns));
 786          eval(shift(@insns));
 787         &vpxor      (@X[0],@X[0],@X[-4&7]);             # "X[0]"^="X[-16]"
 788          eval(shift(@insns));
 789          eval(shift(@insns));
 790 
 791         &vpxor      (@Tx[0],@Tx[0],@X[-2&7]);   # "X[-3]"^"X[-8]"
 792          eval(shift(@insns));
 793          eval(shift(@insns));
 794          eval(shift(@insns));
 795          eval(shift(@insns));
 796 
 797         &vpxor      (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-3]"^"X[-8]"
 798          eval(shift(@insns));
 799          eval(shift(@insns));
 800           &vmovdqa  (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);     # X[]+K xfer to IALU
 801          eval(shift(@insns));
 802          eval(shift(@insns));
 803 
 804         &vpsrld     (@Tx[0],@X[0],31);
 805          eval(shift(@insns));
 806          eval(shift(@insns));
 807          eval(shift(@insns));
 808          eval(shift(@insns));
 809 
 810         &vpslldq(@Tx[2],@X[0],12);          # "X[0]"<<96, extract one dword
 811         &vpaddd     (@X[0],@X[0],@X[0]);
 812          eval(shift(@insns));
 813          eval(shift(@insns));
 814          eval(shift(@insns));
 815          eval(shift(@insns));
 816 
 817         &vpsrld     (@Tx[1],@Tx[2],30);
 818         &vpor       (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=1
 819          eval(shift(@insns));
 820          eval(shift(@insns));
 821          eval(shift(@insns));
 822          eval(shift(@insns));
 823 
 824         &vpslld     (@Tx[2],@Tx[2],2);
 825         &vpxor      (@X[0],@X[0],@Tx[1]);
 826          eval(shift(@insns));
 827          eval(shift(@insns));
 828          eval(shift(@insns));
 829          eval(shift(@insns));
 830 
 831         &vpxor      (@X[0],@X[0],@Tx[2]);           # "X[0]"^=("X[0]">>96)<<<2
 832          eval(shift(@insns));
 833          eval(shift(@insns));
 834           &vmovdqa  (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");       # K_XX_XX
 835          eval(shift(@insns));
 836          eval(shift(@insns));
 837 
 838 
 839          foreach (@insns) { eval; }     # remaining instructions [if any]
 840 
 841   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
 842                 push(@Tx,shift(@Tx));
 843 }
 844 
 845 sub Xupdate_avx_32_79()
 846 { use integer;
 847   my $body = shift;
 848   my @insns = (&$body,&$body,&$body,&$body);    # 32 to 48 instructions
 849   my ($a,$b,$c,$d,$e);
 850 
 851         &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);      # compose "X[-6]"
 852         &vpxor      (@X[0],@X[0],@X[-4&7]);             # "X[0]"="X[-32]"^"X[-16]"
 853          eval(shift(@insns));           # body_20_39
 854          eval(shift(@insns));
 855          eval(shift(@insns));
 856          eval(shift(@insns));           # rol
 857 
 858         &vpxor      (@X[0],@X[0],@X[-7&7]);             # "X[0]"^="X[-28]"
 859          eval(shift(@insns));
 860          eval(shift(@insns))    if (@insns[0] !~ /&ro[rl]/);
 861         if ($Xi%5) {
 862           &vmovdqa  (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
 863         } else {                        # ... or load next one
 864           &vmovdqa  (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
 865         }
 866           &vpaddd   (@Tx[1],@Tx[1],@X[-1&7]);
 867          eval(shift(@insns));           # ror
 868          eval(shift(@insns));
 869 
 870         &vpxor      (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-6]"
 871          eval(shift(@insns));           # body_20_39
 872          eval(shift(@insns));
 873          eval(shift(@insns));
 874          eval(shift(@insns));           # rol
 875 
 876         &vpsrld     (@Tx[0],@X[0],30);
 877           &vmovdqa  (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);     # X[]+K xfer to IALU
 878          eval(shift(@insns));
 879          eval(shift(@insns));
 880          eval(shift(@insns));           # ror
 881          eval(shift(@insns));
 882 
 883         &vpslld     (@X[0],@X[0],2);
 884          eval(shift(@insns));           # body_20_39
 885          eval(shift(@insns));
 886          eval(shift(@insns));
 887          eval(shift(@insns));           # rol
 888          eval(shift(@insns));
 889          eval(shift(@insns));
 890          eval(shift(@insns));           # ror
 891          eval(shift(@insns));
 892 
 893         &vpor       (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=2
 894          eval(shift(@insns));           # body_20_39
 895          eval(shift(@insns));
 896           &vmovdqa  (@Tx[1],@X[0])  if ($Xi<19);
 897          eval(shift(@insns));
 898          eval(shift(@insns));           # rol
 899          eval(shift(@insns));
 900          eval(shift(@insns));
 901          eval(shift(@insns));           # rol
 902          eval(shift(@insns));
 903 
 904          foreach (@insns) { eval; }     # remaining instructions
 905 
 906   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
 907                 push(@Tx,shift(@Tx));
 908 }
 909 
 910 sub Xuplast_avx_80()
 911 { use integer;
 912   my $body = shift;
 913   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 914   my ($a,$b,$c,$d,$e);
 915 
 916          eval(shift(@insns));
 917           &vpaddd   (@Tx[1],@Tx[1],@X[-1&7]);
 918          eval(shift(@insns));
 919          eval(shift(@insns));
 920          eval(shift(@insns));
 921          eval(shift(@insns));
 922 
 923           &movdqa   (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]);     # X[]+K xfer IALU
 924 
 925          foreach (@insns) { eval; }             # remaining instructions
 926 
 927         &cmp        ($inp,$len);
 928         &je (".Ldone_avx");
 929 
 930         unshift(@Tx,pop(@Tx));
 931 
 932         &vmovdqa(@X[2],"64($K_XX_XX)");             # pbswap mask
 933         &vmovdqa(@Tx[1],"0($K_XX_XX)");             # K_00_19
 934         &vmovdqu(@X[-4&7],"0($inp)");           # load input
 935         &vmovdqu(@X[-3&7],"16($inp)");
 936         &vmovdqu(@X[-2&7],"32($inp)");
 937         &vmovdqu(@X[-1&7],"48($inp)");
 938         &vpshufb(@X[-4&7],@X[-4&7],@X[2]);  # byte swap
 939         &add        ($inp,64);
 940 
 941   $Xi=0;
 942 }
 943 
 944 sub Xloop_avx()
 945 { use integer;
 946   my $body = shift;
 947   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 948   my ($a,$b,$c,$d,$e);
 949 
 950          eval(shift(@insns));
 951          eval(shift(@insns));
 952         &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
 953          eval(shift(@insns));
 954          eval(shift(@insns));
 955         &vpaddd     (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]);
 956          eval(shift(@insns));
 957          eval(shift(@insns));
 958          eval(shift(@insns));
 959          eval(shift(@insns));
 960         &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);      # X[]+K xfer to IALU
 961          eval(shift(@insns));
 962          eval(shift(@insns));
 963 
 964         foreach (@insns) { eval; }
 965   $Xi++;
 966 }
 967 
 968 sub Xtail_avx()
 969 { use integer;
 970   my $body = shift;
 971   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 972   my ($a,$b,$c,$d,$e);
 973 
 974         foreach (@insns) { eval; }
 975 }
 976 
 977 $code.=<<___;
 978 .align  16
 979 .Loop_avx:
 980 ___
 981         &Xupdate_avx_16_31(\&body_00_19);
 982         &Xupdate_avx_16_31(\&body_00_19);
 983         &Xupdate_avx_16_31(\&body_00_19);
 984         &Xupdate_avx_16_31(\&body_00_19);
 985         &Xupdate_avx_32_79(\&body_00_19);
 986         &Xupdate_avx_32_79(\&body_20_39);
 987         &Xupdate_avx_32_79(\&body_20_39);
 988         &Xupdate_avx_32_79(\&body_20_39);
 989         &Xupdate_avx_32_79(\&body_20_39);
 990         &Xupdate_avx_32_79(\&body_20_39);
 991         &Xupdate_avx_32_79(\&body_40_59);
 992         &Xupdate_avx_32_79(\&body_40_59);
 993         &Xupdate_avx_32_79(\&body_40_59);
 994         &Xupdate_avx_32_79(\&body_40_59);
 995         &Xupdate_avx_32_79(\&body_40_59);
 996         &Xupdate_avx_32_79(\&body_20_39);
 997         &Xuplast_avx_80(\&body_20_39);  # can jump to "done"
 998 
 999                                 $saved_j=$j; @saved_V=@V;
1000                                 $saved_r=$r; @saved_rndkey=@rndkey;
1001 
1002         &Xloop_avx(\&body_20_39);
1003         &Xloop_avx(\&body_20_39);
1004         &Xloop_avx(\&body_20_39);
1005 
1006 $code.=<<___;
1007         vmovups $iv,48($out,$in0)               # write output
1008         lea     64($in0),$in0
1009 
1010         add     0($ctx),$A                      # update context
1011         add     4($ctx),@T[0]
1012         add     8($ctx),$C
1013         add     12($ctx),$D
1014         mov     $A,0($ctx)
1015         add     16($ctx),$E
1016         mov     @T[0],4($ctx)
1017         mov     @T[0],$B                        # magic seed
1018         mov     $C,8($ctx)
1019         mov     $D,12($ctx)
1020         mov     $E,16($ctx)
1021         jmp     .Loop_avx
1022 
1023 .align  16
1024 .Ldone_avx:
1025 ___
1026                                 $jj=$j=$saved_j; @V=@saved_V;
1027                                 $r=$saved_r;     @rndkey=@saved_rndkey;
1028 
1029         &Xtail_avx(\&body_20_39);
1030         &Xtail_avx(\&body_20_39);
1031         &Xtail_avx(\&body_20_39);
1032 
1033 $code.=<<___;
1034         vmovups $iv,48($out,$in0)               # write output
1035         mov     88(%rsp),$ivp                   # restore $ivp
1036 
1037         add     0($ctx),$A                      # update context
1038         add     4($ctx),@T[0]
1039         add     8($ctx),$C
1040         mov     $A,0($ctx)
1041         add     12($ctx),$D
1042         mov     @T[0],4($ctx)
1043         add     16($ctx),$E
1044         mov     $C,8($ctx)
1045         mov     $D,12($ctx)
1046         mov     $E,16($ctx)
1047         vmovups $iv,($ivp)                      # write IV
1048         vzeroall
1049 ___
1050 $code.=<<___ if ($win64);
1051         movaps  96+0(%rsp),%xmm6
1052         movaps  96+16(%rsp),%xmm7
1053         movaps  96+32(%rsp),%xmm8
1054         movaps  96+48(%rsp),%xmm9
1055         movaps  96+64(%rsp),%xmm10
1056         movaps  96+80(%rsp),%xmm11
1057         movaps  96+96(%rsp),%xmm12
1058         movaps  96+112(%rsp),%xmm13
1059         movaps  96+128(%rsp),%xmm14
1060         movaps  96+144(%rsp),%xmm15
1061 ___
1062 $code.=<<___;
1063         lea     `104+($win64?10*16:0)`(%rsp),%rsi
1064         mov     0(%rsi),%r15
1065         mov     8(%rsi),%r14
1066         mov     16(%rsi),%r13
1067         mov     24(%rsi),%r12
1068         mov     32(%rsi),%rbp
1069         mov     40(%rsi),%rbx
1070         lea     48(%rsi),%rsp
1071 .Lepilogue_avx:
1072         ret
1073 .size   aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
1074 ___
1075 }
1076 $code.=<<___;
1077 .align  64
1078 K_XX_XX:
1079 .long   0x5a827999,0x5a827999,0x5a827999,0x5a827999     # K_00_19
1080 .long   0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     # K_20_39
1081 .long   0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     # K_40_59
1082 .long   0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     # K_60_79
1083 .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap mask
1084 
1085 .asciz  "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1086 .align  64
1087 ___
1088 
1089 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1090 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1091 if ($win64) {
1092 $rec="%rcx";
1093 $frame="%rdx";
1094 $context="%r8";
1095 $disp="%r9";
1096 
1097 $code.=<<___;
1098 .extern __imp_RtlVirtualUnwind
1099 .type   ssse3_handler,\@abi-omnipotent
1100 .align  16
1101 ssse3_handler:
1102         push    %rsi
1103         push    %rdi
1104         push    %rbx
1105         push    %rbp
1106         push    %r12
1107         push    %r13
1108         push    %r14
1109         push    %r15
1110         pushfq
1111         sub     \$64,%rsp
1112 
1113         mov     120($context),%rax      # pull context->Rax
1114         mov     248($context),%rbx      # pull context->Rip
1115 
1116         mov     8($disp),%rsi           # disp->ImageBase
1117         mov     56($disp),%r11          # disp->HandlerData
1118 
1119         mov     0(%r11),%r10d           # HandlerData[0]
1120         lea     (%rsi,%r10),%r10        # prologue label
1121         cmp     %r10,%rbx               # context->Rip<prologue label
1122         jb      .Lcommon_seh_tail
1123 
1124         mov     152($context),%rax      # pull context->Rsp
1125 
1126         mov     4(%r11),%r10d           # HandlerData[1]
1127         lea     (%rsi,%r10),%r10        # epilogue label
1128         cmp     %r10,%rbx               # context->Rip>=epilogue label
1129         jae     .Lcommon_seh_tail
1130 
1131         lea     96(%rax),%rsi
1132         lea     512($context),%rdi      # &context.Xmm6
1133         mov     \$20,%ecx
1134         .long   0xa548f3fc              # cld; rep movsq
1135         lea     `104+10*16`(%rax),%rax  # adjust stack pointer
1136 
1137         mov     0(%rax),%r15
1138         mov     8(%rax),%r14
1139         mov     16(%rax),%r13
1140         mov     24(%rax),%r12
1141         mov     32(%rax),%rbp
1142         mov     40(%rax),%rbx
1143         lea     48(%rax),%rax
1144         mov     %rbx,144($context)      # restore context->Rbx
1145         mov     %rbp,160($context)      # restore context->Rbp
1146         mov     %r12,216($context)      # restore context->R12
1147         mov     %r13,224($context)      # restore context->R13
1148         mov     %r14,232($context)      # restore context->R14
1149         mov     %r15,240($context)      # restore context->R15
1150 
1151 .Lcommon_seh_tail:
1152         mov     8(%rax),%rdi
1153         mov     16(%rax),%rsi
1154         mov     %rax,152($context)      # restore context->Rsp
1155         mov     %rsi,168($context)      # restore context->Rsi
1156         mov     %rdi,176($context)      # restore context->Rdi
1157 
1158         mov     40($disp),%rdi          # disp->ContextRecord
1159         mov     $context,%rsi           # context
1160         mov     \$154,%ecx              # sizeof(CONTEXT)
1161         .long   0xa548f3fc              # cld; rep movsq
1162 
1163         mov     $disp,%rsi
1164         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1165         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1166         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1167         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1168         mov     40(%rsi),%r10           # disp->ContextRecord
1169         lea     56(%rsi),%r11           # &disp->HandlerData
1170         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1171         mov     %r10,32(%rsp)           # arg5
1172         mov     %r11,40(%rsp)           # arg6
1173         mov     %r12,48(%rsp)           # arg7
1174         mov     %rcx,56(%rsp)           # arg8, (NULL)
1175         call    *__imp_RtlVirtualUnwind(%rip)
1176 
1177         mov     \$1,%eax                # ExceptionContinueSearch
1178         add     \$64,%rsp
1179         popfq
1180         pop     %r15
1181         pop     %r14
1182         pop     %r13
1183         pop     %r12
1184         pop     %rbp
1185         pop     %rbx
1186         pop     %rdi
1187         pop     %rsi
1188         ret
1189 .size   ssse3_handler,.-ssse3_handler
1190 
1191 .section        .pdata
1192 .align  4
1193         .rva    .LSEH_begin_aesni_cbc_sha1_enc_ssse3
1194         .rva    .LSEH_end_aesni_cbc_sha1_enc_ssse3
1195         .rva    .LSEH_info_aesni_cbc_sha1_enc_ssse3
1196 ___
1197 $code.=<<___ if ($avx);
1198         .rva    .LSEH_begin_aesni_cbc_sha1_enc_avx
1199         .rva    .LSEH_end_aesni_cbc_sha1_enc_avx
1200         .rva    .LSEH_info_aesni_cbc_sha1_enc_avx
1201 ___
1202 $code.=<<___;
1203 .section        .xdata
1204 .align  8
1205 .LSEH_info_aesni_cbc_sha1_enc_ssse3:
1206         .byte   9,0,0,0
1207         .rva    ssse3_handler
1208         .rva    .Lprologue_ssse3,.Lepilogue_ssse3       # HandlerData[]
1209 ___
1210 $code.=<<___ if ($avx);
1211 .LSEH_info_aesni_cbc_sha1_enc_avx:
1212         .byte   9,0,0,0
1213         .rva    ssse3_handler
1214         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1215 ___
1216 }
1217 
1218 ####################################################################
1219 sub rex {
1220   local *opcode=shift;
1221   my ($dst,$src)=@_;
1222   my $rex=0;
1223 
1224     $rex|=0x04                  if($dst>=8);
1225     $rex|=0x01                  if($src>=8);
1226     push @opcode,$rex|0x40      if($rex);
1227 }
1228 
1229 sub aesni {
1230   my $line=shift;
1231   my @opcode=(0x66);
1232 
1233     if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1234         my %opcodelet = (
1235                 "aesenc" => 0xdc,    "aesenclast" => 0xdd
1236         );
1237         return undef if (!defined($opcodelet{$1}));
1238         rex(\@opcode,$3,$2);
1239         push @opcode,0x0f,0x38,$opcodelet{$1};
1240         push @opcode,0xc0|($2&7)|(($3&7)<<3);     # ModR/M
1241         return ".byte\t".join(',',@opcode);
1242     }
1243     return $line;
1244 }
1245 
1246 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1247 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
1248 
1249 print $code;
1250 close STDOUT;