1 #!/usr/bin/env perl
   2 
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # This module implements support for Intel AES-NI extension. In
  11 # OpenSSL context it's used with Intel engine, but can also be used as
  12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
  13 # details].
  14 #
  15 # Performance.
  16 #
  17 # To start with see corresponding paragraph in aesni-x86_64.pl...
  18 # Instead of filling table similar to one found there I've chosen to
  19 # summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
  20 # The simplified table below represents 32-bit performance relative
  21 # to 64-bit one in every given point. Ratios vary for different
  22 # encryption modes, therefore interval values.
  23 #
  24 #       16-byte     64-byte     256-byte    1-KB        8-KB
  25 #       53-67%      67-84%      91-94%      95-98%      97-99.5%
  26 #
  27 # Lower ratios for smaller block sizes are perfectly understandable,
  28 # because function call overhead is higher in 32-bit mode. Largest
  29 # 8-KB block performance is virtually same: 32-bit code is less than
  30 # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
  31 
  32 # January 2011
  33 #
  34 # See aesni-x86_64.pl for details. Unlike x86_64 version this module
  35 # interleaves at most 6 aes[enc|dec] instructions, because there are
  36 # not enough registers for 8x interleave [which should be optimal for
  37 # Sandy Bridge]. Actually, performance results for 6x interleave
  38 # factor presented in aesni-x86_64.pl (except for CTR) are for this
  39 # module.
  40 
  41 # April 2011
  42 #
  43 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
  44 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
  45 
  46 $PREFIX="aesni";        # if $PREFIX is set to "AES", the script
  47                         # generates drop-in replacement for
  48                         # crypto/aes/asm/aes-586.pl:-)
  49 $inline=1;              # inline _aesni_[en|de]crypt
  50 
  51 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  52 push(@INC,"${dir}","${dir}../../perlasm");
  53 require "x86asm.pl";
  54 
  55 &asm_init($ARGV[0],$0);
  56 
  57 if ($PREFIX eq "aesni") { $movekey=*movups; }
  58 else                    { $movekey=*movups; }
  59 
  60 $len="eax";
  61 $rounds="ecx";
  62 $key="edx";
  63 $inp="esi";
  64 $out="edi";
  65 $rounds_="ebx"; # backup copy for $rounds
  66 $key_="ebp";    # backup copy for $key
  67 
  68 $rndkey0="xmm0";
  69 $rndkey1="xmm1";
  70 $inout0="xmm2";
  71 $inout1="xmm3";
  72 $inout2="xmm4";
  73 $inout3="xmm5"; $in1="xmm5";
  74 $inout4="xmm6"; $in0="xmm6";
  75 $inout5="xmm7"; $ivec="xmm7";
  76 
  77 # AESNI extenstion
  78 sub aeskeygenassist
  79 { my($dst,$src,$imm)=@_;
  80     if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
  81     {   &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
  82 }
  83 sub aescommon
  84 { my($opcodelet,$dst,$src)=@_;
  85     if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
  86     {   &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
  87 }
  88 sub aesimc      { aescommon(0xdb,@_); }
  89 sub aesenc      { aescommon(0xdc,@_); }
  90 sub aesenclast  { aescommon(0xdd,@_); }
  91 sub aesdec      { aescommon(0xde,@_); }
  92 sub aesdeclast  { aescommon(0xdf,@_); }
  93 
  94 # Inline version of internal aesni_[en|de]crypt1
  95 { my $sn;
  96 sub aesni_inline_generate1
  97 { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
  98   $sn++;
  99 
 100     &$movekey               ($rndkey0,&QWP(0,$key));
 101     &$movekey               ($rndkey1,&QWP(16,$key));
 102     &xorps          ($ivec,$rndkey0)        if (defined($ivec));
 103     &lea            ($key,&DWP(32,$key));
 104     &xorps          ($inout,$ivec)          if (defined($ivec));
 105     &xorps          ($inout,$rndkey0)       if (!defined($ivec));
 106     &set_label("${p}1_loop_$sn");
 107         eval"&aes${p}       ($inout,$rndkey1)";
 108         &dec                ($rounds);
 109         &$movekey   ($rndkey1,&QWP(0,$key));
 110         &lea                ($key,&DWP(16,$key));
 111     &jnz            (&label("${p}1_loop_$sn"));
 112     eval"&aes${p}last       ($inout,$rndkey1)";
 113 }}
 114 
 115 sub aesni_generate1     # fully unrolled loop
 116 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
 117 
 118     &function_begin_B("_aesni_${p}rypt1");
 119         &movups             ($rndkey0,&QWP(0,$key));
 120         &$movekey   ($rndkey1,&QWP(0x10,$key));
 121         &xorps              ($inout,$rndkey0);
 122         &$movekey   ($rndkey0,&QWP(0x20,$key));
 123         &lea                ($key,&DWP(0x30,$key));
 124         &cmp                ($rounds,11);
 125         &jb         (&label("${p}128"));
 126         &lea                ($key,&DWP(0x20,$key));
 127         &je         (&label("${p}192"));
 128         &lea                ($key,&DWP(0x20,$key));
 129         eval"&aes${p}       ($inout,$rndkey1)";
 130         &$movekey   ($rndkey1,&QWP(-0x40,$key));
 131         eval"&aes${p}       ($inout,$rndkey0)";
 132         &$movekey   ($rndkey0,&QWP(-0x30,$key));
 133     &set_label("${p}192");
 134         eval"&aes${p}       ($inout,$rndkey1)";
 135         &$movekey   ($rndkey1,&QWP(-0x20,$key));
 136         eval"&aes${p}       ($inout,$rndkey0)";
 137         &$movekey   ($rndkey0,&QWP(-0x10,$key));
 138     &set_label("${p}128");
 139         eval"&aes${p}       ($inout,$rndkey1)";
 140         &$movekey   ($rndkey1,&QWP(0,$key));
 141         eval"&aes${p}       ($inout,$rndkey0)";
 142         &$movekey   ($rndkey0,&QWP(0x10,$key));
 143         eval"&aes${p}       ($inout,$rndkey1)";
 144         &$movekey   ($rndkey1,&QWP(0x20,$key));
 145         eval"&aes${p}       ($inout,$rndkey0)";
 146         &$movekey   ($rndkey0,&QWP(0x30,$key));
 147         eval"&aes${p}       ($inout,$rndkey1)";
 148         &$movekey   ($rndkey1,&QWP(0x40,$key));
 149         eval"&aes${p}       ($inout,$rndkey0)";
 150         &$movekey   ($rndkey0,&QWP(0x50,$key));
 151         eval"&aes${p}       ($inout,$rndkey1)";
 152         &$movekey   ($rndkey1,&QWP(0x60,$key));
 153         eval"&aes${p}       ($inout,$rndkey0)";
 154         &$movekey   ($rndkey0,&QWP(0x70,$key));
 155         eval"&aes${p}       ($inout,$rndkey1)";
 156     eval"&aes${p}last       ($inout,$rndkey0)";
 157     &ret();
 158     &function_end_B("_aesni_${p}rypt1");
 159 }
 160 
 161 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
 162 &aesni_generate1("enc") if (!$inline);
 163 &function_begin_B("${PREFIX}_encrypt");
 164         &mov        ("eax",&wparam(0));
 165         &mov        ($key,&wparam(2));
 166         &movups     ($inout0,&QWP(0,"eax"));
 167         &mov        ($rounds,&DWP(240,$key));
 168         &mov        ("eax",&wparam(1));
 169         if ($inline)
 170         {   &aesni_inline_generate1("enc"); }
 171         else
 172         {   &call   ("_aesni_encrypt1");    }
 173         &movups     (&QWP(0,"eax"),$inout0);
 174         &ret        ();
 175 &function_end_B("${PREFIX}_encrypt");
 176 
 177 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
 178 &aesni_generate1("dec") if(!$inline);
 179 &function_begin_B("${PREFIX}_decrypt");
 180         &mov        ("eax",&wparam(0));
 181         &mov        ($key,&wparam(2));
 182         &movups     ($inout0,&QWP(0,"eax"));
 183         &mov        ($rounds,&DWP(240,$key));
 184         &mov        ("eax",&wparam(1));
 185         if ($inline)
 186         {   &aesni_inline_generate1("dec"); }
 187         else
 188         {   &call   ("_aesni_decrypt1");    }
 189         &movups     (&QWP(0,"eax"),$inout0);
 190         &ret        ();
 191 &function_end_B("${PREFIX}_decrypt");
 192 
 193 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
 194 # factor. Why 3x subroutine were originally used in loops? Even though
 195 # aes[enc|dec] latency was originally 6, it could be scheduled only
 196 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
 197 # utilization, i.e. when subroutine's throughput is virtually same as
 198 # of non-interleaved subroutine [for number of input blocks up to 3].
 199 # This is why it makes no sense to implement 2x subroutine.
 200 # aes[enc|dec] latency in next processor generation is 8, but the
 201 # instructions can be scheduled every cycle. Optimal interleave for
 202 # new processor is therefore 8x, but it's unfeasible to accommodate it
 203 # in XMM registers addreassable in 32-bit mode and therefore 6x is
 204 # used instead...
 205 
 206 sub aesni_generate3
 207 { my $p=shift;
 208 
 209     &function_begin_B("_aesni_${p}rypt3");
 210         &$movekey   ($rndkey0,&QWP(0,$key));
 211         &shr                ($rounds,1);
 212         &$movekey   ($rndkey1,&QWP(16,$key));
 213         &lea                ($key,&DWP(32,$key));
 214         &xorps              ($inout0,$rndkey0);
 215         &pxor               ($inout1,$rndkey0);
 216         &pxor               ($inout2,$rndkey0);
 217         &$movekey   ($rndkey0,&QWP(0,$key));
 218 
 219     &set_label("${p}3_loop");
 220         eval"&aes${p}       ($inout0,$rndkey1)";
 221         eval"&aes${p}       ($inout1,$rndkey1)";
 222         &dec                ($rounds);
 223         eval"&aes${p}       ($inout2,$rndkey1)";
 224         &$movekey   ($rndkey1,&QWP(16,$key));
 225         eval"&aes${p}       ($inout0,$rndkey0)";
 226         eval"&aes${p}       ($inout1,$rndkey0)";
 227         &lea                ($key,&DWP(32,$key));
 228         eval"&aes${p}       ($inout2,$rndkey0)";
 229         &$movekey   ($rndkey0,&QWP(0,$key));
 230         &jnz                (&label("${p}3_loop"));
 231     eval"&aes${p}   ($inout0,$rndkey1)";
 232     eval"&aes${p}   ($inout1,$rndkey1)";
 233     eval"&aes${p}   ($inout2,$rndkey1)";
 234     eval"&aes${p}last       ($inout0,$rndkey0)";
 235     eval"&aes${p}last       ($inout1,$rndkey0)";
 236     eval"&aes${p}last       ($inout2,$rndkey0)";
 237     &ret();
 238     &function_end_B("_aesni_${p}rypt3");
 239 }
 240 
 241 # 4x interleave is implemented to improve small block performance,
 242 # most notably [and naturally] 4 block by ~30%. One can argue that one
 243 # should have implemented 5x as well, but improvement  would be <20%,
 244 # so it's not worth it...
 245 sub aesni_generate4
 246 { my $p=shift;
 247 
 248     &function_begin_B("_aesni_${p}rypt4");
 249         &$movekey   ($rndkey0,&QWP(0,$key));
 250         &$movekey   ($rndkey1,&QWP(16,$key));
 251         &shr                ($rounds,1);
 252         &lea                ($key,&DWP(32,$key));
 253         &xorps              ($inout0,$rndkey0);
 254         &pxor               ($inout1,$rndkey0);
 255         &pxor               ($inout2,$rndkey0);
 256         &pxor               ($inout3,$rndkey0);
 257         &$movekey   ($rndkey0,&QWP(0,$key));
 258 
 259     &set_label("${p}4_loop");
 260         eval"&aes${p}       ($inout0,$rndkey1)";
 261         eval"&aes${p}       ($inout1,$rndkey1)";
 262         &dec                ($rounds);
 263         eval"&aes${p}       ($inout2,$rndkey1)";
 264         eval"&aes${p}       ($inout3,$rndkey1)";
 265         &$movekey   ($rndkey1,&QWP(16,$key));
 266         eval"&aes${p}       ($inout0,$rndkey0)";
 267         eval"&aes${p}       ($inout1,$rndkey0)";
 268         &lea                ($key,&DWP(32,$key));
 269         eval"&aes${p}       ($inout2,$rndkey0)";
 270         eval"&aes${p}       ($inout3,$rndkey0)";
 271         &$movekey   ($rndkey0,&QWP(0,$key));
 272     &jnz            (&label("${p}4_loop"));
 273 
 274     eval"&aes${p}   ($inout0,$rndkey1)";
 275     eval"&aes${p}   ($inout1,$rndkey1)";
 276     eval"&aes${p}   ($inout2,$rndkey1)";
 277     eval"&aes${p}   ($inout3,$rndkey1)";
 278     eval"&aes${p}last       ($inout0,$rndkey0)";
 279     eval"&aes${p}last       ($inout1,$rndkey0)";
 280     eval"&aes${p}last       ($inout2,$rndkey0)";
 281     eval"&aes${p}last       ($inout3,$rndkey0)";
 282     &ret();
 283     &function_end_B("_aesni_${p}rypt4");
 284 }
 285 
 286 sub aesni_generate6
 287 { my $p=shift;
 288 
 289     &function_begin_B("_aesni_${p}rypt6");
 290     &static_label("_aesni_${p}rypt6_enter");
 291         &$movekey   ($rndkey0,&QWP(0,$key));
 292         &shr                ($rounds,1);
 293         &$movekey   ($rndkey1,&QWP(16,$key));
 294         &lea                ($key,&DWP(32,$key));
 295         &xorps              ($inout0,$rndkey0);
 296         &pxor               ($inout1,$rndkey0);     # pxor does better here
 297         eval"&aes${p}       ($inout0,$rndkey1)";
 298         &pxor               ($inout2,$rndkey0);
 299         eval"&aes${p}       ($inout1,$rndkey1)";
 300         &pxor               ($inout3,$rndkey0);
 301         &dec                ($rounds);
 302         eval"&aes${p}       ($inout2,$rndkey1)";
 303         &pxor               ($inout4,$rndkey0);
 304         eval"&aes${p}       ($inout3,$rndkey1)";
 305         &pxor               ($inout5,$rndkey0);
 306         eval"&aes${p}       ($inout4,$rndkey1)";
 307         &$movekey   ($rndkey0,&QWP(0,$key));
 308         eval"&aes${p}       ($inout5,$rndkey1)";
 309         &jmp                (&label("_aesni_${p}rypt6_enter"));
 310 
 311     &set_label("${p}6_loop",16);
 312         eval"&aes${p}       ($inout0,$rndkey1)";
 313         eval"&aes${p}       ($inout1,$rndkey1)";
 314         &dec                ($rounds);
 315         eval"&aes${p}       ($inout2,$rndkey1)";
 316         eval"&aes${p}       ($inout3,$rndkey1)";
 317         eval"&aes${p}       ($inout4,$rndkey1)";
 318         eval"&aes${p}       ($inout5,$rndkey1)";
 319     &set_label("_aesni_${p}rypt6_enter",16);
 320         &$movekey   ($rndkey1,&QWP(16,$key));
 321         eval"&aes${p}       ($inout0,$rndkey0)";
 322         eval"&aes${p}       ($inout1,$rndkey0)";
 323         &lea                ($key,&DWP(32,$key));
 324         eval"&aes${p}       ($inout2,$rndkey0)";
 325         eval"&aes${p}       ($inout3,$rndkey0)";
 326         eval"&aes${p}       ($inout4,$rndkey0)";
 327         eval"&aes${p}       ($inout5,$rndkey0)";
 328         &$movekey   ($rndkey0,&QWP(0,$key));
 329     &jnz            (&label("${p}6_loop"));
 330 
 331     eval"&aes${p}   ($inout0,$rndkey1)";
 332     eval"&aes${p}   ($inout1,$rndkey1)";
 333     eval"&aes${p}   ($inout2,$rndkey1)";
 334     eval"&aes${p}   ($inout3,$rndkey1)";
 335     eval"&aes${p}   ($inout4,$rndkey1)";
 336     eval"&aes${p}   ($inout5,$rndkey1)";
 337     eval"&aes${p}last       ($inout0,$rndkey0)";
 338     eval"&aes${p}last       ($inout1,$rndkey0)";
 339     eval"&aes${p}last       ($inout2,$rndkey0)";
 340     eval"&aes${p}last       ($inout3,$rndkey0)";
 341     eval"&aes${p}last       ($inout4,$rndkey0)";
 342     eval"&aes${p}last       ($inout5,$rndkey0)";
 343     &ret();
 344     &function_end_B("_aesni_${p}rypt6");
 345 }
 346 &aesni_generate3("enc") if ($PREFIX eq "aesni");
 347 &aesni_generate3("dec");
 348 &aesni_generate4("enc") if ($PREFIX eq "aesni");
 349 &aesni_generate4("dec");
 350 &aesni_generate6("enc") if ($PREFIX eq "aesni");
 351 &aesni_generate6("dec");
 352 
 353 if ($PREFIX eq "aesni") {
 354 ######################################################################
 355 # void aesni_ecb_encrypt (const void *in, void *out,
 356 #                         size_t length, const AES_KEY *key,
 357 #                         int enc);
 358 &function_begin("aesni_ecb_encrypt");
 359         &mov        ($inp,&wparam(0));
 360         &mov        ($out,&wparam(1));
 361         &mov        ($len,&wparam(2));
 362         &mov        ($key,&wparam(3));
 363         &mov        ($rounds_,&wparam(4));
 364         &and        ($len,-16);
 365         &jz (&label("ecb_ret"));
 366         &mov        ($rounds,&DWP(240,$key));
 367         &test       ($rounds_,$rounds_);
 368         &jz (&label("ecb_decrypt"));
 369 
 370         &mov        ($key_,$key);           # backup $key
 371         &mov        ($rounds_,$rounds);     # backup $rounds
 372         &cmp        ($len,0x60);
 373         &jb (&label("ecb_enc_tail"));
 374 
 375         &movdqu     ($inout0,&QWP(0,$inp));
 376         &movdqu     ($inout1,&QWP(0x10,$inp));
 377         &movdqu     ($inout2,&QWP(0x20,$inp));
 378         &movdqu     ($inout3,&QWP(0x30,$inp));
 379         &movdqu     ($inout4,&QWP(0x40,$inp));
 380         &movdqu     ($inout5,&QWP(0x50,$inp));
 381         &lea        ($inp,&DWP(0x60,$inp));
 382         &sub        ($len,0x60);
 383         &jmp        (&label("ecb_enc_loop6_enter"));
 384 
 385 &set_label("ecb_enc_loop6",16);
 386         &movups     (&QWP(0,$out),$inout0);
 387         &movdqu     ($inout0,&QWP(0,$inp));
 388         &movups     (&QWP(0x10,$out),$inout1);
 389         &movdqu     ($inout1,&QWP(0x10,$inp));
 390         &movups     (&QWP(0x20,$out),$inout2);
 391         &movdqu     ($inout2,&QWP(0x20,$inp));
 392         &movups     (&QWP(0x30,$out),$inout3);
 393         &movdqu     ($inout3,&QWP(0x30,$inp));
 394         &movups     (&QWP(0x40,$out),$inout4);
 395         &movdqu     ($inout4,&QWP(0x40,$inp));
 396         &movups     (&QWP(0x50,$out),$inout5);
 397         &lea        ($out,&DWP(0x60,$out));
 398         &movdqu     ($inout5,&QWP(0x50,$inp));
 399         &lea        ($inp,&DWP(0x60,$inp));
 400 &set_label("ecb_enc_loop6_enter");
 401 
 402         &call       ("_aesni_encrypt6");
 403 
 404         &mov        ($key,$key_);           # restore $key
 405         &mov        ($rounds,$rounds_);     # restore $rounds
 406         &sub        ($len,0x60);
 407         &jnc        (&label("ecb_enc_loop6"));
 408 
 409         &movups     (&QWP(0,$out),$inout0);
 410         &movups     (&QWP(0x10,$out),$inout1);
 411         &movups     (&QWP(0x20,$out),$inout2);
 412         &movups     (&QWP(0x30,$out),$inout3);
 413         &movups     (&QWP(0x40,$out),$inout4);
 414         &movups     (&QWP(0x50,$out),$inout5);
 415         &lea        ($out,&DWP(0x60,$out));
 416         &add        ($len,0x60);
 417         &jz (&label("ecb_ret"));
 418 
 419 &set_label("ecb_enc_tail");
 420         &movups     ($inout0,&QWP(0,$inp));
 421         &cmp        ($len,0x20);
 422         &jb (&label("ecb_enc_one"));
 423         &movups     ($inout1,&QWP(0x10,$inp));
 424         &je (&label("ecb_enc_two"));
 425         &movups     ($inout2,&QWP(0x20,$inp));
 426         &cmp        ($len,0x40);
 427         &jb (&label("ecb_enc_three"));
 428         &movups     ($inout3,&QWP(0x30,$inp));
 429         &je (&label("ecb_enc_four"));
 430         &movups     ($inout4,&QWP(0x40,$inp));
 431         &xorps      ($inout5,$inout5);
 432         &call       ("_aesni_encrypt6");
 433         &movups     (&QWP(0,$out),$inout0);
 434         &movups     (&QWP(0x10,$out),$inout1);
 435         &movups     (&QWP(0x20,$out),$inout2);
 436         &movups     (&QWP(0x30,$out),$inout3);
 437         &movups     (&QWP(0x40,$out),$inout4);
 438         jmp     (&label("ecb_ret"));
 439 
 440 &set_label("ecb_enc_one",16);
 441         if ($inline)
 442         {   &aesni_inline_generate1("enc"); }
 443         else
 444         {   &call   ("_aesni_encrypt1");    }
 445         &movups     (&QWP(0,$out),$inout0);
 446         &jmp        (&label("ecb_ret"));
 447 
 448 &set_label("ecb_enc_two",16);
 449         &xorps      ($inout2,$inout2);
 450         &call       ("_aesni_encrypt3");
 451         &movups     (&QWP(0,$out),$inout0);
 452         &movups     (&QWP(0x10,$out),$inout1);
 453         &jmp        (&label("ecb_ret"));
 454 
 455 &set_label("ecb_enc_three",16);
 456         &call       ("_aesni_encrypt3");
 457         &movups     (&QWP(0,$out),$inout0);
 458         &movups     (&QWP(0x10,$out),$inout1);
 459         &movups     (&QWP(0x20,$out),$inout2);
 460         &jmp        (&label("ecb_ret"));
 461 
 462 &set_label("ecb_enc_four",16);
 463         &call       ("_aesni_encrypt4");
 464         &movups     (&QWP(0,$out),$inout0);
 465         &movups     (&QWP(0x10,$out),$inout1);
 466         &movups     (&QWP(0x20,$out),$inout2);
 467         &movups     (&QWP(0x30,$out),$inout3);
 468         &jmp        (&label("ecb_ret"));
 469 ######################################################################
 470 &set_label("ecb_decrypt",16);
 471         &mov        ($key_,$key);           # backup $key
 472         &mov        ($rounds_,$rounds);     # backup $rounds
 473         &cmp        ($len,0x60);
 474         &jb (&label("ecb_dec_tail"));
 475 
 476         &movdqu     ($inout0,&QWP(0,$inp));
 477         &movdqu     ($inout1,&QWP(0x10,$inp));
 478         &movdqu     ($inout2,&QWP(0x20,$inp));
 479         &movdqu     ($inout3,&QWP(0x30,$inp));
 480         &movdqu     ($inout4,&QWP(0x40,$inp));
 481         &movdqu     ($inout5,&QWP(0x50,$inp));
 482         &lea        ($inp,&DWP(0x60,$inp));
 483         &sub        ($len,0x60);
 484         &jmp        (&label("ecb_dec_loop6_enter"));
 485 
 486 &set_label("ecb_dec_loop6",16);
 487         &movups     (&QWP(0,$out),$inout0);
 488         &movdqu     ($inout0,&QWP(0,$inp));
 489         &movups     (&QWP(0x10,$out),$inout1);
 490         &movdqu     ($inout1,&QWP(0x10,$inp));
 491         &movups     (&QWP(0x20,$out),$inout2);
 492         &movdqu     ($inout2,&QWP(0x20,$inp));
 493         &movups     (&QWP(0x30,$out),$inout3);
 494         &movdqu     ($inout3,&QWP(0x30,$inp));
 495         &movups     (&QWP(0x40,$out),$inout4);
 496         &movdqu     ($inout4,&QWP(0x40,$inp));
 497         &movups     (&QWP(0x50,$out),$inout5);
 498         &lea        ($out,&DWP(0x60,$out));
 499         &movdqu     ($inout5,&QWP(0x50,$inp));
 500         &lea        ($inp,&DWP(0x60,$inp));
 501 &set_label("ecb_dec_loop6_enter");
 502 
 503         &call       ("_aesni_decrypt6");
 504 
 505         &mov        ($key,$key_);           # restore $key
 506         &mov        ($rounds,$rounds_);     # restore $rounds
 507         &sub        ($len,0x60);
 508         &jnc        (&label("ecb_dec_loop6"));
 509 
 510         &movups     (&QWP(0,$out),$inout0);
 511         &movups     (&QWP(0x10,$out),$inout1);
 512         &movups     (&QWP(0x20,$out),$inout2);
 513         &movups     (&QWP(0x30,$out),$inout3);
 514         &movups     (&QWP(0x40,$out),$inout4);
 515         &movups     (&QWP(0x50,$out),$inout5);
 516         &lea        ($out,&DWP(0x60,$out));
 517         &add        ($len,0x60);
 518         &jz (&label("ecb_ret"));
 519 
 520 &set_label("ecb_dec_tail");
 521         &movups     ($inout0,&QWP(0,$inp));
 522         &cmp        ($len,0x20);
 523         &jb (&label("ecb_dec_one"));
 524         &movups     ($inout1,&QWP(0x10,$inp));
 525         &je (&label("ecb_dec_two"));
 526         &movups     ($inout2,&QWP(0x20,$inp));
 527         &cmp        ($len,0x40);
 528         &jb (&label("ecb_dec_three"));
 529         &movups     ($inout3,&QWP(0x30,$inp));
 530         &je (&label("ecb_dec_four"));
 531         &movups     ($inout4,&QWP(0x40,$inp));
 532         &xorps      ($inout5,$inout5);
 533         &call       ("_aesni_decrypt6");
 534         &movups     (&QWP(0,$out),$inout0);
 535         &movups     (&QWP(0x10,$out),$inout1);
 536         &movups     (&QWP(0x20,$out),$inout2);
 537         &movups     (&QWP(0x30,$out),$inout3);
 538         &movups     (&QWP(0x40,$out),$inout4);
 539         &jmp        (&label("ecb_ret"));
 540 
 541 &set_label("ecb_dec_one",16);
 542         if ($inline)
 543         {   &aesni_inline_generate1("dec"); }
 544         else
 545         {   &call   ("_aesni_decrypt1");    }
 546         &movups     (&QWP(0,$out),$inout0);
 547         &jmp        (&label("ecb_ret"));
 548 
 549 &set_label("ecb_dec_two",16);
 550         &xorps      ($inout2,$inout2);
 551         &call       ("_aesni_decrypt3");
 552         &movups     (&QWP(0,$out),$inout0);
 553         &movups     (&QWP(0x10,$out),$inout1);
 554         &jmp        (&label("ecb_ret"));
 555 
 556 &set_label("ecb_dec_three",16);
 557         &call       ("_aesni_decrypt3");
 558         &movups     (&QWP(0,$out),$inout0);
 559         &movups     (&QWP(0x10,$out),$inout1);
 560         &movups     (&QWP(0x20,$out),$inout2);
 561         &jmp        (&label("ecb_ret"));
 562 
 563 &set_label("ecb_dec_four",16);
 564         &call       ("_aesni_decrypt4");
 565         &movups     (&QWP(0,$out),$inout0);
 566         &movups     (&QWP(0x10,$out),$inout1);
 567         &movups     (&QWP(0x20,$out),$inout2);
 568         &movups     (&QWP(0x30,$out),$inout3);
 569 
 570 &set_label("ecb_ret");
 571 &function_end("aesni_ecb_encrypt");
 572 
 573 ######################################################################
 574 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
 575 #                         size_t blocks, const AES_KEY *key,
 576 #                         const char *ivec,char *cmac);
 577 #
 578 # Handles only complete blocks, operates on 64-bit counter and
 579 # does not update *ivec! Nor does it finalize CMAC value
 580 # (see engine/eng_aesni.c for details)
 581 #
 582 { my $cmac=$inout1;
 583 &function_begin("aesni_ccm64_encrypt_blocks");
 584         &mov        ($inp,&wparam(0));
 585         &mov        ($out,&wparam(1));
 586         &mov        ($len,&wparam(2));
 587         &mov        ($key,&wparam(3));
 588         &mov        ($rounds_,&wparam(4));
 589         &mov        ($rounds,&wparam(5));
 590         &mov        ($key_,"esp");
 591         &sub        ("esp",60);
 592         &and        ("esp",-16);                    # align stack
 593         &mov        (&DWP(48,"esp"),$key_);
 594 
 595         &movdqu     ($ivec,&QWP(0,$rounds_));   # load ivec
 596         &movdqu     ($cmac,&QWP(0,$rounds));    # load cmac
 597         &mov        ($rounds,&DWP(240,$key));
 598 
 599         # compose byte-swap control mask for pshufb on stack
 600         &mov        (&DWP(0,"esp"),0x0c0d0e0f);
 601         &mov        (&DWP(4,"esp"),0x08090a0b);
 602         &mov        (&DWP(8,"esp"),0x04050607);
 603         &mov        (&DWP(12,"esp"),0x00010203);
 604 
 605         # compose counter increment vector on stack
 606         &mov        ($rounds_,1);
 607         &xor        ($key_,$key_);
 608         &mov        (&DWP(16,"esp"),$rounds_);
 609         &mov        (&DWP(20,"esp"),$key_);
 610         &mov        (&DWP(24,"esp"),$key_);
 611         &mov        (&DWP(28,"esp"),$key_);
 612 
 613         &shr        ($rounds,1);
 614         &lea        ($key_,&DWP(0,$key));
 615         &movdqa     ($inout3,&QWP(0,"esp"));
 616         &movdqa     ($inout0,$ivec);
 617         &mov        ($rounds_,$rounds);
 618         &pshufb     ($ivec,$inout3);
 619 
 620 &set_label("ccm64_enc_outer");
 621         &$movekey   ($rndkey0,&QWP(0,$key_));
 622         &mov                ($rounds,$rounds_);
 623         &movups             ($in0,&QWP(0,$inp));
 624 
 625         &xorps              ($inout0,$rndkey0);
 626         &$movekey   ($rndkey1,&QWP(16,$key_));
 627         &xorps              ($rndkey0,$in0);
 628         &lea                ($key,&DWP(32,$key_));
 629         &xorps              ($cmac,$rndkey0);               # cmac^=inp
 630         &$movekey   ($rndkey0,&QWP(0,$key));
 631 
 632 &set_label("ccm64_enc2_loop");
 633         &aesenc             ($inout0,$rndkey1);
 634         &dec                ($rounds);
 635         &aesenc             ($cmac,$rndkey1);
 636         &$movekey   ($rndkey1,&QWP(16,$key));
 637         &aesenc             ($inout0,$rndkey0);
 638         &lea                ($key,&DWP(32,$key));
 639         &aesenc             ($cmac,$rndkey0);
 640         &$movekey   ($rndkey0,&QWP(0,$key));
 641         &jnz                (&label("ccm64_enc2_loop"));
 642         &aesenc             ($inout0,$rndkey1);
 643         &aesenc             ($cmac,$rndkey1);
 644         &paddq              ($ivec,&QWP(16,"esp"));
 645         &aesenclast ($inout0,$rndkey0);
 646         &aesenclast ($cmac,$rndkey0);
 647 
 648         &dec        ($len);
 649         &lea        ($inp,&DWP(16,$inp));
 650         &xorps      ($in0,$inout0);                 # inp^=E(ivec)
 651         &movdqa     ($inout0,$ivec);
 652         &movups     (&QWP(0,$out),$in0);                # save output
 653         &lea        ($out,&DWP(16,$out));
 654         &pshufb     ($inout0,$inout3);
 655         &jnz        (&label("ccm64_enc_outer"));
 656 
 657         &mov        ("esp",&DWP(48,"esp"));
 658         &mov        ($out,&wparam(5));
 659         &movups     (&QWP(0,$out),$cmac);
 660 &function_end("aesni_ccm64_encrypt_blocks");
 661 
 662 &function_begin("aesni_ccm64_decrypt_blocks");
 663         &mov        ($inp,&wparam(0));
 664         &mov        ($out,&wparam(1));
 665         &mov        ($len,&wparam(2));
 666         &mov        ($key,&wparam(3));
 667         &mov        ($rounds_,&wparam(4));
 668         &mov        ($rounds,&wparam(5));
 669         &mov        ($key_,"esp");
 670         &sub        ("esp",60);
 671         &and        ("esp",-16);                    # align stack
 672         &mov        (&DWP(48,"esp"),$key_);
 673 
 674         &movdqu     ($ivec,&QWP(0,$rounds_));   # load ivec
 675         &movdqu     ($cmac,&QWP(0,$rounds));    # load cmac
 676         &mov        ($rounds,&DWP(240,$key));
 677 
 678         # compose byte-swap control mask for pshufb on stack
 679         &mov        (&DWP(0,"esp"),0x0c0d0e0f);
 680         &mov        (&DWP(4,"esp"),0x08090a0b);
 681         &mov        (&DWP(8,"esp"),0x04050607);
 682         &mov        (&DWP(12,"esp"),0x00010203);
 683 
 684         # compose counter increment vector on stack
 685         &mov        ($rounds_,1);
 686         &xor        ($key_,$key_);
 687         &mov        (&DWP(16,"esp"),$rounds_);
 688         &mov        (&DWP(20,"esp"),$key_);
 689         &mov        (&DWP(24,"esp"),$key_);
 690         &mov        (&DWP(28,"esp"),$key_);
 691 
 692         &movdqa     ($inout3,&QWP(0,"esp"));    # bswap mask
 693         &movdqa     ($inout0,$ivec);
 694 
 695         &mov        ($key_,$key);
 696         &mov        ($rounds_,$rounds);
 697 
 698         &pshufb     ($ivec,$inout3);
 699         if ($inline)
 700         {   &aesni_inline_generate1("enc"); }
 701         else
 702         {   &call   ("_aesni_encrypt1");    }
 703         &movups     ($in0,&QWP(0,$inp));                # load inp
 704         &paddq      ($ivec,&QWP(16,"esp"));
 705         &lea        ($inp,&QWP(16,$inp));
 706         &jmp        (&label("ccm64_dec_outer"));
 707 
 708 &set_label("ccm64_dec_outer",16);
 709         &xorps      ($in0,$inout0);                 # inp ^= E(ivec)
 710         &movdqa     ($inout0,$ivec);
 711         &mov        ($rounds,$rounds_);
 712         &movups     (&QWP(0,$out),$in0);                # save output
 713         &lea        ($out,&DWP(16,$out));
 714         &pshufb     ($inout0,$inout3);
 715 
 716         &sub        ($len,1);
 717         &jz (&label("ccm64_dec_break"));
 718 
 719         &$movekey   ($rndkey0,&QWP(0,$key_));
 720         &shr                ($rounds,1);
 721         &$movekey   ($rndkey1,&QWP(16,$key_));
 722         &xorps              ($in0,$rndkey0);
 723         &lea                ($key,&DWP(32,$key_));
 724         &xorps              ($inout0,$rndkey0);
 725         &xorps              ($cmac,$in0);           # cmac^=out
 726         &$movekey   ($rndkey0,&QWP(0,$key));
 727 
 728 &set_label("ccm64_dec2_loop");
 729         &aesenc             ($inout0,$rndkey1);
 730         &dec                ($rounds);
 731         &aesenc             ($cmac,$rndkey1);
 732         &$movekey   ($rndkey1,&QWP(16,$key));
 733         &aesenc             ($inout0,$rndkey0);
 734         &lea                ($key,&DWP(32,$key));
 735         &aesenc             ($cmac,$rndkey0);
 736         &$movekey   ($rndkey0,&QWP(0,$key));
 737         &jnz                (&label("ccm64_dec2_loop"));
 738         &movups             ($in0,&QWP(0,$inp));        # load inp
 739         &paddq              ($ivec,&QWP(16,"esp"));
 740         &aesenc             ($inout0,$rndkey1);
 741         &aesenc             ($cmac,$rndkey1);
 742         &lea                ($inp,&QWP(16,$inp));
 743         &aesenclast ($inout0,$rndkey0);
 744         &aesenclast ($cmac,$rndkey0);
 745         &jmp        (&label("ccm64_dec_outer"));
 746 
 747 &set_label("ccm64_dec_break",16);
 748         &mov        ($key,$key_);
 749         if ($inline)
 750         {   &aesni_inline_generate1("enc",$cmac,$in0);      }
 751         else
 752         {   &call   ("_aesni_encrypt1",$cmac);      }
 753 
 754         &mov        ("esp",&DWP(48,"esp"));
 755         &mov        ($out,&wparam(5));
 756         &movups     (&QWP(0,$out),$cmac);
 757 &function_end("aesni_ccm64_decrypt_blocks");
 758 }
 759 
 760 ######################################################################
 761 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
 762 #                         size_t blocks, const AES_KEY *key,
 763 #                         const char *ivec);
 764 #
 765 # Handles only complete blocks, operates on 32-bit counter and
 766 # does not update *ivec! (see engine/eng_aesni.c for details)
 767 #
 768 # stack layout:
 769 #       0       pshufb mask
 770 #       16      vector addend: 0,6,6,6
 771 #       32      counter-less ivec
 772 #       48      1st triplet of counter vector
 773 #       64      2nd triplet of counter vector
 774 #       80      saved %esp
 775 
 776 &function_begin("aesni_ctr32_encrypt_blocks");
 777         &mov        ($inp,&wparam(0));
 778         &mov        ($out,&wparam(1));
 779         &mov        ($len,&wparam(2));
 780         &mov        ($key,&wparam(3));
 781         &mov        ($rounds_,&wparam(4));
 782         &mov        ($key_,"esp");
 783         &sub        ("esp",88);
 784         &and        ("esp",-16);                    # align stack
 785         &mov        (&DWP(80,"esp"),$key_);
 786 
 787         &cmp        ($len,1);
 788         &je (&label("ctr32_one_shortcut"));
 789 
 790         &movdqu     ($inout5,&QWP(0,$rounds_)); # load ivec
 791 
 792         # compose byte-swap control mask for pshufb on stack
 793         &mov        (&DWP(0,"esp"),0x0c0d0e0f);
 794         &mov        (&DWP(4,"esp"),0x08090a0b);
 795         &mov        (&DWP(8,"esp"),0x04050607);
 796         &mov        (&DWP(12,"esp"),0x00010203);
 797 
 798         # compose counter increment vector on stack
 799         &mov        ($rounds,6);
 800         &xor        ($key_,$key_);
 801         &mov        (&DWP(16,"esp"),$rounds);
 802         &mov        (&DWP(20,"esp"),$rounds);
 803         &mov        (&DWP(24,"esp"),$rounds);
 804         &mov        (&DWP(28,"esp"),$key_);
 805 
 806         &pextrd     ($rounds_,$inout5,3);           # pull 32-bit counter
 807         &pinsrd     ($inout5,$key_,3);              # wipe 32-bit counter
 808 
 809         &mov        ($rounds,&DWP(240,$key));   # key->rounds
 810 
 811         # compose 2 vectors of 3x32-bit counters
 812         &bswap      ($rounds_);
 813         &pxor       ($rndkey1,$rndkey1);
 814         &pxor       ($rndkey0,$rndkey0);
 815         &movdqa     ($inout0,&QWP(0,"esp"));    # load byte-swap mask
 816         &pinsrd     ($rndkey1,$rounds_,0);
 817         &lea        ($key_,&DWP(3,$rounds_));
 818         &pinsrd     ($rndkey0,$key_,0);
 819         &inc        ($rounds_);
 820         &pinsrd     ($rndkey1,$rounds_,1);
 821         &inc        ($key_);
 822         &pinsrd     ($rndkey0,$key_,1);
 823         &inc        ($rounds_);
 824         &pinsrd     ($rndkey1,$rounds_,2);
 825         &inc        ($key_);
 826         &pinsrd     ($rndkey0,$key_,2);
 827         &movdqa     (&QWP(48,"esp"),$rndkey1);  # save 1st triplet
 828         &pshufb     ($rndkey1,$inout0);             # byte swap
 829         &movdqa     (&QWP(64,"esp"),$rndkey0);  # save 2nd triplet
 830         &pshufb     ($rndkey0,$inout0);             # byte swap
 831 
 832         &pshufd     ($inout0,$rndkey1,3<<6);  # place counter to upper dword
 833         &pshufd     ($inout1,$rndkey1,2<<6);
 834         &cmp        ($len,6);
 835         &jb (&label("ctr32_tail"));
 836         &movdqa     (&QWP(32,"esp"),$inout5);   # save counter-less ivec
 837         &shr        ($rounds,1);
 838         &mov        ($key_,$key);                   # backup $key
 839         &mov        ($rounds_,$rounds);             # backup $rounds
 840         &sub        ($len,6);
 841         &jmp        (&label("ctr32_loop6"));
 842 
 843 &set_label("ctr32_loop6",16);
 844         &pshufd     ($inout2,$rndkey1,1<<6);
 845         &movdqa     ($rndkey1,&QWP(32,"esp"));  # pull counter-less ivec
 846         &pshufd     ($inout3,$rndkey0,3<<6);
 847         &por        ($inout0,$rndkey1);             # merge counter-less ivec
 848         &pshufd     ($inout4,$rndkey0,2<<6);
 849         &por        ($inout1,$rndkey1);
 850         &pshufd     ($inout5,$rndkey0,1<<6);
 851         &por        ($inout2,$rndkey1);
 852         &por        ($inout3,$rndkey1);
 853         &por        ($inout4,$rndkey1);
 854         &por        ($inout5,$rndkey1);
 855 
 856         # inlining _aesni_encrypt6's prologue gives ~4% improvement...
 857         &$movekey   ($rndkey0,&QWP(0,$key_));
 858         &$movekey   ($rndkey1,&QWP(16,$key_));
 859         &lea                ($key,&DWP(32,$key_));
 860         &dec                ($rounds);
 861         &pxor               ($inout0,$rndkey0);
 862         &pxor               ($inout1,$rndkey0);
 863         &aesenc             ($inout0,$rndkey1);
 864         &pxor               ($inout2,$rndkey0);
 865         &aesenc             ($inout1,$rndkey1);
 866         &pxor               ($inout3,$rndkey0);
 867         &aesenc             ($inout2,$rndkey1);
 868         &pxor               ($inout4,$rndkey0);
 869         &aesenc             ($inout3,$rndkey1);
 870         &pxor               ($inout5,$rndkey0);
 871         &aesenc             ($inout4,$rndkey1);
 872         &$movekey   ($rndkey0,&QWP(0,$key));
 873         &aesenc             ($inout5,$rndkey1);
 874 
 875         &call               (&label("_aesni_encrypt6_enter"));
 876 
 877         &movups     ($rndkey1,&QWP(0,$inp));
 878         &movups     ($rndkey0,&QWP(0x10,$inp));
 879         &xorps      ($inout0,$rndkey1);
 880         &movups     ($rndkey1,&QWP(0x20,$inp));
 881         &xorps      ($inout1,$rndkey0);
 882         &movups     (&QWP(0,$out),$inout0);
 883         &movdqa     ($rndkey0,&QWP(16,"esp"));  # load increment
 884         &xorps      ($inout2,$rndkey1);
 885         &movdqa     ($rndkey1,&QWP(48,"esp"));  # load 1st triplet
 886         &movups     (&QWP(0x10,$out),$inout1);
 887         &movups     (&QWP(0x20,$out),$inout2);
 888 
 889         &paddd      ($rndkey1,$rndkey0);            # 1st triplet increment
 890         &paddd      ($rndkey0,&QWP(64,"esp"));  # 2nd triplet increment
 891         &movdqa     ($inout0,&QWP(0,"esp"));    # load byte swap mask
 892 
 893         &movups     ($inout1,&QWP(0x30,$inp));
 894         &movups     ($inout2,&QWP(0x40,$inp));
 895         &xorps      ($inout3,$inout1);
 896         &movups     ($inout1,&QWP(0x50,$inp));
 897         &lea        ($inp,&DWP(0x60,$inp));
 898         &movdqa     (&QWP(48,"esp"),$rndkey1);  # save 1st triplet
 899         &pshufb     ($rndkey1,$inout0);             # byte swap
 900         &xorps      ($inout4,$inout2);
 901         &movups     (&QWP(0x30,$out),$inout3);
 902         &xorps      ($inout5,$inout1);
 903         &movdqa     (&QWP(64,"esp"),$rndkey0);  # save 2nd triplet
 904         &pshufb     ($rndkey0,$inout0);             # byte swap
 905         &movups     (&QWP(0x40,$out),$inout4);
 906         &pshufd     ($inout0,$rndkey1,3<<6);
 907         &movups     (&QWP(0x50,$out),$inout5);
 908         &lea        ($out,&DWP(0x60,$out));
 909 
 910         &mov        ($rounds,$rounds_);
 911         &pshufd     ($inout1,$rndkey1,2<<6);
 912         &sub        ($len,6);
 913         &jnc        (&label("ctr32_loop6"));
 914 
 915         &add        ($len,6);
 916         &jz (&label("ctr32_ret"));
 917         &mov        ($key,$key_);
 918         &lea        ($rounds,&DWP(1,"",$rounds,2));     # restore $rounds
 919         &movdqa     ($inout5,&QWP(32,"esp"));   # pull count-less ivec
 920 
 921 &set_label("ctr32_tail");
 922         &por        ($inout0,$inout5);
 923         &cmp        ($len,2);
 924         &jb (&label("ctr32_one"));
 925 
 926         &pshufd     ($inout2,$rndkey1,1<<6);
 927         &por        ($inout1,$inout5);
 928         &je (&label("ctr32_two"));
 929 
 930         &pshufd     ($inout3,$rndkey0,3<<6);
 931         &por        ($inout2,$inout5);
 932         &cmp        ($len,4);
 933         &jb (&label("ctr32_three"));
 934 
 935         &pshufd     ($inout4,$rndkey0,2<<6);
 936         &por        ($inout3,$inout5);
 937         &je (&label("ctr32_four"));
 938 
 939         &por        ($inout4,$inout5);
 940         &call       ("_aesni_encrypt6");
 941         &movups     ($rndkey1,&QWP(0,$inp));
 942         &movups     ($rndkey0,&QWP(0x10,$inp));
 943         &xorps      ($inout0,$rndkey1);
 944         &movups     ($rndkey1,&QWP(0x20,$inp));
 945         &xorps      ($inout1,$rndkey0);
 946         &movups     ($rndkey0,&QWP(0x30,$inp));
 947         &xorps      ($inout2,$rndkey1);
 948         &movups     ($rndkey1,&QWP(0x40,$inp));
 949         &xorps      ($inout3,$rndkey0);
 950         &movups     (&QWP(0,$out),$inout0);
 951         &xorps      ($inout4,$rndkey1);
 952         &movups     (&QWP(0x10,$out),$inout1);
 953         &movups     (&QWP(0x20,$out),$inout2);
 954         &movups     (&QWP(0x30,$out),$inout3);
 955         &movups     (&QWP(0x40,$out),$inout4);
 956         &jmp        (&label("ctr32_ret"));
 957 
 958 &set_label("ctr32_one_shortcut",16);
 959         &movups     ($inout0,&QWP(0,$rounds_)); # load ivec
 960         &mov        ($rounds,&DWP(240,$key));
 961 
 962 &set_label("ctr32_one");
 963         if ($inline)
 964         {   &aesni_inline_generate1("enc"); }
 965         else
 966         {   &call   ("_aesni_encrypt1");    }
 967         &movups     ($in0,&QWP(0,$inp));
 968         &xorps      ($in0,$inout0);
 969         &movups     (&QWP(0,$out),$in0);
 970         &jmp        (&label("ctr32_ret"));
 971 
 972 &set_label("ctr32_two",16);
 973         &call       ("_aesni_encrypt3");
 974         &movups     ($inout3,&QWP(0,$inp));
 975         &movups     ($inout4,&QWP(0x10,$inp));
 976         &xorps      ($inout0,$inout3);
 977         &xorps      ($inout1,$inout4);
 978         &movups     (&QWP(0,$out),$inout0);
 979         &movups     (&QWP(0x10,$out),$inout1);
 980         &jmp        (&label("ctr32_ret"));
 981 
 982 &set_label("ctr32_three",16);
 983         &call       ("_aesni_encrypt3");
 984         &movups     ($inout3,&QWP(0,$inp));
 985         &movups     ($inout4,&QWP(0x10,$inp));
 986         &xorps      ($inout0,$inout3);
 987         &movups     ($inout5,&QWP(0x20,$inp));
 988         &xorps      ($inout1,$inout4);
 989         &movups     (&QWP(0,$out),$inout0);
 990         &xorps      ($inout2,$inout5);
 991         &movups     (&QWP(0x10,$out),$inout1);
 992         &movups     (&QWP(0x20,$out),$inout2);
 993         &jmp        (&label("ctr32_ret"));
 994 
 995 &set_label("ctr32_four",16);
 996         &call       ("_aesni_encrypt4");
 997         &movups     ($inout4,&QWP(0,$inp));
 998         &movups     ($inout5,&QWP(0x10,$inp));
 999         &movups     ($rndkey1,&QWP(0x20,$inp));
1000         &xorps      ($inout0,$inout4);
1001         &movups     ($rndkey0,&QWP(0x30,$inp));
1002         &xorps      ($inout1,$inout5);
1003         &movups     (&QWP(0,$out),$inout0);
1004         &xorps      ($inout2,$rndkey1);
1005         &movups     (&QWP(0x10,$out),$inout1);
1006         &xorps      ($inout3,$rndkey0);
1007         &movups     (&QWP(0x20,$out),$inout2);
1008         &movups     (&QWP(0x30,$out),$inout3);
1009 
1010 &set_label("ctr32_ret");
1011         &mov        ("esp",&DWP(80,"esp"));
1012 &function_end("aesni_ctr32_encrypt_blocks");
1013 
1014 ######################################################################
1015 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1016 #       const AES_KEY *key1, const AES_KEY *key2
1017 #       const unsigned char iv[16]);
1018 #
1019 { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1020 
1021 &function_begin("aesni_xts_encrypt");
1022         &mov        ($key,&wparam(4));          # key2
1023         &mov        ($inp,&wparam(5));          # clear-text tweak
1024 
1025         &mov        ($rounds,&DWP(240,$key));   # key2->rounds
1026         &movups     ($inout0,&QWP(0,$inp));
1027         if ($inline)
1028         {   &aesni_inline_generate1("enc"); }
1029         else
1030         {   &call   ("_aesni_encrypt1");    }
1031 
1032         &mov        ($inp,&wparam(0));
1033         &mov        ($out,&wparam(1));
1034         &mov        ($len,&wparam(2));
1035         &mov        ($key,&wparam(3));          # key1
1036 
1037         &mov        ($key_,"esp");
1038         &sub        ("esp",16*7+8);
1039         &mov        ($rounds,&DWP(240,$key));   # key1->rounds
1040         &and        ("esp",-16);                    # align stack
1041 
1042         &mov        (&DWP(16*6+0,"esp"),0x87);  # compose the magic constant
1043         &mov        (&DWP(16*6+4,"esp"),0);
1044         &mov        (&DWP(16*6+8,"esp"),1);
1045         &mov        (&DWP(16*6+12,"esp"),0);
1046         &mov        (&DWP(16*7+0,"esp"),$len);  # save original $len
1047         &mov        (&DWP(16*7+4,"esp"),$key_); # save original %esp
1048 
1049         &movdqa     ($tweak,$inout0);
1050         &pxor       ($twtmp,$twtmp);
1051         &movdqa     ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1052         &pcmpgtd($twtmp,$tweak);            # broadcast upper bits
1053 
1054         &and        ($len,-16);
1055         &mov        ($key_,$key);                   # backup $key
1056         &mov        ($rounds_,$rounds);             # backup $rounds
1057         &sub        ($len,16*6);
1058         &jc (&label("xts_enc_short"));
1059 
1060         &shr        ($rounds,1);
1061         &mov        ($rounds_,$rounds);
1062         &jmp        (&label("xts_enc_loop6"));
1063 
1064 &set_label("xts_enc_loop6",16);
1065         for ($i=0;$i<4;$i++) {
1066             &pshufd ($twres,$twtmp,0x13);
1067             &pxor   ($twtmp,$twtmp);
1068             &movdqa (&QWP(16*$i,"esp"),$tweak);
1069             &paddq  ($tweak,$tweak);        # &psllq($tweak,1);
1070             &pand   ($twres,$twmask);       # isolate carry and residue
1071             &pcmpgtd        ($twtmp,$tweak);        # broadcast upper bits
1072             &pxor   ($tweak,$twres);
1073         }
1074         &pshufd     ($inout5,$twtmp,0x13);
1075         &movdqa     (&QWP(16*$i++,"esp"),$tweak);
1076         &paddq      ($tweak,$tweak);                # &psllq($tweak,1);
1077          &$movekey  ($rndkey0,&QWP(0,$key_));
1078         &pand       ($inout5,$twmask);              # isolate carry and residue
1079          &movups    ($inout0,&QWP(0,$inp));     # load input
1080         &pxor       ($inout5,$tweak);
1081 
1082         # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1083         &movdqu     ($inout1,&QWP(16*1,$inp));
1084          &xorps             ($inout0,$rndkey0);     # input^=rndkey[0]
1085         &movdqu     ($inout2,&QWP(16*2,$inp));
1086          &pxor              ($inout1,$rndkey0);
1087         &movdqu     ($inout3,&QWP(16*3,$inp));
1088          &pxor              ($inout2,$rndkey0);
1089         &movdqu     ($inout4,&QWP(16*4,$inp));
1090          &pxor              ($inout3,$rndkey0);
1091         &movdqu     ($rndkey1,&QWP(16*5,$inp));
1092          &pxor              ($inout4,$rndkey0);
1093         &lea        ($inp,&DWP(16*6,$inp));
1094         &pxor       ($inout0,&QWP(16*0,"esp")); # input^=tweak
1095         &movdqa     (&QWP(16*$i,"esp"),$inout5);        # save last tweak
1096         &pxor       ($inout5,$rndkey1);
1097 
1098          &$movekey  ($rndkey1,&QWP(16,$key_));
1099          &lea               ($key,&DWP(32,$key_));
1100         &pxor       ($inout1,&QWP(16*1,"esp"));
1101          &aesenc    ($inout0,$rndkey1);
1102         &pxor       ($inout2,&QWP(16*2,"esp"));
1103          &aesenc    ($inout1,$rndkey1);
1104         &pxor       ($inout3,&QWP(16*3,"esp"));
1105          &dec               ($rounds);
1106          &aesenc    ($inout2,$rndkey1);
1107         &pxor       ($inout4,&QWP(16*4,"esp"));
1108          &aesenc    ($inout3,$rndkey1);
1109         &pxor               ($inout5,$rndkey0);
1110          &aesenc    ($inout4,$rndkey1);
1111          &$movekey  ($rndkey0,&QWP(0,$key));
1112          &aesenc    ($inout5,$rndkey1);
1113         &call               (&label("_aesni_encrypt6_enter"));
1114 
1115         &movdqa     ($tweak,&QWP(16*5,"esp"));  # last tweak
1116        &pxor        ($twtmp,$twtmp);
1117         &xorps      ($inout0,&QWP(16*0,"esp")); # output^=tweak
1118        &pcmpgtd     ($twtmp,$tweak);                # broadcast upper bits
1119         &xorps      ($inout1,&QWP(16*1,"esp"));
1120         &movups     (&QWP(16*0,$out),$inout0);  # write output
1121         &xorps      ($inout2,&QWP(16*2,"esp"));
1122         &movups     (&QWP(16*1,$out),$inout1);
1123         &xorps      ($inout3,&QWP(16*3,"esp"));
1124         &movups     (&QWP(16*2,$out),$inout2);
1125         &xorps      ($inout4,&QWP(16*4,"esp"));
1126         &movups     (&QWP(16*3,$out),$inout3);
1127         &xorps      ($inout5,$tweak);
1128         &movups     (&QWP(16*4,$out),$inout4);
1129        &pshufd      ($twres,$twtmp,0x13);
1130         &movups     (&QWP(16*5,$out),$inout5);
1131         &lea        ($out,&DWP(16*6,$out));
1132        &movdqa      ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1133 
1134         &pxor       ($twtmp,$twtmp);
1135         &paddq      ($tweak,$tweak);                # &psllq($tweak,1);
1136         &pand       ($twres,$twmask);               # isolate carry and residue
1137         &pcmpgtd($twtmp,$tweak);            # broadcast upper bits
1138         &mov        ($rounds,$rounds_);             # restore $rounds
1139         &pxor       ($tweak,$twres);
1140 
1141         &sub        ($len,16*6);
1142         &jnc        (&label("xts_enc_loop6"));
1143 
1144         &lea        ($rounds,&DWP(1,"",$rounds,2));     # restore $rounds
1145         &mov        ($key,$key_);                   # restore $key
1146         &mov        ($rounds_,$rounds);
1147 
1148 &set_label("xts_enc_short");
1149         &add        ($len,16*6);
1150         &jz (&label("xts_enc_done6x"));
1151 
1152         &movdqa     ($inout3,$tweak);               # put aside previous tweak
1153         &cmp        ($len,0x20);
1154         &jb (&label("xts_enc_one"));
1155 
1156         &pshufd     ($twres,$twtmp,0x13);
1157         &pxor       ($twtmp,$twtmp);
1158         &paddq      ($tweak,$tweak);                # &psllq($tweak,1);
1159         &pand       ($twres,$twmask);               # isolate carry and residue
1160         &pcmpgtd($twtmp,$tweak);            # broadcast upper bits
1161         &pxor       ($tweak,$twres);
1162         &je (&label("xts_enc_two"));
1163 
1164         &pshufd     ($twres,$twtmp,0x13);
1165         &pxor       ($twtmp,$twtmp);
1166         &movdqa     ($inout4,$tweak);               # put aside previous tweak
1167         &paddq      ($tweak,$tweak);                # &psllq($tweak,1);
1168         &pand       ($twres,$twmask);               # isolate carry and residue
1169         &pcmpgtd($twtmp,$tweak);            # broadcast upper bits
1170         &pxor       ($tweak,$twres);
1171         &cmp        ($len,0x40);
1172         &jb (&label("xts_enc_three"));
1173 
1174         &pshufd     ($twres,$twtmp,0x13);
1175         &pxor       ($twtmp,$twtmp);
1176         &movdqa     ($inout5,$tweak);               # put aside previous tweak
1177         &paddq      ($tweak,$tweak);                # &psllq($tweak,1);
1178         &pand       ($twres,$twmask);               # isolate carry and residue
1179         &pcmpgtd($twtmp,$tweak);            # broadcast upper bits
1180         &pxor       ($tweak,$twres);
1181         &movdqa     (&QWP(16*0,"esp"),$inout3);
1182         &movdqa     (&QWP(16*1,"esp"),$inout4);
1183         &je (&label("xts_enc_four"));
1184 
1185         &movdqa     (&QWP(16*2,"esp"),$inout5);
1186         &pshufd     ($inout5,$twtmp,0x13);
1187         &movdqa     (&QWP(16*3,"esp"),$tweak);
1188         &paddq      ($tweak,$tweak);                # &psllq($inout0,1);
1189         &pand       ($inout5,$twmask);              # isolate carry and residue
1190         &pxor       ($inout5,$tweak);
1191 
1192         &movdqu     ($inout0,&QWP(16*0,$inp));  # load input
1193         &movdqu     ($inout1,&QWP(16*1,$inp));
1194         &movdqu     ($inout2,&QWP(16*2,$inp));
1195         &pxor       ($inout0,&QWP(16*0,"esp")); # input^=tweak
1196         &movdqu     ($inout3,&QWP(16*3,$inp));
1197         &pxor       ($inout1,&QWP(16*1,"esp"));
1198         &movdqu     ($inout4,&QWP(16*4,$inp));
1199         &pxor       ($inout2,&QWP(16*2,"esp"));
1200         &lea        ($inp,&DWP(16*5,$inp));
1201         &pxor       ($inout3,&QWP(16*3,"esp"));
1202         &movdqa     (&QWP(16*4,"esp"),$inout5); # save last tweak
1203         &pxor       ($inout4,$inout5);
1204 
1205         &call       ("_aesni_encrypt6");
1206 
1207         &movaps     ($tweak,&QWP(16*4,"esp"));  # last tweak
1208         &xorps      ($inout0,&QWP(16*0,"esp")); # output^=tweak
1209         &xorps      ($inout1,&QWP(16*1,"esp"));
1210         &xorps      ($inout2,&QWP(16*2,"esp"));
1211         &movups     (&QWP(16*0,$out),$inout0);  # write output
1212         &xorps      ($inout3,&QWP(16*3,"esp"));
1213         &movups     (&QWP(16*1,$out),$inout1);
1214         &xorps      ($inout4,$tweak);
1215         &movups     (&QWP(16*2,$out),$inout2);
1216         &movups     (&QWP(16*3,$out),$inout3);
1217         &movups     (&QWP(16*4,$out),$inout4);
1218         &lea        ($out,&DWP(16*5,$out));
1219         &jmp        (&label("xts_enc_done"));
1220 
1221 &set_label("xts_enc_one",16);
1222         &movups     ($inout0,&QWP(16*0,$inp));  # load input
1223         &lea        ($inp,&DWP(16*1,$inp));
1224         &xorps      ($inout0,$inout3);              # input^=tweak
1225         if ($inline)
1226         {   &aesni_inline_generate1("enc"); }
1227         else
1228         {   &call   ("_aesni_encrypt1");    }
1229         &xorps      ($inout0,$inout3);              # output^=tweak
1230         &movups     (&QWP(16*0,$out),$inout0);  # write output
1231         &lea        ($out,&DWP(16*1,$out));
1232 
1233         &movdqa     ($tweak,$inout3);               # last tweak
1234         &jmp        (&label("xts_enc_done"));
1235 
1236 &set_label("xts_enc_two",16);
1237         &movaps     ($inout4,$tweak);               # put aside last tweak
1238 
1239         &movups     ($inout0,&QWP(16*0,$inp));  # load input
1240         &movups     ($inout1,&QWP(16*1,$inp));
1241         &lea        ($inp,&DWP(16*2,$inp));
1242         &xorps      ($inout0,$inout3);              # input^=tweak
1243         &xorps      ($inout1,$inout4);
1244         &xorps      ($inout2,$inout2);
1245 
1246         &call       ("_aesni_encrypt3");
1247 
1248         &xorps      ($inout0,$inout3);              # output^=tweak
1249         &xorps      ($inout1,$inout4);
1250         &movups     (&QWP(16*0,$out),$inout0);  # write output
1251         &movups     (&QWP(16*1,$out),$inout1);
1252         &lea        ($out,&DWP(16*2,$out));
1253 
1254         &movdqa     ($tweak,$inout4);               # last tweak
1255         &jmp        (&label("xts_enc_done"));
1256 
1257 &set_label("xts_enc_three",16);
1258         &movaps     ($inout5,$tweak);               # put aside last tweak
1259         &movups     ($inout0,&QWP(16*0,$inp));  # load input
1260         &movups     ($inout1,&QWP(16*1,$inp));
1261         &movups     ($inout2,&QWP(16*2,$inp));
1262         &lea        ($inp,&DWP(16*3,$inp));
1263         &xorps      ($inout0,$inout3);              # input^=tweak
1264         &xorps      ($inout1,$inout4);
1265         &xorps      ($inout2,$inout5);
1266 
1267         &call       ("_aesni_encrypt3");
1268 
1269         &xorps      ($inout0,$inout3);              # output^=tweak
1270         &xorps      ($inout1,$inout4);
1271         &xorps      ($inout2,$inout5);
1272         &movups     (&QWP(16*0,$out),$inout0);  # write output
1273         &movups     (&QWP(16*1,$out),$inout1);
1274         &movups     (&QWP(16*2,$out),$inout2);
1275         &lea        ($out,&DWP(16*3,$out));
1276 
1277         &movdqa     ($tweak,$inout5);               # last tweak
1278         &jmp        (&label("xts_enc_done"));
1279 
1280 &set_label("xts_enc_four",16);
1281         &movaps     ($inout4,$tweak);               # put aside last tweak
1282 
1283         &movups     ($inout0,&QWP(16*0,$inp));  # load input
1284         &movups     ($inout1,&QWP(16*1,$inp));
1285         &movups     ($inout2,&QWP(16*2,$inp));
1286         &xorps      ($inout0,&QWP(16*0,"esp")); # input^=tweak
1287         &movups     ($inout3,&QWP(16*3,$inp));
1288         &lea        ($inp,&DWP(16*4,$inp));
1289         &xorps      ($inout1,&QWP(16*1,"esp"));
1290         &xorps      ($inout2,$inout5);
1291         &xorps      ($inout3,$inout4);
1292 
1293         &call       ("_aesni_encrypt4");
1294 
1295         &xorps      ($inout0,&QWP(16*0,"esp")); # output^=tweak
1296         &xorps      ($inout1,&QWP(16*1,"esp"));
1297         &xorps      ($inout2,$inout5);
1298         &movups     (&QWP(16*0,$out),$inout0);  # write output
1299         &xorps      ($inout3,$inout4);
1300         &movups     (&QWP(16*1,$out),$inout1);
1301         &movups     (&QWP(16*2,$out),$inout2);
1302         &movups     (&QWP(16*3,$out),$inout3);
1303         &lea        ($out,&DWP(16*4,$out));
1304 
1305         &movdqa     ($tweak,$inout4);               # last tweak
1306         &jmp        (&label("xts_enc_done"));
1307 
1308 &set_label("xts_enc_done6x",16);            # $tweak is pre-calculated
1309         &mov        ($len,&DWP(16*7+0,"esp"));  # restore original $len
1310         &and        ($len,15);
1311         &jz (&label("xts_enc_ret"));
1312         &movdqa     ($inout3,$tweak);
1313         &mov        (&DWP(16*7+0,"esp"),$len);  # save $len%16
1314         &jmp        (&label("xts_enc_steal"));
1315 
1316 &set_label("xts_enc_done",16);
1317         &mov        ($len,&DWP(16*7+0,"esp"));  # restore original $len
1318         &pxor       ($twtmp,$twtmp);
1319         &and        ($len,15);
1320         &jz (&label("xts_enc_ret"));
1321 
1322         &pcmpgtd($twtmp,$tweak);            # broadcast upper bits
1323         &mov        (&DWP(16*7+0,"esp"),$len);  # save $len%16
1324         &pshufd     ($inout3,$twtmp,0x13);
1325         &paddq      ($tweak,$tweak);                # &psllq($tweak,1);
1326         &pand       ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
1327         &pxor       ($inout3,$tweak);
1328 
1329 &set_label("xts_enc_steal");
1330         &movz       ($rounds,&BP(0,$inp));
1331         &movz       ($key,&BP(-16,$out));
1332         &lea        ($inp,&DWP(1,$inp));
1333         &mov        (&BP(-16,$out),&LB($rounds));
1334         &mov        (&BP(0,$out),&LB($key));
1335         &lea        ($out,&DWP(1,$out));
1336         &sub        ($len,1);
1337         &jnz        (&label("xts_enc_steal"));
1338 
1339         &sub        ($out,&DWP(16*7+0,"esp"));  # rewind $out
1340         &mov        ($key,$key_);                   # restore $key
1341         &mov        ($rounds,$rounds_);             # restore $rounds
1342 
1343         &movups     ($inout0,&QWP(-16,$out));   # load input
1344         &xorps      ($inout0,$inout3);              # input^=tweak
1345         if ($inline)
1346         {   &aesni_inline_generate1("enc"); }
1347         else
1348         {   &call   ("_aesni_encrypt1");    }
1349         &xorps      ($inout0,$inout3);              # output^=tweak
1350         &movups     (&QWP(-16,$out),$inout0);   # write output
1351 
1352 &set_label("xts_enc_ret");
1353         &mov        ("esp",&DWP(16*7+4,"esp")); # restore %esp
1354 &function_end("aesni_xts_encrypt");
1355 
1356 &function_begin("aesni_xts_decrypt");
1357         &mov        ($key,&wparam(4));          # key2
1358         &mov        ($inp,&wparam(5));          # clear-text tweak
1359 
1360         &mov        ($rounds,&DWP(240,$key));   # key2->rounds
1361         &movups     ($inout0,&QWP(0,$inp));
1362         if ($inline)
1363         {   &aesni_inline_generate1("enc"); }
1364         else
1365         {   &call   ("_aesni_encrypt1");    }
1366 
1367         &mov        ($inp,&wparam(0));
1368         &mov        ($out,&wparam(1));
1369         &mov        ($len,&wparam(2));
1370         &mov        ($key,&wparam(3));          # key1
1371 
1372         &mov        ($key_,"esp");
1373         &sub        ("esp",16*7+8);
1374         &and        ("esp",-16);                    # align stack
1375 
1376         &xor        ($rounds_,$rounds_);            # if(len%16) len-=16;
1377         &test       ($len,15);
1378         &setnz      (&LB($rounds_));
1379         &shl        ($rounds_,4);
1380         &sub        ($len,$rounds_);
1381 
1382         &mov        (&DWP(16*6+0,"esp"),0x87);  # compose the magic constant
1383         &mov        (&DWP(16*6+4,"esp"),0);
1384         &mov        (&DWP(16*6+8,"esp"),1);
1385         &mov        (&DWP(16*6+12,"esp"),0);
1386         &mov        (&DWP(16*7+0,"esp"),$len);  # save original $len
1387         &mov        (&DWP(16*7+4,"esp"),$key_); # save original %esp
1388 
1389         &mov        ($rounds,&DWP(240,$key));   # key1->rounds
1390         &mov        ($key_,$key);                   # backup $key
1391         &mov        ($rounds_,$rounds);             # backup $rounds
1392 
1393         &movdqa     ($tweak,$inout0);
1394         &pxor       ($twtmp,$twtmp);
1395         &movdqa     ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1396         &pcmpgtd($twtmp,$tweak);            # broadcast upper bits
1397 
1398         &and        ($len,-16);
1399         &sub        ($len,16*6);
1400         &jc (&label("xts_dec_short"));
1401 
1402         &shr        ($rounds,1);
1403         &mov        ($rounds_,$rounds);
1404         &jmp        (&label("xts_dec_loop6"));
1405 
1406 &set_label("xts_dec_loop6",16);
1407         for ($i=0;$i<4;$i++) {
1408             &pshufd ($twres,$twtmp,0x13);
1409             &pxor   ($twtmp,$twtmp);
1410             &movdqa (&QWP(16*$i,"esp"),$tweak);
1411             &paddq  ($tweak,$tweak);        # &psllq($tweak,1);
1412             &pand   ($twres,$twmask);       # isolate carry and residue
1413             &pcmpgtd        ($twtmp,$tweak);        # broadcast upper bits
1414             &pxor   ($tweak,$twres);
1415         }
1416         &pshufd     ($inout5,$twtmp,0x13);
1417         &movdqa     (&QWP(16*$i++,"esp"),$tweak);
1418         &paddq      ($tweak,$tweak);                # &psllq($tweak,1);
1419          &$movekey  ($rndkey0,&QWP(0,$key_));
1420         &pand       ($inout5,$twmask);              # isolate carry and residue
1421          &movups    ($inout0,&QWP(0,$inp));     # load input
1422         &pxor       ($inout5,$tweak);
1423 
1424         # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1425         &movdqu     ($inout1,&QWP(16*1,$inp));
1426          &xorps             ($inout0,$rndkey0);     # input^=rndkey[0]
1427         &movdqu     ($inout2,&QWP(16*2,$inp));
1428          &pxor              ($inout1,$rndkey0);
1429         &movdqu     ($inout3,&QWP(16*3,$inp));
1430          &pxor              ($inout2,$rndkey0);
1431         &movdqu     ($inout4,&QWP(16*4,$inp));
1432          &pxor              ($inout3,$rndkey0);
1433         &movdqu     ($rndkey1,&QWP(16*5,$inp));
1434          &pxor              ($inout4,$rndkey0);
1435         &lea        ($inp,&DWP(16*6,$inp));
1436         &pxor       ($inout0,&QWP(16*0,"esp")); # input^=tweak
1437         &movdqa     (&QWP(16*$i,"esp"),$inout5);        # save last tweak
1438         &pxor       ($inout5,$rndkey1);
1439 
1440          &$movekey  ($rndkey1,&QWP(16,$key_));
1441          &lea               ($key,&DWP(32,$key_));
1442         &pxor       ($inout1,&QWP(16*1,"esp"));
1443          &aesdec    ($inout0,$rndkey1);
1444         &pxor       ($inout2,&QWP(16*2,"esp"));
1445          &aesdec    ($inout1,$rndkey1);
1446         &pxor       ($inout3,&QWP(16*3,"esp"));
1447          &dec               ($rounds);
1448          &aesdec    ($inout2,$rndkey1);
1449         &pxor       ($inout4,&QWP(16*4,"esp"));
1450          &aesdec    ($inout3,$rndkey1);
1451         &pxor               ($inout5,$rndkey0);
1452          &aesdec    ($inout4,$rndkey1);
1453          &$movekey  ($rndkey0,&QWP(0,$key));
1454          &aesdec    ($inout5,$rndkey1);
1455         &call               (&label("_aesni_decrypt6_enter"));
1456 
1457         &movdqa     ($tweak,&QWP(16*5,"esp"));  # last tweak
1458        &pxor        ($twtmp,$twtmp);
1459         &xorps      ($inout0,&QWP(16*0,"esp")); # output^=tweak
1460        &pcmpgtd     ($twtmp,$tweak);                # broadcast upper bits
1461         &xorps      ($inout1,&QWP(16*1,"esp"));
1462         &movups     (&QWP(16*0,$out),$inout0);  # write output
1463         &xorps      ($inout2,&QWP(16*2,"esp"));
1464         &movups     (&QWP(16*1,$out),$inout1);
1465         &xorps      ($inout3,&QWP(16*3,"esp"));
1466         &movups     (&QWP(16*2,$out),$inout2);
1467         &xorps      ($inout4,&QWP(16*4,"esp"));
1468         &movups     (&QWP(16*3,$out),$inout3);
1469         &xorps      ($inout5,$tweak);
1470         &movups     (&QWP(16*4,$out),$inout4);
1471        &pshufd      ($twres,$twtmp,0x13);
1472         &movups     (&QWP(16*5,$out),$inout5);
1473         &lea        ($out,&DWP(16*6,$out));
1474        &movdqa      ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1475 
1476         &pxor       ($twtmp,$twtmp);
1477         &paddq      ($tweak,$tweak);                # &psllq($tweak,1);
1478         &pand       ($twres,$twmask);               # isolate carry and residue
1479         &pcmpgtd($twtmp,$tweak);            # broadcast upper bits
1480         &mov        ($rounds,$rounds_);             # restore $rounds
1481         &pxor       ($tweak,$twres);
1482 
1483         &sub        ($len,16*6);
1484         &jnc        (&label("xts_dec_loop6"));
1485 
1486         &lea        ($rounds,&DWP(1,"",$rounds,2));     # restore $rounds
1487         &mov        ($key,$key_);                   # restore $key
1488         &mov        ($rounds_,$rounds);
1489 
1490 &set_label("xts_dec_short");
1491         &add        ($len,16*6);
1492         &jz (&label("xts_dec_done6x"));
1493 
1494         &movdqa     ($inout3,$tweak);               # put aside previous tweak
1495         &cmp        ($len,0x20);
1496         &jb (&label("xts_dec_one"));
1497 
1498         &pshufd     ($twres,$twtmp,0x13);
1499         &pxor       ($twtmp,$twtmp);
1500         &paddq      ($tweak,$tweak);                # &psllq($tweak,1);
1501         &pand       ($twres,$twmask);               # isolate carry and residue
1502         &pcmpgtd($twtmp,$tweak);            # broadcast upper bits
1503         &pxor       ($tweak,$twres);
1504         &je (&label("xts_dec_two"));
1505 
1506         &pshufd     ($twres,$twtmp,0x13);
1507         &pxor       ($twtmp,$twtmp);
1508         &movdqa     ($inout4,$tweak);               # put aside previous tweak
1509         &paddq      ($tweak,$tweak);                # &psllq($tweak,1);
1510         &pand       ($twres,$twmask);               # isolate carry and residue
1511         &pcmpgtd($twtmp,$tweak);            # broadcast upper bits
1512         &pxor       ($tweak,$twres);
1513         &cmp        ($len,0x40);
1514         &jb (&label("xts_dec_three"));
1515 
1516         &pshufd     ($twres,$twtmp,0x13);
1517         &pxor       ($twtmp,$twtmp);
1518         &movdqa     ($inout5,$tweak);               # put aside previous tweak
1519         &paddq      ($tweak,$tweak);                # &psllq($tweak,1);
1520         &pand       ($twres,$twmask);               # isolate carry and residue
1521         &pcmpgtd($twtmp,$tweak);            # broadcast upper bits
1522         &pxor       ($tweak,$twres);
1523         &movdqa     (&QWP(16*0,"esp"),$inout3);
1524         &movdqa     (&QWP(16*1,"esp"),$inout4);
1525         &je (&label("xts_dec_four"));
1526 
1527         &movdqa     (&QWP(16*2,"esp"),$inout5);
1528         &pshufd     ($inout5,$twtmp,0x13);
1529         &movdqa     (&QWP(16*3,"esp"),$tweak);
1530         &paddq      ($tweak,$tweak);                # &psllq($inout0,1);
1531         &pand       ($inout5,$twmask);              # isolate carry and residue
1532         &pxor       ($inout5,$tweak);
1533 
1534         &movdqu     ($inout0,&QWP(16*0,$inp));  # load input
1535         &movdqu     ($inout1,&QWP(16*1,$inp));
1536         &movdqu     ($inout2,&QWP(16*2,$inp));
1537         &pxor       ($inout0,&QWP(16*0,"esp")); # input^=tweak
1538         &movdqu     ($inout3,&QWP(16*3,$inp));
1539         &pxor       ($inout1,&QWP(16*1,"esp"));
1540         &movdqu     ($inout4,&QWP(16*4,$inp));
1541         &pxor       ($inout2,&QWP(16*2,"esp"));
1542         &lea        ($inp,&DWP(16*5,$inp));
1543         &pxor       ($inout3,&QWP(16*3,"esp"));
1544         &movdqa     (&QWP(16*4,"esp"),$inout5); # save last tweak
1545         &pxor       ($inout4,$inout5);
1546 
1547         &call       ("_aesni_decrypt6");
1548 
1549         &movaps     ($tweak,&QWP(16*4,"esp"));  # last tweak
1550         &xorps      ($inout0,&QWP(16*0,"esp")); # output^=tweak
1551         &xorps      ($inout1,&QWP(16*1,"esp"));
1552         &xorps      ($inout2,&QWP(16*2,"esp"));
1553         &movups     (&QWP(16*0,$out),$inout0);  # write output
1554         &xorps      ($inout3,&QWP(16*3,"esp"));
1555         &movups     (&QWP(16*1,$out),$inout1);
1556         &xorps      ($inout4,$tweak);
1557         &movups     (&QWP(16*2,$out),$inout2);
1558         &movups     (&QWP(16*3,$out),$inout3);
1559         &movups     (&QWP(16*4,$out),$inout4);
1560         &lea        ($out,&DWP(16*5,$out));
1561         &jmp        (&label("xts_dec_done"));
1562 
1563 &set_label("xts_dec_one",16);
1564         &movups     ($inout0,&QWP(16*0,$inp));  # load input
1565         &lea        ($inp,&DWP(16*1,$inp));
1566         &xorps      ($inout0,$inout3);              # input^=tweak
1567         if ($inline)
1568         {   &aesni_inline_generate1("dec"); }
1569         else
1570         {   &call   ("_aesni_decrypt1");    }
1571         &xorps      ($inout0,$inout3);              # output^=tweak
1572         &movups     (&QWP(16*0,$out),$inout0);  # write output
1573         &lea        ($out,&DWP(16*1,$out));
1574 
1575         &movdqa     ($tweak,$inout3);               # last tweak
1576         &jmp        (&label("xts_dec_done"));
1577 
1578 &set_label("xts_dec_two",16);
1579         &movaps     ($inout4,$tweak);               # put aside last tweak
1580 
1581         &movups     ($inout0,&QWP(16*0,$inp));  # load input
1582         &movups     ($inout1,&QWP(16*1,$inp));
1583         &lea        ($inp,&DWP(16*2,$inp));
1584         &xorps      ($inout0,$inout3);              # input^=tweak
1585         &xorps      ($inout1,$inout4);
1586 
1587         &call       ("_aesni_decrypt3");
1588 
1589         &xorps      ($inout0,$inout3);              # output^=tweak
1590         &xorps      ($inout1,$inout4);
1591         &movups     (&QWP(16*0,$out),$inout0);  # write output
1592         &movups     (&QWP(16*1,$out),$inout1);
1593         &lea        ($out,&DWP(16*2,$out));
1594 
1595         &movdqa     ($tweak,$inout4);               # last tweak
1596         &jmp        (&label("xts_dec_done"));
1597 
1598 &set_label("xts_dec_three",16);
1599         &movaps     ($inout5,$tweak);               # put aside last tweak
1600         &movups     ($inout0,&QWP(16*0,$inp));  # load input
1601         &movups     ($inout1,&QWP(16*1,$inp));
1602         &movups     ($inout2,&QWP(16*2,$inp));
1603         &lea        ($inp,&DWP(16*3,$inp));
1604         &xorps      ($inout0,$inout3);              # input^=tweak
1605         &xorps      ($inout1,$inout4);
1606         &xorps      ($inout2,$inout5);
1607 
1608         &call       ("_aesni_decrypt3");
1609 
1610         &xorps      ($inout0,$inout3);              # output^=tweak
1611         &xorps      ($inout1,$inout4);
1612         &xorps      ($inout2,$inout5);
1613         &movups     (&QWP(16*0,$out),$inout0);  # write output
1614         &movups     (&QWP(16*1,$out),$inout1);
1615         &movups     (&QWP(16*2,$out),$inout2);
1616         &lea        ($out,&DWP(16*3,$out));
1617 
1618         &movdqa     ($tweak,$inout5);               # last tweak
1619         &jmp        (&label("xts_dec_done"));
1620 
1621 &set_label("xts_dec_four",16);
1622         &movaps     ($inout4,$tweak);               # put aside last tweak
1623 
1624         &movups     ($inout0,&QWP(16*0,$inp));  # load input
1625         &movups     ($inout1,&QWP(16*1,$inp));
1626         &movups     ($inout2,&QWP(16*2,$inp));
1627         &xorps      ($inout0,&QWP(16*0,"esp")); # input^=tweak
1628         &movups     ($inout3,&QWP(16*3,$inp));
1629         &lea        ($inp,&DWP(16*4,$inp));
1630         &xorps      ($inout1,&QWP(16*1,"esp"));
1631         &xorps      ($inout2,$inout5);
1632         &xorps      ($inout3,$inout4);
1633 
1634         &call       ("_aesni_decrypt4");
1635 
1636         &xorps      ($inout0,&QWP(16*0,"esp")); # output^=tweak
1637         &xorps      ($inout1,&QWP(16*1,"esp"));
1638         &xorps      ($inout2,$inout5);
1639         &movups     (&QWP(16*0,$out),$inout0);  # write output
1640         &xorps      ($inout3,$inout4);
1641         &movups     (&QWP(16*1,$out),$inout1);
1642         &movups     (&QWP(16*2,$out),$inout2);
1643         &movups     (&QWP(16*3,$out),$inout3);
1644         &lea        ($out,&DWP(16*4,$out));
1645 
1646         &movdqa     ($tweak,$inout4);               # last tweak
1647         &jmp        (&label("xts_dec_done"));
1648 
1649 &set_label("xts_dec_done6x",16);            # $tweak is pre-calculated
1650         &mov        ($len,&DWP(16*7+0,"esp"));  # restore original $len
1651         &and        ($len,15);
1652         &jz (&label("xts_dec_ret"));
1653         &mov        (&DWP(16*7+0,"esp"),$len);  # save $len%16
1654         &jmp        (&label("xts_dec_only_one_more"));
1655 
1656 &set_label("xts_dec_done",16);
1657         &mov        ($len,&DWP(16*7+0,"esp"));  # restore original $len
1658         &pxor       ($twtmp,$twtmp);
1659         &and        ($len,15);
1660         &jz (&label("xts_dec_ret"));
1661 
1662         &pcmpgtd($twtmp,$tweak);            # broadcast upper bits
1663         &mov        (&DWP(16*7+0,"esp"),$len);  # save $len%16
1664         &pshufd     ($twres,$twtmp,0x13);
1665         &pxor       ($twtmp,$twtmp);
1666         &movdqa     ($twmask,&QWP(16*6,"esp"));
1667         &paddq      ($tweak,$tweak);                # &psllq($tweak,1);
1668         &pand       ($twres,$twmask);               # isolate carry and residue
1669         &pcmpgtd($twtmp,$tweak);            # broadcast upper bits
1670         &pxor       ($tweak,$twres);
1671 
1672 &set_label("xts_dec_only_one_more");
1673         &pshufd     ($inout3,$twtmp,0x13);
1674         &movdqa     ($inout4,$tweak);               # put aside previous tweak
1675         &paddq      ($tweak,$tweak);                # &psllq($tweak,1);
1676         &pand       ($inout3,$twmask);              # isolate carry and residue
1677         &pxor       ($inout3,$tweak);
1678 
1679         &mov        ($key,$key_);                   # restore $key
1680         &mov        ($rounds,$rounds_);             # restore $rounds
1681 
1682         &movups     ($inout0,&QWP(0,$inp));             # load input
1683         &xorps      ($inout0,$inout3);              # input^=tweak
1684         if ($inline)
1685         {   &aesni_inline_generate1("dec"); }
1686         else
1687         {   &call   ("_aesni_decrypt1");    }
1688         &xorps      ($inout0,$inout3);              # output^=tweak
1689         &movups     (&QWP(0,$out),$inout0);             # write output
1690 
1691 &set_label("xts_dec_steal");
1692         &movz       ($rounds,&BP(16,$inp));
1693         &movz       ($key,&BP(0,$out));
1694         &lea        ($inp,&DWP(1,$inp));
1695         &mov        (&BP(0,$out),&LB($rounds));
1696         &mov        (&BP(16,$out),&LB($key));
1697         &lea        ($out,&DWP(1,$out));
1698         &sub        ($len,1);
1699         &jnz        (&label("xts_dec_steal"));
1700 
1701         &sub        ($out,&DWP(16*7+0,"esp"));  # rewind $out
1702         &mov        ($key,$key_);                   # restore $key
1703         &mov        ($rounds,$rounds_);             # restore $rounds
1704 
1705         &movups     ($inout0,&QWP(0,$out));             # load input
1706         &xorps      ($inout0,$inout4);              # input^=tweak
1707         if ($inline)
1708         {   &aesni_inline_generate1("dec"); }
1709         else
1710         {   &call   ("_aesni_decrypt1");    }
1711         &xorps      ($inout0,$inout4);              # output^=tweak
1712         &movups     (&QWP(0,$out),$inout0);             # write output
1713 
1714 &set_label("xts_dec_ret");
1715         &mov        ("esp",&DWP(16*7+4,"esp")); # restore %esp
1716 &function_end("aesni_xts_decrypt");
1717 }
1718 }
1719 
1720 ######################################################################
1721 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
1722 #                           size_t length, const AES_KEY *key,
1723 #                           unsigned char *ivp,const int enc);
1724 &function_begin("${PREFIX}_cbc_encrypt");
1725         &mov        ($inp,&wparam(0));
1726         &mov        ($rounds_,"esp");
1727         &mov        ($out,&wparam(1));
1728         &sub        ($rounds_,24);
1729         &mov        ($len,&wparam(2));
1730         &and        ($rounds_,-16);
1731         &mov        ($key,&wparam(3));
1732         &mov        ($key_,&wparam(4));
1733         &test       ($len,$len);
1734         &jz (&label("cbc_abort"));
1735 
1736         &cmp        (&wparam(5),0);
1737         &xchg       ($rounds_,"esp");               # alloca
1738         &movups     ($ivec,&QWP(0,$key_));              # load IV
1739         &mov        ($rounds,&DWP(240,$key));
1740         &mov        ($key_,$key);                   # backup $key
1741         &mov        (&DWP(16,"esp"),$rounds_);  # save original %esp
1742         &mov        ($rounds_,$rounds);             # backup $rounds
1743         &je (&label("cbc_decrypt"));
1744 
1745         &movaps     ($inout0,$ivec);
1746         &cmp        ($len,16);
1747         &jb (&label("cbc_enc_tail"));
1748         &sub        ($len,16);
1749         &jmp        (&label("cbc_enc_loop"));
1750 
1751 &set_label("cbc_enc_loop",16);
1752         &movups     ($ivec,&QWP(0,$inp));               # input actually
1753         &lea        ($inp,&DWP(16,$inp));
1754         if ($inline)
1755         {   &aesni_inline_generate1("enc",$inout0,$ivec);   }
1756         else
1757         {   &xorps($inout0,$ivec); &call("_aesni_encrypt1");    }
1758         &mov        ($rounds,$rounds_);     # restore $rounds
1759         &mov        ($key,$key_);           # restore $key
1760         &movups     (&QWP(0,$out),$inout0);     # store output
1761         &lea        ($out,&DWP(16,$out));
1762         &sub        ($len,16);
1763         &jnc        (&label("cbc_enc_loop"));
1764         &add        ($len,16);
1765         &jnz        (&label("cbc_enc_tail"));
1766         &movaps     ($ivec,$inout0);
1767         &jmp        (&label("cbc_ret"));
1768 
1769 &set_label("cbc_enc_tail");
1770         &mov        ("ecx",$len);           # zaps $rounds
1771         &data_word(0xA4F3F689);             # rep movsb
1772         &mov        ("ecx",16);             # zero tail
1773         &sub        ("ecx",$len);
1774         &xor        ("eax","eax");          # zaps $len
1775         &data_word(0xAAF3F689);             # rep stosb
1776         &lea        ($out,&DWP(-16,$out));      # rewind $out by 1 block
1777         &mov        ($rounds,$rounds_);     # restore $rounds
1778         &mov        ($inp,$out);            # $inp and $out are the same
1779         &mov        ($key,$key_);           # restore $key
1780         &jmp        (&label("cbc_enc_loop"));
1781 ######################################################################
1782 &set_label("cbc_decrypt",16);
1783         &cmp        ($len,0x50);
1784         &jbe        (&label("cbc_dec_tail"));
1785         &movaps     (&QWP(0,"esp"),$ivec);              # save IV
1786         &sub        ($len,0x50);
1787         &jmp        (&label("cbc_dec_loop6_enter"));
1788 
1789 &set_label("cbc_dec_loop6",16);
1790         &movaps     (&QWP(0,"esp"),$rndkey0);   # save IV
1791         &movups     (&QWP(0,$out),$inout5);
1792         &lea        ($out,&DWP(0x10,$out));
1793 &set_label("cbc_dec_loop6_enter");
1794         &movdqu     ($inout0,&QWP(0,$inp));
1795         &movdqu     ($inout1,&QWP(0x10,$inp));
1796         &movdqu     ($inout2,&QWP(0x20,$inp));
1797         &movdqu     ($inout3,&QWP(0x30,$inp));
1798         &movdqu     ($inout4,&QWP(0x40,$inp));
1799         &movdqu     ($inout5,&QWP(0x50,$inp));
1800 
1801         &call       ("_aesni_decrypt6");
1802 
1803         &movups     ($rndkey1,&QWP(0,$inp));
1804         &movups     ($rndkey0,&QWP(0x10,$inp));
1805         &xorps      ($inout0,&QWP(0,"esp"));    # ^=IV
1806         &xorps      ($inout1,$rndkey1);
1807         &movups     ($rndkey1,&QWP(0x20,$inp));
1808         &xorps      ($inout2,$rndkey0);
1809         &movups     ($rndkey0,&QWP(0x30,$inp));
1810         &xorps      ($inout3,$rndkey1);
1811         &movups     ($rndkey1,&QWP(0x40,$inp));
1812         &xorps      ($inout4,$rndkey0);
1813         &movups     ($rndkey0,&QWP(0x50,$inp)); # IV
1814         &xorps      ($inout5,$rndkey1);
1815         &movups     (&QWP(0,$out),$inout0);
1816         &movups     (&QWP(0x10,$out),$inout1);
1817         &lea        ($inp,&DWP(0x60,$inp));
1818         &movups     (&QWP(0x20,$out),$inout2);
1819         &mov        ($rounds,$rounds_)              # restore $rounds
1820         &movups     (&QWP(0x30,$out),$inout3);
1821         &mov        ($key,$key_);                   # restore $key
1822         &movups     (&QWP(0x40,$out),$inout4);
1823         &lea        ($out,&DWP(0x50,$out));
1824         &sub        ($len,0x60);
1825         &ja (&label("cbc_dec_loop6"));
1826 
1827         &movaps     ($inout0,$inout5);
1828         &movaps     ($ivec,$rndkey0);
1829         &add        ($len,0x50);
1830         &jle        (&label("cbc_dec_tail_collected"));
1831         &movups     (&QWP(0,$out),$inout0);
1832         &lea        ($out,&DWP(0x10,$out));
1833 &set_label("cbc_dec_tail");
1834         &movups     ($inout0,&QWP(0,$inp));
1835         &movaps     ($in0,$inout0);
1836         &cmp        ($len,0x10);
1837         &jbe        (&label("cbc_dec_one"));
1838 
1839         &movups     ($inout1,&QWP(0x10,$inp));
1840         &movaps     ($in1,$inout1);
1841         &cmp        ($len,0x20);
1842         &jbe        (&label("cbc_dec_two"));
1843 
1844         &movups     ($inout2,&QWP(0x20,$inp));
1845         &cmp        ($len,0x30);
1846         &jbe        (&label("cbc_dec_three"));
1847 
1848         &movups     ($inout3,&QWP(0x30,$inp));
1849         &cmp        ($len,0x40);
1850         &jbe        (&label("cbc_dec_four"));
1851 
1852         &movups     ($inout4,&QWP(0x40,$inp));
1853         &movaps     (&QWP(0,"esp"),$ivec);              # save IV
1854         &movups     ($inout0,&QWP(0,$inp));
1855         &xorps      ($inout5,$inout5);
1856         &call       ("_aesni_decrypt6");
1857         &movups     ($rndkey1,&QWP(0,$inp));
1858         &movups     ($rndkey0,&QWP(0x10,$inp));
1859         &xorps      ($inout0,&QWP(0,"esp"));    # ^= IV
1860         &xorps      ($inout1,$rndkey1);
1861         &movups     ($rndkey1,&QWP(0x20,$inp));
1862         &xorps      ($inout2,$rndkey0);
1863         &movups     ($rndkey0,&QWP(0x30,$inp));
1864         &xorps      ($inout3,$rndkey1);
1865         &movups     ($ivec,&QWP(0x40,$inp));    # IV
1866         &xorps      ($inout4,$rndkey0);
1867         &movups     (&QWP(0,$out),$inout0);
1868         &movups     (&QWP(0x10,$out),$inout1);
1869         &movups     (&QWP(0x20,$out),$inout2);
1870         &movups     (&QWP(0x30,$out),$inout3);
1871         &lea        ($out,&DWP(0x40,$out));
1872         &movaps     ($inout0,$inout4);
1873         &sub        ($len,0x50);
1874         &jmp        (&label("cbc_dec_tail_collected"));
1875 
1876 &set_label("cbc_dec_one",16);
1877         if ($inline)
1878         {   &aesni_inline_generate1("dec"); }
1879         else
1880         {   &call   ("_aesni_decrypt1");    }
1881         &xorps      ($inout0,$ivec);
1882         &movaps     ($ivec,$in0);
1883         &sub        ($len,0x10);
1884         &jmp        (&label("cbc_dec_tail_collected"));
1885 
1886 &set_label("cbc_dec_two",16);
1887         &xorps      ($inout2,$inout2);
1888         &call       ("_aesni_decrypt3");
1889         &xorps      ($inout0,$ivec);
1890         &xorps      ($inout1,$in0);
1891         &movups     (&QWP(0,$out),$inout0);
1892         &movaps     ($inout0,$inout1);
1893         &lea        ($out,&DWP(0x10,$out));
1894         &movaps     ($ivec,$in1);
1895         &sub        ($len,0x20);
1896         &jmp        (&label("cbc_dec_tail_collected"));
1897 
1898 &set_label("cbc_dec_three",16);
1899         &call       ("_aesni_decrypt3");
1900         &xorps      ($inout0,$ivec);
1901         &xorps      ($inout1,$in0);
1902         &xorps      ($inout2,$in1);
1903         &movups     (&QWP(0,$out),$inout0);
1904         &movaps     ($inout0,$inout2);
1905         &movups     (&QWP(0x10,$out),$inout1);
1906         &lea        ($out,&DWP(0x20,$out));
1907         &movups     ($ivec,&QWP(0x20,$inp));
1908         &sub        ($len,0x30);
1909         &jmp        (&label("cbc_dec_tail_collected"));
1910 
1911 &set_label("cbc_dec_four",16);
1912         &call       ("_aesni_decrypt4");
1913         &movups     ($rndkey1,&QWP(0x10,$inp));
1914         &movups     ($rndkey0,&QWP(0x20,$inp));
1915         &xorps      ($inout0,$ivec);
1916         &movups     ($ivec,&QWP(0x30,$inp));
1917         &xorps      ($inout1,$in0);
1918         &movups     (&QWP(0,$out),$inout0);
1919         &xorps      ($inout2,$rndkey1);
1920         &movups     (&QWP(0x10,$out),$inout1);
1921         &xorps      ($inout3,$rndkey0);
1922         &movups     (&QWP(0x20,$out),$inout2);
1923         &lea        ($out,&DWP(0x30,$out));
1924         &movaps     ($inout0,$inout3);
1925         &sub        ($len,0x40);
1926 
1927 &set_label("cbc_dec_tail_collected");
1928         &and        ($len,15);
1929         &jnz        (&label("cbc_dec_tail_partial"));
1930         &movups     (&QWP(0,$out),$inout0);
1931         &jmp        (&label("cbc_ret"));
1932 
1933 &set_label("cbc_dec_tail_partial",16);
1934         &movaps     (&QWP(0,"esp"),$inout0);
1935         &mov        ("ecx",16);
1936         &mov        ($inp,"esp");
1937         &sub        ("ecx",$len);
1938         &data_word(0xA4F3F689);             # rep movsb
1939 
1940 &set_label("cbc_ret");
1941         &mov        ("esp",&DWP(16,"esp"));     # pull original %esp
1942         &mov        ($key_,&wparam(4));
1943         &movups     (&QWP(0,$key_),$ivec);      # output IV
1944 &set_label("cbc_abort");
1945 &function_end("${PREFIX}_cbc_encrypt");
1946 
1947 ######################################################################
1948 # Mechanical port from aesni-x86_64.pl.
1949 #
1950 # _aesni_set_encrypt_key is private interface,
1951 # input:
1952 #       "eax"   const unsigned char *userKey
1953 #       $rounds int bits
1954 #       $key    AES_KEY *key
1955 # output:
1956 #       "eax"   return code
1957 #       $round  rounds
1958 
1959 &function_begin_B("_aesni_set_encrypt_key");
1960         &test       ("eax","eax");
1961         &jz (&label("bad_pointer"));
1962         &test       ($key,$key);
1963         &jz (&label("bad_pointer"));
1964 
1965         &movups     ("xmm0",&QWP(0,"eax"));     # pull first 128 bits of *userKey
1966         &xorps      ("xmm4","xmm4");        # low dword of xmm4 is assumed 0
1967         &lea        ($key,&DWP(16,$key));
1968         &cmp        ($rounds,256);
1969         &je (&label("14rounds"));
1970         &cmp        ($rounds,192);
1971         &je (&label("12rounds"));
1972         &cmp        ($rounds,128);
1973         &jne        (&label("bad_keybits"));
1974 
1975 &set_label("10rounds",16);
1976         &mov                ($rounds,9);
1977         &$movekey   (&QWP(-16,$key),"xmm0");    # round 0
1978         &aeskeygenassist("xmm1","xmm0",0x01);               # round 1
1979         &call               (&label("key_128_cold"));
1980         &aeskeygenassist("xmm1","xmm0",0x2);                # round 2
1981         &call               (&label("key_128"));
1982         &aeskeygenassist("xmm1","xmm0",0x04);               # round 3
1983         &call               (&label("key_128"));
1984         &aeskeygenassist("xmm1","xmm0",0x08);               # round 4
1985         &call               (&label("key_128"));
1986         &aeskeygenassist("xmm1","xmm0",0x10);               # round 5
1987         &call               (&label("key_128"));
1988         &aeskeygenassist("xmm1","xmm0",0x20);               # round 6
1989         &call               (&label("key_128"));
1990         &aeskeygenassist("xmm1","xmm0",0x40);               # round 7
1991         &call               (&label("key_128"));
1992         &aeskeygenassist("xmm1","xmm0",0x80);               # round 8
1993         &call               (&label("key_128"));
1994         &aeskeygenassist("xmm1","xmm0",0x1b);               # round 9
1995         &call               (&label("key_128"));
1996         &aeskeygenassist("xmm1","xmm0",0x36);               # round 10
1997         &call               (&label("key_128"));
1998         &$movekey   (&QWP(0,$key),"xmm0");
1999         &mov                (&DWP(80,$key),$rounds);
2000         &xor                ("eax","eax");
2001         &ret();
2002 
2003 &set_label("key_128",16);
2004         &$movekey   (&QWP(0,$key),"xmm0");
2005         &lea                ($key,&DWP(16,$key));
2006 &set_label("key_128_cold");
2007         &shufps             ("xmm4","xmm0",0b00010000);
2008         &xorps              ("xmm0","xmm4");
2009         &shufps             ("xmm4","xmm0",0b10001100);
2010         &xorps              ("xmm0","xmm4");
2011         &shufps             ("xmm1","xmm1",0b11111111);     # critical path
2012         &xorps              ("xmm0","xmm1");
2013         &ret();
2014 
2015 &set_label("12rounds",16);
2016         &movq               ("xmm2",&QWP(16,"eax"));    # remaining 1/3 of *userKey
2017         &mov                ($rounds,11);
2018         &$movekey   (&QWP(-16,$key),"xmm0")             # round 0
2019         &aeskeygenassist("xmm1","xmm2",0x01);               # round 1,2
2020         &call               (&label("key_192a_cold"));
2021         &aeskeygenassist("xmm1","xmm2",0x02);               # round 2,3
2022         &call               (&label("key_192b"));
2023         &aeskeygenassist("xmm1","xmm2",0x04);               # round 4,5
2024         &call               (&label("key_192a"));
2025         &aeskeygenassist("xmm1","xmm2",0x08);               # round 5,6
2026         &call               (&label("key_192b"));
2027         &aeskeygenassist("xmm1","xmm2",0x10);               # round 7,8
2028         &call               (&label("key_192a"));
2029         &aeskeygenassist("xmm1","xmm2",0x20);               # round 8,9
2030         &call               (&label("key_192b"));
2031         &aeskeygenassist("xmm1","xmm2",0x40);               # round 10,11
2032         &call               (&label("key_192a"));
2033         &aeskeygenassist("xmm1","xmm2",0x80);               # round 11,12
2034         &call               (&label("key_192b"));
2035         &$movekey   (&QWP(0,$key),"xmm0");
2036         &mov                (&DWP(48,$key),$rounds);
2037         &xor                ("eax","eax");
2038         &ret();
2039 
2040 &set_label("key_192a",16);
2041         &$movekey   (&QWP(0,$key),"xmm0");
2042         &lea                ($key,&DWP(16,$key));
2043 &set_label("key_192a_cold",16);
2044         &movaps             ("xmm5","xmm2");
2045 &set_label("key_192b_warm");
2046         &shufps             ("xmm4","xmm0",0b00010000);
2047         &movdqa             ("xmm3","xmm2");
2048         &xorps              ("xmm0","xmm4");
2049         &shufps             ("xmm4","xmm0",0b10001100);
2050         &pslldq             ("xmm3",4);
2051         &xorps              ("xmm0","xmm4");
2052         &pshufd             ("xmm1","xmm1",0b01010101);     # critical path
2053         &pxor               ("xmm2","xmm3");
2054         &pxor               ("xmm0","xmm1");
2055         &pshufd             ("xmm3","xmm0",0b11111111);
2056         &pxor               ("xmm2","xmm3");
2057         &ret();
2058 
2059 &set_label("key_192b",16);
2060         &movaps             ("xmm3","xmm0");
2061         &shufps             ("xmm5","xmm0",0b01000100);
2062         &$movekey   (&QWP(0,$key),"xmm5");
2063         &shufps             ("xmm3","xmm2",0b01001110);
2064         &$movekey   (&QWP(16,$key),"xmm3");
2065         &lea                ($key,&DWP(32,$key));
2066         &jmp                (&label("key_192b_warm"));
2067 
2068 &set_label("14rounds",16);
2069         &movups             ("xmm2",&QWP(16,"eax"));    # remaining half of *userKey
2070         &mov                ($rounds,13);
2071         &lea                ($key,&DWP(16,$key));
2072         &$movekey   (&QWP(-32,$key),"xmm0");    # round 0
2073         &$movekey   (&QWP(-16,$key),"xmm2");    # round 1
2074         &aeskeygenassist("xmm1","xmm2",0x01);               # round 2
2075         &call               (&label("key_256a_cold"));
2076         &aeskeygenassist("xmm1","xmm0",0x01);               # round 3
2077         &call               (&label("key_256b"));
2078         &aeskeygenassist("xmm1","xmm2",0x02);               # round 4
2079         &call               (&label("key_256a"));
2080         &aeskeygenassist("xmm1","xmm0",0x02);               # round 5
2081         &call               (&label("key_256b"));
2082         &aeskeygenassist("xmm1","xmm2",0x04);               # round 6
2083         &call               (&label("key_256a"));
2084         &aeskeygenassist("xmm1","xmm0",0x04);               # round 7
2085         &call               (&label("key_256b"));
2086         &aeskeygenassist("xmm1","xmm2",0x08);               # round 8
2087         &call               (&label("key_256a"));
2088         &aeskeygenassist("xmm1","xmm0",0x08);               # round 9
2089         &call               (&label("key_256b"));
2090         &aeskeygenassist("xmm1","xmm2",0x10);               # round 10
2091         &call               (&label("key_256a"));
2092         &aeskeygenassist("xmm1","xmm0",0x10);               # round 11
2093         &call               (&label("key_256b"));
2094         &aeskeygenassist("xmm1","xmm2",0x20);               # round 12
2095         &call               (&label("key_256a"));
2096         &aeskeygenassist("xmm1","xmm0",0x20);               # round 13
2097         &call               (&label("key_256b"));
2098         &aeskeygenassist("xmm1","xmm2",0x40);               # round 14
2099         &call               (&label("key_256a"));
2100         &$movekey   (&QWP(0,$key),"xmm0");
2101         &mov                (&DWP(16,$key),$rounds);
2102         &xor                ("eax","eax");
2103         &ret();
2104 
2105 &set_label("key_256a",16);
2106         &$movekey   (&QWP(0,$key),"xmm2");
2107         &lea                ($key,&DWP(16,$key));
2108 &set_label("key_256a_cold");
2109         &shufps             ("xmm4","xmm0",0b00010000);
2110         &xorps              ("xmm0","xmm4");
2111         &shufps             ("xmm4","xmm0",0b10001100);
2112         &xorps              ("xmm0","xmm4");
2113         &shufps             ("xmm1","xmm1",0b11111111);     # critical path
2114         &xorps              ("xmm0","xmm1");
2115         &ret();
2116 
2117 &set_label("key_256b",16);
2118         &$movekey   (&QWP(0,$key),"xmm0");
2119         &lea                ($key,&DWP(16,$key));
2120 
2121         &shufps             ("xmm4","xmm2",0b00010000);
2122         &xorps              ("xmm2","xmm4");
2123         &shufps             ("xmm4","xmm2",0b10001100);
2124         &xorps              ("xmm2","xmm4");
2125         &shufps             ("xmm1","xmm1",0b10101010);     # critical path
2126         &xorps              ("xmm2","xmm1");
2127         &ret();
2128 
2129 &set_label("bad_pointer",4);
2130         &mov        ("eax",-1);
2131         &ret        ();
2132 &set_label("bad_keybits",4);
2133         &mov        ("eax",-2);
2134         &ret        ();
2135 &function_end_B("_aesni_set_encrypt_key");
2136 
2137 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
2138 #                              AES_KEY *key)
2139 &function_begin_B("${PREFIX}_set_encrypt_key");
2140         &mov        ("eax",&wparam(0));
2141         &mov        ($rounds,&wparam(1));
2142         &mov        ($key,&wparam(2));
2143         &call       ("_aesni_set_encrypt_key");
2144         &ret        ();
2145 &function_end_B("${PREFIX}_set_encrypt_key");
2146 
2147 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
2148 #                              AES_KEY *key)
2149 &function_begin_B("${PREFIX}_set_decrypt_key");
2150         &mov        ("eax",&wparam(0));
2151         &mov        ($rounds,&wparam(1));
2152         &mov        ($key,&wparam(2));
2153         &call       ("_aesni_set_encrypt_key");
2154         &mov        ($key,&wparam(2));
2155         &shl        ($rounds,4)     # rounds-1 after _aesni_set_encrypt_key
2156         &test       ("eax","eax");
2157         &jnz        (&label("dec_key_ret"));
2158         &lea        ("eax",&DWP(16,$key,$rounds));      # end of key schedule
2159 
2160         &$movekey   ("xmm0",&QWP(0,$key));      # just swap
2161         &$movekey   ("xmm1",&QWP(0,"eax"));
2162         &$movekey   (&QWP(0,"eax"),"xmm0");
2163         &$movekey   (&QWP(0,$key),"xmm1");
2164         &lea                ($key,&DWP(16,$key));
2165         &lea                ("eax",&DWP(-16,"eax"));
2166 
2167 &set_label("dec_key_inverse");
2168         &$movekey   ("xmm0",&QWP(0,$key));      # swap and inverse
2169         &$movekey   ("xmm1",&QWP(0,"eax"));
2170         &aesimc             ("xmm0","xmm0");
2171         &aesimc             ("xmm1","xmm1");
2172         &lea                ($key,&DWP(16,$key));
2173         &lea                ("eax",&DWP(-16,"eax"));
2174         &$movekey   (&QWP(16,"eax"),"xmm0");
2175         &$movekey   (&QWP(-16,$key),"xmm1");
2176         &cmp                ("eax",$key);
2177         &ja         (&label("dec_key_inverse"));
2178 
2179         &$movekey   ("xmm0",&QWP(0,$key));      # inverse middle
2180         &aesimc             ("xmm0","xmm0");
2181         &$movekey   (&QWP(0,$key),"xmm0");
2182 
2183         &xor                ("eax","eax");          # return success
2184 &set_label("dec_key_ret");
2185         &ret        ();
2186 &function_end_B("${PREFIX}_set_decrypt_key");
2187 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
2188 
2189 &asm_finish();