1 #!/usr/bin/env perl
   2 
   3 ###################################################################
   4 ### AES-128 [originally in CTR mode]                            ###
   5 ### bitsliced implementation for Intel Core 2 processors        ###
   6 ### requires support of SSE extensions up to SSSE3              ###
   7 ### Author: Emilia Käsper and Peter Schwabe                    ###
   8 ### Date: 2009-03-19                                            ###
   9 ### Public domain                                               ###
  10 ###                                                             ###
  11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for    ###
  12 ### further information.                                        ###
  13 ###################################################################
  14 #
  15 # September 2011.
  16 #
  17 # Started as transliteration to "perlasm" the original code has
  18 # undergone following changes:
  19 #
  20 # - code was made position-independent;
  21 # - rounds were folded into a loop resulting in >5x size reduction
  22 #   from 12.5KB to 2.2KB;
  23 # - above was possibile thanks to mixcolumns() modification that
  24 #   allowed to feed its output back to aesenc[last], this was
  25 #   achieved at cost of two additional inter-registers moves;
  26 # - some instruction reordering and interleaving;
  27 # - this module doesn't implement key setup subroutine, instead it
  28 #   relies on conversion of "conventional" key schedule as returned
  29 #   by AES_set_encrypt_key (see discussion below);
  30 # - first and last round keys are treated differently, which allowed
  31 #   to skip one shiftrows(), reduce bit-sliced key schedule and
  32 #   speed-up conversion by 22%;
  33 # - support for 192- and 256-bit keys was added;
  34 #
  35 # Resulting performance in CPU cycles spent to encrypt one byte out
  36 # of 4096-byte buffer with 128-bit key is:
  37 #
  38 #               Emilia's        this(*)         difference
  39 #
  40 # Core 2        9.30            8.69            +7%
  41 # Nehalem(**)   7.63            6.98            +9%
  42 # Atom          17.1            17.4            -2%(***)
  43 #
  44 # (*)   Comparison is not completely fair, because "this" is ECB,
  45 #       i.e. no extra processing such as counter values calculation
  46 #       and xor-ing input as in Emilia's CTR implementation is
  47 #       performed. However, the CTR calculations stand for not more
  48 #       than 1% of total time, so comparison is *rather* fair.
  49 #
  50 # (**)  Results were collected on Westmere, which is considered to
  51 #       be equivalent to Nehalem for this code.
  52 #
  53 # (***) Slowdown on Atom is rather strange per se, because original
  54 #       implementation has a number of 9+-bytes instructions, which
  55 #       are bad for Atom front-end, and which I eliminated completely.
  56 #       In attempt to address deterioration sbox() was tested in FP
  57 #       SIMD "domain" (movaps instead of movdqa, xorps instead of
  58 #       pxor, etc.). While it resulted in nominal 4% improvement on
  59 #       Atom, it hurted Westmere by more than 2x factor.
  60 #
  61 # As for key schedule conversion subroutine. Interface to OpenSSL
  62 # relies on per-invocation on-the-fly conversion. This naturally
  63 # has impact on performance, especially for short inputs. Conversion
  64 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
  65 # function is:
  66 #
  67 #               conversion      conversion/8x block
  68 # Core 2        240             0.22
  69 # Nehalem       180             0.20
  70 # Atom          430             0.19
  71 #
  72 # The ratio values mean that 128-byte blocks will be processed
  73 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
  74 # etc. Then keep in mind that input sizes not divisible by 128 are
  75 # *effectively* slower, especially shortest ones, e.g. consecutive
  76 # 144-byte blocks are processed 44% slower than one would expect,
  77 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
  78 # it's still faster than ["hyper-threading-safe" code path in]
  79 # aes-x86_64.pl on all lengths above 64 bytes...
  80 #
  81 # October 2011.
  82 #
  83 # Add decryption procedure. Performance in CPU cycles spent to decrypt
  84 # one byte out of 4096-byte buffer with 128-bit key is:
  85 #
  86 # Core 2        9.83
  87 # Nehalem       7.74
  88 # Atom          19.0
  89 #
  90 # November 2011.
  91 #
  92 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
  93 # suboptimal, but XTS is meant to be used with larger blocks...
  94 #
  95 #                                               <appro@openssl.org>
  96 
  97 $flavour = shift;
  98 $output  = shift;
  99 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
 100 
 101 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
 102 
 103 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 104 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
 105 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 106 die "can't locate x86_64-xlate.pl";
 107 
 108 open OUT,"| \"$^X\" $xlate $flavour $output";
 109 *STDOUT=*OUT;
 110 
 111 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
 112 my @XMM=map("%xmm$_",(15,0..14));       # best on Atom, +10% over (0..15)
 113 my $ecb=0;      # suppress unreferenced ECB subroutines, spare some space...
 114 
 115 {
 116 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
 117 
 118 sub Sbox {
 119 # input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
 120 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
 121 my @b=@_[0..7];
 122 my @t=@_[8..11];
 123 my @s=@_[12..15];
 124         &InBasisChange      (@b);
 125         &Inv_GF256  (@b[6,5,0,3,7,1,4,2],@t,@s);
 126         &OutBasisChange     (@b[7,1,4,2,6,5,0,3]);
 127 }
 128 
 129 sub InBasisChange {
 130 # input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
 131 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
 132 my @b=@_[0..7];
 133 $code.=<<___;
 134         pxor    @b[6], @b[5]
 135         pxor    @b[1], @b[2]
 136         pxor    @b[0], @b[3]
 137         pxor    @b[2], @b[6]
 138         pxor    @b[0], @b[5]
 139 
 140         pxor    @b[3], @b[6]
 141         pxor    @b[7], @b[3]
 142         pxor    @b[5], @b[7]
 143         pxor    @b[4], @b[3]
 144         pxor    @b[5], @b[4]
 145         pxor    @b[1], @b[3]
 146 
 147         pxor    @b[7], @b[2]
 148         pxor    @b[5], @b[1]
 149 ___
 150 }
 151 
 152 sub OutBasisChange {
 153 # input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
 154 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
 155 my @b=@_[0..7];
 156 $code.=<<___;
 157         pxor    @b[6], @b[0]
 158         pxor    @b[4], @b[1]
 159         pxor    @b[0], @b[2]
 160         pxor    @b[6], @b[4]
 161         pxor    @b[1], @b[6]
 162 
 163         pxor    @b[5], @b[1]
 164         pxor    @b[3], @b[5]
 165         pxor    @b[7], @b[3]
 166         pxor    @b[5], @b[7]
 167         pxor    @b[5], @b[2]
 168 
 169         pxor    @b[7], @b[4]
 170 ___
 171 }
 172 
 173 sub InvSbox {
 174 # input in lsb  > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
 175 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
 176 my @b=@_[0..7];
 177 my @t=@_[8..11];
 178 my @s=@_[12..15];
 179         &InvInBasisChange   (@b);
 180         &Inv_GF256          (@b[5,1,2,6,3,7,0,4],@t,@s);
 181         &InvOutBasisChange  (@b[3,7,0,4,5,1,2,6]);
 182 }
 183 
 184 sub InvInBasisChange {          # OutBasisChange in reverse
 185 my @b=@_[5,1,2,6,3,7,0,4];
 186 $code.=<<___
 187         pxor    @b[7], @b[4]
 188 
 189         pxor    @b[5], @b[7]
 190         pxor    @b[5], @b[2]
 191         pxor    @b[7], @b[3]
 192         pxor    @b[3], @b[5]
 193         pxor    @b[5], @b[1]
 194 
 195         pxor    @b[1], @b[6]
 196         pxor    @b[0], @b[2]
 197         pxor    @b[6], @b[4]
 198         pxor    @b[6], @b[0]
 199         pxor    @b[4], @b[1]
 200 ___
 201 }
 202 
 203 sub InvOutBasisChange {         # InBasisChange in reverse
 204 my @b=@_[2,5,7,3,6,1,0,4];
 205 $code.=<<___;
 206         pxor    @b[5], @b[1]
 207         pxor    @b[7], @b[2]
 208 
 209         pxor    @b[1], @b[3]
 210         pxor    @b[5], @b[4]
 211         pxor    @b[5], @b[7]
 212         pxor    @b[4], @b[3]
 213          pxor   @b[0], @b[5]
 214         pxor    @b[7], @b[3]
 215          pxor   @b[2], @b[6]
 216          pxor   @b[1], @b[2]
 217         pxor    @b[3], @b[6]
 218 
 219         pxor    @b[0], @b[3]
 220         pxor    @b[6], @b[5]
 221 ___
 222 }
 223 
 224 sub Mul_GF4 {
 225 #;*************************************************************
 226 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
 227 #;*************************************************************
 228 my ($x0,$x1,$y0,$y1,$t0)=@_;
 229 $code.=<<___;
 230         movdqa  $y0, $t0
 231         pxor    $y1, $t0
 232         pand    $x0, $t0
 233         pxor    $x1, $x0
 234         pand    $y0, $x1
 235         pand    $y1, $x0
 236         pxor    $x1, $x0
 237         pxor    $t0, $x1
 238 ___
 239 }
 240 
 241 sub Mul_GF4_N {                         # not used, see next subroutine
 242 # multiply and scale by N
 243 my ($x0,$x1,$y0,$y1,$t0)=@_;
 244 $code.=<<___;
 245         movdqa  $y0, $t0
 246         pxor    $y1, $t0
 247         pand    $x0, $t0
 248         pxor    $x1, $x0
 249         pand    $y0, $x1
 250         pand    $y1, $x0
 251         pxor    $x0, $x1
 252         pxor    $t0, $x0
 253 ___
 254 }
 255 
 256 sub Mul_GF4_N_GF4 {
 257 # interleaved Mul_GF4_N and Mul_GF4
 258 my ($x0,$x1,$y0,$y1,$t0,
 259     $x2,$x3,$y2,$y3,$t1)=@_;
 260 $code.=<<___;
 261         movdqa  $y0, $t0
 262          movdqa $y2, $t1
 263         pxor    $y1, $t0
 264          pxor   $y3, $t1
 265         pand    $x0, $t0
 266          pand   $x2, $t1
 267         pxor    $x1, $x0
 268          pxor   $x3, $x2
 269         pand    $y0, $x1
 270          pand   $y2, $x3
 271         pand    $y1, $x0
 272          pand   $y3, $x2
 273         pxor    $x0, $x1
 274          pxor   $x3, $x2
 275         pxor    $t0, $x0
 276          pxor   $t1, $x3
 277 ___
 278 }
 279 sub Mul_GF16_2 {
 280 my @x=@_[0..7];
 281 my @y=@_[8..11];
 282 my @t=@_[12..15];
 283 $code.=<<___;
 284         movdqa  @x[0], @t[0]
 285         movdqa  @x[1], @t[1]
 286 ___
 287         &Mul_GF4    (@x[0], @x[1], @y[0], @y[1], @t[2]);
 288 $code.=<<___;
 289         pxor    @x[2], @t[0]
 290         pxor    @x[3], @t[1]
 291         pxor    @y[2], @y[0]
 292         pxor    @y[3], @y[1]
 293 ___
 294         Mul_GF4_N_GF4   (@t[0], @t[1], @y[0], @y[1], @t[3],
 295                          @x[2], @x[3], @y[2], @y[3], @t[2]);
 296 $code.=<<___;
 297         pxor    @t[0], @x[0]
 298         pxor    @t[0], @x[2]
 299         pxor    @t[1], @x[1]
 300         pxor    @t[1], @x[3]
 301 
 302         movdqa  @x[4], @t[0]
 303         movdqa  @x[5], @t[1]
 304         pxor    @x[6], @t[0]
 305         pxor    @x[7], @t[1]
 306 ___
 307         &Mul_GF4_N_GF4      (@t[0], @t[1], @y[0], @y[1], @t[3],
 308                          @x[6], @x[7], @y[2], @y[3], @t[2]);
 309 $code.=<<___;
 310         pxor    @y[2], @y[0]
 311         pxor    @y[3], @y[1]
 312 ___
 313         &Mul_GF4    (@x[4], @x[5], @y[0], @y[1], @t[3]);
 314 $code.=<<___;
 315         pxor    @t[0], @x[4]
 316         pxor    @t[0], @x[6]
 317         pxor    @t[1], @x[5]
 318         pxor    @t[1], @x[7]
 319 ___
 320 }
 321 sub Inv_GF256 {
 322 #;********************************************************************
 323 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
 324 #;********************************************************************
 325 my @x=@_[0..7];
 326 my @t=@_[8..11];
 327 my @s=@_[12..15];
 328 # direct optimizations from hardware
 329 $code.=<<___;
 330         movdqa  @x[4], @t[3]
 331         movdqa  @x[5], @t[2]
 332         movdqa  @x[1], @t[1]
 333         movdqa  @x[7], @s[1]
 334         movdqa  @x[0], @s[0]
 335 
 336         pxor    @x[6], @t[3]
 337         pxor    @x[7], @t[2]
 338         pxor    @x[3], @t[1]
 339          movdqa @t[3], @s[2]
 340         pxor    @x[6], @s[1]
 341          movdqa @t[2], @t[0]
 342         pxor    @x[2], @s[0]
 343          movdqa @t[3], @s[3]
 344 
 345         por     @t[1], @t[2]
 346         por     @s[0], @t[3]
 347         pxor    @t[0], @s[3]
 348         pand    @s[0], @s[2]
 349         pxor    @t[1], @s[0]
 350         pand    @t[1], @t[0]
 351         pand    @s[0], @s[3]
 352         movdqa  @x[3], @s[0]
 353         pxor    @x[2], @s[0]
 354         pand    @s[0], @s[1]
 355         pxor    @s[1], @t[3]
 356         pxor    @s[1], @t[2]
 357         movdqa  @x[4], @s[1]
 358         movdqa  @x[1], @s[0]
 359         pxor    @x[5], @s[1]
 360         pxor    @x[0], @s[0]
 361         movdqa  @s[1], @t[1]
 362         pand    @s[0], @s[1]
 363         por     @s[0], @t[1]
 364         pxor    @s[1], @t[0]
 365         pxor    @s[3], @t[3]
 366         pxor    @s[2], @t[2]
 367         pxor    @s[3], @t[1]
 368         movdqa  @x[7], @s[0]
 369         pxor    @s[2], @t[0]
 370         movdqa  @x[6], @s[1]
 371         pxor    @s[2], @t[1]
 372         movdqa  @x[5], @s[2]
 373         pand    @x[3], @s[0]
 374         movdqa  @x[4], @s[3]
 375         pand    @x[2], @s[1]
 376         pand    @x[1], @s[2]
 377         por     @x[0], @s[3]
 378         pxor    @s[0], @t[3]
 379         pxor    @s[1], @t[2]
 380         pxor    @s[2], @t[1]
 381         pxor    @s[3], @t[0]
 382 
 383         #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
 384 
 385         # new smaller inversion
 386 
 387         movdqa  @t[3], @s[0]
 388         pand    @t[1], @t[3]
 389         pxor    @t[2], @s[0]
 390 
 391         movdqa  @t[0], @s[2]
 392         movdqa  @s[0], @s[3]
 393         pxor    @t[3], @s[2]
 394         pand    @s[2], @s[3]
 395 
 396         movdqa  @t[1], @s[1]
 397         pxor    @t[2], @s[3]
 398         pxor    @t[0], @s[1]
 399 
 400         pxor    @t[2], @t[3]
 401 
 402         pand    @t[3], @s[1]
 403 
 404         movdqa  @s[2], @t[2]
 405         pxor    @t[0], @s[1]
 406 
 407         pxor    @s[1], @t[2]
 408         pxor    @s[1], @t[1]
 409 
 410         pand    @t[0], @t[2]
 411 
 412         pxor    @t[2], @s[2]
 413         pxor    @t[2], @t[1]
 414 
 415         pand    @s[3], @s[2]
 416 
 417         pxor    @s[0], @s[2]
 418 ___
 419 # output in s3, s2, s1, t1
 420 
 421 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
 422 
 423 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
 424         &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
 425 
 426 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
 427 }
 428 
 429 # AES linear components
 430 
 431 sub ShiftRows {
 432 my @x=@_[0..7];
 433 my $mask=pop;
 434 $code.=<<___;
 435         pxor    0x00($key),@x[0]
 436         pxor    0x10($key),@x[1]
 437         pshufb  $mask,@x[0]
 438         pxor    0x20($key),@x[2]
 439         pshufb  $mask,@x[1]
 440         pxor    0x30($key),@x[3]
 441         pshufb  $mask,@x[2]
 442         pxor    0x40($key),@x[4]
 443         pshufb  $mask,@x[3]
 444         pxor    0x50($key),@x[5]
 445         pshufb  $mask,@x[4]
 446         pxor    0x60($key),@x[6]
 447         pshufb  $mask,@x[5]
 448         pxor    0x70($key),@x[7]
 449         pshufb  $mask,@x[6]
 450         lea     0x80($key),$key
 451         pshufb  $mask,@x[7]
 452 ___
 453 }
 454 
 455 sub MixColumns {
 456 # modified to emit output in order suitable for feeding back to aesenc[last]
 457 my @x=@_[0..7];
 458 my @t=@_[8..15];
 459 my $inv=@_[16]; # optional
 460 $code.=<<___;
 461         pshufd  \$0x93, @x[0], @t[0]    # x0 <<< 32
 462         pshufd  \$0x93, @x[1], @t[1]
 463          pxor   @t[0], @x[0]            # x0 ^ (x0 <<< 32)
 464         pshufd  \$0x93, @x[2], @t[2]
 465          pxor   @t[1], @x[1]
 466         pshufd  \$0x93, @x[3], @t[3]
 467          pxor   @t[2], @x[2]
 468         pshufd  \$0x93, @x[4], @t[4]
 469          pxor   @t[3], @x[3]
 470         pshufd  \$0x93, @x[5], @t[5]
 471          pxor   @t[4], @x[4]
 472         pshufd  \$0x93, @x[6], @t[6]
 473          pxor   @t[5], @x[5]
 474         pshufd  \$0x93, @x[7], @t[7]
 475          pxor   @t[6], @x[6]
 476          pxor   @t[7], @x[7]
 477 
 478         pxor    @x[0], @t[1]
 479         pxor    @x[7], @t[0]
 480         pxor    @x[7], @t[1]
 481          pshufd \$0x4E, @x[0], @x[0]    # (x0 ^ (x0 <<< 32)) <<< 64)
 482         pxor    @x[1], @t[2]
 483          pshufd \$0x4E, @x[1], @x[1]
 484         pxor    @x[4], @t[5]
 485          pxor   @t[0], @x[0]
 486         pxor    @x[5], @t[6]
 487          pxor   @t[1], @x[1]
 488         pxor    @x[3], @t[4]
 489          pshufd \$0x4E, @x[4], @t[0]
 490         pxor    @x[6], @t[7]
 491          pshufd \$0x4E, @x[5], @t[1]
 492         pxor    @x[2], @t[3]
 493          pshufd \$0x4E, @x[3], @x[4]
 494         pxor    @x[7], @t[3]
 495          pshufd \$0x4E, @x[7], @x[5]
 496         pxor    @x[7], @t[4]
 497          pshufd \$0x4E, @x[6], @x[3]
 498         pxor    @t[4], @t[0]
 499          pshufd \$0x4E, @x[2], @x[6]
 500         pxor    @t[5], @t[1]
 501 ___
 502 $code.=<<___ if (!$inv);
 503         pxor    @t[3], @x[4]
 504         pxor    @t[7], @x[5]
 505         pxor    @t[6], @x[3]
 506          movdqa @t[0], @x[2]
 507         pxor    @t[2], @x[6]
 508          movdqa @t[1], @x[7]
 509 ___
 510 $code.=<<___ if ($inv);
 511         pxor    @x[4], @t[3]
 512         pxor    @t[7], @x[5]
 513         pxor    @x[3], @t[6]
 514          movdqa @t[0], @x[3]
 515         pxor    @t[2], @x[6]
 516          movdqa @t[6], @x[2]
 517          movdqa @t[1], @x[7]
 518          movdqa @x[6], @x[4]
 519          movdqa @t[3], @x[6]
 520 ___
 521 }
 522 
 523 sub InvMixColumns_orig {
 524 my @x=@_[0..7];
 525 my @t=@_[8..15];
 526 
 527 $code.=<<___;
 528         # multiplication by 0x0e
 529         pshufd  \$0x93, @x[7], @t[7]
 530         movdqa  @x[2], @t[2]
 531         pxor    @x[5], @x[7]            # 7 5
 532         pxor    @x[5], @x[2]            # 2 5
 533         pshufd  \$0x93, @x[0], @t[0]
 534         movdqa  @x[5], @t[5]
 535         pxor    @x[0], @x[5]            # 5 0           [1]
 536         pxor    @x[1], @x[0]            # 0 1
 537         pshufd  \$0x93, @x[1], @t[1]
 538         pxor    @x[2], @x[1]            # 1 25
 539         pxor    @x[6], @x[0]            # 01 6          [2]
 540         pxor    @x[3], @x[1]            # 125 3         [4]
 541         pshufd  \$0x93, @x[3], @t[3]
 542         pxor    @x[0], @x[2]            # 25 016        [3]
 543         pxor    @x[7], @x[3]            # 3 75
 544         pxor    @x[6], @x[7]            # 75 6          [0]
 545         pshufd  \$0x93, @x[6], @t[6]
 546         movdqa  @x[4], @t[4]
 547         pxor    @x[4], @x[6]            # 6 4
 548         pxor    @x[3], @x[4]            # 4 375         [6]
 549         pxor    @x[7], @x[3]            # 375 756=36
 550         pxor    @t[5], @x[6]            # 64 5          [7]
 551         pxor    @t[2], @x[3]            # 36 2
 552         pxor    @t[4], @x[3]            # 362 4         [5]
 553         pshufd  \$0x93, @t[5], @t[5]
 554 ___
 555                                         my @y = @x[7,5,0,2,1,3,4,6];
 556 $code.=<<___;
 557         # multiplication by 0x0b
 558         pxor    @y[0], @y[1]
 559         pxor    @t[0], @y[0]
 560         pxor    @t[1], @y[1]
 561         pshufd  \$0x93, @t[2], @t[2]
 562         pxor    @t[5], @y[0]
 563         pxor    @t[6], @y[1]
 564         pxor    @t[7], @y[0]
 565         pshufd  \$0x93, @t[4], @t[4]
 566         pxor    @t[6], @t[7]            # clobber t[7]
 567         pxor    @y[0], @y[1]
 568 
 569         pxor    @t[0], @y[3]
 570         pshufd  \$0x93, @t[0], @t[0]
 571         pxor    @t[1], @y[2]
 572         pxor    @t[1], @y[4]
 573         pxor    @t[2], @y[2]
 574         pshufd  \$0x93, @t[1], @t[1]
 575         pxor    @t[2], @y[3]
 576         pxor    @t[2], @y[5]
 577         pxor    @t[7], @y[2]
 578         pshufd  \$0x93, @t[2], @t[2]
 579         pxor    @t[3], @y[3]
 580         pxor    @t[3], @y[6]
 581         pxor    @t[3], @y[4]
 582         pshufd  \$0x93, @t[3], @t[3]
 583         pxor    @t[4], @y[7]
 584         pxor    @t[4], @y[5]
 585         pxor    @t[7], @y[7]
 586         pxor    @t[5], @y[3]
 587         pxor    @t[4], @y[4]
 588         pxor    @t[5], @t[7]            # clobber t[7] even more
 589 
 590         pxor    @t[7], @y[5]
 591         pshufd  \$0x93, @t[4], @t[4]
 592         pxor    @t[7], @y[6]
 593         pxor    @t[7], @y[4]
 594 
 595         pxor    @t[5], @t[7]
 596         pshufd  \$0x93, @t[5], @t[5]
 597         pxor    @t[6], @t[7]            # restore t[7]
 598 
 599         # multiplication by 0x0d
 600         pxor    @y[7], @y[4]
 601         pxor    @t[4], @y[7]
 602         pshufd  \$0x93, @t[6], @t[6]
 603         pxor    @t[0], @y[2]
 604         pxor    @t[5], @y[7]
 605         pxor    @t[2], @y[2]
 606         pshufd  \$0x93, @t[7], @t[7]
 607 
 608         pxor    @y[1], @y[3]
 609         pxor    @t[1], @y[1]
 610         pxor    @t[0], @y[0]
 611         pxor    @t[0], @y[3]
 612         pxor    @t[5], @y[1]
 613         pxor    @t[5], @y[0]
 614         pxor    @t[7], @y[1]
 615         pshufd  \$0x93, @t[0], @t[0]
 616         pxor    @t[6], @y[0]
 617         pxor    @y[1], @y[3]
 618         pxor    @t[1], @y[4]
 619         pshufd  \$0x93, @t[1], @t[1]
 620 
 621         pxor    @t[7], @y[7]
 622         pxor    @t[2], @y[4]
 623         pxor    @t[2], @y[5]
 624         pshufd  \$0x93, @t[2], @t[2]
 625         pxor    @t[6], @y[2]
 626         pxor    @t[3], @t[6]            # clobber t[6]
 627         pxor    @y[7], @y[4]
 628         pxor    @t[6], @y[3]
 629 
 630         pxor    @t[6], @y[6]
 631         pxor    @t[5], @y[5]
 632         pxor    @t[4], @y[6]
 633         pshufd  \$0x93, @t[4], @t[4]
 634         pxor    @t[6], @y[5]
 635         pxor    @t[7], @y[6]
 636         pxor    @t[3], @t[6]            # restore t[6]
 637 
 638         pshufd  \$0x93, @t[5], @t[5]
 639         pshufd  \$0x93, @t[6], @t[6]
 640         pshufd  \$0x93, @t[7], @t[7]
 641         pshufd  \$0x93, @t[3], @t[3]
 642 
 643         # multiplication by 0x09
 644         pxor    @y[1], @y[4]
 645         pxor    @y[1], @t[1]            # t[1]=y[1]
 646         pxor    @t[5], @t[0]            # clobber t[0]
 647         pxor    @t[5], @t[1]
 648         pxor    @t[0], @y[3]
 649         pxor    @y[0], @t[0]            # t[0]=y[0]
 650         pxor    @t[6], @t[1]
 651         pxor    @t[7], @t[6]            # clobber t[6]
 652         pxor    @t[1], @y[4]
 653         pxor    @t[4], @y[7]
 654         pxor    @y[4], @t[4]            # t[4]=y[4]
 655         pxor    @t[3], @y[6]
 656         pxor    @y[3], @t[3]            # t[3]=y[3]
 657         pxor    @t[2], @y[5]
 658         pxor    @y[2], @t[2]            # t[2]=y[2]
 659         pxor    @t[7], @t[3]
 660         pxor    @y[5], @t[5]            # t[5]=y[5]
 661         pxor    @t[6], @t[2]
 662         pxor    @t[6], @t[5]
 663         pxor    @y[6], @t[6]            # t[6]=y[6]
 664         pxor    @y[7], @t[7]            # t[7]=y[7]
 665 
 666         movdqa  @t[0],@XMM[0]
 667         movdqa  @t[1],@XMM[1]
 668         movdqa  @t[2],@XMM[2]
 669         movdqa  @t[3],@XMM[3]
 670         movdqa  @t[4],@XMM[4]
 671         movdqa  @t[5],@XMM[5]
 672         movdqa  @t[6],@XMM[6]
 673         movdqa  @t[7],@XMM[7]
 674 ___
 675 }
 676 
 677 sub InvMixColumns {
 678 my @x=@_[0..7];
 679 my @t=@_[8..15];
 680 
 681 # Thanks to Jussi Kivilinna for providing pointer to
 682 #
 683 # | 0e 0b 0d 09 |   | 02 03 01 01 |   | 05 00 04 00 |
 684 # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
 685 # | 0d 09 0e 0b |   | 01 01 02 03 |   | 04 00 05 00 |
 686 # | 0b 0d 09 0e |   | 03 01 01 02 |   | 00 04 00 05 |
 687 
 688 $code.=<<___;
 689         # multiplication by 0x05-0x00-0x04-0x00
 690         pshufd  \$0x4E, @x[0], @t[0]
 691         pshufd  \$0x4E, @x[6], @t[6]
 692         pxor    @x[0], @t[0]
 693         pshufd  \$0x4E, @x[7], @t[7]
 694         pxor    @x[6], @t[6]
 695         pshufd  \$0x4E, @x[1], @t[1]
 696         pxor    @x[7], @t[7]
 697         pshufd  \$0x4E, @x[2], @t[2]
 698         pxor    @x[1], @t[1]
 699         pshufd  \$0x4E, @x[3], @t[3]
 700         pxor    @x[2], @t[2]
 701          pxor   @t[6], @x[0]
 702          pxor   @t[6], @x[1]
 703         pshufd  \$0x4E, @x[4], @t[4]
 704         pxor    @x[3], @t[3]
 705          pxor   @t[0], @x[2]
 706          pxor   @t[1], @x[3]
 707         pshufd  \$0x4E, @x[5], @t[5]
 708         pxor    @x[4], @t[4]
 709          pxor   @t[7], @x[1]
 710          pxor   @t[2], @x[4]
 711         pxor    @x[5], @t[5]
 712 
 713          pxor   @t[7], @x[2]
 714          pxor   @t[6], @x[3]
 715          pxor   @t[6], @x[4]
 716          pxor   @t[3], @x[5]
 717          pxor   @t[4], @x[6]
 718          pxor   @t[7], @x[4]
 719          pxor   @t[7], @x[5]
 720          pxor   @t[5], @x[7]
 721 ___
 722         &MixColumns (@x,@t,1);      # flipped 2<->3 and 4<->6
 723 }
 724 
 725 sub aesenc {                            # not used
 726 my @b=@_[0..7];
 727 my @t=@_[8..15];
 728 $code.=<<___;
 729         movdqa  0x30($const),@t[0]      # .LSR
 730 ___
 731         &ShiftRows  (@b,@t[0]);
 732         &Sbox               (@b,@t);
 733         &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
 734 }
 735 
 736 sub aesenclast {                        # not used
 737 my @b=@_[0..7];
 738 my @t=@_[8..15];
 739 $code.=<<___;
 740         movdqa  0x40($const),@t[0]      # .LSRM0
 741 ___
 742         &ShiftRows  (@b,@t[0]);
 743         &Sbox               (@b,@t);
 744 $code.=<<___
 745         pxor    0x00($key),@b[0]
 746         pxor    0x10($key),@b[1]
 747         pxor    0x20($key),@b[4]
 748         pxor    0x30($key),@b[6]
 749         pxor    0x40($key),@b[3]
 750         pxor    0x50($key),@b[7]
 751         pxor    0x60($key),@b[2]
 752         pxor    0x70($key),@b[5]
 753 ___
 754 }
 755 
 756 sub swapmove {
 757 my ($a,$b,$n,$mask,$t)=@_;
 758 $code.=<<___;
 759         movdqa  $b,$t
 760         psrlq   \$$n,$b
 761         pxor    $a,$b
 762         pand    $mask,$b
 763         pxor    $b,$a
 764         psllq   \$$n,$b
 765         pxor    $t,$b
 766 ___
 767 }
 768 sub swapmove2x {
 769 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
 770 $code.=<<___;
 771         movdqa  $b0,$t0
 772         psrlq   \$$n,$b0
 773          movdqa $b1,$t1
 774          psrlq  \$$n,$b1
 775         pxor    $a0,$b0
 776          pxor   $a1,$b1
 777         pand    $mask,$b0
 778          pand   $mask,$b1
 779         pxor    $b0,$a0
 780         psllq   \$$n,$b0
 781          pxor   $b1,$a1
 782          psllq  \$$n,$b1
 783         pxor    $t0,$b0
 784          pxor   $t1,$b1
 785 ___
 786 }
 787 
 788 sub bitslice {
 789 my @x=reverse(@_[0..7]);
 790 my ($t0,$t1,$t2,$t3)=@_[8..11];
 791 $code.=<<___;
 792         movdqa  0x00($const),$t0        # .LBS0
 793         movdqa  0x10($const),$t1        # .LBS1
 794 ___
 795         &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
 796         &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
 797 $code.=<<___;
 798         movdqa  0x20($const),$t0        # .LBS2
 799 ___
 800         &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
 801         &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
 802 
 803         &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
 804         &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
 805 }
 806 
 807 $code.=<<___;
 808 .text
 809 
 810 .extern asm_AES_encrypt
 811 .extern asm_AES_decrypt
 812 
 813 .type   _bsaes_encrypt8,\@abi-omnipotent
 814 .align  64
 815 _bsaes_encrypt8:
 816         lea     .LBS0(%rip), $const     # constants table
 817 
 818         movdqa  ($key), @XMM[9]         # round 0 key
 819         lea     0x10($key), $key
 820         movdqa  0x50($const), @XMM[8]   # .LM0SR
 821         pxor    @XMM[9], @XMM[0]        # xor with round0 key
 822         pxor    @XMM[9], @XMM[1]
 823          pshufb @XMM[8], @XMM[0]
 824         pxor    @XMM[9], @XMM[2]
 825          pshufb @XMM[8], @XMM[1]
 826         pxor    @XMM[9], @XMM[3]
 827          pshufb @XMM[8], @XMM[2]
 828         pxor    @XMM[9], @XMM[4]
 829          pshufb @XMM[8], @XMM[3]
 830         pxor    @XMM[9], @XMM[5]
 831          pshufb @XMM[8], @XMM[4]
 832         pxor    @XMM[9], @XMM[6]
 833          pshufb @XMM[8], @XMM[5]
 834         pxor    @XMM[9], @XMM[7]
 835          pshufb @XMM[8], @XMM[6]
 836          pshufb @XMM[8], @XMM[7]
 837 _bsaes_encrypt8_bitslice:
 838 ___
 839         &bitslice   (@XMM[0..7, 8..11]);
 840 $code.=<<___;
 841         dec     $rounds
 842         jmp     .Lenc_sbox
 843 .align  16
 844 .Lenc_loop:
 845 ___
 846         &ShiftRows  (@XMM[0..7, 8]);
 847 $code.=".Lenc_sbox:\n";
 848         &Sbox               (@XMM[0..7, 8..15]);
 849 $code.=<<___;
 850         dec     $rounds
 851         jl      .Lenc_done
 852 ___
 853         &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
 854 $code.=<<___;
 855         movdqa  0x30($const), @XMM[8]   # .LSR
 856         jnz     .Lenc_loop
 857         movdqa  0x40($const), @XMM[8]   # .LSRM0
 858         jmp     .Lenc_loop
 859 .align  16
 860 .Lenc_done:
 861 ___
 862         # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
 863         &bitslice   (@XMM[0,1,4,6,3,7,2,5, 8..11]);
 864 $code.=<<___;
 865         movdqa  ($key), @XMM[8]         # last round key
 866         pxor    @XMM[8], @XMM[4]
 867         pxor    @XMM[8], @XMM[6]
 868         pxor    @XMM[8], @XMM[3]
 869         pxor    @XMM[8], @XMM[7]
 870         pxor    @XMM[8], @XMM[2]
 871         pxor    @XMM[8], @XMM[5]
 872         pxor    @XMM[8], @XMM[0]
 873         pxor    @XMM[8], @XMM[1]
 874         ret
 875 .size   _bsaes_encrypt8,.-_bsaes_encrypt8
 876 
 877 .type   _bsaes_decrypt8,\@abi-omnipotent
 878 .align  64
 879 _bsaes_decrypt8:
 880         lea     .LBS0(%rip), $const     # constants table
 881 
 882         movdqa  ($key), @XMM[9]         # round 0 key
 883         lea     0x10($key), $key
 884         movdqa  -0x30($const), @XMM[8]  # .LM0ISR
 885         pxor    @XMM[9], @XMM[0]        # xor with round0 key
 886         pxor    @XMM[9], @XMM[1]
 887          pshufb @XMM[8], @XMM[0]
 888         pxor    @XMM[9], @XMM[2]
 889          pshufb @XMM[8], @XMM[1]
 890         pxor    @XMM[9], @XMM[3]
 891          pshufb @XMM[8], @XMM[2]
 892         pxor    @XMM[9], @XMM[4]
 893          pshufb @XMM[8], @XMM[3]
 894         pxor    @XMM[9], @XMM[5]
 895          pshufb @XMM[8], @XMM[4]
 896         pxor    @XMM[9], @XMM[6]
 897          pshufb @XMM[8], @XMM[5]
 898         pxor    @XMM[9], @XMM[7]
 899          pshufb @XMM[8], @XMM[6]
 900          pshufb @XMM[8], @XMM[7]
 901 ___
 902         &bitslice   (@XMM[0..7, 8..11]);
 903 $code.=<<___;
 904         dec     $rounds
 905         jmp     .Ldec_sbox
 906 .align  16
 907 .Ldec_loop:
 908 ___
 909         &ShiftRows  (@XMM[0..7, 8]);
 910 $code.=".Ldec_sbox:\n";
 911         &InvSbox    (@XMM[0..7, 8..15]);
 912 $code.=<<___;
 913         dec     $rounds
 914         jl      .Ldec_done
 915 ___
 916         &InvMixColumns      (@XMM[0,1,6,4,2,7,3,5, 8..15]);
 917 $code.=<<___;
 918         movdqa  -0x10($const), @XMM[8]  # .LISR
 919         jnz     .Ldec_loop
 920         movdqa  -0x20($const), @XMM[8]  # .LISRM0
 921         jmp     .Ldec_loop
 922 .align  16
 923 .Ldec_done:
 924 ___
 925         &bitslice   (@XMM[0,1,6,4,2,7,3,5, 8..11]);
 926 $code.=<<___;
 927         movdqa  ($key), @XMM[8]         # last round key
 928         pxor    @XMM[8], @XMM[6]
 929         pxor    @XMM[8], @XMM[4]
 930         pxor    @XMM[8], @XMM[2]
 931         pxor    @XMM[8], @XMM[7]
 932         pxor    @XMM[8], @XMM[3]
 933         pxor    @XMM[8], @XMM[5]
 934         pxor    @XMM[8], @XMM[0]
 935         pxor    @XMM[8], @XMM[1]
 936         ret
 937 .size   _bsaes_decrypt8,.-_bsaes_decrypt8
 938 ___
 939 }
 940 {
 941 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
 942 
 943 sub bitslice_key {
 944 my @x=reverse(@_[0..7]);
 945 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
 946 
 947         &swapmove   (@x[0,1],1,$bs0,$t2,$t3);
 948 $code.=<<___;
 949         #&swapmove(@x[2,3],1,$t0,$t2,$t3);
 950         movdqa  @x[0], @x[2]
 951         movdqa  @x[1], @x[3]
 952 ___
 953         #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
 954 
 955         &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
 956 $code.=<<___;
 957         #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
 958         movdqa  @x[0], @x[4]
 959         movdqa  @x[2], @x[6]
 960         movdqa  @x[1], @x[5]
 961         movdqa  @x[3], @x[7]
 962 ___
 963         &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
 964         &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
 965 }
 966 
 967 $code.=<<___;
 968 .type   _bsaes_key_convert,\@abi-omnipotent
 969 .align  16
 970 _bsaes_key_convert:
 971         lea     .Lmasks(%rip), $const
 972         movdqu  ($inp), %xmm7           # load round 0 key
 973         lea     0x10($inp), $inp
 974         movdqa  0x00($const), %xmm0     # 0x01...
 975         movdqa  0x10($const), %xmm1     # 0x02...
 976         movdqa  0x20($const), %xmm2     # 0x04...
 977         movdqa  0x30($const), %xmm3     # 0x08...
 978         movdqa  0x40($const), %xmm4     # .LM0
 979         pcmpeqd %xmm5, %xmm5            # .LNOT
 980 
 981         movdqu  ($inp), %xmm6           # load round 1 key
 982         movdqa  %xmm7, ($out)           # save round 0 key
 983         lea     0x10($out), $out
 984         dec     $rounds
 985         jmp     .Lkey_loop
 986 .align  16
 987 .Lkey_loop:
 988         pshufb  %xmm4, %xmm6            # .LM0
 989 
 990         movdqa  %xmm0,  %xmm8
 991         movdqa  %xmm1,  %xmm9
 992 
 993         pand    %xmm6,  %xmm8
 994         pand    %xmm6,  %xmm9
 995         movdqa  %xmm2,  %xmm10
 996         pcmpeqb %xmm0,  %xmm8
 997         psllq   \$4,    %xmm0           # 0x10...
 998         movdqa  %xmm3,  %xmm11
 999         pcmpeqb %xmm1,  %xmm9
1000         psllq   \$4,    %xmm1           # 0x20...
1001 
1002         pand    %xmm6,  %xmm10
1003         pand    %xmm6,  %xmm11
1004         movdqa  %xmm0,  %xmm12
1005         pcmpeqb %xmm2,  %xmm10
1006         psllq   \$4,    %xmm2           # 0x40...
1007         movdqa  %xmm1,  %xmm13
1008         pcmpeqb %xmm3,  %xmm11
1009         psllq   \$4,    %xmm3           # 0x80...
1010 
1011         movdqa  %xmm2,  %xmm14
1012         movdqa  %xmm3,  %xmm15
1013          pxor   %xmm5,  %xmm8           # "pnot"
1014          pxor   %xmm5,  %xmm9
1015 
1016         pand    %xmm6,  %xmm12
1017         pand    %xmm6,  %xmm13
1018          movdqa %xmm8, 0x00($out)       # write bit-sliced round key
1019         pcmpeqb %xmm0,  %xmm12
1020         psrlq   \$4,    %xmm0           # 0x01...
1021          movdqa %xmm9, 0x10($out)
1022         pcmpeqb %xmm1,  %xmm13
1023         psrlq   \$4,    %xmm1           # 0x02...
1024          lea    0x10($inp), $inp
1025 
1026         pand    %xmm6,  %xmm14
1027         pand    %xmm6,  %xmm15
1028          movdqa %xmm10, 0x20($out)
1029         pcmpeqb %xmm2,  %xmm14
1030         psrlq   \$4,    %xmm2           # 0x04...
1031          movdqa %xmm11, 0x30($out)
1032         pcmpeqb %xmm3,  %xmm15
1033         psrlq   \$4,    %xmm3           # 0x08...
1034          movdqu ($inp), %xmm6           # load next round key
1035 
1036         pxor    %xmm5, %xmm13           # "pnot"
1037         pxor    %xmm5, %xmm14
1038         movdqa  %xmm12, 0x40($out)
1039         movdqa  %xmm13, 0x50($out)
1040         movdqa  %xmm14, 0x60($out)
1041         movdqa  %xmm15, 0x70($out)
1042         lea     0x80($out),$out
1043         dec     $rounds
1044         jnz     .Lkey_loop
1045 
1046         movdqa  0x50($const), %xmm7     # .L63
1047         #movdqa %xmm6, ($out)           # don't save last round key
1048         ret
1049 .size   _bsaes_key_convert,.-_bsaes_key_convert
1050 ___
1051 }
1052 
1053 if (0 && !$win64) {     # following four functions are unsupported interface
1054                         # used for benchmarking...
1055 $code.=<<___;
1056 .globl  bsaes_enc_key_convert
1057 .type   bsaes_enc_key_convert,\@function,2
1058 .align  16
1059 bsaes_enc_key_convert:
1060         mov     240($inp),%r10d         # pass rounds
1061         mov     $inp,%rcx               # pass key
1062         mov     $out,%rax               # pass key schedule
1063         call    _bsaes_key_convert
1064         pxor    %xmm6,%xmm7             # fix up last round key
1065         movdqa  %xmm7,(%rax)            # save last round key
1066         ret
1067 .size   bsaes_enc_key_convert,.-bsaes_enc_key_convert
1068 
1069 .globl  bsaes_encrypt_128
1070 .type   bsaes_encrypt_128,\@function,4
1071 .align  16
1072 bsaes_encrypt_128:
1073 .Lenc128_loop:
1074         movdqu  0x00($inp), @XMM[0]     # load input
1075         movdqu  0x10($inp), @XMM[1]
1076         movdqu  0x20($inp), @XMM[2]
1077         movdqu  0x30($inp), @XMM[3]
1078         movdqu  0x40($inp), @XMM[4]
1079         movdqu  0x50($inp), @XMM[5]
1080         movdqu  0x60($inp), @XMM[6]
1081         movdqu  0x70($inp), @XMM[7]
1082         mov     $key, %rax              # pass the $key
1083         lea     0x80($inp), $inp
1084         mov     \$10,%r10d
1085 
1086         call    _bsaes_encrypt8
1087 
1088         movdqu  @XMM[0], 0x00($out)     # write output
1089         movdqu  @XMM[1], 0x10($out)
1090         movdqu  @XMM[4], 0x20($out)
1091         movdqu  @XMM[6], 0x30($out)
1092         movdqu  @XMM[3], 0x40($out)
1093         movdqu  @XMM[7], 0x50($out)
1094         movdqu  @XMM[2], 0x60($out)
1095         movdqu  @XMM[5], 0x70($out)
1096         lea     0x80($out), $out
1097         sub     \$0x80,$len
1098         ja      .Lenc128_loop
1099         ret
1100 .size   bsaes_encrypt_128,.-bsaes_encrypt_128
1101 
1102 .globl  bsaes_dec_key_convert
1103 .type   bsaes_dec_key_convert,\@function,2
1104 .align  16
1105 bsaes_dec_key_convert:
1106         mov     240($inp),%r10d         # pass rounds
1107         mov     $inp,%rcx               # pass key
1108         mov     $out,%rax               # pass key schedule
1109         call    _bsaes_key_convert
1110         pxor    ($out),%xmm7            # fix up round 0 key
1111         movdqa  %xmm6,(%rax)            # save last round key
1112         movdqa  %xmm7,($out)
1113         ret
1114 .size   bsaes_dec_key_convert,.-bsaes_dec_key_convert
1115 
1116 .globl  bsaes_decrypt_128
1117 .type   bsaes_decrypt_128,\@function,4
1118 .align  16
1119 bsaes_decrypt_128:
1120 .Ldec128_loop:
1121         movdqu  0x00($inp), @XMM[0]     # load input
1122         movdqu  0x10($inp), @XMM[1]
1123         movdqu  0x20($inp), @XMM[2]
1124         movdqu  0x30($inp), @XMM[3]
1125         movdqu  0x40($inp), @XMM[4]
1126         movdqu  0x50($inp), @XMM[5]
1127         movdqu  0x60($inp), @XMM[6]
1128         movdqu  0x70($inp), @XMM[7]
1129         mov     $key, %rax              # pass the $key
1130         lea     0x80($inp), $inp
1131         mov     \$10,%r10d
1132 
1133         call    _bsaes_decrypt8
1134 
1135         movdqu  @XMM[0], 0x00($out)     # write output
1136         movdqu  @XMM[1], 0x10($out)
1137         movdqu  @XMM[6], 0x20($out)
1138         movdqu  @XMM[4], 0x30($out)
1139         movdqu  @XMM[2], 0x40($out)
1140         movdqu  @XMM[7], 0x50($out)
1141         movdqu  @XMM[3], 0x60($out)
1142         movdqu  @XMM[5], 0x70($out)
1143         lea     0x80($out), $out
1144         sub     \$0x80,$len
1145         ja      .Ldec128_loop
1146         ret
1147 .size   bsaes_decrypt_128,.-bsaes_decrypt_128
1148 ___
1149 }
1150 {
1151 ######################################################################
1152 #
1153 # OpenSSL interface
1154 #
1155 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1156                                                 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1157 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1158 
1159 if ($ecb) {
1160 $code.=<<___;
1161 .globl  bsaes_ecb_encrypt_blocks
1162 .type   bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1163 .align  16
1164 bsaes_ecb_encrypt_blocks:
1165         mov     %rsp, %rax
1166 .Lecb_enc_prologue:
1167         push    %rbp
1168         push    %rbx
1169         push    %r12
1170         push    %r13
1171         push    %r14
1172         push    %r15
1173         lea     -0x48(%rsp),%rsp
1174 ___
1175 $code.=<<___ if ($win64);
1176         lea     -0xa0(%rsp), %rsp
1177         movaps  %xmm6, 0x40(%rsp)
1178         movaps  %xmm7, 0x50(%rsp)
1179         movaps  %xmm8, 0x60(%rsp)
1180         movaps  %xmm9, 0x70(%rsp)
1181         movaps  %xmm10, 0x80(%rsp)
1182         movaps  %xmm11, 0x90(%rsp)
1183         movaps  %xmm12, 0xa0(%rsp)
1184         movaps  %xmm13, 0xb0(%rsp)
1185         movaps  %xmm14, 0xc0(%rsp)
1186         movaps  %xmm15, 0xd0(%rsp)
1187 .Lecb_enc_body:
1188 ___
1189 $code.=<<___;
1190         mov     %rsp,%rbp               # backup %rsp
1191         mov     240($arg4),%eax         # rounds
1192         mov     $arg1,$inp              # backup arguments
1193         mov     $arg2,$out
1194         mov     $arg3,$len
1195         mov     $arg4,$key
1196         cmp     \$8,$arg3
1197         jb      .Lecb_enc_short
1198 
1199         mov     %eax,%ebx               # backup rounds
1200         shl     \$7,%rax                # 128 bytes per inner round key
1201         sub     \$`128-32`,%rax         # size of bit-sliced key schedule
1202         sub     %rax,%rsp
1203         mov     %rsp,%rax               # pass key schedule
1204         mov     $key,%rcx               # pass key
1205         mov     %ebx,%r10d              # pass rounds
1206         call    _bsaes_key_convert
1207         pxor    %xmm6,%xmm7             # fix up last round key
1208         movdqa  %xmm7,(%rax)            # save last round key
1209 
1210         sub     \$8,$len
1211 .Lecb_enc_loop:
1212         movdqu  0x00($inp), @XMM[0]     # load input
1213         movdqu  0x10($inp), @XMM[1]
1214         movdqu  0x20($inp), @XMM[2]
1215         movdqu  0x30($inp), @XMM[3]
1216         movdqu  0x40($inp), @XMM[4]
1217         movdqu  0x50($inp), @XMM[5]
1218         mov     %rsp, %rax              # pass key schedule
1219         movdqu  0x60($inp), @XMM[6]
1220         mov     %ebx,%r10d              # pass rounds
1221         movdqu  0x70($inp), @XMM[7]
1222         lea     0x80($inp), $inp
1223 
1224         call    _bsaes_encrypt8
1225 
1226         movdqu  @XMM[0], 0x00($out)     # write output
1227         movdqu  @XMM[1], 0x10($out)
1228         movdqu  @XMM[4], 0x20($out)
1229         movdqu  @XMM[6], 0x30($out)
1230         movdqu  @XMM[3], 0x40($out)
1231         movdqu  @XMM[7], 0x50($out)
1232         movdqu  @XMM[2], 0x60($out)
1233         movdqu  @XMM[5], 0x70($out)
1234         lea     0x80($out), $out
1235         sub     \$8,$len
1236         jnc     .Lecb_enc_loop
1237 
1238         add     \$8,$len
1239         jz      .Lecb_enc_done
1240 
1241         movdqu  0x00($inp), @XMM[0]     # load input
1242         mov     %rsp, %rax              # pass key schedule
1243         mov     %ebx,%r10d              # pass rounds
1244         cmp     \$2,$len
1245         jb      .Lecb_enc_one
1246         movdqu  0x10($inp), @XMM[1]
1247         je      .Lecb_enc_two
1248         movdqu  0x20($inp), @XMM[2]
1249         cmp     \$4,$len
1250         jb      .Lecb_enc_three
1251         movdqu  0x30($inp), @XMM[3]
1252         je      .Lecb_enc_four
1253         movdqu  0x40($inp), @XMM[4]
1254         cmp     \$6,$len
1255         jb      .Lecb_enc_five
1256         movdqu  0x50($inp), @XMM[5]
1257         je      .Lecb_enc_six
1258         movdqu  0x60($inp), @XMM[6]
1259         call    _bsaes_encrypt8
1260         movdqu  @XMM[0], 0x00($out)     # write output
1261         movdqu  @XMM[1], 0x10($out)
1262         movdqu  @XMM[4], 0x20($out)
1263         movdqu  @XMM[6], 0x30($out)
1264         movdqu  @XMM[3], 0x40($out)
1265         movdqu  @XMM[7], 0x50($out)
1266         movdqu  @XMM[2], 0x60($out)
1267         jmp     .Lecb_enc_done
1268 .align  16
1269 .Lecb_enc_six:
1270         call    _bsaes_encrypt8
1271         movdqu  @XMM[0], 0x00($out)     # write output
1272         movdqu  @XMM[1], 0x10($out)
1273         movdqu  @XMM[4], 0x20($out)
1274         movdqu  @XMM[6], 0x30($out)
1275         movdqu  @XMM[3], 0x40($out)
1276         movdqu  @XMM[7], 0x50($out)
1277         jmp     .Lecb_enc_done
1278 .align  16
1279 .Lecb_enc_five:
1280         call    _bsaes_encrypt8
1281         movdqu  @XMM[0], 0x00($out)     # write output
1282         movdqu  @XMM[1], 0x10($out)
1283         movdqu  @XMM[4], 0x20($out)
1284         movdqu  @XMM[6], 0x30($out)
1285         movdqu  @XMM[3], 0x40($out)
1286         jmp     .Lecb_enc_done
1287 .align  16
1288 .Lecb_enc_four:
1289         call    _bsaes_encrypt8
1290         movdqu  @XMM[0], 0x00($out)     # write output
1291         movdqu  @XMM[1], 0x10($out)
1292         movdqu  @XMM[4], 0x20($out)
1293         movdqu  @XMM[6], 0x30($out)
1294         jmp     .Lecb_enc_done
1295 .align  16
1296 .Lecb_enc_three:
1297         call    _bsaes_encrypt8
1298         movdqu  @XMM[0], 0x00($out)     # write output
1299         movdqu  @XMM[1], 0x10($out)
1300         movdqu  @XMM[4], 0x20($out)
1301         jmp     .Lecb_enc_done
1302 .align  16
1303 .Lecb_enc_two:
1304         call    _bsaes_encrypt8
1305         movdqu  @XMM[0], 0x00($out)     # write output
1306         movdqu  @XMM[1], 0x10($out)
1307         jmp     .Lecb_enc_done
1308 .align  16
1309 .Lecb_enc_one:
1310         call    _bsaes_encrypt8
1311         movdqu  @XMM[0], 0x00($out)     # write output
1312         jmp     .Lecb_enc_done
1313 .align  16
1314 .Lecb_enc_short:
1315         lea     ($inp), $arg1
1316         lea     ($out), $arg2
1317         lea     ($key), $arg3
1318         call    asm_AES_encrypt
1319         lea     16($inp), $inp
1320         lea     16($out), $out
1321         dec     $len
1322         jnz     .Lecb_enc_short
1323 
1324 .Lecb_enc_done:
1325         lea     (%rsp),%rax
1326         pxor    %xmm0, %xmm0
1327 .Lecb_enc_bzero:                        # wipe key schedule [if any]
1328         movdqa  %xmm0, 0x00(%rax)
1329         movdqa  %xmm0, 0x10(%rax)
1330         lea     0x20(%rax), %rax
1331         cmp     %rax, %rbp
1332         jb      .Lecb_enc_bzero
1333 
1334         lea     (%rbp),%rsp             # restore %rsp
1335 ___
1336 $code.=<<___ if ($win64);
1337         movaps  0x40(%rbp), %xmm6
1338         movaps  0x50(%rbp), %xmm7
1339         movaps  0x60(%rbp), %xmm8
1340         movaps  0x70(%rbp), %xmm9
1341         movaps  0x80(%rbp), %xmm10
1342         movaps  0x90(%rbp), %xmm11
1343         movaps  0xa0(%rbp), %xmm12
1344         movaps  0xb0(%rbp), %xmm13
1345         movaps  0xc0(%rbp), %xmm14
1346         movaps  0xd0(%rbp), %xmm15
1347         lea     0xa0(%rbp), %rsp
1348 ___
1349 $code.=<<___;
1350         mov     0x48(%rsp), %r15
1351         mov     0x50(%rsp), %r14
1352         mov     0x58(%rsp), %r13
1353         mov     0x60(%rsp), %r12
1354         mov     0x68(%rsp), %rbx
1355         mov     0x70(%rsp), %rax
1356         lea     0x78(%rsp), %rsp
1357         mov     %rax, %rbp
1358 .Lecb_enc_epilogue:
1359         ret
1360 .size   bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1361 
1362 .globl  bsaes_ecb_decrypt_blocks
1363 .type   bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1364 .align  16
1365 bsaes_ecb_decrypt_blocks:
1366         mov     %rsp, %rax
1367 .Lecb_dec_prologue:
1368         push    %rbp
1369         push    %rbx
1370         push    %r12
1371         push    %r13
1372         push    %r14
1373         push    %r15
1374         lea     -0x48(%rsp),%rsp
1375 ___
1376 $code.=<<___ if ($win64);
1377         lea     -0xa0(%rsp), %rsp
1378         movaps  %xmm6, 0x40(%rsp)
1379         movaps  %xmm7, 0x50(%rsp)
1380         movaps  %xmm8, 0x60(%rsp)
1381         movaps  %xmm9, 0x70(%rsp)
1382         movaps  %xmm10, 0x80(%rsp)
1383         movaps  %xmm11, 0x90(%rsp)
1384         movaps  %xmm12, 0xa0(%rsp)
1385         movaps  %xmm13, 0xb0(%rsp)
1386         movaps  %xmm14, 0xc0(%rsp)
1387         movaps  %xmm15, 0xd0(%rsp)
1388 .Lecb_dec_body:
1389 ___
1390 $code.=<<___;
1391         mov     %rsp,%rbp               # backup %rsp
1392         mov     240($arg4),%eax         # rounds
1393         mov     $arg1,$inp              # backup arguments
1394         mov     $arg2,$out
1395         mov     $arg3,$len
1396         mov     $arg4,$key
1397         cmp     \$8,$arg3
1398         jb      .Lecb_dec_short
1399 
1400         mov     %eax,%ebx               # backup rounds
1401         shl     \$7,%rax                # 128 bytes per inner round key
1402         sub     \$`128-32`,%rax         # size of bit-sliced key schedule
1403         sub     %rax,%rsp
1404         mov     %rsp,%rax               # pass key schedule
1405         mov     $key,%rcx               # pass key
1406         mov     %ebx,%r10d              # pass rounds
1407         call    _bsaes_key_convert
1408         pxor    (%rsp),%xmm7            # fix up 0 round key
1409         movdqa  %xmm6,(%rax)            # save last round key
1410         movdqa  %xmm7,(%rsp)
1411 
1412         sub     \$8,$len
1413 .Lecb_dec_loop:
1414         movdqu  0x00($inp), @XMM[0]     # load input
1415         movdqu  0x10($inp), @XMM[1]
1416         movdqu  0x20($inp), @XMM[2]
1417         movdqu  0x30($inp), @XMM[3]
1418         movdqu  0x40($inp), @XMM[4]
1419         movdqu  0x50($inp), @XMM[5]
1420         mov     %rsp, %rax              # pass key schedule
1421         movdqu  0x60($inp), @XMM[6]
1422         mov     %ebx,%r10d              # pass rounds
1423         movdqu  0x70($inp), @XMM[7]
1424         lea     0x80($inp), $inp
1425 
1426         call    _bsaes_decrypt8
1427 
1428         movdqu  @XMM[0], 0x00($out)     # write output
1429         movdqu  @XMM[1], 0x10($out)
1430         movdqu  @XMM[6], 0x20($out)
1431         movdqu  @XMM[4], 0x30($out)
1432         movdqu  @XMM[2], 0x40($out)
1433         movdqu  @XMM[7], 0x50($out)
1434         movdqu  @XMM[3], 0x60($out)
1435         movdqu  @XMM[5], 0x70($out)
1436         lea     0x80($out), $out
1437         sub     \$8,$len
1438         jnc     .Lecb_dec_loop
1439 
1440         add     \$8,$len
1441         jz      .Lecb_dec_done
1442 
1443         movdqu  0x00($inp), @XMM[0]     # load input
1444         mov     %rsp, %rax              # pass key schedule
1445         mov     %ebx,%r10d              # pass rounds
1446         cmp     \$2,$len
1447         jb      .Lecb_dec_one
1448         movdqu  0x10($inp), @XMM[1]
1449         je      .Lecb_dec_two
1450         movdqu  0x20($inp), @XMM[2]
1451         cmp     \$4,$len
1452         jb      .Lecb_dec_three
1453         movdqu  0x30($inp), @XMM[3]
1454         je      .Lecb_dec_four
1455         movdqu  0x40($inp), @XMM[4]
1456         cmp     \$6,$len
1457         jb      .Lecb_dec_five
1458         movdqu  0x50($inp), @XMM[5]
1459         je      .Lecb_dec_six
1460         movdqu  0x60($inp), @XMM[6]
1461         call    _bsaes_decrypt8
1462         movdqu  @XMM[0], 0x00($out)     # write output
1463         movdqu  @XMM[1], 0x10($out)
1464         movdqu  @XMM[6], 0x20($out)
1465         movdqu  @XMM[4], 0x30($out)
1466         movdqu  @XMM[2], 0x40($out)
1467         movdqu  @XMM[7], 0x50($out)
1468         movdqu  @XMM[3], 0x60($out)
1469         jmp     .Lecb_dec_done
1470 .align  16
1471 .Lecb_dec_six:
1472         call    _bsaes_decrypt8
1473         movdqu  @XMM[0], 0x00($out)     # write output
1474         movdqu  @XMM[1], 0x10($out)
1475         movdqu  @XMM[6], 0x20($out)
1476         movdqu  @XMM[4], 0x30($out)
1477         movdqu  @XMM[2], 0x40($out)
1478         movdqu  @XMM[7], 0x50($out)
1479         jmp     .Lecb_dec_done
1480 .align  16
1481 .Lecb_dec_five:
1482         call    _bsaes_decrypt8
1483         movdqu  @XMM[0], 0x00($out)     # write output
1484         movdqu  @XMM[1], 0x10($out)
1485         movdqu  @XMM[6], 0x20($out)
1486         movdqu  @XMM[4], 0x30($out)
1487         movdqu  @XMM[2], 0x40($out)
1488         jmp     .Lecb_dec_done
1489 .align  16
1490 .Lecb_dec_four:
1491         call    _bsaes_decrypt8
1492         movdqu  @XMM[0], 0x00($out)     # write output
1493         movdqu  @XMM[1], 0x10($out)
1494         movdqu  @XMM[6], 0x20($out)
1495         movdqu  @XMM[4], 0x30($out)
1496         jmp     .Lecb_dec_done
1497 .align  16
1498 .Lecb_dec_three:
1499         call    _bsaes_decrypt8
1500         movdqu  @XMM[0], 0x00($out)     # write output
1501         movdqu  @XMM[1], 0x10($out)
1502         movdqu  @XMM[6], 0x20($out)
1503         jmp     .Lecb_dec_done
1504 .align  16
1505 .Lecb_dec_two:
1506         call    _bsaes_decrypt8
1507         movdqu  @XMM[0], 0x00($out)     # write output
1508         movdqu  @XMM[1], 0x10($out)
1509         jmp     .Lecb_dec_done
1510 .align  16
1511 .Lecb_dec_one:
1512         call    _bsaes_decrypt8
1513         movdqu  @XMM[0], 0x00($out)     # write output
1514         jmp     .Lecb_dec_done
1515 .align  16
1516 .Lecb_dec_short:
1517         lea     ($inp), $arg1
1518         lea     ($out), $arg2
1519         lea     ($key), $arg3
1520         call    asm_AES_decrypt
1521         lea     16($inp), $inp
1522         lea     16($out), $out
1523         dec     $len
1524         jnz     .Lecb_dec_short
1525 
1526 .Lecb_dec_done:
1527         lea     (%rsp),%rax
1528         pxor    %xmm0, %xmm0
1529 .Lecb_dec_bzero:                        # wipe key schedule [if any]
1530         movdqa  %xmm0, 0x00(%rax)
1531         movdqa  %xmm0, 0x10(%rax)
1532         lea     0x20(%rax), %rax
1533         cmp     %rax, %rbp
1534         jb      .Lecb_dec_bzero
1535 
1536         lea     (%rbp),%rsp             # restore %rsp
1537 ___
1538 $code.=<<___ if ($win64);
1539         movaps  0x40(%rbp), %xmm6
1540         movaps  0x50(%rbp), %xmm7
1541         movaps  0x60(%rbp), %xmm8
1542         movaps  0x70(%rbp), %xmm9
1543         movaps  0x80(%rbp), %xmm10
1544         movaps  0x90(%rbp), %xmm11
1545         movaps  0xa0(%rbp), %xmm12
1546         movaps  0xb0(%rbp), %xmm13
1547         movaps  0xc0(%rbp), %xmm14
1548         movaps  0xd0(%rbp), %xmm15
1549         lea     0xa0(%rbp), %rsp
1550 ___
1551 $code.=<<___;
1552         mov     0x48(%rsp), %r15
1553         mov     0x50(%rsp), %r14
1554         mov     0x58(%rsp), %r13
1555         mov     0x60(%rsp), %r12
1556         mov     0x68(%rsp), %rbx
1557         mov     0x70(%rsp), %rax
1558         lea     0x78(%rsp), %rsp
1559         mov     %rax, %rbp
1560 .Lecb_dec_epilogue:
1561         ret
1562 .size   bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1563 ___
1564 }
1565 $code.=<<___;
1566 .extern asm_AES_cbc_encrypt
1567 .globl  bsaes_cbc_encrypt
1568 .type   bsaes_cbc_encrypt,\@abi-omnipotent
1569 .align  16
1570 bsaes_cbc_encrypt:
1571 ___
1572 $code.=<<___ if ($win64);
1573         mov     48(%rsp),$arg6          # pull direction flag
1574 ___
1575 $code.=<<___;
1576         cmp     \$0,$arg6
1577         jne     asm_AES_cbc_encrypt
1578         cmp     \$128,$arg3
1579         jb      asm_AES_cbc_encrypt
1580 
1581         mov     %rsp, %rax
1582 .Lcbc_dec_prologue:
1583         push    %rbp
1584         push    %rbx
1585         push    %r12
1586         push    %r13
1587         push    %r14
1588         push    %r15
1589         lea     -0x48(%rsp), %rsp
1590 ___
1591 $code.=<<___ if ($win64);
1592         mov     0xa0(%rsp),$arg5        # pull ivp
1593         lea     -0xa0(%rsp), %rsp
1594         movaps  %xmm6, 0x40(%rsp)
1595         movaps  %xmm7, 0x50(%rsp)
1596         movaps  %xmm8, 0x60(%rsp)
1597         movaps  %xmm9, 0x70(%rsp)
1598         movaps  %xmm10, 0x80(%rsp)
1599         movaps  %xmm11, 0x90(%rsp)
1600         movaps  %xmm12, 0xa0(%rsp)
1601         movaps  %xmm13, 0xb0(%rsp)
1602         movaps  %xmm14, 0xc0(%rsp)
1603         movaps  %xmm15, 0xd0(%rsp)
1604 .Lcbc_dec_body:
1605 ___
1606 $code.=<<___;
1607         mov     %rsp, %rbp              # backup %rsp
1608         mov     240($arg4), %eax        # rounds
1609         mov     $arg1, $inp             # backup arguments
1610         mov     $arg2, $out
1611         mov     $arg3, $len
1612         mov     $arg4, $key
1613         mov     $arg5, %rbx
1614         shr     \$4, $len               # bytes to blocks
1615 
1616         mov     %eax, %edx              # rounds
1617         shl     \$7, %rax               # 128 bytes per inner round key
1618         sub     \$`128-32`, %rax        # size of bit-sliced key schedule
1619         sub     %rax, %rsp
1620 
1621         mov     %rsp, %rax              # pass key schedule
1622         mov     $key, %rcx              # pass key
1623         mov     %edx, %r10d             # pass rounds
1624         call    _bsaes_key_convert
1625         pxor    (%rsp),%xmm7            # fix up 0 round key
1626         movdqa  %xmm6,(%rax)            # save last round key
1627         movdqa  %xmm7,(%rsp)
1628 
1629         movdqu  (%rbx), @XMM[15]        # load IV
1630         sub     \$8,$len
1631 .Lcbc_dec_loop:
1632         movdqu  0x00($inp), @XMM[0]     # load input
1633         movdqu  0x10($inp), @XMM[1]
1634         movdqu  0x20($inp), @XMM[2]
1635         movdqu  0x30($inp), @XMM[3]
1636         movdqu  0x40($inp), @XMM[4]
1637         movdqu  0x50($inp), @XMM[5]
1638         mov     %rsp, %rax              # pass key schedule
1639         movdqu  0x60($inp), @XMM[6]
1640         mov     %edx,%r10d              # pass rounds
1641         movdqu  0x70($inp), @XMM[7]
1642         movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
1643 
1644         call    _bsaes_decrypt8
1645 
1646         pxor    0x20(%rbp), @XMM[0]     # ^= IV
1647         movdqu  0x00($inp), @XMM[8]     # re-load input
1648         movdqu  0x10($inp), @XMM[9]
1649         pxor    @XMM[8], @XMM[1]
1650         movdqu  0x20($inp), @XMM[10]
1651         pxor    @XMM[9], @XMM[6]
1652         movdqu  0x30($inp), @XMM[11]
1653         pxor    @XMM[10], @XMM[4]
1654         movdqu  0x40($inp), @XMM[12]
1655         pxor    @XMM[11], @XMM[2]
1656         movdqu  0x50($inp), @XMM[13]
1657         pxor    @XMM[12], @XMM[7]
1658         movdqu  0x60($inp), @XMM[14]
1659         pxor    @XMM[13], @XMM[3]
1660         movdqu  0x70($inp), @XMM[15]    # IV
1661         pxor    @XMM[14], @XMM[5]
1662         movdqu  @XMM[0], 0x00($out)     # write output
1663         lea     0x80($inp), $inp
1664         movdqu  @XMM[1], 0x10($out)
1665         movdqu  @XMM[6], 0x20($out)
1666         movdqu  @XMM[4], 0x30($out)
1667         movdqu  @XMM[2], 0x40($out)
1668         movdqu  @XMM[7], 0x50($out)
1669         movdqu  @XMM[3], 0x60($out)
1670         movdqu  @XMM[5], 0x70($out)
1671         lea     0x80($out), $out
1672         sub     \$8,$len
1673         jnc     .Lcbc_dec_loop
1674 
1675         add     \$8,$len
1676         jz      .Lcbc_dec_done
1677 
1678         movdqu  0x00($inp), @XMM[0]     # load input
1679         mov     %rsp, %rax              # pass key schedule
1680         mov     %edx, %r10d             # pass rounds
1681         cmp     \$2,$len
1682         jb      .Lcbc_dec_one
1683         movdqu  0x10($inp), @XMM[1]
1684         je      .Lcbc_dec_two
1685         movdqu  0x20($inp), @XMM[2]
1686         cmp     \$4,$len
1687         jb      .Lcbc_dec_three
1688         movdqu  0x30($inp), @XMM[3]
1689         je      .Lcbc_dec_four
1690         movdqu  0x40($inp), @XMM[4]
1691         cmp     \$6,$len
1692         jb      .Lcbc_dec_five
1693         movdqu  0x50($inp), @XMM[5]
1694         je      .Lcbc_dec_six
1695         movdqu  0x60($inp), @XMM[6]
1696         movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
1697         call    _bsaes_decrypt8
1698         pxor    0x20(%rbp), @XMM[0]     # ^= IV
1699         movdqu  0x00($inp), @XMM[8]     # re-load input
1700         movdqu  0x10($inp), @XMM[9]
1701         pxor    @XMM[8], @XMM[1]
1702         movdqu  0x20($inp), @XMM[10]
1703         pxor    @XMM[9], @XMM[6]
1704         movdqu  0x30($inp), @XMM[11]
1705         pxor    @XMM[10], @XMM[4]
1706         movdqu  0x40($inp), @XMM[12]
1707         pxor    @XMM[11], @XMM[2]
1708         movdqu  0x50($inp), @XMM[13]
1709         pxor    @XMM[12], @XMM[7]
1710         movdqu  0x60($inp), @XMM[15]    # IV
1711         pxor    @XMM[13], @XMM[3]
1712         movdqu  @XMM[0], 0x00($out)     # write output
1713         movdqu  @XMM[1], 0x10($out)
1714         movdqu  @XMM[6], 0x20($out)
1715         movdqu  @XMM[4], 0x30($out)
1716         movdqu  @XMM[2], 0x40($out)
1717         movdqu  @XMM[7], 0x50($out)
1718         movdqu  @XMM[3], 0x60($out)
1719         jmp     .Lcbc_dec_done
1720 .align  16
1721 .Lcbc_dec_six:
1722         movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
1723         call    _bsaes_decrypt8
1724         pxor    0x20(%rbp), @XMM[0]     # ^= IV
1725         movdqu  0x00($inp), @XMM[8]     # re-load input
1726         movdqu  0x10($inp), @XMM[9]
1727         pxor    @XMM[8], @XMM[1]
1728         movdqu  0x20($inp), @XMM[10]
1729         pxor    @XMM[9], @XMM[6]
1730         movdqu  0x30($inp), @XMM[11]
1731         pxor    @XMM[10], @XMM[4]
1732         movdqu  0x40($inp), @XMM[12]
1733         pxor    @XMM[11], @XMM[2]
1734         movdqu  0x50($inp), @XMM[15]    # IV
1735         pxor    @XMM[12], @XMM[7]
1736         movdqu  @XMM[0], 0x00($out)     # write output
1737         movdqu  @XMM[1], 0x10($out)
1738         movdqu  @XMM[6], 0x20($out)
1739         movdqu  @XMM[4], 0x30($out)
1740         movdqu  @XMM[2], 0x40($out)
1741         movdqu  @XMM[7], 0x50($out)
1742         jmp     .Lcbc_dec_done
1743 .align  16
1744 .Lcbc_dec_five:
1745         movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
1746         call    _bsaes_decrypt8
1747         pxor    0x20(%rbp), @XMM[0]     # ^= IV
1748         movdqu  0x00($inp), @XMM[8]     # re-load input
1749         movdqu  0x10($inp), @XMM[9]
1750         pxor    @XMM[8], @XMM[1]
1751         movdqu  0x20($inp), @XMM[10]
1752         pxor    @XMM[9], @XMM[6]
1753         movdqu  0x30($inp), @XMM[11]
1754         pxor    @XMM[10], @XMM[4]
1755         movdqu  0x40($inp), @XMM[15]    # IV
1756         pxor    @XMM[11], @XMM[2]
1757         movdqu  @XMM[0], 0x00($out)     # write output
1758         movdqu  @XMM[1], 0x10($out)
1759         movdqu  @XMM[6], 0x20($out)
1760         movdqu  @XMM[4], 0x30($out)
1761         movdqu  @XMM[2], 0x40($out)
1762         jmp     .Lcbc_dec_done
1763 .align  16
1764 .Lcbc_dec_four:
1765         movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
1766         call    _bsaes_decrypt8
1767         pxor    0x20(%rbp), @XMM[0]     # ^= IV
1768         movdqu  0x00($inp), @XMM[8]     # re-load input
1769         movdqu  0x10($inp), @XMM[9]
1770         pxor    @XMM[8], @XMM[1]
1771         movdqu  0x20($inp), @XMM[10]
1772         pxor    @XMM[9], @XMM[6]
1773         movdqu  0x30($inp), @XMM[15]    # IV
1774         pxor    @XMM[10], @XMM[4]
1775         movdqu  @XMM[0], 0x00($out)     # write output
1776         movdqu  @XMM[1], 0x10($out)
1777         movdqu  @XMM[6], 0x20($out)
1778         movdqu  @XMM[4], 0x30($out)
1779         jmp     .Lcbc_dec_done
1780 .align  16
1781 .Lcbc_dec_three:
1782         movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
1783         call    _bsaes_decrypt8
1784         pxor    0x20(%rbp), @XMM[0]     # ^= IV
1785         movdqu  0x00($inp), @XMM[8]     # re-load input
1786         movdqu  0x10($inp), @XMM[9]
1787         pxor    @XMM[8], @XMM[1]
1788         movdqu  0x20($inp), @XMM[15]    # IV
1789         pxor    @XMM[9], @XMM[6]
1790         movdqu  @XMM[0], 0x00($out)     # write output
1791         movdqu  @XMM[1], 0x10($out)
1792         movdqu  @XMM[6], 0x20($out)
1793         jmp     .Lcbc_dec_done
1794 .align  16
1795 .Lcbc_dec_two:
1796         movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
1797         call    _bsaes_decrypt8
1798         pxor    0x20(%rbp), @XMM[0]     # ^= IV
1799         movdqu  0x00($inp), @XMM[8]     # re-load input
1800         movdqu  0x10($inp), @XMM[15]    # IV
1801         pxor    @XMM[8], @XMM[1]
1802         movdqu  @XMM[0], 0x00($out)     # write output
1803         movdqu  @XMM[1], 0x10($out)
1804         jmp     .Lcbc_dec_done
1805 .align  16
1806 .Lcbc_dec_one:
1807         lea     ($inp), $arg1
1808         lea     0x20(%rbp), $arg2       # buffer output
1809         lea     ($key), $arg3
1810         call    asm_AES_decrypt         # doesn't touch %xmm
1811         pxor    0x20(%rbp), @XMM[15]    # ^= IV
1812         movdqu  @XMM[15], ($out)        # write output
1813         movdqa  @XMM[0], @XMM[15]       # IV
1814 
1815 .Lcbc_dec_done:
1816         movdqu  @XMM[15], (%rbx)        # return IV
1817         lea     (%rsp), %rax
1818         pxor    %xmm0, %xmm0
1819 .Lcbc_dec_bzero:                        # wipe key schedule [if any]
1820         movdqa  %xmm0, 0x00(%rax)
1821         movdqa  %xmm0, 0x10(%rax)
1822         lea     0x20(%rax), %rax
1823         cmp     %rax, %rbp
1824         ja      .Lcbc_dec_bzero
1825 
1826         lea     (%rbp),%rsp             # restore %rsp
1827 ___
1828 $code.=<<___ if ($win64);
1829         movaps  0x40(%rbp), %xmm6
1830         movaps  0x50(%rbp), %xmm7
1831         movaps  0x60(%rbp), %xmm8
1832         movaps  0x70(%rbp), %xmm9
1833         movaps  0x80(%rbp), %xmm10
1834         movaps  0x90(%rbp), %xmm11
1835         movaps  0xa0(%rbp), %xmm12
1836         movaps  0xb0(%rbp), %xmm13
1837         movaps  0xc0(%rbp), %xmm14
1838         movaps  0xd0(%rbp), %xmm15
1839         lea     0xa0(%rbp), %rsp
1840 ___
1841 $code.=<<___;
1842         mov     0x48(%rsp), %r15
1843         mov     0x50(%rsp), %r14
1844         mov     0x58(%rsp), %r13
1845         mov     0x60(%rsp), %r12
1846         mov     0x68(%rsp), %rbx
1847         mov     0x70(%rsp), %rax
1848         lea     0x78(%rsp), %rsp
1849         mov     %rax, %rbp
1850 .Lcbc_dec_epilogue:
1851         ret
1852 .size   bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1853 
1854 .globl  bsaes_ctr32_encrypt_blocks
1855 .type   bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1856 .align  16
1857 bsaes_ctr32_encrypt_blocks:
1858         mov     %rsp, %rax
1859 .Lctr_enc_prologue:
1860         push    %rbp
1861         push    %rbx
1862         push    %r12
1863         push    %r13
1864         push    %r14
1865         push    %r15
1866         lea     -0x48(%rsp), %rsp
1867 ___
1868 $code.=<<___ if ($win64);
1869         mov     0xa0(%rsp),$arg5        # pull ivp
1870         lea     -0xa0(%rsp), %rsp
1871         movaps  %xmm6, 0x40(%rsp)
1872         movaps  %xmm7, 0x50(%rsp)
1873         movaps  %xmm8, 0x60(%rsp)
1874         movaps  %xmm9, 0x70(%rsp)
1875         movaps  %xmm10, 0x80(%rsp)
1876         movaps  %xmm11, 0x90(%rsp)
1877         movaps  %xmm12, 0xa0(%rsp)
1878         movaps  %xmm13, 0xb0(%rsp)
1879         movaps  %xmm14, 0xc0(%rsp)
1880         movaps  %xmm15, 0xd0(%rsp)
1881 .Lctr_enc_body:
1882 ___
1883 $code.=<<___;
1884         mov     %rsp, %rbp              # backup %rsp
1885         movdqu  ($arg5), %xmm0          # load counter
1886         mov     240($arg4), %eax        # rounds
1887         mov     $arg1, $inp             # backup arguments
1888         mov     $arg2, $out
1889         mov     $arg3, $len
1890         mov     $arg4, $key
1891         movdqa  %xmm0, 0x20(%rbp)       # copy counter
1892         cmp     \$8, $arg3
1893         jb      .Lctr_enc_short
1894 
1895         mov     %eax, %ebx              # rounds
1896         shl     \$7, %rax               # 128 bytes per inner round key
1897         sub     \$`128-32`, %rax        # size of bit-sliced key schedule
1898         sub     %rax, %rsp
1899 
1900         mov     %rsp, %rax              # pass key schedule
1901         mov     $key, %rcx              # pass key
1902         mov     %ebx, %r10d             # pass rounds
1903         call    _bsaes_key_convert
1904         pxor    %xmm6,%xmm7             # fix up last round key
1905         movdqa  %xmm7,(%rax)            # save last round key
1906 
1907         movdqa  (%rsp), @XMM[9]         # load round0 key
1908         lea     .LADD1(%rip), %r11
1909         movdqa  0x20(%rbp), @XMM[0]     # counter copy
1910         movdqa  -0x20(%r11), @XMM[8]    # .LSWPUP
1911         pshufb  @XMM[8], @XMM[9]        # byte swap upper part
1912         pshufb  @XMM[8], @XMM[0]
1913         movdqa  @XMM[9], (%rsp)         # save adjusted round0 key
1914         jmp     .Lctr_enc_loop
1915 .align  16
1916 .Lctr_enc_loop:
1917         movdqa  @XMM[0], 0x20(%rbp)     # save counter
1918         movdqa  @XMM[0], @XMM[1]        # prepare 8 counter values
1919         movdqa  @XMM[0], @XMM[2]
1920         paddd   0x00(%r11), @XMM[1]     # .LADD1
1921         movdqa  @XMM[0], @XMM[3]
1922         paddd   0x10(%r11), @XMM[2]     # .LADD2
1923         movdqa  @XMM[0], @XMM[4]
1924         paddd   0x20(%r11), @XMM[3]     # .LADD3
1925         movdqa  @XMM[0], @XMM[5]
1926         paddd   0x30(%r11), @XMM[4]     # .LADD4
1927         movdqa  @XMM[0], @XMM[6]
1928         paddd   0x40(%r11), @XMM[5]     # .LADD5
1929         movdqa  @XMM[0], @XMM[7]
1930         paddd   0x50(%r11), @XMM[6]     # .LADD6
1931         paddd   0x60(%r11), @XMM[7]     # .LADD7
1932 
1933         # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1934         # to flip byte order in 32-bit counter
1935         movdqa  (%rsp), @XMM[9]         # round 0 key
1936         lea     0x10(%rsp), %rax        # pass key schedule
1937         movdqa  -0x10(%r11), @XMM[8]    # .LSWPUPM0SR
1938         pxor    @XMM[9], @XMM[0]        # xor with round0 key
1939         pxor    @XMM[9], @XMM[1]
1940          pshufb @XMM[8], @XMM[0]
1941         pxor    @XMM[9], @XMM[2]
1942          pshufb @XMM[8], @XMM[1]
1943         pxor    @XMM[9], @XMM[3]
1944          pshufb @XMM[8], @XMM[2]
1945         pxor    @XMM[9], @XMM[4]
1946          pshufb @XMM[8], @XMM[3]
1947         pxor    @XMM[9], @XMM[5]
1948          pshufb @XMM[8], @XMM[4]
1949         pxor    @XMM[9], @XMM[6]
1950          pshufb @XMM[8], @XMM[5]
1951         pxor    @XMM[9], @XMM[7]
1952          pshufb @XMM[8], @XMM[6]
1953         lea     .LBS0(%rip), %r11       # constants table
1954          pshufb @XMM[8], @XMM[7]
1955         mov     %ebx,%r10d              # pass rounds
1956 
1957         call    _bsaes_encrypt8_bitslice
1958 
1959         sub     \$8,$len
1960         jc      .Lctr_enc_loop_done
1961 
1962         movdqu  0x00($inp), @XMM[8]     # load input
1963         movdqu  0x10($inp), @XMM[9]
1964         movdqu  0x20($inp), @XMM[10]
1965         movdqu  0x30($inp), @XMM[11]
1966         movdqu  0x40($inp), @XMM[12]
1967         movdqu  0x50($inp), @XMM[13]
1968         movdqu  0x60($inp), @XMM[14]
1969         movdqu  0x70($inp), @XMM[15]
1970         lea     0x80($inp),$inp
1971         pxor    @XMM[0], @XMM[8]
1972         movdqa  0x20(%rbp), @XMM[0]     # load counter
1973         pxor    @XMM[9], @XMM[1]
1974         movdqu  @XMM[8], 0x00($out)     # write output
1975         pxor    @XMM[10], @XMM[4]
1976         movdqu  @XMM[1], 0x10($out)
1977         pxor    @XMM[11], @XMM[6]
1978         movdqu  @XMM[4], 0x20($out)
1979         pxor    @XMM[12], @XMM[3]
1980         movdqu  @XMM[6], 0x30($out)
1981         pxor    @XMM[13], @XMM[7]
1982         movdqu  @XMM[3], 0x40($out)
1983         pxor    @XMM[14], @XMM[2]
1984         movdqu  @XMM[7], 0x50($out)
1985         pxor    @XMM[15], @XMM[5]
1986         movdqu  @XMM[2], 0x60($out)
1987         lea     .LADD1(%rip), %r11
1988         movdqu  @XMM[5], 0x70($out)
1989         lea     0x80($out), $out
1990         paddd   0x70(%r11), @XMM[0]     # .LADD8
1991         jnz     .Lctr_enc_loop
1992 
1993         jmp     .Lctr_enc_done
1994 .align  16
1995 .Lctr_enc_loop_done:
1996         add     \$8, $len
1997         movdqu  0x00($inp), @XMM[8]     # load input
1998         pxor    @XMM[8], @XMM[0]
1999         movdqu  @XMM[0], 0x00($out)     # write output
2000         cmp     \$2,$len
2001         jb      .Lctr_enc_done
2002         movdqu  0x10($inp), @XMM[9]
2003         pxor    @XMM[9], @XMM[1]
2004         movdqu  @XMM[1], 0x10($out)
2005         je      .Lctr_enc_done
2006         movdqu  0x20($inp), @XMM[10]
2007         pxor    @XMM[10], @XMM[4]
2008         movdqu  @XMM[4], 0x20($out)
2009         cmp     \$4,$len
2010         jb      .Lctr_enc_done
2011         movdqu  0x30($inp), @XMM[11]
2012         pxor    @XMM[11], @XMM[6]
2013         movdqu  @XMM[6], 0x30($out)
2014         je      .Lctr_enc_done
2015         movdqu  0x40($inp), @XMM[12]
2016         pxor    @XMM[12], @XMM[3]
2017         movdqu  @XMM[3], 0x40($out)
2018         cmp     \$6,$len
2019         jb      .Lctr_enc_done
2020         movdqu  0x50($inp), @XMM[13]
2021         pxor    @XMM[13], @XMM[7]
2022         movdqu  @XMM[7], 0x50($out)
2023         je      .Lctr_enc_done
2024         movdqu  0x60($inp), @XMM[14]
2025         pxor    @XMM[14], @XMM[2]
2026         movdqu  @XMM[2], 0x60($out)
2027         jmp     .Lctr_enc_done
2028 
2029 .align  16
2030 .Lctr_enc_short:
2031         lea     0x20(%rbp), $arg1
2032         lea     0x30(%rbp), $arg2
2033         lea     ($key), $arg3
2034         call    asm_AES_encrypt
2035         movdqu  ($inp), @XMM[1]
2036         lea     16($inp), $inp
2037         mov     0x2c(%rbp), %eax        # load 32-bit counter
2038         bswap   %eax
2039         pxor    0x30(%rbp), @XMM[1]
2040         inc     %eax                    # increment
2041         movdqu  @XMM[1], ($out)
2042         bswap   %eax
2043         lea     16($out), $out
2044         mov     %eax, 0x2c(%rsp)        # save 32-bit counter
2045         dec     $len
2046         jnz     .Lctr_enc_short
2047 
2048 .Lctr_enc_done:
2049         lea     (%rsp), %rax
2050         pxor    %xmm0, %xmm0
2051 .Lctr_enc_bzero:                        # wipe key schedule [if any]
2052         movdqa  %xmm0, 0x00(%rax)
2053         movdqa  %xmm0, 0x10(%rax)
2054         lea     0x20(%rax), %rax
2055         cmp     %rax, %rbp
2056         ja      .Lctr_enc_bzero
2057 
2058         lea     (%rbp),%rsp             # restore %rsp
2059 ___
2060 $code.=<<___ if ($win64);
2061         movaps  0x40(%rbp), %xmm6
2062         movaps  0x50(%rbp), %xmm7
2063         movaps  0x60(%rbp), %xmm8
2064         movaps  0x70(%rbp), %xmm9
2065         movaps  0x80(%rbp), %xmm10
2066         movaps  0x90(%rbp), %xmm11
2067         movaps  0xa0(%rbp), %xmm12
2068         movaps  0xb0(%rbp), %xmm13
2069         movaps  0xc0(%rbp), %xmm14
2070         movaps  0xd0(%rbp), %xmm15
2071         lea     0xa0(%rbp), %rsp
2072 ___
2073 $code.=<<___;
2074         mov     0x48(%rsp), %r15
2075         mov     0x50(%rsp), %r14
2076         mov     0x58(%rsp), %r13
2077         mov     0x60(%rsp), %r12
2078         mov     0x68(%rsp), %rbx
2079         mov     0x70(%rsp), %rax
2080         lea     0x78(%rsp), %rsp
2081         mov     %rax, %rbp
2082 .Lctr_enc_epilogue:
2083         ret
2084 .size   bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2085 ___
2086 ######################################################################
2087 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2088 #       const AES_KEY *key1, const AES_KEY *key2,
2089 #       const unsigned char iv[16]);
2090 #
2091 my ($twmask,$twres,$twtmp)=@XMM[13..15];
2092 $arg6=~s/d$//;
2093 
2094 $code.=<<___;
2095 .globl  bsaes_xts_encrypt
2096 .type   bsaes_xts_encrypt,\@abi-omnipotent
2097 .align  16
2098 bsaes_xts_encrypt:
2099         mov     %rsp, %rax
2100 .Lxts_enc_prologue:
2101         push    %rbp
2102         push    %rbx
2103         push    %r12
2104         push    %r13
2105         push    %r14
2106         push    %r15
2107         lea     -0x48(%rsp), %rsp
2108 ___
2109 $code.=<<___ if ($win64);
2110         mov     0xa0(%rsp),$arg5        # pull key2
2111         mov     0xa8(%rsp),$arg6        # pull ivp
2112         lea     -0xa0(%rsp), %rsp
2113         movaps  %xmm6, 0x40(%rsp)
2114         movaps  %xmm7, 0x50(%rsp)
2115         movaps  %xmm8, 0x60(%rsp)
2116         movaps  %xmm9, 0x70(%rsp)
2117         movaps  %xmm10, 0x80(%rsp)
2118         movaps  %xmm11, 0x90(%rsp)
2119         movaps  %xmm12, 0xa0(%rsp)
2120         movaps  %xmm13, 0xb0(%rsp)
2121         movaps  %xmm14, 0xc0(%rsp)
2122         movaps  %xmm15, 0xd0(%rsp)
2123 .Lxts_enc_body:
2124 ___
2125 $code.=<<___;
2126         mov     %rsp, %rbp              # backup %rsp
2127         mov     $arg1, $inp             # backup arguments
2128         mov     $arg2, $out
2129         mov     $arg3, $len
2130         mov     $arg4, $key
2131 
2132         lea     ($arg6), $arg1
2133         lea     0x20(%rbp), $arg2
2134         lea     ($arg5), $arg3
2135         call    asm_AES_encrypt         # generate initial tweak
2136 
2137         mov     240($key), %eax         # rounds
2138         mov     $len, %rbx              # backup $len
2139 
2140         mov     %eax, %edx              # rounds
2141         shl     \$7, %rax               # 128 bytes per inner round key
2142         sub     \$`128-32`, %rax        # size of bit-sliced key schedule
2143         sub     %rax, %rsp
2144 
2145         mov     %rsp, %rax              # pass key schedule
2146         mov     $key, %rcx              # pass key
2147         mov     %edx, %r10d             # pass rounds
2148         call    _bsaes_key_convert
2149         pxor    %xmm6, %xmm7            # fix up last round key
2150         movdqa  %xmm7, (%rax)           # save last round key
2151 
2152         and     \$-16, $len
2153         sub     \$0x80, %rsp            # place for tweak[8]
2154         movdqa  0x20(%rbp), @XMM[7]     # initial tweak
2155 
2156         pxor    $twtmp, $twtmp
2157         movdqa  .Lxts_magic(%rip), $twmask
2158         pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
2159 
2160         sub     \$0x80, $len
2161         jc      .Lxts_enc_short
2162         jmp     .Lxts_enc_loop
2163 
2164 .align  16
2165 .Lxts_enc_loop:
2166 ___
2167     for ($i=0;$i<7;$i++) {
2168     $code.=<<___;
2169         pshufd  \$0x13, $twtmp, $twres
2170         pxor    $twtmp, $twtmp
2171         movdqa  @XMM[7], @XMM[$i]
2172         movdqa  @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2173         paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
2174         pand    $twmask, $twres         # isolate carry and residue
2175         pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
2176         pxor    $twres, @XMM[7]
2177 ___
2178     $code.=<<___ if ($i>=1);
2179         movdqu  `0x10*($i-1)`($inp), @XMM[8+$i-1]
2180 ___
2181     $code.=<<___ if ($i>=2);
2182         pxor    @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2183 ___
2184     }
2185 $code.=<<___;
2186         movdqu  0x60($inp), @XMM[8+6]
2187         pxor    @XMM[8+5], @XMM[5]
2188         movdqu  0x70($inp), @XMM[8+7]
2189         lea     0x80($inp), $inp
2190         movdqa  @XMM[7], 0x70(%rsp)
2191         pxor    @XMM[8+6], @XMM[6]
2192         lea     0x80(%rsp), %rax        # pass key schedule
2193         pxor    @XMM[8+7], @XMM[7]
2194         mov     %edx, %r10d             # pass rounds
2195 
2196         call    _bsaes_encrypt8
2197 
2198         pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
2199         pxor    0x10(%rsp), @XMM[1]
2200         movdqu  @XMM[0], 0x00($out)     # write output
2201         pxor    0x20(%rsp), @XMM[4]
2202         movdqu  @XMM[1], 0x10($out)
2203         pxor    0x30(%rsp), @XMM[6]
2204         movdqu  @XMM[4], 0x20($out)
2205         pxor    0x40(%rsp), @XMM[3]
2206         movdqu  @XMM[6], 0x30($out)
2207         pxor    0x50(%rsp), @XMM[7]
2208         movdqu  @XMM[3], 0x40($out)
2209         pxor    0x60(%rsp), @XMM[2]
2210         movdqu  @XMM[7], 0x50($out)
2211         pxor    0x70(%rsp), @XMM[5]
2212         movdqu  @XMM[2], 0x60($out)
2213         movdqu  @XMM[5], 0x70($out)
2214         lea     0x80($out), $out
2215 
2216         movdqa  0x70(%rsp), @XMM[7]     # prepare next iteration tweak
2217         pxor    $twtmp, $twtmp
2218         movdqa  .Lxts_magic(%rip), $twmask
2219         pcmpgtd @XMM[7], $twtmp
2220         pshufd  \$0x13, $twtmp, $twres
2221         pxor    $twtmp, $twtmp
2222         paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
2223         pand    $twmask, $twres         # isolate carry and residue
2224         pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
2225         pxor    $twres, @XMM[7]
2226 
2227         sub     \$0x80,$len
2228         jnc     .Lxts_enc_loop
2229 
2230 .Lxts_enc_short:
2231         add     \$0x80, $len
2232         jz      .Lxts_enc_done
2233 ___
2234     for ($i=0;$i<7;$i++) {
2235     $code.=<<___;
2236         pshufd  \$0x13, $twtmp, $twres
2237         pxor    $twtmp, $twtmp
2238         movdqa  @XMM[7], @XMM[$i]
2239         movdqa  @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2240         paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
2241         pand    $twmask, $twres         # isolate carry and residue
2242         pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
2243         pxor    $twres, @XMM[7]
2244 ___
2245     $code.=<<___ if ($i>=1);
2246         movdqu  `0x10*($i-1)`($inp), @XMM[8+$i-1]
2247         cmp     \$`0x10*$i`,$len
2248         je      .Lxts_enc_$i
2249 ___
2250     $code.=<<___ if ($i>=2);
2251         pxor    @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2252 ___
2253     }
2254 $code.=<<___;
2255         movdqu  0x60($inp), @XMM[8+6]
2256         pxor    @XMM[8+5], @XMM[5]
2257         movdqa  @XMM[7], 0x70(%rsp)
2258         lea     0x70($inp), $inp
2259         pxor    @XMM[8+6], @XMM[6]
2260         lea     0x80(%rsp), %rax        # pass key schedule
2261         mov     %edx, %r10d             # pass rounds
2262 
2263         call    _bsaes_encrypt8
2264 
2265         pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
2266         pxor    0x10(%rsp), @XMM[1]
2267         movdqu  @XMM[0], 0x00($out)     # write output
2268         pxor    0x20(%rsp), @XMM[4]
2269         movdqu  @XMM[1], 0x10($out)
2270         pxor    0x30(%rsp), @XMM[6]
2271         movdqu  @XMM[4], 0x20($out)
2272         pxor    0x40(%rsp), @XMM[3]
2273         movdqu  @XMM[6], 0x30($out)
2274         pxor    0x50(%rsp), @XMM[7]
2275         movdqu  @XMM[3], 0x40($out)
2276         pxor    0x60(%rsp), @XMM[2]
2277         movdqu  @XMM[7], 0x50($out)
2278         movdqu  @XMM[2], 0x60($out)
2279         lea     0x70($out), $out
2280 
2281         movdqa  0x70(%rsp), @XMM[7]     # next iteration tweak
2282         jmp     .Lxts_enc_done
2283 .align  16
2284 .Lxts_enc_6:
2285         pxor    @XMM[8+4], @XMM[4]
2286         lea     0x60($inp), $inp
2287         pxor    @XMM[8+5], @XMM[5]
2288         lea     0x80(%rsp), %rax        # pass key schedule
2289         mov     %edx, %r10d             # pass rounds
2290 
2291         call    _bsaes_encrypt8
2292 
2293         pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
2294         pxor    0x10(%rsp), @XMM[1]
2295         movdqu  @XMM[0], 0x00($out)     # write output
2296         pxor    0x20(%rsp), @XMM[4]
2297         movdqu  @XMM[1], 0x10($out)
2298         pxor    0x30(%rsp), @XMM[6]
2299         movdqu  @XMM[4], 0x20($out)
2300         pxor    0x40(%rsp), @XMM[3]
2301         movdqu  @XMM[6], 0x30($out)
2302         pxor    0x50(%rsp), @XMM[7]
2303         movdqu  @XMM[3], 0x40($out)
2304         movdqu  @XMM[7], 0x50($out)
2305         lea     0x60($out), $out
2306 
2307         movdqa  0x60(%rsp), @XMM[7]     # next iteration tweak
2308         jmp     .Lxts_enc_done
2309 .align  16
2310 .Lxts_enc_5:
2311         pxor    @XMM[8+3], @XMM[3]
2312         lea     0x50($inp), $inp
2313         pxor    @XMM[8+4], @XMM[4]
2314         lea     0x80(%rsp), %rax        # pass key schedule
2315         mov     %edx, %r10d             # pass rounds
2316 
2317         call    _bsaes_encrypt8
2318 
2319         pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
2320         pxor    0x10(%rsp), @XMM[1]
2321         movdqu  @XMM[0], 0x00($out)     # write output
2322         pxor    0x20(%rsp), @XMM[4]
2323         movdqu  @XMM[1], 0x10($out)
2324         pxor    0x30(%rsp), @XMM[6]
2325         movdqu  @XMM[4], 0x20($out)
2326         pxor    0x40(%rsp), @XMM[3]
2327         movdqu  @XMM[6], 0x30($out)
2328         movdqu  @XMM[3], 0x40($out)
2329         lea     0x50($out), $out
2330 
2331         movdqa  0x50(%rsp), @XMM[7]     # next iteration tweak
2332         jmp     .Lxts_enc_done
2333 .align  16
2334 .Lxts_enc_4:
2335         pxor    @XMM[8+2], @XMM[2]
2336         lea     0x40($inp), $inp
2337         pxor    @XMM[8+3], @XMM[3]
2338         lea     0x80(%rsp), %rax        # pass key schedule
2339         mov     %edx, %r10d             # pass rounds
2340 
2341         call    _bsaes_encrypt8
2342 
2343         pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
2344         pxor    0x10(%rsp), @XMM[1]
2345         movdqu  @XMM[0], 0x00($out)     # write output
2346         pxor    0x20(%rsp), @XMM[4]
2347         movdqu  @XMM[1], 0x10($out)
2348         pxor    0x30(%rsp), @XMM[6]
2349         movdqu  @XMM[4], 0x20($out)
2350         movdqu  @XMM[6], 0x30($out)
2351         lea     0x40($out), $out
2352 
2353         movdqa  0x40(%rsp), @XMM[7]     # next iteration tweak
2354         jmp     .Lxts_enc_done
2355 .align  16
2356 .Lxts_enc_3:
2357         pxor    @XMM[8+1], @XMM[1]
2358         lea     0x30($inp), $inp
2359         pxor    @XMM[8+2], @XMM[2]
2360         lea     0x80(%rsp), %rax        # pass key schedule
2361         mov     %edx, %r10d             # pass rounds
2362 
2363         call    _bsaes_encrypt8
2364 
2365         pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
2366         pxor    0x10(%rsp), @XMM[1]
2367         movdqu  @XMM[0], 0x00($out)     # write output
2368         pxor    0x20(%rsp), @XMM[4]
2369         movdqu  @XMM[1], 0x10($out)
2370         movdqu  @XMM[4], 0x20($out)
2371         lea     0x30($out), $out
2372 
2373         movdqa  0x30(%rsp), @XMM[7]     # next iteration tweak
2374         jmp     .Lxts_enc_done
2375 .align  16
2376 .Lxts_enc_2:
2377         pxor    @XMM[8+0], @XMM[0]
2378         lea     0x20($inp), $inp
2379         pxor    @XMM[8+1], @XMM[1]
2380         lea     0x80(%rsp), %rax        # pass key schedule
2381         mov     %edx, %r10d             # pass rounds
2382 
2383         call    _bsaes_encrypt8
2384 
2385         pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
2386         pxor    0x10(%rsp), @XMM[1]
2387         movdqu  @XMM[0], 0x00($out)     # write output
2388         movdqu  @XMM[1], 0x10($out)
2389         lea     0x20($out), $out
2390 
2391         movdqa  0x20(%rsp), @XMM[7]     # next iteration tweak
2392         jmp     .Lxts_enc_done
2393 .align  16
2394 .Lxts_enc_1:
2395         pxor    @XMM[0], @XMM[8]
2396         lea     0x10($inp), $inp
2397         movdqa  @XMM[8], 0x20(%rbp)
2398         lea     0x20(%rbp), $arg1
2399         lea     0x20(%rbp), $arg2
2400         lea     ($key), $arg3
2401         call    asm_AES_encrypt         # doesn't touch %xmm
2402         pxor    0x20(%rbp), @XMM[0]     # ^= tweak[]
2403         #pxor   @XMM[8], @XMM[0]
2404         #lea    0x80(%rsp), %rax        # pass key schedule
2405         #mov    %edx, %r10d             # pass rounds
2406         #call   _bsaes_encrypt8
2407         #pxor   0x00(%rsp), @XMM[0]     # ^= tweak[]
2408         movdqu  @XMM[0], 0x00($out)     # write output
2409         lea     0x10($out), $out
2410 
2411         movdqa  0x10(%rsp), @XMM[7]     # next iteration tweak
2412 
2413 .Lxts_enc_done:
2414         and     \$15, %ebx
2415         jz      .Lxts_enc_ret
2416         mov     $out, %rdx
2417 
2418 .Lxts_enc_steal:
2419         movzb   ($inp), %eax
2420         movzb   -16(%rdx), %ecx
2421         lea     1($inp), $inp
2422         mov     %al, -16(%rdx)
2423         mov     %cl, 0(%rdx)
2424         lea     1(%rdx), %rdx
2425         sub     \$1,%ebx
2426         jnz     .Lxts_enc_steal
2427 
2428         movdqu  -16($out), @XMM[0]
2429         lea     0x20(%rbp), $arg1
2430         pxor    @XMM[7], @XMM[0]
2431         lea     0x20(%rbp), $arg2
2432         movdqa  @XMM[0], 0x20(%rbp)
2433         lea     ($key), $arg3
2434         call    asm_AES_encrypt         # doesn't touch %xmm
2435         pxor    0x20(%rbp), @XMM[7]
2436         movdqu  @XMM[7], -16($out)
2437 
2438 .Lxts_enc_ret:
2439         lea     (%rsp), %rax
2440         pxor    %xmm0, %xmm0
2441 .Lxts_enc_bzero:                        # wipe key schedule [if any]
2442         movdqa  %xmm0, 0x00(%rax)
2443         movdqa  %xmm0, 0x10(%rax)
2444         lea     0x20(%rax), %rax
2445         cmp     %rax, %rbp
2446         ja      .Lxts_enc_bzero
2447 
2448         lea     (%rbp),%rsp             # restore %rsp
2449 ___
2450 $code.=<<___ if ($win64);
2451         movaps  0x40(%rbp), %xmm6
2452         movaps  0x50(%rbp), %xmm7
2453         movaps  0x60(%rbp), %xmm8
2454         movaps  0x70(%rbp), %xmm9
2455         movaps  0x80(%rbp), %xmm10
2456         movaps  0x90(%rbp), %xmm11
2457         movaps  0xa0(%rbp), %xmm12
2458         movaps  0xb0(%rbp), %xmm13
2459         movaps  0xc0(%rbp), %xmm14
2460         movaps  0xd0(%rbp), %xmm15
2461         lea     0xa0(%rbp), %rsp
2462 ___
2463 $code.=<<___;
2464         mov     0x48(%rsp), %r15
2465         mov     0x50(%rsp), %r14
2466         mov     0x58(%rsp), %r13
2467         mov     0x60(%rsp), %r12
2468         mov     0x68(%rsp), %rbx
2469         mov     0x70(%rsp), %rax
2470         lea     0x78(%rsp), %rsp
2471         mov     %rax, %rbp
2472 .Lxts_enc_epilogue:
2473         ret
2474 .size   bsaes_xts_encrypt,.-bsaes_xts_encrypt
2475 
2476 .globl  bsaes_xts_decrypt
2477 .type   bsaes_xts_decrypt,\@abi-omnipotent
2478 .align  16
2479 bsaes_xts_decrypt:
2480         mov     %rsp, %rax
2481 .Lxts_dec_prologue:
2482         push    %rbp
2483         push    %rbx
2484         push    %r12
2485         push    %r13
2486         push    %r14
2487         push    %r15
2488         lea     -0x48(%rsp), %rsp
2489 ___
2490 $code.=<<___ if ($win64);
2491         mov     0xa0(%rsp),$arg5        # pull key2
2492         mov     0xa8(%rsp),$arg6        # pull ivp
2493         lea     -0xa0(%rsp), %rsp
2494         movaps  %xmm6, 0x40(%rsp)
2495         movaps  %xmm7, 0x50(%rsp)
2496         movaps  %xmm8, 0x60(%rsp)
2497         movaps  %xmm9, 0x70(%rsp)
2498         movaps  %xmm10, 0x80(%rsp)
2499         movaps  %xmm11, 0x90(%rsp)
2500         movaps  %xmm12, 0xa0(%rsp)
2501         movaps  %xmm13, 0xb0(%rsp)
2502         movaps  %xmm14, 0xc0(%rsp)
2503         movaps  %xmm15, 0xd0(%rsp)
2504 .Lxts_dec_body:
2505 ___
2506 $code.=<<___;
2507         mov     %rsp, %rbp              # backup %rsp
2508         mov     $arg1, $inp             # backup arguments
2509         mov     $arg2, $out
2510         mov     $arg3, $len
2511         mov     $arg4, $key
2512 
2513         lea     ($arg6), $arg1
2514         lea     0x20(%rbp), $arg2
2515         lea     ($arg5), $arg3
2516         call    asm_AES_encrypt         # generate initial tweak
2517 
2518         mov     240($key), %eax         # rounds
2519         mov     $len, %rbx              # backup $len
2520 
2521         mov     %eax, %edx              # rounds
2522         shl     \$7, %rax               # 128 bytes per inner round key
2523         sub     \$`128-32`, %rax        # size of bit-sliced key schedule
2524         sub     %rax, %rsp
2525 
2526         mov     %rsp, %rax              # pass key schedule
2527         mov     $key, %rcx              # pass key
2528         mov     %edx, %r10d             # pass rounds
2529         call    _bsaes_key_convert
2530         pxor    (%rsp), %xmm7           # fix up round 0 key
2531         movdqa  %xmm6, (%rax)           # save last round key
2532         movdqa  %xmm7, (%rsp)
2533 
2534         xor     %eax, %eax              # if ($len%16) len-=16;
2535         and     \$-16, $len
2536         test    \$15, %ebx
2537         setnz   %al
2538         shl     \$4, %rax
2539         sub     %rax, $len
2540 
2541         sub     \$0x80, %rsp            # place for tweak[8]
2542         movdqa  0x20(%rbp), @XMM[7]     # initial tweak
2543 
2544         pxor    $twtmp, $twtmp
2545         movdqa  .Lxts_magic(%rip), $twmask
2546         pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
2547 
2548         sub     \$0x80, $len
2549         jc      .Lxts_dec_short
2550         jmp     .Lxts_dec_loop
2551 
2552 .align  16
2553 .Lxts_dec_loop:
2554 ___
2555     for ($i=0;$i<7;$i++) {
2556     $code.=<<___;
2557         pshufd  \$0x13, $twtmp, $twres
2558         pxor    $twtmp, $twtmp
2559         movdqa  @XMM[7], @XMM[$i]
2560         movdqa  @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2561         paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
2562         pand    $twmask, $twres         # isolate carry and residue
2563         pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
2564         pxor    $twres, @XMM[7]
2565 ___
2566     $code.=<<___ if ($i>=1);
2567         movdqu  `0x10*($i-1)`($inp), @XMM[8+$i-1]
2568 ___
2569     $code.=<<___ if ($i>=2);
2570         pxor    @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2571 ___
2572     }
2573 $code.=<<___;
2574         movdqu  0x60($inp), @XMM[8+6]
2575         pxor    @XMM[8+5], @XMM[5]
2576         movdqu  0x70($inp), @XMM[8+7]
2577         lea     0x80($inp), $inp
2578         movdqa  @XMM[7], 0x70(%rsp)
2579         pxor    @XMM[8+6], @XMM[6]
2580         lea     0x80(%rsp), %rax        # pass key schedule
2581         pxor    @XMM[8+7], @XMM[7]
2582         mov     %edx, %r10d             # pass rounds
2583 
2584         call    _bsaes_decrypt8
2585 
2586         pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
2587         pxor    0x10(%rsp), @XMM[1]
2588         movdqu  @XMM[0], 0x00($out)     # write output
2589         pxor    0x20(%rsp), @XMM[6]
2590         movdqu  @XMM[1], 0x10($out)
2591         pxor    0x30(%rsp), @XMM[4]
2592         movdqu  @XMM[6], 0x20($out)
2593         pxor    0x40(%rsp), @XMM[2]
2594         movdqu  @XMM[4], 0x30($out)
2595         pxor    0x50(%rsp), @XMM[7]
2596         movdqu  @XMM[2], 0x40($out)
2597         pxor    0x60(%rsp), @XMM[3]
2598         movdqu  @XMM[7], 0x50($out)
2599         pxor    0x70(%rsp), @XMM[5]
2600         movdqu  @XMM[3], 0x60($out)
2601         movdqu  @XMM[5], 0x70($out)
2602         lea     0x80($out), $out
2603 
2604         movdqa  0x70(%rsp), @XMM[7]     # prepare next iteration tweak
2605         pxor    $twtmp, $twtmp
2606         movdqa  .Lxts_magic(%rip), $twmask
2607         pcmpgtd @XMM[7], $twtmp
2608         pshufd  \$0x13, $twtmp, $twres
2609         pxor    $twtmp, $twtmp
2610         paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
2611         pand    $twmask, $twres         # isolate carry and residue
2612         pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
2613         pxor    $twres, @XMM[7]
2614 
2615         sub     \$0x80,$len
2616         jnc     .Lxts_dec_loop
2617 
2618 .Lxts_dec_short:
2619         add     \$0x80, $len
2620         jz      .Lxts_dec_done
2621 ___
2622     for ($i=0;$i<7;$i++) {
2623     $code.=<<___;
2624         pshufd  \$0x13, $twtmp, $twres
2625         pxor    $twtmp, $twtmp
2626         movdqa  @XMM[7], @XMM[$i]
2627         movdqa  @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2628         paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
2629         pand    $twmask, $twres         # isolate carry and residue
2630         pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
2631         pxor    $twres, @XMM[7]
2632 ___
2633     $code.=<<___ if ($i>=1);
2634         movdqu  `0x10*($i-1)`($inp), @XMM[8+$i-1]
2635         cmp     \$`0x10*$i`,$len
2636         je      .Lxts_dec_$i
2637 ___
2638     $code.=<<___ if ($i>=2);
2639         pxor    @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2640 ___
2641     }
2642 $code.=<<___;
2643         movdqu  0x60($inp), @XMM[8+6]
2644         pxor    @XMM[8+5], @XMM[5]
2645         movdqa  @XMM[7], 0x70(%rsp)
2646         lea     0x70($inp), $inp
2647         pxor    @XMM[8+6], @XMM[6]
2648         lea     0x80(%rsp), %rax        # pass key schedule
2649         mov     %edx, %r10d             # pass rounds
2650 
2651         call    _bsaes_decrypt8
2652 
2653         pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
2654         pxor    0x10(%rsp), @XMM[1]
2655         movdqu  @XMM[0], 0x00($out)     # write output
2656         pxor    0x20(%rsp), @XMM[6]
2657         movdqu  @XMM[1], 0x10($out)
2658         pxor    0x30(%rsp), @XMM[4]
2659         movdqu  @XMM[6], 0x20($out)
2660         pxor    0x40(%rsp), @XMM[2]
2661         movdqu  @XMM[4], 0x30($out)
2662         pxor    0x50(%rsp), @XMM[7]
2663         movdqu  @XMM[2], 0x40($out)
2664         pxor    0x60(%rsp), @XMM[3]
2665         movdqu  @XMM[7], 0x50($out)
2666         movdqu  @XMM[3], 0x60($out)
2667         lea     0x70($out), $out
2668 
2669         movdqa  0x70(%rsp), @XMM[7]     # next iteration tweak
2670         jmp     .Lxts_dec_done
2671 .align  16
2672 .Lxts_dec_6:
2673         pxor    @XMM[8+4], @XMM[4]
2674         lea     0x60($inp), $inp
2675         pxor    @XMM[8+5], @XMM[5]
2676         lea     0x80(%rsp), %rax        # pass key schedule
2677         mov     %edx, %r10d             # pass rounds
2678 
2679         call    _bsaes_decrypt8
2680 
2681         pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
2682         pxor    0x10(%rsp), @XMM[1]
2683         movdqu  @XMM[0], 0x00($out)     # write output
2684         pxor    0x20(%rsp), @XMM[6]
2685         movdqu  @XMM[1], 0x10($out)
2686         pxor    0x30(%rsp), @XMM[4]
2687         movdqu  @XMM[6], 0x20($out)
2688         pxor    0x40(%rsp), @XMM[2]
2689         movdqu  @XMM[4], 0x30($out)
2690         pxor    0x50(%rsp), @XMM[7]
2691         movdqu  @XMM[2], 0x40($out)
2692         movdqu  @XMM[7], 0x50($out)
2693         lea     0x60($out), $out
2694 
2695         movdqa  0x60(%rsp), @XMM[7]     # next iteration tweak
2696         jmp     .Lxts_dec_done
2697 .align  16
2698 .Lxts_dec_5:
2699         pxor    @XMM[8+3], @XMM[3]
2700         lea     0x50($inp), $inp
2701         pxor    @XMM[8+4], @XMM[4]
2702         lea     0x80(%rsp), %rax        # pass key schedule
2703         mov     %edx, %r10d             # pass rounds
2704 
2705         call    _bsaes_decrypt8
2706 
2707         pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
2708         pxor    0x10(%rsp), @XMM[1]
2709         movdqu  @XMM[0], 0x00($out)     # write output
2710         pxor    0x20(%rsp), @XMM[6]
2711         movdqu  @XMM[1], 0x10($out)
2712         pxor    0x30(%rsp), @XMM[4]
2713         movdqu  @XMM[6], 0x20($out)
2714         pxor    0x40(%rsp), @XMM[2]
2715         movdqu  @XMM[4], 0x30($out)
2716         movdqu  @XMM[2], 0x40($out)
2717         lea     0x50($out), $out
2718 
2719         movdqa  0x50(%rsp), @XMM[7]     # next iteration tweak
2720         jmp     .Lxts_dec_done
2721 .align  16
2722 .Lxts_dec_4:
2723         pxor    @XMM[8+2], @XMM[2]
2724         lea     0x40($inp), $inp
2725         pxor    @XMM[8+3], @XMM[3]
2726         lea     0x80(%rsp), %rax        # pass key schedule
2727         mov     %edx, %r10d             # pass rounds
2728 
2729         call    _bsaes_decrypt8
2730 
2731         pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
2732         pxor    0x10(%rsp), @XMM[1]
2733         movdqu  @XMM[0], 0x00($out)     # write output
2734         pxor    0x20(%rsp), @XMM[6]
2735         movdqu  @XMM[1], 0x10($out)
2736         pxor    0x30(%rsp), @XMM[4]
2737         movdqu  @XMM[6], 0x20($out)
2738         movdqu  @XMM[4], 0x30($out)
2739         lea     0x40($out), $out
2740 
2741         movdqa  0x40(%rsp), @XMM[7]     # next iteration tweak
2742         jmp     .Lxts_dec_done
2743 .align  16
2744 .Lxts_dec_3:
2745         pxor    @XMM[8+1], @XMM[1]
2746         lea     0x30($inp), $inp
2747         pxor    @XMM[8+2], @XMM[2]
2748         lea     0x80(%rsp), %rax        # pass key schedule
2749         mov     %edx, %r10d             # pass rounds
2750 
2751         call    _bsaes_decrypt8
2752 
2753         pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
2754         pxor    0x10(%rsp), @XMM[1]
2755         movdqu  @XMM[0], 0x00($out)     # write output
2756         pxor    0x20(%rsp), @XMM[6]
2757         movdqu  @XMM[1], 0x10($out)
2758         movdqu  @XMM[6], 0x20($out)
2759         lea     0x30($out), $out
2760 
2761         movdqa  0x30(%rsp), @XMM[7]     # next iteration tweak
2762         jmp     .Lxts_dec_done
2763 .align  16
2764 .Lxts_dec_2:
2765         pxor    @XMM[8+0], @XMM[0]
2766         lea     0x20($inp), $inp
2767         pxor    @XMM[8+1], @XMM[1]
2768         lea     0x80(%rsp), %rax        # pass key schedule
2769         mov     %edx, %r10d             # pass rounds
2770 
2771         call    _bsaes_decrypt8
2772 
2773         pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
2774         pxor    0x10(%rsp), @XMM[1]
2775         movdqu  @XMM[0], 0x00($out)     # write output
2776         movdqu  @XMM[1], 0x10($out)
2777         lea     0x20($out), $out
2778 
2779         movdqa  0x20(%rsp), @XMM[7]     # next iteration tweak
2780         jmp     .Lxts_dec_done
2781 .align  16
2782 .Lxts_dec_1:
2783         pxor    @XMM[0], @XMM[8]
2784         lea     0x10($inp), $inp
2785         movdqa  @XMM[8], 0x20(%rbp)
2786         lea     0x20(%rbp), $arg1
2787         lea     0x20(%rbp), $arg2
2788         lea     ($key), $arg3
2789         call    asm_AES_decrypt         # doesn't touch %xmm
2790         pxor    0x20(%rbp), @XMM[0]     # ^= tweak[]
2791         #pxor   @XMM[8], @XMM[0]
2792         #lea    0x80(%rsp), %rax        # pass key schedule
2793         #mov    %edx, %r10d             # pass rounds
2794         #call   _bsaes_decrypt8
2795         #pxor   0x00(%rsp), @XMM[0]     # ^= tweak[]
2796         movdqu  @XMM[0], 0x00($out)     # write output
2797         lea     0x10($out), $out
2798 
2799         movdqa  0x10(%rsp), @XMM[7]     # next iteration tweak
2800 
2801 .Lxts_dec_done:
2802         and     \$15, %ebx
2803         jz      .Lxts_dec_ret
2804 
2805         pxor    $twtmp, $twtmp
2806         movdqa  .Lxts_magic(%rip), $twmask
2807         pcmpgtd @XMM[7], $twtmp
2808         pshufd  \$0x13, $twtmp, $twres
2809         movdqa  @XMM[7], @XMM[6]
2810         paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
2811         pand    $twmask, $twres         # isolate carry and residue
2812         movdqu  ($inp), @XMM[0]
2813         pxor    $twres, @XMM[7]
2814 
2815         lea     0x20(%rbp), $arg1
2816         pxor    @XMM[7], @XMM[0]
2817         lea     0x20(%rbp), $arg2
2818         movdqa  @XMM[0], 0x20(%rbp)
2819         lea     ($key), $arg3
2820         call    asm_AES_decrypt         # doesn't touch %xmm
2821         pxor    0x20(%rbp), @XMM[7]
2822         mov     $out, %rdx
2823         movdqu  @XMM[7], ($out)
2824 
2825 .Lxts_dec_steal:
2826         movzb   16($inp), %eax
2827         movzb   (%rdx), %ecx
2828         lea     1($inp), $inp
2829         mov     %al, (%rdx)
2830         mov     %cl, 16(%rdx)
2831         lea     1(%rdx), %rdx
2832         sub     \$1,%ebx
2833         jnz     .Lxts_dec_steal
2834 
2835         movdqu  ($out), @XMM[0]
2836         lea     0x20(%rbp), $arg1
2837         pxor    @XMM[6], @XMM[0]
2838         lea     0x20(%rbp), $arg2
2839         movdqa  @XMM[0], 0x20(%rbp)
2840         lea     ($key), $arg3
2841         call    asm_AES_decrypt         # doesn't touch %xmm
2842         pxor    0x20(%rbp), @XMM[6]
2843         movdqu  @XMM[6], ($out)
2844 
2845 .Lxts_dec_ret:
2846         lea     (%rsp), %rax
2847         pxor    %xmm0, %xmm0
2848 .Lxts_dec_bzero:                        # wipe key schedule [if any]
2849         movdqa  %xmm0, 0x00(%rax)
2850         movdqa  %xmm0, 0x10(%rax)
2851         lea     0x20(%rax), %rax
2852         cmp     %rax, %rbp
2853         ja      .Lxts_dec_bzero
2854 
2855         lea     (%rbp),%rsp             # restore %rsp
2856 ___
2857 $code.=<<___ if ($win64);
2858         movaps  0x40(%rbp), %xmm6
2859         movaps  0x50(%rbp), %xmm7
2860         movaps  0x60(%rbp), %xmm8
2861         movaps  0x70(%rbp), %xmm9
2862         movaps  0x80(%rbp), %xmm10
2863         movaps  0x90(%rbp), %xmm11
2864         movaps  0xa0(%rbp), %xmm12
2865         movaps  0xb0(%rbp), %xmm13
2866         movaps  0xc0(%rbp), %xmm14
2867         movaps  0xd0(%rbp), %xmm15
2868         lea     0xa0(%rbp), %rsp
2869 ___
2870 $code.=<<___;
2871         mov     0x48(%rsp), %r15
2872         mov     0x50(%rsp), %r14
2873         mov     0x58(%rsp), %r13
2874         mov     0x60(%rsp), %r12
2875         mov     0x68(%rsp), %rbx
2876         mov     0x70(%rsp), %rax
2877         lea     0x78(%rsp), %rsp
2878         mov     %rax, %rbp
2879 .Lxts_dec_epilogue:
2880         ret
2881 .size   bsaes_xts_decrypt,.-bsaes_xts_decrypt
2882 ___
2883 }
2884 $code.=<<___;
2885 .type   _bsaes_const,\@object
2886 .align  64
2887 _bsaes_const:
2888 .LM0ISR:        # InvShiftRows constants
2889         .quad   0x0a0e0206070b0f03, 0x0004080c0d010509
2890 .LISRM0:
2891         .quad   0x01040b0e0205080f, 0x0306090c00070a0d
2892 .LISR:
2893         .quad   0x0504070602010003, 0x0f0e0d0c080b0a09
2894 .LBS0:          # bit-slice constants
2895         .quad   0x5555555555555555, 0x5555555555555555
2896 .LBS1:
2897         .quad   0x3333333333333333, 0x3333333333333333
2898 .LBS2:
2899         .quad   0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2900 .LSR:           # shiftrows constants
2901         .quad   0x0504070600030201, 0x0f0e0d0c0a09080b
2902 .LSRM0:
2903         .quad   0x0304090e00050a0f, 0x01060b0c0207080d
2904 .LM0SR:
2905         .quad   0x0a0e02060f03070b, 0x0004080c05090d01
2906 .LSWPUP:        # byte-swap upper dword
2907         .quad   0x0706050403020100, 0x0c0d0e0f0b0a0908
2908 .LSWPUPM0SR:
2909         .quad   0x0a0d02060c03070b, 0x0004080f05090e01
2910 .LADD1:         # counter increment constants
2911         .quad   0x0000000000000000, 0x0000000100000000
2912 .LADD2:
2913         .quad   0x0000000000000000, 0x0000000200000000
2914 .LADD3:
2915         .quad   0x0000000000000000, 0x0000000300000000
2916 .LADD4:
2917         .quad   0x0000000000000000, 0x0000000400000000
2918 .LADD5:
2919         .quad   0x0000000000000000, 0x0000000500000000
2920 .LADD6:
2921         .quad   0x0000000000000000, 0x0000000600000000
2922 .LADD7:
2923         .quad   0x0000000000000000, 0x0000000700000000
2924 .LADD8:
2925         .quad   0x0000000000000000, 0x0000000800000000
2926 .Lxts_magic:
2927         .long   0x87,0,1,0
2928 .Lmasks:
2929         .quad   0x0101010101010101, 0x0101010101010101
2930         .quad   0x0202020202020202, 0x0202020202020202
2931         .quad   0x0404040404040404, 0x0404040404040404
2932         .quad   0x0808080808080808, 0x0808080808080808
2933 .LM0:
2934         .quad   0x02060a0e03070b0f, 0x0004080c0105090d
2935 .L63:
2936         .quad   0x6363636363636363, 0x6363636363636363
2937 .asciz  "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2938 .align  64
2939 .size   _bsaes_const,.-_bsaes_const
2940 ___
2941 
2942 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2943 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
2944 if ($win64) {
2945 $rec="%rcx";
2946 $frame="%rdx";
2947 $context="%r8";
2948 $disp="%r9";
2949 
2950 $code.=<<___;
2951 .extern __imp_RtlVirtualUnwind
2952 .type   se_handler,\@abi-omnipotent
2953 .align  16
2954 se_handler:
2955         push    %rsi
2956         push    %rdi
2957         push    %rbx
2958         push    %rbp
2959         push    %r12
2960         push    %r13
2961         push    %r14
2962         push    %r15
2963         pushfq
2964         sub     \$64,%rsp
2965 
2966         mov     120($context),%rax      # pull context->Rax
2967         mov     248($context),%rbx      # pull context->Rip
2968 
2969         mov     8($disp),%rsi           # disp->ImageBase
2970         mov     56($disp),%r11          # disp->HandlerData
2971 
2972         mov     0(%r11),%r10d           # HandlerData[0]
2973         lea     (%rsi,%r10),%r10        # prologue label
2974         cmp     %r10,%rbx               # context->Rip<prologue label
2975         jb      .Lin_prologue
2976 
2977         mov     152($context),%rax      # pull context->Rsp
2978 
2979         mov     4(%r11),%r10d           # HandlerData[1]
2980         lea     (%rsi,%r10),%r10        # epilogue label
2981         cmp     %r10,%rbx               # context->Rip>=epilogue label
2982         jae     .Lin_prologue
2983 
2984         mov     160($context),%rax      # pull context->Rbp
2985 
2986         lea     0x40(%rax),%rsi         # %xmm save area
2987         lea     512($context),%rdi      # &context.Xmm6
2988         mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
2989         .long   0xa548f3fc              # cld; rep movsq
2990         lea     0xa0(%rax),%rax         # adjust stack pointer
2991 
2992         mov     0x70(%rax),%rbp
2993         mov     0x68(%rax),%rbx
2994         mov     0x60(%rax),%r12
2995         mov     0x58(%rax),%r13
2996         mov     0x50(%rax),%r14
2997         mov     0x48(%rax),%r15
2998         lea     0x78(%rax),%rax         # adjust stack pointer
2999         mov     %rbx,144($context)      # restore context->Rbx
3000         mov     %rbp,160($context)      # restore context->Rbp
3001         mov     %r12,216($context)      # restore context->R12
3002         mov     %r13,224($context)      # restore context->R13
3003         mov     %r14,232($context)      # restore context->R14
3004         mov     %r15,240($context)      # restore context->R15
3005 
3006 .Lin_prologue:
3007         mov     %rax,152($context)      # restore context->Rsp
3008 
3009         mov     40($disp),%rdi          # disp->ContextRecord
3010         mov     $context,%rsi           # context
3011         mov     \$`1232/8`,%ecx         # sizeof(CONTEXT)
3012         .long   0xa548f3fc              # cld; rep movsq
3013 
3014         mov     $disp,%rsi
3015         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
3016         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
3017         mov     0(%rsi),%r8             # arg3, disp->ControlPc
3018         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
3019         mov     40(%rsi),%r10           # disp->ContextRecord
3020         lea     56(%rsi),%r11           # &disp->HandlerData
3021         lea     24(%rsi),%r12           # &disp->EstablisherFrame
3022         mov     %r10,32(%rsp)           # arg5
3023         mov     %r11,40(%rsp)           # arg6
3024         mov     %r12,48(%rsp)           # arg7
3025         mov     %rcx,56(%rsp)           # arg8, (NULL)
3026         call    *__imp_RtlVirtualUnwind(%rip)
3027 
3028         mov     \$1,%eax                # ExceptionContinueSearch
3029         add     \$64,%rsp
3030         popfq
3031         pop     %r15
3032         pop     %r14
3033         pop     %r13
3034         pop     %r12
3035         pop     %rbp
3036         pop     %rbx
3037         pop     %rdi
3038         pop     %rsi
3039         ret
3040 .size   se_handler,.-se_handler
3041 
3042 .section        .pdata
3043 .align  4
3044 ___
3045 $code.=<<___ if ($ecb);
3046         .rva    .Lecb_enc_prologue
3047         .rva    .Lecb_enc_epilogue
3048         .rva    .Lecb_enc_info
3049 
3050         .rva    .Lecb_dec_prologue
3051         .rva    .Lecb_dec_epilogue
3052         .rva    .Lecb_dec_info
3053 ___
3054 $code.=<<___;
3055         .rva    .Lcbc_dec_prologue
3056         .rva    .Lcbc_dec_epilogue
3057         .rva    .Lcbc_dec_info
3058 
3059         .rva    .Lctr_enc_prologue
3060         .rva    .Lctr_enc_epilogue
3061         .rva    .Lctr_enc_info
3062 
3063         .rva    .Lxts_enc_prologue
3064         .rva    .Lxts_enc_epilogue
3065         .rva    .Lxts_enc_info
3066 
3067         .rva    .Lxts_dec_prologue
3068         .rva    .Lxts_dec_epilogue
3069         .rva    .Lxts_dec_info
3070 
3071 .section        .xdata
3072 .align  8
3073 ___
3074 $code.=<<___ if ($ecb);
3075 .Lecb_enc_info:
3076         .byte   9,0,0,0
3077         .rva    se_handler
3078         .rva    .Lecb_enc_body,.Lecb_enc_epilogue       # HandlerData[]
3079 .Lecb_dec_info:
3080         .byte   9,0,0,0
3081         .rva    se_handler
3082         .rva    .Lecb_dec_body,.Lecb_dec_epilogue       # HandlerData[]
3083 ___
3084 $code.=<<___;
3085 .Lcbc_dec_info:
3086         .byte   9,0,0,0
3087         .rva    se_handler
3088         .rva    .Lcbc_dec_body,.Lcbc_dec_epilogue       # HandlerData[]
3089 .Lctr_enc_info:
3090         .byte   9,0,0,0
3091         .rva    se_handler
3092         .rva    .Lctr_enc_body,.Lctr_enc_epilogue       # HandlerData[]
3093 .Lxts_enc_info:
3094         .byte   9,0,0,0
3095         .rva    se_handler
3096         .rva    .Lxts_enc_body,.Lxts_enc_epilogue       # HandlerData[]
3097 .Lxts_dec_info:
3098         .byte   9,0,0,0
3099         .rva    se_handler
3100         .rva    .Lxts_dec_body,.Lxts_dec_epilogue       # HandlerData[]
3101 ___
3102 }
3103 
3104 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
3105 
3106 print $code;
3107 
3108 close STDOUT;