1 #!/usr/bin/env perl 2 3 ################################################################### 4 ### AES-128 [originally in CTR mode] ### 5 ### bitsliced implementation for Intel Core 2 processors ### 6 ### requires support of SSE extensions up to SSSE3 ### 7 ### Author: Emilia Käsper and Peter Schwabe ### 8 ### Date: 2009-03-19 ### 9 ### Public domain ### 10 ### ### 11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ### 12 ### further information. ### 13 ################################################################### 14 # 15 # September 2011. 16 # 17 # Started as transliteration to "perlasm" the original code has 18 # undergone following changes: 19 # 20 # - code was made position-independent; 21 # - rounds were folded into a loop resulting in >5x size reduction 22 # from 12.5KB to 2.2KB; 23 # - above was possibile thanks to mixcolumns() modification that 24 # allowed to feed its output back to aesenc[last], this was 25 # achieved at cost of two additional inter-registers moves; 26 # - some instruction reordering and interleaving; 27 # - this module doesn't implement key setup subroutine, instead it 28 # relies on conversion of "conventional" key schedule as returned 29 # by AES_set_encrypt_key (see discussion below); 30 # - first and last round keys are treated differently, which allowed 31 # to skip one shiftrows(), reduce bit-sliced key schedule and 32 # speed-up conversion by 22%; 33 # - support for 192- and 256-bit keys was added; 34 # 35 # Resulting performance in CPU cycles spent to encrypt one byte out 36 # of 4096-byte buffer with 128-bit key is: 37 # 38 # Emilia's this(*) difference 39 # 40 # Core 2 9.30 8.69 +7% 41 # Nehalem(**) 7.63 6.98 +9% 42 # Atom 17.1 17.4 -2%(***) 43 # 44 # (*) Comparison is not completely fair, because "this" is ECB, 45 # i.e. no extra processing such as counter values calculation 46 # and xor-ing input as in Emilia's CTR implementation is 47 # performed. However, the CTR calculations stand for not more 48 # than 1% of total time, so comparison is *rather* fair. 49 # 50 # (**) Results were collected on Westmere, which is considered to 51 # be equivalent to Nehalem for this code. 52 # 53 # (***) Slowdown on Atom is rather strange per se, because original 54 # implementation has a number of 9+-bytes instructions, which 55 # are bad for Atom front-end, and which I eliminated completely. 56 # In attempt to address deterioration sbox() was tested in FP 57 # SIMD "domain" (movaps instead of movdqa, xorps instead of 58 # pxor, etc.). While it resulted in nominal 4% improvement on 59 # Atom, it hurted Westmere by more than 2x factor. 60 # 61 # As for key schedule conversion subroutine. Interface to OpenSSL 62 # relies on per-invocation on-the-fly conversion. This naturally 63 # has impact on performance, especially for short inputs. Conversion 64 # time in CPU cycles and its ratio to CPU cycles spent in 8x block 65 # function is: 66 # 67 # conversion conversion/8x block 68 # Core 2 240 0.22 69 # Nehalem 180 0.20 70 # Atom 430 0.19 71 # 72 # The ratio values mean that 128-byte blocks will be processed 73 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, 74 # etc. Then keep in mind that input sizes not divisible by 128 are 75 # *effectively* slower, especially shortest ones, e.g. consecutive 76 # 144-byte blocks are processed 44% slower than one would expect, 77 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" 78 # it's still faster than ["hyper-threading-safe" code path in] 79 # aes-x86_64.pl on all lengths above 64 bytes... 80 # 81 # October 2011. 82 # 83 # Add decryption procedure. Performance in CPU cycles spent to decrypt 84 # one byte out of 4096-byte buffer with 128-bit key is: 85 # 86 # Core 2 9.83 87 # Nehalem 7.74 88 # Atom 19.0 89 # 90 # November 2011. 91 # 92 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is 93 # suboptimal, but XTS is meant to be used with larger blocks... 94 # 95 # <appro@openssl.org> 96 97 $flavour = shift; 98 $output = shift; 99 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 100 101 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 102 103 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 104 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 105 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 106 die "can't locate x86_64-xlate.pl"; 107 108 open OUT,"| \"$^X\" $xlate $flavour $output"; 109 *STDOUT=*OUT; 110 111 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); 112 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) 113 my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... 114 115 { 116 my ($key,$rounds,$const)=("%rax","%r10d","%r11"); 117 118 sub Sbox { 119 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 120 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb 121 my @b=@_[0..7]; 122 my @t=@_[8..11]; 123 my @s=@_[12..15]; 124 &InBasisChange (@b); 125 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); 126 &OutBasisChange (@b[7,1,4,2,6,5,0,3]); 127 } 128 129 sub InBasisChange { 130 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 131 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 132 my @b=@_[0..7]; 133 $code.=<<___; 134 pxor @b[6], @b[5] 135 pxor @b[1], @b[2] 136 pxor @b[0], @b[3] 137 pxor @b[2], @b[6] 138 pxor @b[0], @b[5] 139 140 pxor @b[3], @b[6] 141 pxor @b[7], @b[3] 142 pxor @b[5], @b[7] 143 pxor @b[4], @b[3] 144 pxor @b[5], @b[4] 145 pxor @b[1], @b[3] 146 147 pxor @b[7], @b[2] 148 pxor @b[5], @b[1] 149 ___ 150 } 151 152 sub OutBasisChange { 153 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 154 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb 155 my @b=@_[0..7]; 156 $code.=<<___; 157 pxor @b[6], @b[0] 158 pxor @b[4], @b[1] 159 pxor @b[0], @b[2] 160 pxor @b[6], @b[4] 161 pxor @b[1], @b[6] 162 163 pxor @b[5], @b[1] 164 pxor @b[3], @b[5] 165 pxor @b[7], @b[3] 166 pxor @b[5], @b[7] 167 pxor @b[5], @b[2] 168 169 pxor @b[7], @b[4] 170 ___ 171 } 172 173 sub InvSbox { 174 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb 175 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb 176 my @b=@_[0..7]; 177 my @t=@_[8..11]; 178 my @s=@_[12..15]; 179 &InvInBasisChange (@b); 180 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); 181 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); 182 } 183 184 sub InvInBasisChange { # OutBasisChange in reverse 185 my @b=@_[5,1,2,6,3,7,0,4]; 186 $code.=<<___ 187 pxor @b[7], @b[4] 188 189 pxor @b[5], @b[7] 190 pxor @b[5], @b[2] 191 pxor @b[7], @b[3] 192 pxor @b[3], @b[5] 193 pxor @b[5], @b[1] 194 195 pxor @b[1], @b[6] 196 pxor @b[0], @b[2] 197 pxor @b[6], @b[4] 198 pxor @b[6], @b[0] 199 pxor @b[4], @b[1] 200 ___ 201 } 202 203 sub InvOutBasisChange { # InBasisChange in reverse 204 my @b=@_[2,5,7,3,6,1,0,4]; 205 $code.=<<___; 206 pxor @b[5], @b[1] 207 pxor @b[7], @b[2] 208 209 pxor @b[1], @b[3] 210 pxor @b[5], @b[4] 211 pxor @b[5], @b[7] 212 pxor @b[4], @b[3] 213 pxor @b[0], @b[5] 214 pxor @b[7], @b[3] 215 pxor @b[2], @b[6] 216 pxor @b[1], @b[2] 217 pxor @b[3], @b[6] 218 219 pxor @b[0], @b[3] 220 pxor @b[6], @b[5] 221 ___ 222 } 223 224 sub Mul_GF4 { 225 #;************************************************************* 226 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * 227 #;************************************************************* 228 my ($x0,$x1,$y0,$y1,$t0)=@_; 229 $code.=<<___; 230 movdqa $y0, $t0 231 pxor $y1, $t0 232 pand $x0, $t0 233 pxor $x1, $x0 234 pand $y0, $x1 235 pand $y1, $x0 236 pxor $x1, $x0 237 pxor $t0, $x1 238 ___ 239 } 240 241 sub Mul_GF4_N { # not used, see next subroutine 242 # multiply and scale by N 243 my ($x0,$x1,$y0,$y1,$t0)=@_; 244 $code.=<<___; 245 movdqa $y0, $t0 246 pxor $y1, $t0 247 pand $x0, $t0 248 pxor $x1, $x0 249 pand $y0, $x1 250 pand $y1, $x0 251 pxor $x0, $x1 252 pxor $t0, $x0 253 ___ 254 } 255 256 sub Mul_GF4_N_GF4 { 257 # interleaved Mul_GF4_N and Mul_GF4 258 my ($x0,$x1,$y0,$y1,$t0, 259 $x2,$x3,$y2,$y3,$t1)=@_; 260 $code.=<<___; 261 movdqa $y0, $t0 262 movdqa $y2, $t1 263 pxor $y1, $t0 264 pxor $y3, $t1 265 pand $x0, $t0 266 pand $x2, $t1 267 pxor $x1, $x0 268 pxor $x3, $x2 269 pand $y0, $x1 270 pand $y2, $x3 271 pand $y1, $x0 272 pand $y3, $x2 273 pxor $x0, $x1 274 pxor $x3, $x2 275 pxor $t0, $x0 276 pxor $t1, $x3 277 ___ 278 } 279 sub Mul_GF16_2 { 280 my @x=@_[0..7]; 281 my @y=@_[8..11]; 282 my @t=@_[12..15]; 283 $code.=<<___; 284 movdqa @x[0], @t[0] 285 movdqa @x[1], @t[1] 286 ___ 287 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); 288 $code.=<<___; 289 pxor @x[2], @t[0] 290 pxor @x[3], @t[1] 291 pxor @y[2], @y[0] 292 pxor @y[3], @y[1] 293 ___ 294 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 295 @x[2], @x[3], @y[2], @y[3], @t[2]); 296 $code.=<<___; 297 pxor @t[0], @x[0] 298 pxor @t[0], @x[2] 299 pxor @t[1], @x[1] 300 pxor @t[1], @x[3] 301 302 movdqa @x[4], @t[0] 303 movdqa @x[5], @t[1] 304 pxor @x[6], @t[0] 305 pxor @x[7], @t[1] 306 ___ 307 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], 308 @x[6], @x[7], @y[2], @y[3], @t[2]); 309 $code.=<<___; 310 pxor @y[2], @y[0] 311 pxor @y[3], @y[1] 312 ___ 313 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); 314 $code.=<<___; 315 pxor @t[0], @x[4] 316 pxor @t[0], @x[6] 317 pxor @t[1], @x[5] 318 pxor @t[1], @x[7] 319 ___ 320 } 321 sub Inv_GF256 { 322 #;******************************************************************** 323 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * 324 #;******************************************************************** 325 my @x=@_[0..7]; 326 my @t=@_[8..11]; 327 my @s=@_[12..15]; 328 # direct optimizations from hardware 329 $code.=<<___; 330 movdqa @x[4], @t[3] 331 movdqa @x[5], @t[2] 332 movdqa @x[1], @t[1] 333 movdqa @x[7], @s[1] 334 movdqa @x[0], @s[0] 335 336 pxor @x[6], @t[3] 337 pxor @x[7], @t[2] 338 pxor @x[3], @t[1] 339 movdqa @t[3], @s[2] 340 pxor @x[6], @s[1] 341 movdqa @t[2], @t[0] 342 pxor @x[2], @s[0] 343 movdqa @t[3], @s[3] 344 345 por @t[1], @t[2] 346 por @s[0], @t[3] 347 pxor @t[0], @s[3] 348 pand @s[0], @s[2] 349 pxor @t[1], @s[0] 350 pand @t[1], @t[0] 351 pand @s[0], @s[3] 352 movdqa @x[3], @s[0] 353 pxor @x[2], @s[0] 354 pand @s[0], @s[1] 355 pxor @s[1], @t[3] 356 pxor @s[1], @t[2] 357 movdqa @x[4], @s[1] 358 movdqa @x[1], @s[0] 359 pxor @x[5], @s[1] 360 pxor @x[0], @s[0] 361 movdqa @s[1], @t[1] 362 pand @s[0], @s[1] 363 por @s[0], @t[1] 364 pxor @s[1], @t[0] 365 pxor @s[3], @t[3] 366 pxor @s[2], @t[2] 367 pxor @s[3], @t[1] 368 movdqa @x[7], @s[0] 369 pxor @s[2], @t[0] 370 movdqa @x[6], @s[1] 371 pxor @s[2], @t[1] 372 movdqa @x[5], @s[2] 373 pand @x[3], @s[0] 374 movdqa @x[4], @s[3] 375 pand @x[2], @s[1] 376 pand @x[1], @s[2] 377 por @x[0], @s[3] 378 pxor @s[0], @t[3] 379 pxor @s[1], @t[2] 380 pxor @s[2], @t[1] 381 pxor @s[3], @t[0] 382 383 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 384 385 # new smaller inversion 386 387 movdqa @t[3], @s[0] 388 pand @t[1], @t[3] 389 pxor @t[2], @s[0] 390 391 movdqa @t[0], @s[2] 392 movdqa @s[0], @s[3] 393 pxor @t[3], @s[2] 394 pand @s[2], @s[3] 395 396 movdqa @t[1], @s[1] 397 pxor @t[2], @s[3] 398 pxor @t[0], @s[1] 399 400 pxor @t[2], @t[3] 401 402 pand @t[3], @s[1] 403 404 movdqa @s[2], @t[2] 405 pxor @t[0], @s[1] 406 407 pxor @s[1], @t[2] 408 pxor @s[1], @t[1] 409 410 pand @t[0], @t[2] 411 412 pxor @t[2], @s[2] 413 pxor @t[2], @t[1] 414 415 pand @s[3], @s[2] 416 417 pxor @s[0], @s[2] 418 ___ 419 # output in s3, s2, s1, t1 420 421 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 422 423 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 424 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); 425 426 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb 427 } 428 429 # AES linear components 430 431 sub ShiftRows { 432 my @x=@_[0..7]; 433 my $mask=pop; 434 $code.=<<___; 435 pxor 0x00($key),@x[0] 436 pxor 0x10($key),@x[1] 437 pshufb $mask,@x[0] 438 pxor 0x20($key),@x[2] 439 pshufb $mask,@x[1] 440 pxor 0x30($key),@x[3] 441 pshufb $mask,@x[2] 442 pxor 0x40($key),@x[4] 443 pshufb $mask,@x[3] 444 pxor 0x50($key),@x[5] 445 pshufb $mask,@x[4] 446 pxor 0x60($key),@x[6] 447 pshufb $mask,@x[5] 448 pxor 0x70($key),@x[7] 449 pshufb $mask,@x[6] 450 lea 0x80($key),$key 451 pshufb $mask,@x[7] 452 ___ 453 } 454 455 sub MixColumns { 456 # modified to emit output in order suitable for feeding back to aesenc[last] 457 my @x=@_[0..7]; 458 my @t=@_[8..15]; 459 my $inv=@_[16]; # optional 460 $code.=<<___; 461 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 462 pshufd \$0x93, @x[1], @t[1] 463 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) 464 pshufd \$0x93, @x[2], @t[2] 465 pxor @t[1], @x[1] 466 pshufd \$0x93, @x[3], @t[3] 467 pxor @t[2], @x[2] 468 pshufd \$0x93, @x[4], @t[4] 469 pxor @t[3], @x[3] 470 pshufd \$0x93, @x[5], @t[5] 471 pxor @t[4], @x[4] 472 pshufd \$0x93, @x[6], @t[6] 473 pxor @t[5], @x[5] 474 pshufd \$0x93, @x[7], @t[7] 475 pxor @t[6], @x[6] 476 pxor @t[7], @x[7] 477 478 pxor @x[0], @t[1] 479 pxor @x[7], @t[0] 480 pxor @x[7], @t[1] 481 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) 482 pxor @x[1], @t[2] 483 pshufd \$0x4E, @x[1], @x[1] 484 pxor @x[4], @t[5] 485 pxor @t[0], @x[0] 486 pxor @x[5], @t[6] 487 pxor @t[1], @x[1] 488 pxor @x[3], @t[4] 489 pshufd \$0x4E, @x[4], @t[0] 490 pxor @x[6], @t[7] 491 pshufd \$0x4E, @x[5], @t[1] 492 pxor @x[2], @t[3] 493 pshufd \$0x4E, @x[3], @x[4] 494 pxor @x[7], @t[3] 495 pshufd \$0x4E, @x[7], @x[5] 496 pxor @x[7], @t[4] 497 pshufd \$0x4E, @x[6], @x[3] 498 pxor @t[4], @t[0] 499 pshufd \$0x4E, @x[2], @x[6] 500 pxor @t[5], @t[1] 501 ___ 502 $code.=<<___ if (!$inv); 503 pxor @t[3], @x[4] 504 pxor @t[7], @x[5] 505 pxor @t[6], @x[3] 506 movdqa @t[0], @x[2] 507 pxor @t[2], @x[6] 508 movdqa @t[1], @x[7] 509 ___ 510 $code.=<<___ if ($inv); 511 pxor @x[4], @t[3] 512 pxor @t[7], @x[5] 513 pxor @x[3], @t[6] 514 movdqa @t[0], @x[3] 515 pxor @t[2], @x[6] 516 movdqa @t[6], @x[2] 517 movdqa @t[1], @x[7] 518 movdqa @x[6], @x[4] 519 movdqa @t[3], @x[6] 520 ___ 521 } 522 523 sub InvMixColumns_orig { 524 my @x=@_[0..7]; 525 my @t=@_[8..15]; 526 527 $code.=<<___; 528 # multiplication by 0x0e 529 pshufd \$0x93, @x[7], @t[7] 530 movdqa @x[2], @t[2] 531 pxor @x[5], @x[7] # 7 5 532 pxor @x[5], @x[2] # 2 5 533 pshufd \$0x93, @x[0], @t[0] 534 movdqa @x[5], @t[5] 535 pxor @x[0], @x[5] # 5 0 [1] 536 pxor @x[1], @x[0] # 0 1 537 pshufd \$0x93, @x[1], @t[1] 538 pxor @x[2], @x[1] # 1 25 539 pxor @x[6], @x[0] # 01 6 [2] 540 pxor @x[3], @x[1] # 125 3 [4] 541 pshufd \$0x93, @x[3], @t[3] 542 pxor @x[0], @x[2] # 25 016 [3] 543 pxor @x[7], @x[3] # 3 75 544 pxor @x[6], @x[7] # 75 6 [0] 545 pshufd \$0x93, @x[6], @t[6] 546 movdqa @x[4], @t[4] 547 pxor @x[4], @x[6] # 6 4 548 pxor @x[3], @x[4] # 4 375 [6] 549 pxor @x[7], @x[3] # 375 756=36 550 pxor @t[5], @x[6] # 64 5 [7] 551 pxor @t[2], @x[3] # 36 2 552 pxor @t[4], @x[3] # 362 4 [5] 553 pshufd \$0x93, @t[5], @t[5] 554 ___ 555 my @y = @x[7,5,0,2,1,3,4,6]; 556 $code.=<<___; 557 # multiplication by 0x0b 558 pxor @y[0], @y[1] 559 pxor @t[0], @y[0] 560 pxor @t[1], @y[1] 561 pshufd \$0x93, @t[2], @t[2] 562 pxor @t[5], @y[0] 563 pxor @t[6], @y[1] 564 pxor @t[7], @y[0] 565 pshufd \$0x93, @t[4], @t[4] 566 pxor @t[6], @t[7] # clobber t[7] 567 pxor @y[0], @y[1] 568 569 pxor @t[0], @y[3] 570 pshufd \$0x93, @t[0], @t[0] 571 pxor @t[1], @y[2] 572 pxor @t[1], @y[4] 573 pxor @t[2], @y[2] 574 pshufd \$0x93, @t[1], @t[1] 575 pxor @t[2], @y[3] 576 pxor @t[2], @y[5] 577 pxor @t[7], @y[2] 578 pshufd \$0x93, @t[2], @t[2] 579 pxor @t[3], @y[3] 580 pxor @t[3], @y[6] 581 pxor @t[3], @y[4] 582 pshufd \$0x93, @t[3], @t[3] 583 pxor @t[4], @y[7] 584 pxor @t[4], @y[5] 585 pxor @t[7], @y[7] 586 pxor @t[5], @y[3] 587 pxor @t[4], @y[4] 588 pxor @t[5], @t[7] # clobber t[7] even more 589 590 pxor @t[7], @y[5] 591 pshufd \$0x93, @t[4], @t[4] 592 pxor @t[7], @y[6] 593 pxor @t[7], @y[4] 594 595 pxor @t[5], @t[7] 596 pshufd \$0x93, @t[5], @t[5] 597 pxor @t[6], @t[7] # restore t[7] 598 599 # multiplication by 0x0d 600 pxor @y[7], @y[4] 601 pxor @t[4], @y[7] 602 pshufd \$0x93, @t[6], @t[6] 603 pxor @t[0], @y[2] 604 pxor @t[5], @y[7] 605 pxor @t[2], @y[2] 606 pshufd \$0x93, @t[7], @t[7] 607 608 pxor @y[1], @y[3] 609 pxor @t[1], @y[1] 610 pxor @t[0], @y[0] 611 pxor @t[0], @y[3] 612 pxor @t[5], @y[1] 613 pxor @t[5], @y[0] 614 pxor @t[7], @y[1] 615 pshufd \$0x93, @t[0], @t[0] 616 pxor @t[6], @y[0] 617 pxor @y[1], @y[3] 618 pxor @t[1], @y[4] 619 pshufd \$0x93, @t[1], @t[1] 620 621 pxor @t[7], @y[7] 622 pxor @t[2], @y[4] 623 pxor @t[2], @y[5] 624 pshufd \$0x93, @t[2], @t[2] 625 pxor @t[6], @y[2] 626 pxor @t[3], @t[6] # clobber t[6] 627 pxor @y[7], @y[4] 628 pxor @t[6], @y[3] 629 630 pxor @t[6], @y[6] 631 pxor @t[5], @y[5] 632 pxor @t[4], @y[6] 633 pshufd \$0x93, @t[4], @t[4] 634 pxor @t[6], @y[5] 635 pxor @t[7], @y[6] 636 pxor @t[3], @t[6] # restore t[6] 637 638 pshufd \$0x93, @t[5], @t[5] 639 pshufd \$0x93, @t[6], @t[6] 640 pshufd \$0x93, @t[7], @t[7] 641 pshufd \$0x93, @t[3], @t[3] 642 643 # multiplication by 0x09 644 pxor @y[1], @y[4] 645 pxor @y[1], @t[1] # t[1]=y[1] 646 pxor @t[5], @t[0] # clobber t[0] 647 pxor @t[5], @t[1] 648 pxor @t[0], @y[3] 649 pxor @y[0], @t[0] # t[0]=y[0] 650 pxor @t[6], @t[1] 651 pxor @t[7], @t[6] # clobber t[6] 652 pxor @t[1], @y[4] 653 pxor @t[4], @y[7] 654 pxor @y[4], @t[4] # t[4]=y[4] 655 pxor @t[3], @y[6] 656 pxor @y[3], @t[3] # t[3]=y[3] 657 pxor @t[2], @y[5] 658 pxor @y[2], @t[2] # t[2]=y[2] 659 pxor @t[7], @t[3] 660 pxor @y[5], @t[5] # t[5]=y[5] 661 pxor @t[6], @t[2] 662 pxor @t[6], @t[5] 663 pxor @y[6], @t[6] # t[6]=y[6] 664 pxor @y[7], @t[7] # t[7]=y[7] 665 666 movdqa @t[0],@XMM[0] 667 movdqa @t[1],@XMM[1] 668 movdqa @t[2],@XMM[2] 669 movdqa @t[3],@XMM[3] 670 movdqa @t[4],@XMM[4] 671 movdqa @t[5],@XMM[5] 672 movdqa @t[6],@XMM[6] 673 movdqa @t[7],@XMM[7] 674 ___ 675 } 676 677 sub InvMixColumns { 678 my @x=@_[0..7]; 679 my @t=@_[8..15]; 680 681 # Thanks to Jussi Kivilinna for providing pointer to 682 # 683 # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | 684 # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | 685 # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | 686 # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | 687 688 $code.=<<___; 689 # multiplication by 0x05-0x00-0x04-0x00 690 pshufd \$0x4E, @x[0], @t[0] 691 pshufd \$0x4E, @x[6], @t[6] 692 pxor @x[0], @t[0] 693 pshufd \$0x4E, @x[7], @t[7] 694 pxor @x[6], @t[6] 695 pshufd \$0x4E, @x[1], @t[1] 696 pxor @x[7], @t[7] 697 pshufd \$0x4E, @x[2], @t[2] 698 pxor @x[1], @t[1] 699 pshufd \$0x4E, @x[3], @t[3] 700 pxor @x[2], @t[2] 701 pxor @t[6], @x[0] 702 pxor @t[6], @x[1] 703 pshufd \$0x4E, @x[4], @t[4] 704 pxor @x[3], @t[3] 705 pxor @t[0], @x[2] 706 pxor @t[1], @x[3] 707 pshufd \$0x4E, @x[5], @t[5] 708 pxor @x[4], @t[4] 709 pxor @t[7], @x[1] 710 pxor @t[2], @x[4] 711 pxor @x[5], @t[5] 712 713 pxor @t[7], @x[2] 714 pxor @t[6], @x[3] 715 pxor @t[6], @x[4] 716 pxor @t[3], @x[5] 717 pxor @t[4], @x[6] 718 pxor @t[7], @x[4] 719 pxor @t[7], @x[5] 720 pxor @t[5], @x[7] 721 ___ 722 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6 723 } 724 725 sub aesenc { # not used 726 my @b=@_[0..7]; 727 my @t=@_[8..15]; 728 $code.=<<___; 729 movdqa 0x30($const),@t[0] # .LSR 730 ___ 731 &ShiftRows (@b,@t[0]); 732 &Sbox (@b,@t); 733 &MixColumns (@b[0,1,4,6,3,7,2,5],@t); 734 } 735 736 sub aesenclast { # not used 737 my @b=@_[0..7]; 738 my @t=@_[8..15]; 739 $code.=<<___; 740 movdqa 0x40($const),@t[0] # .LSRM0 741 ___ 742 &ShiftRows (@b,@t[0]); 743 &Sbox (@b,@t); 744 $code.=<<___ 745 pxor 0x00($key),@b[0] 746 pxor 0x10($key),@b[1] 747 pxor 0x20($key),@b[4] 748 pxor 0x30($key),@b[6] 749 pxor 0x40($key),@b[3] 750 pxor 0x50($key),@b[7] 751 pxor 0x60($key),@b[2] 752 pxor 0x70($key),@b[5] 753 ___ 754 } 755 756 sub swapmove { 757 my ($a,$b,$n,$mask,$t)=@_; 758 $code.=<<___; 759 movdqa $b,$t 760 psrlq \$$n,$b 761 pxor $a,$b 762 pand $mask,$b 763 pxor $b,$a 764 psllq \$$n,$b 765 pxor $t,$b 766 ___ 767 } 768 sub swapmove2x { 769 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; 770 $code.=<<___; 771 movdqa $b0,$t0 772 psrlq \$$n,$b0 773 movdqa $b1,$t1 774 psrlq \$$n,$b1 775 pxor $a0,$b0 776 pxor $a1,$b1 777 pand $mask,$b0 778 pand $mask,$b1 779 pxor $b0,$a0 780 psllq \$$n,$b0 781 pxor $b1,$a1 782 psllq \$$n,$b1 783 pxor $t0,$b0 784 pxor $t1,$b1 785 ___ 786 } 787 788 sub bitslice { 789 my @x=reverse(@_[0..7]); 790 my ($t0,$t1,$t2,$t3)=@_[8..11]; 791 $code.=<<___; 792 movdqa 0x00($const),$t0 # .LBS0 793 movdqa 0x10($const),$t1 # .LBS1 794 ___ 795 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); 796 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 797 $code.=<<___; 798 movdqa 0x20($const),$t0 # .LBS2 799 ___ 800 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); 801 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 802 803 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); 804 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); 805 } 806 807 $code.=<<___; 808 .text 809 810 .extern asm_AES_encrypt 811 .extern asm_AES_decrypt 812 813 .type _bsaes_encrypt8,\@abi-omnipotent 814 .align 64 815 _bsaes_encrypt8: 816 lea .LBS0(%rip), $const # constants table 817 818 movdqa ($key), @XMM[9] # round 0 key 819 lea 0x10($key), $key 820 movdqa 0x50($const), @XMM[8] # .LM0SR 821 pxor @XMM[9], @XMM[0] # xor with round0 key 822 pxor @XMM[9], @XMM[1] 823 pshufb @XMM[8], @XMM[0] 824 pxor @XMM[9], @XMM[2] 825 pshufb @XMM[8], @XMM[1] 826 pxor @XMM[9], @XMM[3] 827 pshufb @XMM[8], @XMM[2] 828 pxor @XMM[9], @XMM[4] 829 pshufb @XMM[8], @XMM[3] 830 pxor @XMM[9], @XMM[5] 831 pshufb @XMM[8], @XMM[4] 832 pxor @XMM[9], @XMM[6] 833 pshufb @XMM[8], @XMM[5] 834 pxor @XMM[9], @XMM[7] 835 pshufb @XMM[8], @XMM[6] 836 pshufb @XMM[8], @XMM[7] 837 _bsaes_encrypt8_bitslice: 838 ___ 839 &bitslice (@XMM[0..7, 8..11]); 840 $code.=<<___; 841 dec $rounds 842 jmp .Lenc_sbox 843 .align 16 844 .Lenc_loop: 845 ___ 846 &ShiftRows (@XMM[0..7, 8]); 847 $code.=".Lenc_sbox:\n"; 848 &Sbox (@XMM[0..7, 8..15]); 849 $code.=<<___; 850 dec $rounds 851 jl .Lenc_done 852 ___ 853 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); 854 $code.=<<___; 855 movdqa 0x30($const), @XMM[8] # .LSR 856 jnz .Lenc_loop 857 movdqa 0x40($const), @XMM[8] # .LSRM0 858 jmp .Lenc_loop 859 .align 16 860 .Lenc_done: 861 ___ 862 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb 863 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); 864 $code.=<<___; 865 movdqa ($key), @XMM[8] # last round key 866 pxor @XMM[8], @XMM[4] 867 pxor @XMM[8], @XMM[6] 868 pxor @XMM[8], @XMM[3] 869 pxor @XMM[8], @XMM[7] 870 pxor @XMM[8], @XMM[2] 871 pxor @XMM[8], @XMM[5] 872 pxor @XMM[8], @XMM[0] 873 pxor @XMM[8], @XMM[1] 874 ret 875 .size _bsaes_encrypt8,.-_bsaes_encrypt8 876 877 .type _bsaes_decrypt8,\@abi-omnipotent 878 .align 64 879 _bsaes_decrypt8: 880 lea .LBS0(%rip), $const # constants table 881 882 movdqa ($key), @XMM[9] # round 0 key 883 lea 0x10($key), $key 884 movdqa -0x30($const), @XMM[8] # .LM0ISR 885 pxor @XMM[9], @XMM[0] # xor with round0 key 886 pxor @XMM[9], @XMM[1] 887 pshufb @XMM[8], @XMM[0] 888 pxor @XMM[9], @XMM[2] 889 pshufb @XMM[8], @XMM[1] 890 pxor @XMM[9], @XMM[3] 891 pshufb @XMM[8], @XMM[2] 892 pxor @XMM[9], @XMM[4] 893 pshufb @XMM[8], @XMM[3] 894 pxor @XMM[9], @XMM[5] 895 pshufb @XMM[8], @XMM[4] 896 pxor @XMM[9], @XMM[6] 897 pshufb @XMM[8], @XMM[5] 898 pxor @XMM[9], @XMM[7] 899 pshufb @XMM[8], @XMM[6] 900 pshufb @XMM[8], @XMM[7] 901 ___ 902 &bitslice (@XMM[0..7, 8..11]); 903 $code.=<<___; 904 dec $rounds 905 jmp .Ldec_sbox 906 .align 16 907 .Ldec_loop: 908 ___ 909 &ShiftRows (@XMM[0..7, 8]); 910 $code.=".Ldec_sbox:\n"; 911 &InvSbox (@XMM[0..7, 8..15]); 912 $code.=<<___; 913 dec $rounds 914 jl .Ldec_done 915 ___ 916 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); 917 $code.=<<___; 918 movdqa -0x10($const), @XMM[8] # .LISR 919 jnz .Ldec_loop 920 movdqa -0x20($const), @XMM[8] # .LISRM0 921 jmp .Ldec_loop 922 .align 16 923 .Ldec_done: 924 ___ 925 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); 926 $code.=<<___; 927 movdqa ($key), @XMM[8] # last round key 928 pxor @XMM[8], @XMM[6] 929 pxor @XMM[8], @XMM[4] 930 pxor @XMM[8], @XMM[2] 931 pxor @XMM[8], @XMM[7] 932 pxor @XMM[8], @XMM[3] 933 pxor @XMM[8], @XMM[5] 934 pxor @XMM[8], @XMM[0] 935 pxor @XMM[8], @XMM[1] 936 ret 937 .size _bsaes_decrypt8,.-_bsaes_decrypt8 938 ___ 939 } 940 { 941 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); 942 943 sub bitslice_key { 944 my @x=reverse(@_[0..7]); 945 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; 946 947 &swapmove (@x[0,1],1,$bs0,$t2,$t3); 948 $code.=<<___; 949 #&swapmove(@x[2,3],1,$t0,$t2,$t3); 950 movdqa @x[0], @x[2] 951 movdqa @x[1], @x[3] 952 ___ 953 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); 954 955 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); 956 $code.=<<___; 957 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); 958 movdqa @x[0], @x[4] 959 movdqa @x[2], @x[6] 960 movdqa @x[1], @x[5] 961 movdqa @x[3], @x[7] 962 ___ 963 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); 964 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); 965 } 966 967 $code.=<<___; 968 .type _bsaes_key_convert,\@abi-omnipotent 969 .align 16 970 _bsaes_key_convert: 971 lea .Lmasks(%rip), $const 972 movdqu ($inp), %xmm7 # load round 0 key 973 lea 0x10($inp), $inp 974 movdqa 0x00($const), %xmm0 # 0x01... 975 movdqa 0x10($const), %xmm1 # 0x02... 976 movdqa 0x20($const), %xmm2 # 0x04... 977 movdqa 0x30($const), %xmm3 # 0x08... 978 movdqa 0x40($const), %xmm4 # .LM0 979 pcmpeqd %xmm5, %xmm5 # .LNOT 980 981 movdqu ($inp), %xmm6 # load round 1 key 982 movdqa %xmm7, ($out) # save round 0 key 983 lea 0x10($out), $out 984 dec $rounds 985 jmp .Lkey_loop 986 .align 16 987 .Lkey_loop: 988 pshufb %xmm4, %xmm6 # .LM0 989 990 movdqa %xmm0, %xmm8 991 movdqa %xmm1, %xmm9 992 993 pand %xmm6, %xmm8 994 pand %xmm6, %xmm9 995 movdqa %xmm2, %xmm10 996 pcmpeqb %xmm0, %xmm8 997 psllq \$4, %xmm0 # 0x10... 998 movdqa %xmm3, %xmm11 999 pcmpeqb %xmm1, %xmm9 1000 psllq \$4, %xmm1 # 0x20... 1001 1002 pand %xmm6, %xmm10 1003 pand %xmm6, %xmm11 1004 movdqa %xmm0, %xmm12 1005 pcmpeqb %xmm2, %xmm10 1006 psllq \$4, %xmm2 # 0x40... 1007 movdqa %xmm1, %xmm13 1008 pcmpeqb %xmm3, %xmm11 1009 psllq \$4, %xmm3 # 0x80... 1010 1011 movdqa %xmm2, %xmm14 1012 movdqa %xmm3, %xmm15 1013 pxor %xmm5, %xmm8 # "pnot" 1014 pxor %xmm5, %xmm9 1015 1016 pand %xmm6, %xmm12 1017 pand %xmm6, %xmm13 1018 movdqa %xmm8, 0x00($out) # write bit-sliced round key 1019 pcmpeqb %xmm0, %xmm12 1020 psrlq \$4, %xmm0 # 0x01... 1021 movdqa %xmm9, 0x10($out) 1022 pcmpeqb %xmm1, %xmm13 1023 psrlq \$4, %xmm1 # 0x02... 1024 lea 0x10($inp), $inp 1025 1026 pand %xmm6, %xmm14 1027 pand %xmm6, %xmm15 1028 movdqa %xmm10, 0x20($out) 1029 pcmpeqb %xmm2, %xmm14 1030 psrlq \$4, %xmm2 # 0x04... 1031 movdqa %xmm11, 0x30($out) 1032 pcmpeqb %xmm3, %xmm15 1033 psrlq \$4, %xmm3 # 0x08... 1034 movdqu ($inp), %xmm6 # load next round key 1035 1036 pxor %xmm5, %xmm13 # "pnot" 1037 pxor %xmm5, %xmm14 1038 movdqa %xmm12, 0x40($out) 1039 movdqa %xmm13, 0x50($out) 1040 movdqa %xmm14, 0x60($out) 1041 movdqa %xmm15, 0x70($out) 1042 lea 0x80($out),$out 1043 dec $rounds 1044 jnz .Lkey_loop 1045 1046 movdqa 0x50($const), %xmm7 # .L63 1047 #movdqa %xmm6, ($out) # don't save last round key 1048 ret 1049 .size _bsaes_key_convert,.-_bsaes_key_convert 1050 ___ 1051 } 1052 1053 if (0 && !$win64) { # following four functions are unsupported interface 1054 # used for benchmarking... 1055 $code.=<<___; 1056 .globl bsaes_enc_key_convert 1057 .type bsaes_enc_key_convert,\@function,2 1058 .align 16 1059 bsaes_enc_key_convert: 1060 mov 240($inp),%r10d # pass rounds 1061 mov $inp,%rcx # pass key 1062 mov $out,%rax # pass key schedule 1063 call _bsaes_key_convert 1064 pxor %xmm6,%xmm7 # fix up last round key 1065 movdqa %xmm7,(%rax) # save last round key 1066 ret 1067 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert 1068 1069 .globl bsaes_encrypt_128 1070 .type bsaes_encrypt_128,\@function,4 1071 .align 16 1072 bsaes_encrypt_128: 1073 .Lenc128_loop: 1074 movdqu 0x00($inp), @XMM[0] # load input 1075 movdqu 0x10($inp), @XMM[1] 1076 movdqu 0x20($inp), @XMM[2] 1077 movdqu 0x30($inp), @XMM[3] 1078 movdqu 0x40($inp), @XMM[4] 1079 movdqu 0x50($inp), @XMM[5] 1080 movdqu 0x60($inp), @XMM[6] 1081 movdqu 0x70($inp), @XMM[7] 1082 mov $key, %rax # pass the $key 1083 lea 0x80($inp), $inp 1084 mov \$10,%r10d 1085 1086 call _bsaes_encrypt8 1087 1088 movdqu @XMM[0], 0x00($out) # write output 1089 movdqu @XMM[1], 0x10($out) 1090 movdqu @XMM[4], 0x20($out) 1091 movdqu @XMM[6], 0x30($out) 1092 movdqu @XMM[3], 0x40($out) 1093 movdqu @XMM[7], 0x50($out) 1094 movdqu @XMM[2], 0x60($out) 1095 movdqu @XMM[5], 0x70($out) 1096 lea 0x80($out), $out 1097 sub \$0x80,$len 1098 ja .Lenc128_loop 1099 ret 1100 .size bsaes_encrypt_128,.-bsaes_encrypt_128 1101 1102 .globl bsaes_dec_key_convert 1103 .type bsaes_dec_key_convert,\@function,2 1104 .align 16 1105 bsaes_dec_key_convert: 1106 mov 240($inp),%r10d # pass rounds 1107 mov $inp,%rcx # pass key 1108 mov $out,%rax # pass key schedule 1109 call _bsaes_key_convert 1110 pxor ($out),%xmm7 # fix up round 0 key 1111 movdqa %xmm6,(%rax) # save last round key 1112 movdqa %xmm7,($out) 1113 ret 1114 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert 1115 1116 .globl bsaes_decrypt_128 1117 .type bsaes_decrypt_128,\@function,4 1118 .align 16 1119 bsaes_decrypt_128: 1120 .Ldec128_loop: 1121 movdqu 0x00($inp), @XMM[0] # load input 1122 movdqu 0x10($inp), @XMM[1] 1123 movdqu 0x20($inp), @XMM[2] 1124 movdqu 0x30($inp), @XMM[3] 1125 movdqu 0x40($inp), @XMM[4] 1126 movdqu 0x50($inp), @XMM[5] 1127 movdqu 0x60($inp), @XMM[6] 1128 movdqu 0x70($inp), @XMM[7] 1129 mov $key, %rax # pass the $key 1130 lea 0x80($inp), $inp 1131 mov \$10,%r10d 1132 1133 call _bsaes_decrypt8 1134 1135 movdqu @XMM[0], 0x00($out) # write output 1136 movdqu @XMM[1], 0x10($out) 1137 movdqu @XMM[6], 0x20($out) 1138 movdqu @XMM[4], 0x30($out) 1139 movdqu @XMM[2], 0x40($out) 1140 movdqu @XMM[7], 0x50($out) 1141 movdqu @XMM[3], 0x60($out) 1142 movdqu @XMM[5], 0x70($out) 1143 lea 0x80($out), $out 1144 sub \$0x80,$len 1145 ja .Ldec128_loop 1146 ret 1147 .size bsaes_decrypt_128,.-bsaes_decrypt_128 1148 ___ 1149 } 1150 { 1151 ###################################################################### 1152 # 1153 # OpenSSL interface 1154 # 1155 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d") 1156 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1157 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); 1158 1159 if ($ecb) { 1160 $code.=<<___; 1161 .globl bsaes_ecb_encrypt_blocks 1162 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent 1163 .align 16 1164 bsaes_ecb_encrypt_blocks: 1165 mov %rsp, %rax 1166 .Lecb_enc_prologue: 1167 push %rbp 1168 push %rbx 1169 push %r12 1170 push %r13 1171 push %r14 1172 push %r15 1173 lea -0x48(%rsp),%rsp 1174 ___ 1175 $code.=<<___ if ($win64); 1176 lea -0xa0(%rsp), %rsp 1177 movaps %xmm6, 0x40(%rsp) 1178 movaps %xmm7, 0x50(%rsp) 1179 movaps %xmm8, 0x60(%rsp) 1180 movaps %xmm9, 0x70(%rsp) 1181 movaps %xmm10, 0x80(%rsp) 1182 movaps %xmm11, 0x90(%rsp) 1183 movaps %xmm12, 0xa0(%rsp) 1184 movaps %xmm13, 0xb0(%rsp) 1185 movaps %xmm14, 0xc0(%rsp) 1186 movaps %xmm15, 0xd0(%rsp) 1187 .Lecb_enc_body: 1188 ___ 1189 $code.=<<___; 1190 mov %rsp,%rbp # backup %rsp 1191 mov 240($arg4),%eax # rounds 1192 mov $arg1,$inp # backup arguments 1193 mov $arg2,$out 1194 mov $arg3,$len 1195 mov $arg4,$key 1196 cmp \$8,$arg3 1197 jb .Lecb_enc_short 1198 1199 mov %eax,%ebx # backup rounds 1200 shl \$7,%rax # 128 bytes per inner round key 1201 sub \$`128-32`,%rax # size of bit-sliced key schedule 1202 sub %rax,%rsp 1203 mov %rsp,%rax # pass key schedule 1204 mov $key,%rcx # pass key 1205 mov %ebx,%r10d # pass rounds 1206 call _bsaes_key_convert 1207 pxor %xmm6,%xmm7 # fix up last round key 1208 movdqa %xmm7,(%rax) # save last round key 1209 1210 sub \$8,$len 1211 .Lecb_enc_loop: 1212 movdqu 0x00($inp), @XMM[0] # load input 1213 movdqu 0x10($inp), @XMM[1] 1214 movdqu 0x20($inp), @XMM[2] 1215 movdqu 0x30($inp), @XMM[3] 1216 movdqu 0x40($inp), @XMM[4] 1217 movdqu 0x50($inp), @XMM[5] 1218 mov %rsp, %rax # pass key schedule 1219 movdqu 0x60($inp), @XMM[6] 1220 mov %ebx,%r10d # pass rounds 1221 movdqu 0x70($inp), @XMM[7] 1222 lea 0x80($inp), $inp 1223 1224 call _bsaes_encrypt8 1225 1226 movdqu @XMM[0], 0x00($out) # write output 1227 movdqu @XMM[1], 0x10($out) 1228 movdqu @XMM[4], 0x20($out) 1229 movdqu @XMM[6], 0x30($out) 1230 movdqu @XMM[3], 0x40($out) 1231 movdqu @XMM[7], 0x50($out) 1232 movdqu @XMM[2], 0x60($out) 1233 movdqu @XMM[5], 0x70($out) 1234 lea 0x80($out), $out 1235 sub \$8,$len 1236 jnc .Lecb_enc_loop 1237 1238 add \$8,$len 1239 jz .Lecb_enc_done 1240 1241 movdqu 0x00($inp), @XMM[0] # load input 1242 mov %rsp, %rax # pass key schedule 1243 mov %ebx,%r10d # pass rounds 1244 cmp \$2,$len 1245 jb .Lecb_enc_one 1246 movdqu 0x10($inp), @XMM[1] 1247 je .Lecb_enc_two 1248 movdqu 0x20($inp), @XMM[2] 1249 cmp \$4,$len 1250 jb .Lecb_enc_three 1251 movdqu 0x30($inp), @XMM[3] 1252 je .Lecb_enc_four 1253 movdqu 0x40($inp), @XMM[4] 1254 cmp \$6,$len 1255 jb .Lecb_enc_five 1256 movdqu 0x50($inp), @XMM[5] 1257 je .Lecb_enc_six 1258 movdqu 0x60($inp), @XMM[6] 1259 call _bsaes_encrypt8 1260 movdqu @XMM[0], 0x00($out) # write output 1261 movdqu @XMM[1], 0x10($out) 1262 movdqu @XMM[4], 0x20($out) 1263 movdqu @XMM[6], 0x30($out) 1264 movdqu @XMM[3], 0x40($out) 1265 movdqu @XMM[7], 0x50($out) 1266 movdqu @XMM[2], 0x60($out) 1267 jmp .Lecb_enc_done 1268 .align 16 1269 .Lecb_enc_six: 1270 call _bsaes_encrypt8 1271 movdqu @XMM[0], 0x00($out) # write output 1272 movdqu @XMM[1], 0x10($out) 1273 movdqu @XMM[4], 0x20($out) 1274 movdqu @XMM[6], 0x30($out) 1275 movdqu @XMM[3], 0x40($out) 1276 movdqu @XMM[7], 0x50($out) 1277 jmp .Lecb_enc_done 1278 .align 16 1279 .Lecb_enc_five: 1280 call _bsaes_encrypt8 1281 movdqu @XMM[0], 0x00($out) # write output 1282 movdqu @XMM[1], 0x10($out) 1283 movdqu @XMM[4], 0x20($out) 1284 movdqu @XMM[6], 0x30($out) 1285 movdqu @XMM[3], 0x40($out) 1286 jmp .Lecb_enc_done 1287 .align 16 1288 .Lecb_enc_four: 1289 call _bsaes_encrypt8 1290 movdqu @XMM[0], 0x00($out) # write output 1291 movdqu @XMM[1], 0x10($out) 1292 movdqu @XMM[4], 0x20($out) 1293 movdqu @XMM[6], 0x30($out) 1294 jmp .Lecb_enc_done 1295 .align 16 1296 .Lecb_enc_three: 1297 call _bsaes_encrypt8 1298 movdqu @XMM[0], 0x00($out) # write output 1299 movdqu @XMM[1], 0x10($out) 1300 movdqu @XMM[4], 0x20($out) 1301 jmp .Lecb_enc_done 1302 .align 16 1303 .Lecb_enc_two: 1304 call _bsaes_encrypt8 1305 movdqu @XMM[0], 0x00($out) # write output 1306 movdqu @XMM[1], 0x10($out) 1307 jmp .Lecb_enc_done 1308 .align 16 1309 .Lecb_enc_one: 1310 call _bsaes_encrypt8 1311 movdqu @XMM[0], 0x00($out) # write output 1312 jmp .Lecb_enc_done 1313 .align 16 1314 .Lecb_enc_short: 1315 lea ($inp), $arg1 1316 lea ($out), $arg2 1317 lea ($key), $arg3 1318 call asm_AES_encrypt 1319 lea 16($inp), $inp 1320 lea 16($out), $out 1321 dec $len 1322 jnz .Lecb_enc_short 1323 1324 .Lecb_enc_done: 1325 lea (%rsp),%rax 1326 pxor %xmm0, %xmm0 1327 .Lecb_enc_bzero: # wipe key schedule [if any] 1328 movdqa %xmm0, 0x00(%rax) 1329 movdqa %xmm0, 0x10(%rax) 1330 lea 0x20(%rax), %rax 1331 cmp %rax, %rbp 1332 jb .Lecb_enc_bzero 1333 1334 lea (%rbp),%rsp # restore %rsp 1335 ___ 1336 $code.=<<___ if ($win64); 1337 movaps 0x40(%rbp), %xmm6 1338 movaps 0x50(%rbp), %xmm7 1339 movaps 0x60(%rbp), %xmm8 1340 movaps 0x70(%rbp), %xmm9 1341 movaps 0x80(%rbp), %xmm10 1342 movaps 0x90(%rbp), %xmm11 1343 movaps 0xa0(%rbp), %xmm12 1344 movaps 0xb0(%rbp), %xmm13 1345 movaps 0xc0(%rbp), %xmm14 1346 movaps 0xd0(%rbp), %xmm15 1347 lea 0xa0(%rbp), %rsp 1348 ___ 1349 $code.=<<___; 1350 mov 0x48(%rsp), %r15 1351 mov 0x50(%rsp), %r14 1352 mov 0x58(%rsp), %r13 1353 mov 0x60(%rsp), %r12 1354 mov 0x68(%rsp), %rbx 1355 mov 0x70(%rsp), %rax 1356 lea 0x78(%rsp), %rsp 1357 mov %rax, %rbp 1358 .Lecb_enc_epilogue: 1359 ret 1360 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks 1361 1362 .globl bsaes_ecb_decrypt_blocks 1363 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent 1364 .align 16 1365 bsaes_ecb_decrypt_blocks: 1366 mov %rsp, %rax 1367 .Lecb_dec_prologue: 1368 push %rbp 1369 push %rbx 1370 push %r12 1371 push %r13 1372 push %r14 1373 push %r15 1374 lea -0x48(%rsp),%rsp 1375 ___ 1376 $code.=<<___ if ($win64); 1377 lea -0xa0(%rsp), %rsp 1378 movaps %xmm6, 0x40(%rsp) 1379 movaps %xmm7, 0x50(%rsp) 1380 movaps %xmm8, 0x60(%rsp) 1381 movaps %xmm9, 0x70(%rsp) 1382 movaps %xmm10, 0x80(%rsp) 1383 movaps %xmm11, 0x90(%rsp) 1384 movaps %xmm12, 0xa0(%rsp) 1385 movaps %xmm13, 0xb0(%rsp) 1386 movaps %xmm14, 0xc0(%rsp) 1387 movaps %xmm15, 0xd0(%rsp) 1388 .Lecb_dec_body: 1389 ___ 1390 $code.=<<___; 1391 mov %rsp,%rbp # backup %rsp 1392 mov 240($arg4),%eax # rounds 1393 mov $arg1,$inp # backup arguments 1394 mov $arg2,$out 1395 mov $arg3,$len 1396 mov $arg4,$key 1397 cmp \$8,$arg3 1398 jb .Lecb_dec_short 1399 1400 mov %eax,%ebx # backup rounds 1401 shl \$7,%rax # 128 bytes per inner round key 1402 sub \$`128-32`,%rax # size of bit-sliced key schedule 1403 sub %rax,%rsp 1404 mov %rsp,%rax # pass key schedule 1405 mov $key,%rcx # pass key 1406 mov %ebx,%r10d # pass rounds 1407 call _bsaes_key_convert 1408 pxor (%rsp),%xmm7 # fix up 0 round key 1409 movdqa %xmm6,(%rax) # save last round key 1410 movdqa %xmm7,(%rsp) 1411 1412 sub \$8,$len 1413 .Lecb_dec_loop: 1414 movdqu 0x00($inp), @XMM[0] # load input 1415 movdqu 0x10($inp), @XMM[1] 1416 movdqu 0x20($inp), @XMM[2] 1417 movdqu 0x30($inp), @XMM[3] 1418 movdqu 0x40($inp), @XMM[4] 1419 movdqu 0x50($inp), @XMM[5] 1420 mov %rsp, %rax # pass key schedule 1421 movdqu 0x60($inp), @XMM[6] 1422 mov %ebx,%r10d # pass rounds 1423 movdqu 0x70($inp), @XMM[7] 1424 lea 0x80($inp), $inp 1425 1426 call _bsaes_decrypt8 1427 1428 movdqu @XMM[0], 0x00($out) # write output 1429 movdqu @XMM[1], 0x10($out) 1430 movdqu @XMM[6], 0x20($out) 1431 movdqu @XMM[4], 0x30($out) 1432 movdqu @XMM[2], 0x40($out) 1433 movdqu @XMM[7], 0x50($out) 1434 movdqu @XMM[3], 0x60($out) 1435 movdqu @XMM[5], 0x70($out) 1436 lea 0x80($out), $out 1437 sub \$8,$len 1438 jnc .Lecb_dec_loop 1439 1440 add \$8,$len 1441 jz .Lecb_dec_done 1442 1443 movdqu 0x00($inp), @XMM[0] # load input 1444 mov %rsp, %rax # pass key schedule 1445 mov %ebx,%r10d # pass rounds 1446 cmp \$2,$len 1447 jb .Lecb_dec_one 1448 movdqu 0x10($inp), @XMM[1] 1449 je .Lecb_dec_two 1450 movdqu 0x20($inp), @XMM[2] 1451 cmp \$4,$len 1452 jb .Lecb_dec_three 1453 movdqu 0x30($inp), @XMM[3] 1454 je .Lecb_dec_four 1455 movdqu 0x40($inp), @XMM[4] 1456 cmp \$6,$len 1457 jb .Lecb_dec_five 1458 movdqu 0x50($inp), @XMM[5] 1459 je .Lecb_dec_six 1460 movdqu 0x60($inp), @XMM[6] 1461 call _bsaes_decrypt8 1462 movdqu @XMM[0], 0x00($out) # write output 1463 movdqu @XMM[1], 0x10($out) 1464 movdqu @XMM[6], 0x20($out) 1465 movdqu @XMM[4], 0x30($out) 1466 movdqu @XMM[2], 0x40($out) 1467 movdqu @XMM[7], 0x50($out) 1468 movdqu @XMM[3], 0x60($out) 1469 jmp .Lecb_dec_done 1470 .align 16 1471 .Lecb_dec_six: 1472 call _bsaes_decrypt8 1473 movdqu @XMM[0], 0x00($out) # write output 1474 movdqu @XMM[1], 0x10($out) 1475 movdqu @XMM[6], 0x20($out) 1476 movdqu @XMM[4], 0x30($out) 1477 movdqu @XMM[2], 0x40($out) 1478 movdqu @XMM[7], 0x50($out) 1479 jmp .Lecb_dec_done 1480 .align 16 1481 .Lecb_dec_five: 1482 call _bsaes_decrypt8 1483 movdqu @XMM[0], 0x00($out) # write output 1484 movdqu @XMM[1], 0x10($out) 1485 movdqu @XMM[6], 0x20($out) 1486 movdqu @XMM[4], 0x30($out) 1487 movdqu @XMM[2], 0x40($out) 1488 jmp .Lecb_dec_done 1489 .align 16 1490 .Lecb_dec_four: 1491 call _bsaes_decrypt8 1492 movdqu @XMM[0], 0x00($out) # write output 1493 movdqu @XMM[1], 0x10($out) 1494 movdqu @XMM[6], 0x20($out) 1495 movdqu @XMM[4], 0x30($out) 1496 jmp .Lecb_dec_done 1497 .align 16 1498 .Lecb_dec_three: 1499 call _bsaes_decrypt8 1500 movdqu @XMM[0], 0x00($out) # write output 1501 movdqu @XMM[1], 0x10($out) 1502 movdqu @XMM[6], 0x20($out) 1503 jmp .Lecb_dec_done 1504 .align 16 1505 .Lecb_dec_two: 1506 call _bsaes_decrypt8 1507 movdqu @XMM[0], 0x00($out) # write output 1508 movdqu @XMM[1], 0x10($out) 1509 jmp .Lecb_dec_done 1510 .align 16 1511 .Lecb_dec_one: 1512 call _bsaes_decrypt8 1513 movdqu @XMM[0], 0x00($out) # write output 1514 jmp .Lecb_dec_done 1515 .align 16 1516 .Lecb_dec_short: 1517 lea ($inp), $arg1 1518 lea ($out), $arg2 1519 lea ($key), $arg3 1520 call asm_AES_decrypt 1521 lea 16($inp), $inp 1522 lea 16($out), $out 1523 dec $len 1524 jnz .Lecb_dec_short 1525 1526 .Lecb_dec_done: 1527 lea (%rsp),%rax 1528 pxor %xmm0, %xmm0 1529 .Lecb_dec_bzero: # wipe key schedule [if any] 1530 movdqa %xmm0, 0x00(%rax) 1531 movdqa %xmm0, 0x10(%rax) 1532 lea 0x20(%rax), %rax 1533 cmp %rax, %rbp 1534 jb .Lecb_dec_bzero 1535 1536 lea (%rbp),%rsp # restore %rsp 1537 ___ 1538 $code.=<<___ if ($win64); 1539 movaps 0x40(%rbp), %xmm6 1540 movaps 0x50(%rbp), %xmm7 1541 movaps 0x60(%rbp), %xmm8 1542 movaps 0x70(%rbp), %xmm9 1543 movaps 0x80(%rbp), %xmm10 1544 movaps 0x90(%rbp), %xmm11 1545 movaps 0xa0(%rbp), %xmm12 1546 movaps 0xb0(%rbp), %xmm13 1547 movaps 0xc0(%rbp), %xmm14 1548 movaps 0xd0(%rbp), %xmm15 1549 lea 0xa0(%rbp), %rsp 1550 ___ 1551 $code.=<<___; 1552 mov 0x48(%rsp), %r15 1553 mov 0x50(%rsp), %r14 1554 mov 0x58(%rsp), %r13 1555 mov 0x60(%rsp), %r12 1556 mov 0x68(%rsp), %rbx 1557 mov 0x70(%rsp), %rax 1558 lea 0x78(%rsp), %rsp 1559 mov %rax, %rbp 1560 .Lecb_dec_epilogue: 1561 ret 1562 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks 1563 ___ 1564 } 1565 $code.=<<___; 1566 .extern asm_AES_cbc_encrypt 1567 .globl bsaes_cbc_encrypt 1568 .type bsaes_cbc_encrypt,\@abi-omnipotent 1569 .align 16 1570 bsaes_cbc_encrypt: 1571 ___ 1572 $code.=<<___ if ($win64); 1573 mov 48(%rsp),$arg6 # pull direction flag 1574 ___ 1575 $code.=<<___; 1576 cmp \$0,$arg6 1577 jne asm_AES_cbc_encrypt 1578 cmp \$128,$arg3 1579 jb asm_AES_cbc_encrypt 1580 1581 mov %rsp, %rax 1582 .Lcbc_dec_prologue: 1583 push %rbp 1584 push %rbx 1585 push %r12 1586 push %r13 1587 push %r14 1588 push %r15 1589 lea -0x48(%rsp), %rsp 1590 ___ 1591 $code.=<<___ if ($win64); 1592 mov 0xa0(%rsp),$arg5 # pull ivp 1593 lea -0xa0(%rsp), %rsp 1594 movaps %xmm6, 0x40(%rsp) 1595 movaps %xmm7, 0x50(%rsp) 1596 movaps %xmm8, 0x60(%rsp) 1597 movaps %xmm9, 0x70(%rsp) 1598 movaps %xmm10, 0x80(%rsp) 1599 movaps %xmm11, 0x90(%rsp) 1600 movaps %xmm12, 0xa0(%rsp) 1601 movaps %xmm13, 0xb0(%rsp) 1602 movaps %xmm14, 0xc0(%rsp) 1603 movaps %xmm15, 0xd0(%rsp) 1604 .Lcbc_dec_body: 1605 ___ 1606 $code.=<<___; 1607 mov %rsp, %rbp # backup %rsp 1608 mov 240($arg4), %eax # rounds 1609 mov $arg1, $inp # backup arguments 1610 mov $arg2, $out 1611 mov $arg3, $len 1612 mov $arg4, $key 1613 mov $arg5, %rbx 1614 shr \$4, $len # bytes to blocks 1615 1616 mov %eax, %edx # rounds 1617 shl \$7, %rax # 128 bytes per inner round key 1618 sub \$`128-32`, %rax # size of bit-sliced key schedule 1619 sub %rax, %rsp 1620 1621 mov %rsp, %rax # pass key schedule 1622 mov $key, %rcx # pass key 1623 mov %edx, %r10d # pass rounds 1624 call _bsaes_key_convert 1625 pxor (%rsp),%xmm7 # fix up 0 round key 1626 movdqa %xmm6,(%rax) # save last round key 1627 movdqa %xmm7,(%rsp) 1628 1629 movdqu (%rbx), @XMM[15] # load IV 1630 sub \$8,$len 1631 .Lcbc_dec_loop: 1632 movdqu 0x00($inp), @XMM[0] # load input 1633 movdqu 0x10($inp), @XMM[1] 1634 movdqu 0x20($inp), @XMM[2] 1635 movdqu 0x30($inp), @XMM[3] 1636 movdqu 0x40($inp), @XMM[4] 1637 movdqu 0x50($inp), @XMM[5] 1638 mov %rsp, %rax # pass key schedule 1639 movdqu 0x60($inp), @XMM[6] 1640 mov %edx,%r10d # pass rounds 1641 movdqu 0x70($inp), @XMM[7] 1642 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1643 1644 call _bsaes_decrypt8 1645 1646 pxor 0x20(%rbp), @XMM[0] # ^= IV 1647 movdqu 0x00($inp), @XMM[8] # re-load input 1648 movdqu 0x10($inp), @XMM[9] 1649 pxor @XMM[8], @XMM[1] 1650 movdqu 0x20($inp), @XMM[10] 1651 pxor @XMM[9], @XMM[6] 1652 movdqu 0x30($inp), @XMM[11] 1653 pxor @XMM[10], @XMM[4] 1654 movdqu 0x40($inp), @XMM[12] 1655 pxor @XMM[11], @XMM[2] 1656 movdqu 0x50($inp), @XMM[13] 1657 pxor @XMM[12], @XMM[7] 1658 movdqu 0x60($inp), @XMM[14] 1659 pxor @XMM[13], @XMM[3] 1660 movdqu 0x70($inp), @XMM[15] # IV 1661 pxor @XMM[14], @XMM[5] 1662 movdqu @XMM[0], 0x00($out) # write output 1663 lea 0x80($inp), $inp 1664 movdqu @XMM[1], 0x10($out) 1665 movdqu @XMM[6], 0x20($out) 1666 movdqu @XMM[4], 0x30($out) 1667 movdqu @XMM[2], 0x40($out) 1668 movdqu @XMM[7], 0x50($out) 1669 movdqu @XMM[3], 0x60($out) 1670 movdqu @XMM[5], 0x70($out) 1671 lea 0x80($out), $out 1672 sub \$8,$len 1673 jnc .Lcbc_dec_loop 1674 1675 add \$8,$len 1676 jz .Lcbc_dec_done 1677 1678 movdqu 0x00($inp), @XMM[0] # load input 1679 mov %rsp, %rax # pass key schedule 1680 mov %edx, %r10d # pass rounds 1681 cmp \$2,$len 1682 jb .Lcbc_dec_one 1683 movdqu 0x10($inp), @XMM[1] 1684 je .Lcbc_dec_two 1685 movdqu 0x20($inp), @XMM[2] 1686 cmp \$4,$len 1687 jb .Lcbc_dec_three 1688 movdqu 0x30($inp), @XMM[3] 1689 je .Lcbc_dec_four 1690 movdqu 0x40($inp), @XMM[4] 1691 cmp \$6,$len 1692 jb .Lcbc_dec_five 1693 movdqu 0x50($inp), @XMM[5] 1694 je .Lcbc_dec_six 1695 movdqu 0x60($inp), @XMM[6] 1696 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1697 call _bsaes_decrypt8 1698 pxor 0x20(%rbp), @XMM[0] # ^= IV 1699 movdqu 0x00($inp), @XMM[8] # re-load input 1700 movdqu 0x10($inp), @XMM[9] 1701 pxor @XMM[8], @XMM[1] 1702 movdqu 0x20($inp), @XMM[10] 1703 pxor @XMM[9], @XMM[6] 1704 movdqu 0x30($inp), @XMM[11] 1705 pxor @XMM[10], @XMM[4] 1706 movdqu 0x40($inp), @XMM[12] 1707 pxor @XMM[11], @XMM[2] 1708 movdqu 0x50($inp), @XMM[13] 1709 pxor @XMM[12], @XMM[7] 1710 movdqu 0x60($inp), @XMM[15] # IV 1711 pxor @XMM[13], @XMM[3] 1712 movdqu @XMM[0], 0x00($out) # write output 1713 movdqu @XMM[1], 0x10($out) 1714 movdqu @XMM[6], 0x20($out) 1715 movdqu @XMM[4], 0x30($out) 1716 movdqu @XMM[2], 0x40($out) 1717 movdqu @XMM[7], 0x50($out) 1718 movdqu @XMM[3], 0x60($out) 1719 jmp .Lcbc_dec_done 1720 .align 16 1721 .Lcbc_dec_six: 1722 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1723 call _bsaes_decrypt8 1724 pxor 0x20(%rbp), @XMM[0] # ^= IV 1725 movdqu 0x00($inp), @XMM[8] # re-load input 1726 movdqu 0x10($inp), @XMM[9] 1727 pxor @XMM[8], @XMM[1] 1728 movdqu 0x20($inp), @XMM[10] 1729 pxor @XMM[9], @XMM[6] 1730 movdqu 0x30($inp), @XMM[11] 1731 pxor @XMM[10], @XMM[4] 1732 movdqu 0x40($inp), @XMM[12] 1733 pxor @XMM[11], @XMM[2] 1734 movdqu 0x50($inp), @XMM[15] # IV 1735 pxor @XMM[12], @XMM[7] 1736 movdqu @XMM[0], 0x00($out) # write output 1737 movdqu @XMM[1], 0x10($out) 1738 movdqu @XMM[6], 0x20($out) 1739 movdqu @XMM[4], 0x30($out) 1740 movdqu @XMM[2], 0x40($out) 1741 movdqu @XMM[7], 0x50($out) 1742 jmp .Lcbc_dec_done 1743 .align 16 1744 .Lcbc_dec_five: 1745 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1746 call _bsaes_decrypt8 1747 pxor 0x20(%rbp), @XMM[0] # ^= IV 1748 movdqu 0x00($inp), @XMM[8] # re-load input 1749 movdqu 0x10($inp), @XMM[9] 1750 pxor @XMM[8], @XMM[1] 1751 movdqu 0x20($inp), @XMM[10] 1752 pxor @XMM[9], @XMM[6] 1753 movdqu 0x30($inp), @XMM[11] 1754 pxor @XMM[10], @XMM[4] 1755 movdqu 0x40($inp), @XMM[15] # IV 1756 pxor @XMM[11], @XMM[2] 1757 movdqu @XMM[0], 0x00($out) # write output 1758 movdqu @XMM[1], 0x10($out) 1759 movdqu @XMM[6], 0x20($out) 1760 movdqu @XMM[4], 0x30($out) 1761 movdqu @XMM[2], 0x40($out) 1762 jmp .Lcbc_dec_done 1763 .align 16 1764 .Lcbc_dec_four: 1765 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1766 call _bsaes_decrypt8 1767 pxor 0x20(%rbp), @XMM[0] # ^= IV 1768 movdqu 0x00($inp), @XMM[8] # re-load input 1769 movdqu 0x10($inp), @XMM[9] 1770 pxor @XMM[8], @XMM[1] 1771 movdqu 0x20($inp), @XMM[10] 1772 pxor @XMM[9], @XMM[6] 1773 movdqu 0x30($inp), @XMM[15] # IV 1774 pxor @XMM[10], @XMM[4] 1775 movdqu @XMM[0], 0x00($out) # write output 1776 movdqu @XMM[1], 0x10($out) 1777 movdqu @XMM[6], 0x20($out) 1778 movdqu @XMM[4], 0x30($out) 1779 jmp .Lcbc_dec_done 1780 .align 16 1781 .Lcbc_dec_three: 1782 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1783 call _bsaes_decrypt8 1784 pxor 0x20(%rbp), @XMM[0] # ^= IV 1785 movdqu 0x00($inp), @XMM[8] # re-load input 1786 movdqu 0x10($inp), @XMM[9] 1787 pxor @XMM[8], @XMM[1] 1788 movdqu 0x20($inp), @XMM[15] # IV 1789 pxor @XMM[9], @XMM[6] 1790 movdqu @XMM[0], 0x00($out) # write output 1791 movdqu @XMM[1], 0x10($out) 1792 movdqu @XMM[6], 0x20($out) 1793 jmp .Lcbc_dec_done 1794 .align 16 1795 .Lcbc_dec_two: 1796 movdqa @XMM[15], 0x20(%rbp) # put aside IV 1797 call _bsaes_decrypt8 1798 pxor 0x20(%rbp), @XMM[0] # ^= IV 1799 movdqu 0x00($inp), @XMM[8] # re-load input 1800 movdqu 0x10($inp), @XMM[15] # IV 1801 pxor @XMM[8], @XMM[1] 1802 movdqu @XMM[0], 0x00($out) # write output 1803 movdqu @XMM[1], 0x10($out) 1804 jmp .Lcbc_dec_done 1805 .align 16 1806 .Lcbc_dec_one: 1807 lea ($inp), $arg1 1808 lea 0x20(%rbp), $arg2 # buffer output 1809 lea ($key), $arg3 1810 call asm_AES_decrypt # doesn't touch %xmm 1811 pxor 0x20(%rbp), @XMM[15] # ^= IV 1812 movdqu @XMM[15], ($out) # write output 1813 movdqa @XMM[0], @XMM[15] # IV 1814 1815 .Lcbc_dec_done: 1816 movdqu @XMM[15], (%rbx) # return IV 1817 lea (%rsp), %rax 1818 pxor %xmm0, %xmm0 1819 .Lcbc_dec_bzero: # wipe key schedule [if any] 1820 movdqa %xmm0, 0x00(%rax) 1821 movdqa %xmm0, 0x10(%rax) 1822 lea 0x20(%rax), %rax 1823 cmp %rax, %rbp 1824 ja .Lcbc_dec_bzero 1825 1826 lea (%rbp),%rsp # restore %rsp 1827 ___ 1828 $code.=<<___ if ($win64); 1829 movaps 0x40(%rbp), %xmm6 1830 movaps 0x50(%rbp), %xmm7 1831 movaps 0x60(%rbp), %xmm8 1832 movaps 0x70(%rbp), %xmm9 1833 movaps 0x80(%rbp), %xmm10 1834 movaps 0x90(%rbp), %xmm11 1835 movaps 0xa0(%rbp), %xmm12 1836 movaps 0xb0(%rbp), %xmm13 1837 movaps 0xc0(%rbp), %xmm14 1838 movaps 0xd0(%rbp), %xmm15 1839 lea 0xa0(%rbp), %rsp 1840 ___ 1841 $code.=<<___; 1842 mov 0x48(%rsp), %r15 1843 mov 0x50(%rsp), %r14 1844 mov 0x58(%rsp), %r13 1845 mov 0x60(%rsp), %r12 1846 mov 0x68(%rsp), %rbx 1847 mov 0x70(%rsp), %rax 1848 lea 0x78(%rsp), %rsp 1849 mov %rax, %rbp 1850 .Lcbc_dec_epilogue: 1851 ret 1852 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt 1853 1854 .globl bsaes_ctr32_encrypt_blocks 1855 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent 1856 .align 16 1857 bsaes_ctr32_encrypt_blocks: 1858 mov %rsp, %rax 1859 .Lctr_enc_prologue: 1860 push %rbp 1861 push %rbx 1862 push %r12 1863 push %r13 1864 push %r14 1865 push %r15 1866 lea -0x48(%rsp), %rsp 1867 ___ 1868 $code.=<<___ if ($win64); 1869 mov 0xa0(%rsp),$arg5 # pull ivp 1870 lea -0xa0(%rsp), %rsp 1871 movaps %xmm6, 0x40(%rsp) 1872 movaps %xmm7, 0x50(%rsp) 1873 movaps %xmm8, 0x60(%rsp) 1874 movaps %xmm9, 0x70(%rsp) 1875 movaps %xmm10, 0x80(%rsp) 1876 movaps %xmm11, 0x90(%rsp) 1877 movaps %xmm12, 0xa0(%rsp) 1878 movaps %xmm13, 0xb0(%rsp) 1879 movaps %xmm14, 0xc0(%rsp) 1880 movaps %xmm15, 0xd0(%rsp) 1881 .Lctr_enc_body: 1882 ___ 1883 $code.=<<___; 1884 mov %rsp, %rbp # backup %rsp 1885 movdqu ($arg5), %xmm0 # load counter 1886 mov 240($arg4), %eax # rounds 1887 mov $arg1, $inp # backup arguments 1888 mov $arg2, $out 1889 mov $arg3, $len 1890 mov $arg4, $key 1891 movdqa %xmm0, 0x20(%rbp) # copy counter 1892 cmp \$8, $arg3 1893 jb .Lctr_enc_short 1894 1895 mov %eax, %ebx # rounds 1896 shl \$7, %rax # 128 bytes per inner round key 1897 sub \$`128-32`, %rax # size of bit-sliced key schedule 1898 sub %rax, %rsp 1899 1900 mov %rsp, %rax # pass key schedule 1901 mov $key, %rcx # pass key 1902 mov %ebx, %r10d # pass rounds 1903 call _bsaes_key_convert 1904 pxor %xmm6,%xmm7 # fix up last round key 1905 movdqa %xmm7,(%rax) # save last round key 1906 1907 movdqa (%rsp), @XMM[9] # load round0 key 1908 lea .LADD1(%rip), %r11 1909 movdqa 0x20(%rbp), @XMM[0] # counter copy 1910 movdqa -0x20(%r11), @XMM[8] # .LSWPUP 1911 pshufb @XMM[8], @XMM[9] # byte swap upper part 1912 pshufb @XMM[8], @XMM[0] 1913 movdqa @XMM[9], (%rsp) # save adjusted round0 key 1914 jmp .Lctr_enc_loop 1915 .align 16 1916 .Lctr_enc_loop: 1917 movdqa @XMM[0], 0x20(%rbp) # save counter 1918 movdqa @XMM[0], @XMM[1] # prepare 8 counter values 1919 movdqa @XMM[0], @XMM[2] 1920 paddd 0x00(%r11), @XMM[1] # .LADD1 1921 movdqa @XMM[0], @XMM[3] 1922 paddd 0x10(%r11), @XMM[2] # .LADD2 1923 movdqa @XMM[0], @XMM[4] 1924 paddd 0x20(%r11), @XMM[3] # .LADD3 1925 movdqa @XMM[0], @XMM[5] 1926 paddd 0x30(%r11), @XMM[4] # .LADD4 1927 movdqa @XMM[0], @XMM[6] 1928 paddd 0x40(%r11), @XMM[5] # .LADD5 1929 movdqa @XMM[0], @XMM[7] 1930 paddd 0x50(%r11), @XMM[6] # .LADD6 1931 paddd 0x60(%r11), @XMM[7] # .LADD7 1932 1933 # Borrow prologue from _bsaes_encrypt8 to use the opportunity 1934 # to flip byte order in 32-bit counter 1935 movdqa (%rsp), @XMM[9] # round 0 key 1936 lea 0x10(%rsp), %rax # pass key schedule 1937 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR 1938 pxor @XMM[9], @XMM[0] # xor with round0 key 1939 pxor @XMM[9], @XMM[1] 1940 pshufb @XMM[8], @XMM[0] 1941 pxor @XMM[9], @XMM[2] 1942 pshufb @XMM[8], @XMM[1] 1943 pxor @XMM[9], @XMM[3] 1944 pshufb @XMM[8], @XMM[2] 1945 pxor @XMM[9], @XMM[4] 1946 pshufb @XMM[8], @XMM[3] 1947 pxor @XMM[9], @XMM[5] 1948 pshufb @XMM[8], @XMM[4] 1949 pxor @XMM[9], @XMM[6] 1950 pshufb @XMM[8], @XMM[5] 1951 pxor @XMM[9], @XMM[7] 1952 pshufb @XMM[8], @XMM[6] 1953 lea .LBS0(%rip), %r11 # constants table 1954 pshufb @XMM[8], @XMM[7] 1955 mov %ebx,%r10d # pass rounds 1956 1957 call _bsaes_encrypt8_bitslice 1958 1959 sub \$8,$len 1960 jc .Lctr_enc_loop_done 1961 1962 movdqu 0x00($inp), @XMM[8] # load input 1963 movdqu 0x10($inp), @XMM[9] 1964 movdqu 0x20($inp), @XMM[10] 1965 movdqu 0x30($inp), @XMM[11] 1966 movdqu 0x40($inp), @XMM[12] 1967 movdqu 0x50($inp), @XMM[13] 1968 movdqu 0x60($inp), @XMM[14] 1969 movdqu 0x70($inp), @XMM[15] 1970 lea 0x80($inp),$inp 1971 pxor @XMM[0], @XMM[8] 1972 movdqa 0x20(%rbp), @XMM[0] # load counter 1973 pxor @XMM[9], @XMM[1] 1974 movdqu @XMM[8], 0x00($out) # write output 1975 pxor @XMM[10], @XMM[4] 1976 movdqu @XMM[1], 0x10($out) 1977 pxor @XMM[11], @XMM[6] 1978 movdqu @XMM[4], 0x20($out) 1979 pxor @XMM[12], @XMM[3] 1980 movdqu @XMM[6], 0x30($out) 1981 pxor @XMM[13], @XMM[7] 1982 movdqu @XMM[3], 0x40($out) 1983 pxor @XMM[14], @XMM[2] 1984 movdqu @XMM[7], 0x50($out) 1985 pxor @XMM[15], @XMM[5] 1986 movdqu @XMM[2], 0x60($out) 1987 lea .LADD1(%rip), %r11 1988 movdqu @XMM[5], 0x70($out) 1989 lea 0x80($out), $out 1990 paddd 0x70(%r11), @XMM[0] # .LADD8 1991 jnz .Lctr_enc_loop 1992 1993 jmp .Lctr_enc_done 1994 .align 16 1995 .Lctr_enc_loop_done: 1996 add \$8, $len 1997 movdqu 0x00($inp), @XMM[8] # load input 1998 pxor @XMM[8], @XMM[0] 1999 movdqu @XMM[0], 0x00($out) # write output 2000 cmp \$2,$len 2001 jb .Lctr_enc_done 2002 movdqu 0x10($inp), @XMM[9] 2003 pxor @XMM[9], @XMM[1] 2004 movdqu @XMM[1], 0x10($out) 2005 je .Lctr_enc_done 2006 movdqu 0x20($inp), @XMM[10] 2007 pxor @XMM[10], @XMM[4] 2008 movdqu @XMM[4], 0x20($out) 2009 cmp \$4,$len 2010 jb .Lctr_enc_done 2011 movdqu 0x30($inp), @XMM[11] 2012 pxor @XMM[11], @XMM[6] 2013 movdqu @XMM[6], 0x30($out) 2014 je .Lctr_enc_done 2015 movdqu 0x40($inp), @XMM[12] 2016 pxor @XMM[12], @XMM[3] 2017 movdqu @XMM[3], 0x40($out) 2018 cmp \$6,$len 2019 jb .Lctr_enc_done 2020 movdqu 0x50($inp), @XMM[13] 2021 pxor @XMM[13], @XMM[7] 2022 movdqu @XMM[7], 0x50($out) 2023 je .Lctr_enc_done 2024 movdqu 0x60($inp), @XMM[14] 2025 pxor @XMM[14], @XMM[2] 2026 movdqu @XMM[2], 0x60($out) 2027 jmp .Lctr_enc_done 2028 2029 .align 16 2030 .Lctr_enc_short: 2031 lea 0x20(%rbp), $arg1 2032 lea 0x30(%rbp), $arg2 2033 lea ($key), $arg3 2034 call asm_AES_encrypt 2035 movdqu ($inp), @XMM[1] 2036 lea 16($inp), $inp 2037 mov 0x2c(%rbp), %eax # load 32-bit counter 2038 bswap %eax 2039 pxor 0x30(%rbp), @XMM[1] 2040 inc %eax # increment 2041 movdqu @XMM[1], ($out) 2042 bswap %eax 2043 lea 16($out), $out 2044 mov %eax, 0x2c(%rsp) # save 32-bit counter 2045 dec $len 2046 jnz .Lctr_enc_short 2047 2048 .Lctr_enc_done: 2049 lea (%rsp), %rax 2050 pxor %xmm0, %xmm0 2051 .Lctr_enc_bzero: # wipe key schedule [if any] 2052 movdqa %xmm0, 0x00(%rax) 2053 movdqa %xmm0, 0x10(%rax) 2054 lea 0x20(%rax), %rax 2055 cmp %rax, %rbp 2056 ja .Lctr_enc_bzero 2057 2058 lea (%rbp),%rsp # restore %rsp 2059 ___ 2060 $code.=<<___ if ($win64); 2061 movaps 0x40(%rbp), %xmm6 2062 movaps 0x50(%rbp), %xmm7 2063 movaps 0x60(%rbp), %xmm8 2064 movaps 0x70(%rbp), %xmm9 2065 movaps 0x80(%rbp), %xmm10 2066 movaps 0x90(%rbp), %xmm11 2067 movaps 0xa0(%rbp), %xmm12 2068 movaps 0xb0(%rbp), %xmm13 2069 movaps 0xc0(%rbp), %xmm14 2070 movaps 0xd0(%rbp), %xmm15 2071 lea 0xa0(%rbp), %rsp 2072 ___ 2073 $code.=<<___; 2074 mov 0x48(%rsp), %r15 2075 mov 0x50(%rsp), %r14 2076 mov 0x58(%rsp), %r13 2077 mov 0x60(%rsp), %r12 2078 mov 0x68(%rsp), %rbx 2079 mov 0x70(%rsp), %rax 2080 lea 0x78(%rsp), %rsp 2081 mov %rax, %rbp 2082 .Lctr_enc_epilogue: 2083 ret 2084 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks 2085 ___ 2086 ###################################################################### 2087 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, 2088 # const AES_KEY *key1, const AES_KEY *key2, 2089 # const unsigned char iv[16]); 2090 # 2091 my ($twmask,$twres,$twtmp)=@XMM[13..15]; 2092 $arg6=~s/d$//; 2093 2094 $code.=<<___; 2095 .globl bsaes_xts_encrypt 2096 .type bsaes_xts_encrypt,\@abi-omnipotent 2097 .align 16 2098 bsaes_xts_encrypt: 2099 mov %rsp, %rax 2100 .Lxts_enc_prologue: 2101 push %rbp 2102 push %rbx 2103 push %r12 2104 push %r13 2105 push %r14 2106 push %r15 2107 lea -0x48(%rsp), %rsp 2108 ___ 2109 $code.=<<___ if ($win64); 2110 mov 0xa0(%rsp),$arg5 # pull key2 2111 mov 0xa8(%rsp),$arg6 # pull ivp 2112 lea -0xa0(%rsp), %rsp 2113 movaps %xmm6, 0x40(%rsp) 2114 movaps %xmm7, 0x50(%rsp) 2115 movaps %xmm8, 0x60(%rsp) 2116 movaps %xmm9, 0x70(%rsp) 2117 movaps %xmm10, 0x80(%rsp) 2118 movaps %xmm11, 0x90(%rsp) 2119 movaps %xmm12, 0xa0(%rsp) 2120 movaps %xmm13, 0xb0(%rsp) 2121 movaps %xmm14, 0xc0(%rsp) 2122 movaps %xmm15, 0xd0(%rsp) 2123 .Lxts_enc_body: 2124 ___ 2125 $code.=<<___; 2126 mov %rsp, %rbp # backup %rsp 2127 mov $arg1, $inp # backup arguments 2128 mov $arg2, $out 2129 mov $arg3, $len 2130 mov $arg4, $key 2131 2132 lea ($arg6), $arg1 2133 lea 0x20(%rbp), $arg2 2134 lea ($arg5), $arg3 2135 call asm_AES_encrypt # generate initial tweak 2136 2137 mov 240($key), %eax # rounds 2138 mov $len, %rbx # backup $len 2139 2140 mov %eax, %edx # rounds 2141 shl \$7, %rax # 128 bytes per inner round key 2142 sub \$`128-32`, %rax # size of bit-sliced key schedule 2143 sub %rax, %rsp 2144 2145 mov %rsp, %rax # pass key schedule 2146 mov $key, %rcx # pass key 2147 mov %edx, %r10d # pass rounds 2148 call _bsaes_key_convert 2149 pxor %xmm6, %xmm7 # fix up last round key 2150 movdqa %xmm7, (%rax) # save last round key 2151 2152 and \$-16, $len 2153 sub \$0x80, %rsp # place for tweak[8] 2154 movdqa 0x20(%rbp), @XMM[7] # initial tweak 2155 2156 pxor $twtmp, $twtmp 2157 movdqa .Lxts_magic(%rip), $twmask 2158 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2159 2160 sub \$0x80, $len 2161 jc .Lxts_enc_short 2162 jmp .Lxts_enc_loop 2163 2164 .align 16 2165 .Lxts_enc_loop: 2166 ___ 2167 for ($i=0;$i<7;$i++) { 2168 $code.=<<___; 2169 pshufd \$0x13, $twtmp, $twres 2170 pxor $twtmp, $twtmp 2171 movdqa @XMM[7], @XMM[$i] 2172 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2173 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2174 pand $twmask, $twres # isolate carry and residue 2175 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2176 pxor $twres, @XMM[7] 2177 ___ 2178 $code.=<<___ if ($i>=1); 2179 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2180 ___ 2181 $code.=<<___ if ($i>=2); 2182 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2183 ___ 2184 } 2185 $code.=<<___; 2186 movdqu 0x60($inp), @XMM[8+6] 2187 pxor @XMM[8+5], @XMM[5] 2188 movdqu 0x70($inp), @XMM[8+7] 2189 lea 0x80($inp), $inp 2190 movdqa @XMM[7], 0x70(%rsp) 2191 pxor @XMM[8+6], @XMM[6] 2192 lea 0x80(%rsp), %rax # pass key schedule 2193 pxor @XMM[8+7], @XMM[7] 2194 mov %edx, %r10d # pass rounds 2195 2196 call _bsaes_encrypt8 2197 2198 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2199 pxor 0x10(%rsp), @XMM[1] 2200 movdqu @XMM[0], 0x00($out) # write output 2201 pxor 0x20(%rsp), @XMM[4] 2202 movdqu @XMM[1], 0x10($out) 2203 pxor 0x30(%rsp), @XMM[6] 2204 movdqu @XMM[4], 0x20($out) 2205 pxor 0x40(%rsp), @XMM[3] 2206 movdqu @XMM[6], 0x30($out) 2207 pxor 0x50(%rsp), @XMM[7] 2208 movdqu @XMM[3], 0x40($out) 2209 pxor 0x60(%rsp), @XMM[2] 2210 movdqu @XMM[7], 0x50($out) 2211 pxor 0x70(%rsp), @XMM[5] 2212 movdqu @XMM[2], 0x60($out) 2213 movdqu @XMM[5], 0x70($out) 2214 lea 0x80($out), $out 2215 2216 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2217 pxor $twtmp, $twtmp 2218 movdqa .Lxts_magic(%rip), $twmask 2219 pcmpgtd @XMM[7], $twtmp 2220 pshufd \$0x13, $twtmp, $twres 2221 pxor $twtmp, $twtmp 2222 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2223 pand $twmask, $twres # isolate carry and residue 2224 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2225 pxor $twres, @XMM[7] 2226 2227 sub \$0x80,$len 2228 jnc .Lxts_enc_loop 2229 2230 .Lxts_enc_short: 2231 add \$0x80, $len 2232 jz .Lxts_enc_done 2233 ___ 2234 for ($i=0;$i<7;$i++) { 2235 $code.=<<___; 2236 pshufd \$0x13, $twtmp, $twres 2237 pxor $twtmp, $twtmp 2238 movdqa @XMM[7], @XMM[$i] 2239 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2240 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2241 pand $twmask, $twres # isolate carry and residue 2242 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2243 pxor $twres, @XMM[7] 2244 ___ 2245 $code.=<<___ if ($i>=1); 2246 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2247 cmp \$`0x10*$i`,$len 2248 je .Lxts_enc_$i 2249 ___ 2250 $code.=<<___ if ($i>=2); 2251 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2252 ___ 2253 } 2254 $code.=<<___; 2255 movdqu 0x60($inp), @XMM[8+6] 2256 pxor @XMM[8+5], @XMM[5] 2257 movdqa @XMM[7], 0x70(%rsp) 2258 lea 0x70($inp), $inp 2259 pxor @XMM[8+6], @XMM[6] 2260 lea 0x80(%rsp), %rax # pass key schedule 2261 mov %edx, %r10d # pass rounds 2262 2263 call _bsaes_encrypt8 2264 2265 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2266 pxor 0x10(%rsp), @XMM[1] 2267 movdqu @XMM[0], 0x00($out) # write output 2268 pxor 0x20(%rsp), @XMM[4] 2269 movdqu @XMM[1], 0x10($out) 2270 pxor 0x30(%rsp), @XMM[6] 2271 movdqu @XMM[4], 0x20($out) 2272 pxor 0x40(%rsp), @XMM[3] 2273 movdqu @XMM[6], 0x30($out) 2274 pxor 0x50(%rsp), @XMM[7] 2275 movdqu @XMM[3], 0x40($out) 2276 pxor 0x60(%rsp), @XMM[2] 2277 movdqu @XMM[7], 0x50($out) 2278 movdqu @XMM[2], 0x60($out) 2279 lea 0x70($out), $out 2280 2281 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2282 jmp .Lxts_enc_done 2283 .align 16 2284 .Lxts_enc_6: 2285 pxor @XMM[8+4], @XMM[4] 2286 lea 0x60($inp), $inp 2287 pxor @XMM[8+5], @XMM[5] 2288 lea 0x80(%rsp), %rax # pass key schedule 2289 mov %edx, %r10d # pass rounds 2290 2291 call _bsaes_encrypt8 2292 2293 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2294 pxor 0x10(%rsp), @XMM[1] 2295 movdqu @XMM[0], 0x00($out) # write output 2296 pxor 0x20(%rsp), @XMM[4] 2297 movdqu @XMM[1], 0x10($out) 2298 pxor 0x30(%rsp), @XMM[6] 2299 movdqu @XMM[4], 0x20($out) 2300 pxor 0x40(%rsp), @XMM[3] 2301 movdqu @XMM[6], 0x30($out) 2302 pxor 0x50(%rsp), @XMM[7] 2303 movdqu @XMM[3], 0x40($out) 2304 movdqu @XMM[7], 0x50($out) 2305 lea 0x60($out), $out 2306 2307 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2308 jmp .Lxts_enc_done 2309 .align 16 2310 .Lxts_enc_5: 2311 pxor @XMM[8+3], @XMM[3] 2312 lea 0x50($inp), $inp 2313 pxor @XMM[8+4], @XMM[4] 2314 lea 0x80(%rsp), %rax # pass key schedule 2315 mov %edx, %r10d # pass rounds 2316 2317 call _bsaes_encrypt8 2318 2319 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2320 pxor 0x10(%rsp), @XMM[1] 2321 movdqu @XMM[0], 0x00($out) # write output 2322 pxor 0x20(%rsp), @XMM[4] 2323 movdqu @XMM[1], 0x10($out) 2324 pxor 0x30(%rsp), @XMM[6] 2325 movdqu @XMM[4], 0x20($out) 2326 pxor 0x40(%rsp), @XMM[3] 2327 movdqu @XMM[6], 0x30($out) 2328 movdqu @XMM[3], 0x40($out) 2329 lea 0x50($out), $out 2330 2331 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2332 jmp .Lxts_enc_done 2333 .align 16 2334 .Lxts_enc_4: 2335 pxor @XMM[8+2], @XMM[2] 2336 lea 0x40($inp), $inp 2337 pxor @XMM[8+3], @XMM[3] 2338 lea 0x80(%rsp), %rax # pass key schedule 2339 mov %edx, %r10d # pass rounds 2340 2341 call _bsaes_encrypt8 2342 2343 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2344 pxor 0x10(%rsp), @XMM[1] 2345 movdqu @XMM[0], 0x00($out) # write output 2346 pxor 0x20(%rsp), @XMM[4] 2347 movdqu @XMM[1], 0x10($out) 2348 pxor 0x30(%rsp), @XMM[6] 2349 movdqu @XMM[4], 0x20($out) 2350 movdqu @XMM[6], 0x30($out) 2351 lea 0x40($out), $out 2352 2353 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2354 jmp .Lxts_enc_done 2355 .align 16 2356 .Lxts_enc_3: 2357 pxor @XMM[8+1], @XMM[1] 2358 lea 0x30($inp), $inp 2359 pxor @XMM[8+2], @XMM[2] 2360 lea 0x80(%rsp), %rax # pass key schedule 2361 mov %edx, %r10d # pass rounds 2362 2363 call _bsaes_encrypt8 2364 2365 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2366 pxor 0x10(%rsp), @XMM[1] 2367 movdqu @XMM[0], 0x00($out) # write output 2368 pxor 0x20(%rsp), @XMM[4] 2369 movdqu @XMM[1], 0x10($out) 2370 movdqu @XMM[4], 0x20($out) 2371 lea 0x30($out), $out 2372 2373 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2374 jmp .Lxts_enc_done 2375 .align 16 2376 .Lxts_enc_2: 2377 pxor @XMM[8+0], @XMM[0] 2378 lea 0x20($inp), $inp 2379 pxor @XMM[8+1], @XMM[1] 2380 lea 0x80(%rsp), %rax # pass key schedule 2381 mov %edx, %r10d # pass rounds 2382 2383 call _bsaes_encrypt8 2384 2385 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2386 pxor 0x10(%rsp), @XMM[1] 2387 movdqu @XMM[0], 0x00($out) # write output 2388 movdqu @XMM[1], 0x10($out) 2389 lea 0x20($out), $out 2390 2391 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2392 jmp .Lxts_enc_done 2393 .align 16 2394 .Lxts_enc_1: 2395 pxor @XMM[0], @XMM[8] 2396 lea 0x10($inp), $inp 2397 movdqa @XMM[8], 0x20(%rbp) 2398 lea 0x20(%rbp), $arg1 2399 lea 0x20(%rbp), $arg2 2400 lea ($key), $arg3 2401 call asm_AES_encrypt # doesn't touch %xmm 2402 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2403 #pxor @XMM[8], @XMM[0] 2404 #lea 0x80(%rsp), %rax # pass key schedule 2405 #mov %edx, %r10d # pass rounds 2406 #call _bsaes_encrypt8 2407 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2408 movdqu @XMM[0], 0x00($out) # write output 2409 lea 0x10($out), $out 2410 2411 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2412 2413 .Lxts_enc_done: 2414 and \$15, %ebx 2415 jz .Lxts_enc_ret 2416 mov $out, %rdx 2417 2418 .Lxts_enc_steal: 2419 movzb ($inp), %eax 2420 movzb -16(%rdx), %ecx 2421 lea 1($inp), $inp 2422 mov %al, -16(%rdx) 2423 mov %cl, 0(%rdx) 2424 lea 1(%rdx), %rdx 2425 sub \$1,%ebx 2426 jnz .Lxts_enc_steal 2427 2428 movdqu -16($out), @XMM[0] 2429 lea 0x20(%rbp), $arg1 2430 pxor @XMM[7], @XMM[0] 2431 lea 0x20(%rbp), $arg2 2432 movdqa @XMM[0], 0x20(%rbp) 2433 lea ($key), $arg3 2434 call asm_AES_encrypt # doesn't touch %xmm 2435 pxor 0x20(%rbp), @XMM[7] 2436 movdqu @XMM[7], -16($out) 2437 2438 .Lxts_enc_ret: 2439 lea (%rsp), %rax 2440 pxor %xmm0, %xmm0 2441 .Lxts_enc_bzero: # wipe key schedule [if any] 2442 movdqa %xmm0, 0x00(%rax) 2443 movdqa %xmm0, 0x10(%rax) 2444 lea 0x20(%rax), %rax 2445 cmp %rax, %rbp 2446 ja .Lxts_enc_bzero 2447 2448 lea (%rbp),%rsp # restore %rsp 2449 ___ 2450 $code.=<<___ if ($win64); 2451 movaps 0x40(%rbp), %xmm6 2452 movaps 0x50(%rbp), %xmm7 2453 movaps 0x60(%rbp), %xmm8 2454 movaps 0x70(%rbp), %xmm9 2455 movaps 0x80(%rbp), %xmm10 2456 movaps 0x90(%rbp), %xmm11 2457 movaps 0xa0(%rbp), %xmm12 2458 movaps 0xb0(%rbp), %xmm13 2459 movaps 0xc0(%rbp), %xmm14 2460 movaps 0xd0(%rbp), %xmm15 2461 lea 0xa0(%rbp), %rsp 2462 ___ 2463 $code.=<<___; 2464 mov 0x48(%rsp), %r15 2465 mov 0x50(%rsp), %r14 2466 mov 0x58(%rsp), %r13 2467 mov 0x60(%rsp), %r12 2468 mov 0x68(%rsp), %rbx 2469 mov 0x70(%rsp), %rax 2470 lea 0x78(%rsp), %rsp 2471 mov %rax, %rbp 2472 .Lxts_enc_epilogue: 2473 ret 2474 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt 2475 2476 .globl bsaes_xts_decrypt 2477 .type bsaes_xts_decrypt,\@abi-omnipotent 2478 .align 16 2479 bsaes_xts_decrypt: 2480 mov %rsp, %rax 2481 .Lxts_dec_prologue: 2482 push %rbp 2483 push %rbx 2484 push %r12 2485 push %r13 2486 push %r14 2487 push %r15 2488 lea -0x48(%rsp), %rsp 2489 ___ 2490 $code.=<<___ if ($win64); 2491 mov 0xa0(%rsp),$arg5 # pull key2 2492 mov 0xa8(%rsp),$arg6 # pull ivp 2493 lea -0xa0(%rsp), %rsp 2494 movaps %xmm6, 0x40(%rsp) 2495 movaps %xmm7, 0x50(%rsp) 2496 movaps %xmm8, 0x60(%rsp) 2497 movaps %xmm9, 0x70(%rsp) 2498 movaps %xmm10, 0x80(%rsp) 2499 movaps %xmm11, 0x90(%rsp) 2500 movaps %xmm12, 0xa0(%rsp) 2501 movaps %xmm13, 0xb0(%rsp) 2502 movaps %xmm14, 0xc0(%rsp) 2503 movaps %xmm15, 0xd0(%rsp) 2504 .Lxts_dec_body: 2505 ___ 2506 $code.=<<___; 2507 mov %rsp, %rbp # backup %rsp 2508 mov $arg1, $inp # backup arguments 2509 mov $arg2, $out 2510 mov $arg3, $len 2511 mov $arg4, $key 2512 2513 lea ($arg6), $arg1 2514 lea 0x20(%rbp), $arg2 2515 lea ($arg5), $arg3 2516 call asm_AES_encrypt # generate initial tweak 2517 2518 mov 240($key), %eax # rounds 2519 mov $len, %rbx # backup $len 2520 2521 mov %eax, %edx # rounds 2522 shl \$7, %rax # 128 bytes per inner round key 2523 sub \$`128-32`, %rax # size of bit-sliced key schedule 2524 sub %rax, %rsp 2525 2526 mov %rsp, %rax # pass key schedule 2527 mov $key, %rcx # pass key 2528 mov %edx, %r10d # pass rounds 2529 call _bsaes_key_convert 2530 pxor (%rsp), %xmm7 # fix up round 0 key 2531 movdqa %xmm6, (%rax) # save last round key 2532 movdqa %xmm7, (%rsp) 2533 2534 xor %eax, %eax # if ($len%16) len-=16; 2535 and \$-16, $len 2536 test \$15, %ebx 2537 setnz %al 2538 shl \$4, %rax 2539 sub %rax, $len 2540 2541 sub \$0x80, %rsp # place for tweak[8] 2542 movdqa 0x20(%rbp), @XMM[7] # initial tweak 2543 2544 pxor $twtmp, $twtmp 2545 movdqa .Lxts_magic(%rip), $twmask 2546 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2547 2548 sub \$0x80, $len 2549 jc .Lxts_dec_short 2550 jmp .Lxts_dec_loop 2551 2552 .align 16 2553 .Lxts_dec_loop: 2554 ___ 2555 for ($i=0;$i<7;$i++) { 2556 $code.=<<___; 2557 pshufd \$0x13, $twtmp, $twres 2558 pxor $twtmp, $twtmp 2559 movdqa @XMM[7], @XMM[$i] 2560 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2561 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2562 pand $twmask, $twres # isolate carry and residue 2563 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2564 pxor $twres, @XMM[7] 2565 ___ 2566 $code.=<<___ if ($i>=1); 2567 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2568 ___ 2569 $code.=<<___ if ($i>=2); 2570 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2571 ___ 2572 } 2573 $code.=<<___; 2574 movdqu 0x60($inp), @XMM[8+6] 2575 pxor @XMM[8+5], @XMM[5] 2576 movdqu 0x70($inp), @XMM[8+7] 2577 lea 0x80($inp), $inp 2578 movdqa @XMM[7], 0x70(%rsp) 2579 pxor @XMM[8+6], @XMM[6] 2580 lea 0x80(%rsp), %rax # pass key schedule 2581 pxor @XMM[8+7], @XMM[7] 2582 mov %edx, %r10d # pass rounds 2583 2584 call _bsaes_decrypt8 2585 2586 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2587 pxor 0x10(%rsp), @XMM[1] 2588 movdqu @XMM[0], 0x00($out) # write output 2589 pxor 0x20(%rsp), @XMM[6] 2590 movdqu @XMM[1], 0x10($out) 2591 pxor 0x30(%rsp), @XMM[4] 2592 movdqu @XMM[6], 0x20($out) 2593 pxor 0x40(%rsp), @XMM[2] 2594 movdqu @XMM[4], 0x30($out) 2595 pxor 0x50(%rsp), @XMM[7] 2596 movdqu @XMM[2], 0x40($out) 2597 pxor 0x60(%rsp), @XMM[3] 2598 movdqu @XMM[7], 0x50($out) 2599 pxor 0x70(%rsp), @XMM[5] 2600 movdqu @XMM[3], 0x60($out) 2601 movdqu @XMM[5], 0x70($out) 2602 lea 0x80($out), $out 2603 2604 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak 2605 pxor $twtmp, $twtmp 2606 movdqa .Lxts_magic(%rip), $twmask 2607 pcmpgtd @XMM[7], $twtmp 2608 pshufd \$0x13, $twtmp, $twres 2609 pxor $twtmp, $twtmp 2610 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2611 pand $twmask, $twres # isolate carry and residue 2612 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2613 pxor $twres, @XMM[7] 2614 2615 sub \$0x80,$len 2616 jnc .Lxts_dec_loop 2617 2618 .Lxts_dec_short: 2619 add \$0x80, $len 2620 jz .Lxts_dec_done 2621 ___ 2622 for ($i=0;$i<7;$i++) { 2623 $code.=<<___; 2624 pshufd \$0x13, $twtmp, $twres 2625 pxor $twtmp, $twtmp 2626 movdqa @XMM[7], @XMM[$i] 2627 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] 2628 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2629 pand $twmask, $twres # isolate carry and residue 2630 pcmpgtd @XMM[7], $twtmp # broadcast upper bits 2631 pxor $twres, @XMM[7] 2632 ___ 2633 $code.=<<___ if ($i>=1); 2634 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] 2635 cmp \$`0x10*$i`,$len 2636 je .Lxts_dec_$i 2637 ___ 2638 $code.=<<___ if ($i>=2); 2639 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] 2640 ___ 2641 } 2642 $code.=<<___; 2643 movdqu 0x60($inp), @XMM[8+6] 2644 pxor @XMM[8+5], @XMM[5] 2645 movdqa @XMM[7], 0x70(%rsp) 2646 lea 0x70($inp), $inp 2647 pxor @XMM[8+6], @XMM[6] 2648 lea 0x80(%rsp), %rax # pass key schedule 2649 mov %edx, %r10d # pass rounds 2650 2651 call _bsaes_decrypt8 2652 2653 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2654 pxor 0x10(%rsp), @XMM[1] 2655 movdqu @XMM[0], 0x00($out) # write output 2656 pxor 0x20(%rsp), @XMM[6] 2657 movdqu @XMM[1], 0x10($out) 2658 pxor 0x30(%rsp), @XMM[4] 2659 movdqu @XMM[6], 0x20($out) 2660 pxor 0x40(%rsp), @XMM[2] 2661 movdqu @XMM[4], 0x30($out) 2662 pxor 0x50(%rsp), @XMM[7] 2663 movdqu @XMM[2], 0x40($out) 2664 pxor 0x60(%rsp), @XMM[3] 2665 movdqu @XMM[7], 0x50($out) 2666 movdqu @XMM[3], 0x60($out) 2667 lea 0x70($out), $out 2668 2669 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak 2670 jmp .Lxts_dec_done 2671 .align 16 2672 .Lxts_dec_6: 2673 pxor @XMM[8+4], @XMM[4] 2674 lea 0x60($inp), $inp 2675 pxor @XMM[8+5], @XMM[5] 2676 lea 0x80(%rsp), %rax # pass key schedule 2677 mov %edx, %r10d # pass rounds 2678 2679 call _bsaes_decrypt8 2680 2681 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2682 pxor 0x10(%rsp), @XMM[1] 2683 movdqu @XMM[0], 0x00($out) # write output 2684 pxor 0x20(%rsp), @XMM[6] 2685 movdqu @XMM[1], 0x10($out) 2686 pxor 0x30(%rsp), @XMM[4] 2687 movdqu @XMM[6], 0x20($out) 2688 pxor 0x40(%rsp), @XMM[2] 2689 movdqu @XMM[4], 0x30($out) 2690 pxor 0x50(%rsp), @XMM[7] 2691 movdqu @XMM[2], 0x40($out) 2692 movdqu @XMM[7], 0x50($out) 2693 lea 0x60($out), $out 2694 2695 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak 2696 jmp .Lxts_dec_done 2697 .align 16 2698 .Lxts_dec_5: 2699 pxor @XMM[8+3], @XMM[3] 2700 lea 0x50($inp), $inp 2701 pxor @XMM[8+4], @XMM[4] 2702 lea 0x80(%rsp), %rax # pass key schedule 2703 mov %edx, %r10d # pass rounds 2704 2705 call _bsaes_decrypt8 2706 2707 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2708 pxor 0x10(%rsp), @XMM[1] 2709 movdqu @XMM[0], 0x00($out) # write output 2710 pxor 0x20(%rsp), @XMM[6] 2711 movdqu @XMM[1], 0x10($out) 2712 pxor 0x30(%rsp), @XMM[4] 2713 movdqu @XMM[6], 0x20($out) 2714 pxor 0x40(%rsp), @XMM[2] 2715 movdqu @XMM[4], 0x30($out) 2716 movdqu @XMM[2], 0x40($out) 2717 lea 0x50($out), $out 2718 2719 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak 2720 jmp .Lxts_dec_done 2721 .align 16 2722 .Lxts_dec_4: 2723 pxor @XMM[8+2], @XMM[2] 2724 lea 0x40($inp), $inp 2725 pxor @XMM[8+3], @XMM[3] 2726 lea 0x80(%rsp), %rax # pass key schedule 2727 mov %edx, %r10d # pass rounds 2728 2729 call _bsaes_decrypt8 2730 2731 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2732 pxor 0x10(%rsp), @XMM[1] 2733 movdqu @XMM[0], 0x00($out) # write output 2734 pxor 0x20(%rsp), @XMM[6] 2735 movdqu @XMM[1], 0x10($out) 2736 pxor 0x30(%rsp), @XMM[4] 2737 movdqu @XMM[6], 0x20($out) 2738 movdqu @XMM[4], 0x30($out) 2739 lea 0x40($out), $out 2740 2741 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak 2742 jmp .Lxts_dec_done 2743 .align 16 2744 .Lxts_dec_3: 2745 pxor @XMM[8+1], @XMM[1] 2746 lea 0x30($inp), $inp 2747 pxor @XMM[8+2], @XMM[2] 2748 lea 0x80(%rsp), %rax # pass key schedule 2749 mov %edx, %r10d # pass rounds 2750 2751 call _bsaes_decrypt8 2752 2753 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2754 pxor 0x10(%rsp), @XMM[1] 2755 movdqu @XMM[0], 0x00($out) # write output 2756 pxor 0x20(%rsp), @XMM[6] 2757 movdqu @XMM[1], 0x10($out) 2758 movdqu @XMM[6], 0x20($out) 2759 lea 0x30($out), $out 2760 2761 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak 2762 jmp .Lxts_dec_done 2763 .align 16 2764 .Lxts_dec_2: 2765 pxor @XMM[8+0], @XMM[0] 2766 lea 0x20($inp), $inp 2767 pxor @XMM[8+1], @XMM[1] 2768 lea 0x80(%rsp), %rax # pass key schedule 2769 mov %edx, %r10d # pass rounds 2770 2771 call _bsaes_decrypt8 2772 2773 pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2774 pxor 0x10(%rsp), @XMM[1] 2775 movdqu @XMM[0], 0x00($out) # write output 2776 movdqu @XMM[1], 0x10($out) 2777 lea 0x20($out), $out 2778 2779 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak 2780 jmp .Lxts_dec_done 2781 .align 16 2782 .Lxts_dec_1: 2783 pxor @XMM[0], @XMM[8] 2784 lea 0x10($inp), $inp 2785 movdqa @XMM[8], 0x20(%rbp) 2786 lea 0x20(%rbp), $arg1 2787 lea 0x20(%rbp), $arg2 2788 lea ($key), $arg3 2789 call asm_AES_decrypt # doesn't touch %xmm 2790 pxor 0x20(%rbp), @XMM[0] # ^= tweak[] 2791 #pxor @XMM[8], @XMM[0] 2792 #lea 0x80(%rsp), %rax # pass key schedule 2793 #mov %edx, %r10d # pass rounds 2794 #call _bsaes_decrypt8 2795 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] 2796 movdqu @XMM[0], 0x00($out) # write output 2797 lea 0x10($out), $out 2798 2799 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak 2800 2801 .Lxts_dec_done: 2802 and \$15, %ebx 2803 jz .Lxts_dec_ret 2804 2805 pxor $twtmp, $twtmp 2806 movdqa .Lxts_magic(%rip), $twmask 2807 pcmpgtd @XMM[7], $twtmp 2808 pshufd \$0x13, $twtmp, $twres 2809 movdqa @XMM[7], @XMM[6] 2810 paddq @XMM[7], @XMM[7] # psllq 1,$tweak 2811 pand $twmask, $twres # isolate carry and residue 2812 movdqu ($inp), @XMM[0] 2813 pxor $twres, @XMM[7] 2814 2815 lea 0x20(%rbp), $arg1 2816 pxor @XMM[7], @XMM[0] 2817 lea 0x20(%rbp), $arg2 2818 movdqa @XMM[0], 0x20(%rbp) 2819 lea ($key), $arg3 2820 call asm_AES_decrypt # doesn't touch %xmm 2821 pxor 0x20(%rbp), @XMM[7] 2822 mov $out, %rdx 2823 movdqu @XMM[7], ($out) 2824 2825 .Lxts_dec_steal: 2826 movzb 16($inp), %eax 2827 movzb (%rdx), %ecx 2828 lea 1($inp), $inp 2829 mov %al, (%rdx) 2830 mov %cl, 16(%rdx) 2831 lea 1(%rdx), %rdx 2832 sub \$1,%ebx 2833 jnz .Lxts_dec_steal 2834 2835 movdqu ($out), @XMM[0] 2836 lea 0x20(%rbp), $arg1 2837 pxor @XMM[6], @XMM[0] 2838 lea 0x20(%rbp), $arg2 2839 movdqa @XMM[0], 0x20(%rbp) 2840 lea ($key), $arg3 2841 call asm_AES_decrypt # doesn't touch %xmm 2842 pxor 0x20(%rbp), @XMM[6] 2843 movdqu @XMM[6], ($out) 2844 2845 .Lxts_dec_ret: 2846 lea (%rsp), %rax 2847 pxor %xmm0, %xmm0 2848 .Lxts_dec_bzero: # wipe key schedule [if any] 2849 movdqa %xmm0, 0x00(%rax) 2850 movdqa %xmm0, 0x10(%rax) 2851 lea 0x20(%rax), %rax 2852 cmp %rax, %rbp 2853 ja .Lxts_dec_bzero 2854 2855 lea (%rbp),%rsp # restore %rsp 2856 ___ 2857 $code.=<<___ if ($win64); 2858 movaps 0x40(%rbp), %xmm6 2859 movaps 0x50(%rbp), %xmm7 2860 movaps 0x60(%rbp), %xmm8 2861 movaps 0x70(%rbp), %xmm9 2862 movaps 0x80(%rbp), %xmm10 2863 movaps 0x90(%rbp), %xmm11 2864 movaps 0xa0(%rbp), %xmm12 2865 movaps 0xb0(%rbp), %xmm13 2866 movaps 0xc0(%rbp), %xmm14 2867 movaps 0xd0(%rbp), %xmm15 2868 lea 0xa0(%rbp), %rsp 2869 ___ 2870 $code.=<<___; 2871 mov 0x48(%rsp), %r15 2872 mov 0x50(%rsp), %r14 2873 mov 0x58(%rsp), %r13 2874 mov 0x60(%rsp), %r12 2875 mov 0x68(%rsp), %rbx 2876 mov 0x70(%rsp), %rax 2877 lea 0x78(%rsp), %rsp 2878 mov %rax, %rbp 2879 .Lxts_dec_epilogue: 2880 ret 2881 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt 2882 ___ 2883 } 2884 $code.=<<___; 2885 .type _bsaes_const,\@object 2886 .align 64 2887 _bsaes_const: 2888 .LM0ISR: # InvShiftRows constants 2889 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 2890 .LISRM0: 2891 .quad 0x01040b0e0205080f, 0x0306090c00070a0d 2892 .LISR: 2893 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 2894 .LBS0: # bit-slice constants 2895 .quad 0x5555555555555555, 0x5555555555555555 2896 .LBS1: 2897 .quad 0x3333333333333333, 0x3333333333333333 2898 .LBS2: 2899 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f 2900 .LSR: # shiftrows constants 2901 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b 2902 .LSRM0: 2903 .quad 0x0304090e00050a0f, 0x01060b0c0207080d 2904 .LM0SR: 2905 .quad 0x0a0e02060f03070b, 0x0004080c05090d01 2906 .LSWPUP: # byte-swap upper dword 2907 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 2908 .LSWPUPM0SR: 2909 .quad 0x0a0d02060c03070b, 0x0004080f05090e01 2910 .LADD1: # counter increment constants 2911 .quad 0x0000000000000000, 0x0000000100000000 2912 .LADD2: 2913 .quad 0x0000000000000000, 0x0000000200000000 2914 .LADD3: 2915 .quad 0x0000000000000000, 0x0000000300000000 2916 .LADD4: 2917 .quad 0x0000000000000000, 0x0000000400000000 2918 .LADD5: 2919 .quad 0x0000000000000000, 0x0000000500000000 2920 .LADD6: 2921 .quad 0x0000000000000000, 0x0000000600000000 2922 .LADD7: 2923 .quad 0x0000000000000000, 0x0000000700000000 2924 .LADD8: 2925 .quad 0x0000000000000000, 0x0000000800000000 2926 .Lxts_magic: 2927 .long 0x87,0,1,0 2928 .Lmasks: 2929 .quad 0x0101010101010101, 0x0101010101010101 2930 .quad 0x0202020202020202, 0x0202020202020202 2931 .quad 0x0404040404040404, 0x0404040404040404 2932 .quad 0x0808080808080808, 0x0808080808080808 2933 .LM0: 2934 .quad 0x02060a0e03070b0f, 0x0004080c0105090d 2935 .L63: 2936 .quad 0x6363636363636363, 0x6363636363636363 2937 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov" 2938 .align 64 2939 .size _bsaes_const,.-_bsaes_const 2940 ___ 2941 2942 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2943 # CONTEXT *context,DISPATCHER_CONTEXT *disp) 2944 if ($win64) { 2945 $rec="%rcx"; 2946 $frame="%rdx"; 2947 $context="%r8"; 2948 $disp="%r9"; 2949 2950 $code.=<<___; 2951 .extern __imp_RtlVirtualUnwind 2952 .type se_handler,\@abi-omnipotent 2953 .align 16 2954 se_handler: 2955 push %rsi 2956 push %rdi 2957 push %rbx 2958 push %rbp 2959 push %r12 2960 push %r13 2961 push %r14 2962 push %r15 2963 pushfq 2964 sub \$64,%rsp 2965 2966 mov 120($context),%rax # pull context->Rax 2967 mov 248($context),%rbx # pull context->Rip 2968 2969 mov 8($disp),%rsi # disp->ImageBase 2970 mov 56($disp),%r11 # disp->HandlerData 2971 2972 mov 0(%r11),%r10d # HandlerData[0] 2973 lea (%rsi,%r10),%r10 # prologue label 2974 cmp %r10,%rbx # context->Rip<prologue label 2975 jb .Lin_prologue 2976 2977 mov 152($context),%rax # pull context->Rsp 2978 2979 mov 4(%r11),%r10d # HandlerData[1] 2980 lea (%rsi,%r10),%r10 # epilogue label 2981 cmp %r10,%rbx # context->Rip>=epilogue label 2982 jae .Lin_prologue 2983 2984 mov 160($context),%rax # pull context->Rbp 2985 2986 lea 0x40(%rax),%rsi # %xmm save area 2987 lea 512($context),%rdi # &context.Xmm6 2988 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 2989 .long 0xa548f3fc # cld; rep movsq 2990 lea 0xa0(%rax),%rax # adjust stack pointer 2991 2992 mov 0x70(%rax),%rbp 2993 mov 0x68(%rax),%rbx 2994 mov 0x60(%rax),%r12 2995 mov 0x58(%rax),%r13 2996 mov 0x50(%rax),%r14 2997 mov 0x48(%rax),%r15 2998 lea 0x78(%rax),%rax # adjust stack pointer 2999 mov %rbx,144($context) # restore context->Rbx 3000 mov %rbp,160($context) # restore context->Rbp 3001 mov %r12,216($context) # restore context->R12 3002 mov %r13,224($context) # restore context->R13 3003 mov %r14,232($context) # restore context->R14 3004 mov %r15,240($context) # restore context->R15 3005 3006 .Lin_prologue: 3007 mov %rax,152($context) # restore context->Rsp 3008 3009 mov 40($disp),%rdi # disp->ContextRecord 3010 mov $context,%rsi # context 3011 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 3012 .long 0xa548f3fc # cld; rep movsq 3013 3014 mov $disp,%rsi 3015 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 3016 mov 8(%rsi),%rdx # arg2, disp->ImageBase 3017 mov 0(%rsi),%r8 # arg3, disp->ControlPc 3018 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 3019 mov 40(%rsi),%r10 # disp->ContextRecord 3020 lea 56(%rsi),%r11 # &disp->HandlerData 3021 lea 24(%rsi),%r12 # &disp->EstablisherFrame 3022 mov %r10,32(%rsp) # arg5 3023 mov %r11,40(%rsp) # arg6 3024 mov %r12,48(%rsp) # arg7 3025 mov %rcx,56(%rsp) # arg8, (NULL) 3026 call *__imp_RtlVirtualUnwind(%rip) 3027 3028 mov \$1,%eax # ExceptionContinueSearch 3029 add \$64,%rsp 3030 popfq 3031 pop %r15 3032 pop %r14 3033 pop %r13 3034 pop %r12 3035 pop %rbp 3036 pop %rbx 3037 pop %rdi 3038 pop %rsi 3039 ret 3040 .size se_handler,.-se_handler 3041 3042 .section .pdata 3043 .align 4 3044 ___ 3045 $code.=<<___ if ($ecb); 3046 .rva .Lecb_enc_prologue 3047 .rva .Lecb_enc_epilogue 3048 .rva .Lecb_enc_info 3049 3050 .rva .Lecb_dec_prologue 3051 .rva .Lecb_dec_epilogue 3052 .rva .Lecb_dec_info 3053 ___ 3054 $code.=<<___; 3055 .rva .Lcbc_dec_prologue 3056 .rva .Lcbc_dec_epilogue 3057 .rva .Lcbc_dec_info 3058 3059 .rva .Lctr_enc_prologue 3060 .rva .Lctr_enc_epilogue 3061 .rva .Lctr_enc_info 3062 3063 .rva .Lxts_enc_prologue 3064 .rva .Lxts_enc_epilogue 3065 .rva .Lxts_enc_info 3066 3067 .rva .Lxts_dec_prologue 3068 .rva .Lxts_dec_epilogue 3069 .rva .Lxts_dec_info 3070 3071 .section .xdata 3072 .align 8 3073 ___ 3074 $code.=<<___ if ($ecb); 3075 .Lecb_enc_info: 3076 .byte 9,0,0,0 3077 .rva se_handler 3078 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] 3079 .Lecb_dec_info: 3080 .byte 9,0,0,0 3081 .rva se_handler 3082 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] 3083 ___ 3084 $code.=<<___; 3085 .Lcbc_dec_info: 3086 .byte 9,0,0,0 3087 .rva se_handler 3088 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] 3089 .Lctr_enc_info: 3090 .byte 9,0,0,0 3091 .rva se_handler 3092 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] 3093 .Lxts_enc_info: 3094 .byte 9,0,0,0 3095 .rva se_handler 3096 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] 3097 .Lxts_dec_info: 3098 .byte 9,0,0,0 3099 .rva se_handler 3100 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] 3101 ___ 3102 } 3103 3104 $code =~ s/\`([^\`]*)\`/eval($1)/gem; 3105 3106 print $code; 3107 3108 close STDOUT;