1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5 # project. The module is, however, dual licensed under OpenSSL and 6 # CRYPTOGAMS licenses depending on where you obtain it. For further 7 # details see http://www.openssl.org/~appro/cryptogams/. 8 # ==================================================================== 9 # 10 # This module implements support for Intel AES-NI extension. In 11 # OpenSSL context it's used with Intel engine, but can also be used as 12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for 13 # details]. 14 # 15 # Performance. 16 # 17 # To start with see corresponding paragraph in aesni-x86_64.pl... 18 # Instead of filling table similar to one found there I've chosen to 19 # summarize *comparison* results for raw ECB, CTR and CBC benchmarks. 20 # The simplified table below represents 32-bit performance relative 21 # to 64-bit one in every given point. Ratios vary for different 22 # encryption modes, therefore interval values. 23 # 24 # 16-byte 64-byte 256-byte 1-KB 8-KB 25 # 53-67% 67-84% 91-94% 95-98% 97-99.5% 26 # 27 # Lower ratios for smaller block sizes are perfectly understandable, 28 # because function call overhead is higher in 32-bit mode. Largest 29 # 8-KB block performance is virtually same: 32-bit code is less than 30 # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. 31 32 # January 2011 33 # 34 # See aesni-x86_64.pl for details. Unlike x86_64 version this module 35 # interleaves at most 6 aes[enc|dec] instructions, because there are 36 # not enough registers for 8x interleave [which should be optimal for 37 # Sandy Bridge]. Actually, performance results for 6x interleave 38 # factor presented in aesni-x86_64.pl (except for CTR) are for this 39 # module. 40 41 # April 2011 42 # 43 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing 44 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. 45 46 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script 47 # generates drop-in replacement for 48 # crypto/aes/asm/aes-586.pl:-) 49 $inline=1; # inline _aesni_[en|de]crypt 50 51 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 52 push(@INC,"${dir}","${dir}../../perlasm"); 53 require "x86asm.pl"; 54 55 &asm_init($ARGV[0],$0); 56 57 if ($PREFIX eq "aesni") { $movekey=*movups; } 58 else { $movekey=*movups; } 59 60 $len="eax"; 61 $rounds="ecx"; 62 $key="edx"; 63 $inp="esi"; 64 $out="edi"; 65 $rounds_="ebx"; # backup copy for $rounds 66 $key_="ebp"; # backup copy for $key 67 68 $rndkey0="xmm0"; 69 $rndkey1="xmm1"; 70 $inout0="xmm2"; 71 $inout1="xmm3"; 72 $inout2="xmm4"; 73 $inout3="xmm5"; $in1="xmm5"; 74 $inout4="xmm6"; $in0="xmm6"; 75 $inout5="xmm7"; $ivec="xmm7"; 76 77 # AESNI extenstion 78 sub aeskeygenassist 79 { my($dst,$src,$imm)=@_; 80 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 81 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } 82 } 83 sub aescommon 84 { my($opcodelet,$dst,$src)=@_; 85 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) 86 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} 87 } 88 sub aesimc { aescommon(0xdb,@_); } 89 sub aesenc { aescommon(0xdc,@_); } 90 sub aesenclast { aescommon(0xdd,@_); } 91 sub aesdec { aescommon(0xde,@_); } 92 sub aesdeclast { aescommon(0xdf,@_); } 93 94 # Inline version of internal aesni_[en|de]crypt1 95 { my $sn; 96 sub aesni_inline_generate1 97 { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); 98 $sn++; 99 100 &$movekey ($rndkey0,&QWP(0,$key)); 101 &$movekey ($rndkey1,&QWP(16,$key)); 102 &xorps ($ivec,$rndkey0) if (defined($ivec)); 103 &lea ($key,&DWP(32,$key)); 104 &xorps ($inout,$ivec) if (defined($ivec)); 105 &xorps ($inout,$rndkey0) if (!defined($ivec)); 106 &set_label("${p}1_loop_$sn"); 107 eval"&aes${p} ($inout,$rndkey1)"; 108 &dec ($rounds); 109 &$movekey ($rndkey1,&QWP(0,$key)); 110 &lea ($key,&DWP(16,$key)); 111 &jnz (&label("${p}1_loop_$sn")); 112 eval"&aes${p}last ($inout,$rndkey1)"; 113 }} 114 115 sub aesni_generate1 # fully unrolled loop 116 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); 117 118 &function_begin_B("_aesni_${p}rypt1"); 119 &movups ($rndkey0,&QWP(0,$key)); 120 &$movekey ($rndkey1,&QWP(0x10,$key)); 121 &xorps ($inout,$rndkey0); 122 &$movekey ($rndkey0,&QWP(0x20,$key)); 123 &lea ($key,&DWP(0x30,$key)); 124 &cmp ($rounds,11); 125 &jb (&label("${p}128")); 126 &lea ($key,&DWP(0x20,$key)); 127 &je (&label("${p}192")); 128 &lea ($key,&DWP(0x20,$key)); 129 eval"&aes${p} ($inout,$rndkey1)"; 130 &$movekey ($rndkey1,&QWP(-0x40,$key)); 131 eval"&aes${p} ($inout,$rndkey0)"; 132 &$movekey ($rndkey0,&QWP(-0x30,$key)); 133 &set_label("${p}192"); 134 eval"&aes${p} ($inout,$rndkey1)"; 135 &$movekey ($rndkey1,&QWP(-0x20,$key)); 136 eval"&aes${p} ($inout,$rndkey0)"; 137 &$movekey ($rndkey0,&QWP(-0x10,$key)); 138 &set_label("${p}128"); 139 eval"&aes${p} ($inout,$rndkey1)"; 140 &$movekey ($rndkey1,&QWP(0,$key)); 141 eval"&aes${p} ($inout,$rndkey0)"; 142 &$movekey ($rndkey0,&QWP(0x10,$key)); 143 eval"&aes${p} ($inout,$rndkey1)"; 144 &$movekey ($rndkey1,&QWP(0x20,$key)); 145 eval"&aes${p} ($inout,$rndkey0)"; 146 &$movekey ($rndkey0,&QWP(0x30,$key)); 147 eval"&aes${p} ($inout,$rndkey1)"; 148 &$movekey ($rndkey1,&QWP(0x40,$key)); 149 eval"&aes${p} ($inout,$rndkey0)"; 150 &$movekey ($rndkey0,&QWP(0x50,$key)); 151 eval"&aes${p} ($inout,$rndkey1)"; 152 &$movekey ($rndkey1,&QWP(0x60,$key)); 153 eval"&aes${p} ($inout,$rndkey0)"; 154 &$movekey ($rndkey0,&QWP(0x70,$key)); 155 eval"&aes${p} ($inout,$rndkey1)"; 156 eval"&aes${p}last ($inout,$rndkey0)"; 157 &ret(); 158 &function_end_B("_aesni_${p}rypt1"); 159 } 160 161 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); 162 &aesni_generate1("enc") if (!$inline); 163 &function_begin_B("${PREFIX}_encrypt"); 164 &mov ("eax",&wparam(0)); 165 &mov ($key,&wparam(2)); 166 &movups ($inout0,&QWP(0,"eax")); 167 &mov ($rounds,&DWP(240,$key)); 168 &mov ("eax",&wparam(1)); 169 if ($inline) 170 { &aesni_inline_generate1("enc"); } 171 else 172 { &call ("_aesni_encrypt1"); } 173 &movups (&QWP(0,"eax"),$inout0); 174 &ret (); 175 &function_end_B("${PREFIX}_encrypt"); 176 177 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); 178 &aesni_generate1("dec") if(!$inline); 179 &function_begin_B("${PREFIX}_decrypt"); 180 &mov ("eax",&wparam(0)); 181 &mov ($key,&wparam(2)); 182 &movups ($inout0,&QWP(0,"eax")); 183 &mov ($rounds,&DWP(240,$key)); 184 &mov ("eax",&wparam(1)); 185 if ($inline) 186 { &aesni_inline_generate1("dec"); } 187 else 188 { &call ("_aesni_decrypt1"); } 189 &movups (&QWP(0,"eax"),$inout0); 190 &ret (); 191 &function_end_B("${PREFIX}_decrypt"); 192 193 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave 194 # factor. Why 3x subroutine were originally used in loops? Even though 195 # aes[enc|dec] latency was originally 6, it could be scheduled only 196 # every *2nd* cycle. Thus 3x interleave was the one providing optimal 197 # utilization, i.e. when subroutine's throughput is virtually same as 198 # of non-interleaved subroutine [for number of input blocks up to 3]. 199 # This is why it makes no sense to implement 2x subroutine. 200 # aes[enc|dec] latency in next processor generation is 8, but the 201 # instructions can be scheduled every cycle. Optimal interleave for 202 # new processor is therefore 8x, but it's unfeasible to accommodate it 203 # in XMM registers addreassable in 32-bit mode and therefore 6x is 204 # used instead... 205 206 sub aesni_generate3 207 { my $p=shift; 208 209 &function_begin_B("_aesni_${p}rypt3"); 210 &$movekey ($rndkey0,&QWP(0,$key)); 211 &shr ($rounds,1); 212 &$movekey ($rndkey1,&QWP(16,$key)); 213 &lea ($key,&DWP(32,$key)); 214 &xorps ($inout0,$rndkey0); 215 &pxor ($inout1,$rndkey0); 216 &pxor ($inout2,$rndkey0); 217 &$movekey ($rndkey0,&QWP(0,$key)); 218 219 &set_label("${p}3_loop"); 220 eval"&aes${p} ($inout0,$rndkey1)"; 221 eval"&aes${p} ($inout1,$rndkey1)"; 222 &dec ($rounds); 223 eval"&aes${p} ($inout2,$rndkey1)"; 224 &$movekey ($rndkey1,&QWP(16,$key)); 225 eval"&aes${p} ($inout0,$rndkey0)"; 226 eval"&aes${p} ($inout1,$rndkey0)"; 227 &lea ($key,&DWP(32,$key)); 228 eval"&aes${p} ($inout2,$rndkey0)"; 229 &$movekey ($rndkey0,&QWP(0,$key)); 230 &jnz (&label("${p}3_loop")); 231 eval"&aes${p} ($inout0,$rndkey1)"; 232 eval"&aes${p} ($inout1,$rndkey1)"; 233 eval"&aes${p} ($inout2,$rndkey1)"; 234 eval"&aes${p}last ($inout0,$rndkey0)"; 235 eval"&aes${p}last ($inout1,$rndkey0)"; 236 eval"&aes${p}last ($inout2,$rndkey0)"; 237 &ret(); 238 &function_end_B("_aesni_${p}rypt3"); 239 } 240 241 # 4x interleave is implemented to improve small block performance, 242 # most notably [and naturally] 4 block by ~30%. One can argue that one 243 # should have implemented 5x as well, but improvement would be <20%, 244 # so it's not worth it... 245 sub aesni_generate4 246 { my $p=shift; 247 248 &function_begin_B("_aesni_${p}rypt4"); 249 &$movekey ($rndkey0,&QWP(0,$key)); 250 &$movekey ($rndkey1,&QWP(16,$key)); 251 &shr ($rounds,1); 252 &lea ($key,&DWP(32,$key)); 253 &xorps ($inout0,$rndkey0); 254 &pxor ($inout1,$rndkey0); 255 &pxor ($inout2,$rndkey0); 256 &pxor ($inout3,$rndkey0); 257 &$movekey ($rndkey0,&QWP(0,$key)); 258 259 &set_label("${p}4_loop"); 260 eval"&aes${p} ($inout0,$rndkey1)"; 261 eval"&aes${p} ($inout1,$rndkey1)"; 262 &dec ($rounds); 263 eval"&aes${p} ($inout2,$rndkey1)"; 264 eval"&aes${p} ($inout3,$rndkey1)"; 265 &$movekey ($rndkey1,&QWP(16,$key)); 266 eval"&aes${p} ($inout0,$rndkey0)"; 267 eval"&aes${p} ($inout1,$rndkey0)"; 268 &lea ($key,&DWP(32,$key)); 269 eval"&aes${p} ($inout2,$rndkey0)"; 270 eval"&aes${p} ($inout3,$rndkey0)"; 271 &$movekey ($rndkey0,&QWP(0,$key)); 272 &jnz (&label("${p}4_loop")); 273 274 eval"&aes${p} ($inout0,$rndkey1)"; 275 eval"&aes${p} ($inout1,$rndkey1)"; 276 eval"&aes${p} ($inout2,$rndkey1)"; 277 eval"&aes${p} ($inout3,$rndkey1)"; 278 eval"&aes${p}last ($inout0,$rndkey0)"; 279 eval"&aes${p}last ($inout1,$rndkey0)"; 280 eval"&aes${p}last ($inout2,$rndkey0)"; 281 eval"&aes${p}last ($inout3,$rndkey0)"; 282 &ret(); 283 &function_end_B("_aesni_${p}rypt4"); 284 } 285 286 sub aesni_generate6 287 { my $p=shift; 288 289 &function_begin_B("_aesni_${p}rypt6"); 290 &static_label("_aesni_${p}rypt6_enter"); 291 &$movekey ($rndkey0,&QWP(0,$key)); 292 &shr ($rounds,1); 293 &$movekey ($rndkey1,&QWP(16,$key)); 294 &lea ($key,&DWP(32,$key)); 295 &xorps ($inout0,$rndkey0); 296 &pxor ($inout1,$rndkey0); # pxor does better here 297 eval"&aes${p} ($inout0,$rndkey1)"; 298 &pxor ($inout2,$rndkey0); 299 eval"&aes${p} ($inout1,$rndkey1)"; 300 &pxor ($inout3,$rndkey0); 301 &dec ($rounds); 302 eval"&aes${p} ($inout2,$rndkey1)"; 303 &pxor ($inout4,$rndkey0); 304 eval"&aes${p} ($inout3,$rndkey1)"; 305 &pxor ($inout5,$rndkey0); 306 eval"&aes${p} ($inout4,$rndkey1)"; 307 &$movekey ($rndkey0,&QWP(0,$key)); 308 eval"&aes${p} ($inout5,$rndkey1)"; 309 &jmp (&label("_aesni_${p}rypt6_enter")); 310 311 &set_label("${p}6_loop",16); 312 eval"&aes${p} ($inout0,$rndkey1)"; 313 eval"&aes${p} ($inout1,$rndkey1)"; 314 &dec ($rounds); 315 eval"&aes${p} ($inout2,$rndkey1)"; 316 eval"&aes${p} ($inout3,$rndkey1)"; 317 eval"&aes${p} ($inout4,$rndkey1)"; 318 eval"&aes${p} ($inout5,$rndkey1)"; 319 &set_label("_aesni_${p}rypt6_enter",16); 320 &$movekey ($rndkey1,&QWP(16,$key)); 321 eval"&aes${p} ($inout0,$rndkey0)"; 322 eval"&aes${p} ($inout1,$rndkey0)"; 323 &lea ($key,&DWP(32,$key)); 324 eval"&aes${p} ($inout2,$rndkey0)"; 325 eval"&aes${p} ($inout3,$rndkey0)"; 326 eval"&aes${p} ($inout4,$rndkey0)"; 327 eval"&aes${p} ($inout5,$rndkey0)"; 328 &$movekey ($rndkey0,&QWP(0,$key)); 329 &jnz (&label("${p}6_loop")); 330 331 eval"&aes${p} ($inout0,$rndkey1)"; 332 eval"&aes${p} ($inout1,$rndkey1)"; 333 eval"&aes${p} ($inout2,$rndkey1)"; 334 eval"&aes${p} ($inout3,$rndkey1)"; 335 eval"&aes${p} ($inout4,$rndkey1)"; 336 eval"&aes${p} ($inout5,$rndkey1)"; 337 eval"&aes${p}last ($inout0,$rndkey0)"; 338 eval"&aes${p}last ($inout1,$rndkey0)"; 339 eval"&aes${p}last ($inout2,$rndkey0)"; 340 eval"&aes${p}last ($inout3,$rndkey0)"; 341 eval"&aes${p}last ($inout4,$rndkey0)"; 342 eval"&aes${p}last ($inout5,$rndkey0)"; 343 &ret(); 344 &function_end_B("_aesni_${p}rypt6"); 345 } 346 &aesni_generate3("enc") if ($PREFIX eq "aesni"); 347 &aesni_generate3("dec"); 348 &aesni_generate4("enc") if ($PREFIX eq "aesni"); 349 &aesni_generate4("dec"); 350 &aesni_generate6("enc") if ($PREFIX eq "aesni"); 351 &aesni_generate6("dec"); 352 353 if ($PREFIX eq "aesni") { 354 ###################################################################### 355 # void aesni_ecb_encrypt (const void *in, void *out, 356 # size_t length, const AES_KEY *key, 357 # int enc); 358 &function_begin("aesni_ecb_encrypt"); 359 &mov ($inp,&wparam(0)); 360 &mov ($out,&wparam(1)); 361 &mov ($len,&wparam(2)); 362 &mov ($key,&wparam(3)); 363 &mov ($rounds_,&wparam(4)); 364 &and ($len,-16); 365 &jz (&label("ecb_ret")); 366 &mov ($rounds,&DWP(240,$key)); 367 &test ($rounds_,$rounds_); 368 &jz (&label("ecb_decrypt")); 369 370 &mov ($key_,$key); # backup $key 371 &mov ($rounds_,$rounds); # backup $rounds 372 &cmp ($len,0x60); 373 &jb (&label("ecb_enc_tail")); 374 375 &movdqu ($inout0,&QWP(0,$inp)); 376 &movdqu ($inout1,&QWP(0x10,$inp)); 377 &movdqu ($inout2,&QWP(0x20,$inp)); 378 &movdqu ($inout3,&QWP(0x30,$inp)); 379 &movdqu ($inout4,&QWP(0x40,$inp)); 380 &movdqu ($inout5,&QWP(0x50,$inp)); 381 &lea ($inp,&DWP(0x60,$inp)); 382 &sub ($len,0x60); 383 &jmp (&label("ecb_enc_loop6_enter")); 384 385 &set_label("ecb_enc_loop6",16); 386 &movups (&QWP(0,$out),$inout0); 387 &movdqu ($inout0,&QWP(0,$inp)); 388 &movups (&QWP(0x10,$out),$inout1); 389 &movdqu ($inout1,&QWP(0x10,$inp)); 390 &movups (&QWP(0x20,$out),$inout2); 391 &movdqu ($inout2,&QWP(0x20,$inp)); 392 &movups (&QWP(0x30,$out),$inout3); 393 &movdqu ($inout3,&QWP(0x30,$inp)); 394 &movups (&QWP(0x40,$out),$inout4); 395 &movdqu ($inout4,&QWP(0x40,$inp)); 396 &movups (&QWP(0x50,$out),$inout5); 397 &lea ($out,&DWP(0x60,$out)); 398 &movdqu ($inout5,&QWP(0x50,$inp)); 399 &lea ($inp,&DWP(0x60,$inp)); 400 &set_label("ecb_enc_loop6_enter"); 401 402 &call ("_aesni_encrypt6"); 403 404 &mov ($key,$key_); # restore $key 405 &mov ($rounds,$rounds_); # restore $rounds 406 &sub ($len,0x60); 407 &jnc (&label("ecb_enc_loop6")); 408 409 &movups (&QWP(0,$out),$inout0); 410 &movups (&QWP(0x10,$out),$inout1); 411 &movups (&QWP(0x20,$out),$inout2); 412 &movups (&QWP(0x30,$out),$inout3); 413 &movups (&QWP(0x40,$out),$inout4); 414 &movups (&QWP(0x50,$out),$inout5); 415 &lea ($out,&DWP(0x60,$out)); 416 &add ($len,0x60); 417 &jz (&label("ecb_ret")); 418 419 &set_label("ecb_enc_tail"); 420 &movups ($inout0,&QWP(0,$inp)); 421 &cmp ($len,0x20); 422 &jb (&label("ecb_enc_one")); 423 &movups ($inout1,&QWP(0x10,$inp)); 424 &je (&label("ecb_enc_two")); 425 &movups ($inout2,&QWP(0x20,$inp)); 426 &cmp ($len,0x40); 427 &jb (&label("ecb_enc_three")); 428 &movups ($inout3,&QWP(0x30,$inp)); 429 &je (&label("ecb_enc_four")); 430 &movups ($inout4,&QWP(0x40,$inp)); 431 &xorps ($inout5,$inout5); 432 &call ("_aesni_encrypt6"); 433 &movups (&QWP(0,$out),$inout0); 434 &movups (&QWP(0x10,$out),$inout1); 435 &movups (&QWP(0x20,$out),$inout2); 436 &movups (&QWP(0x30,$out),$inout3); 437 &movups (&QWP(0x40,$out),$inout4); 438 jmp (&label("ecb_ret")); 439 440 &set_label("ecb_enc_one",16); 441 if ($inline) 442 { &aesni_inline_generate1("enc"); } 443 else 444 { &call ("_aesni_encrypt1"); } 445 &movups (&QWP(0,$out),$inout0); 446 &jmp (&label("ecb_ret")); 447 448 &set_label("ecb_enc_two",16); 449 &xorps ($inout2,$inout2); 450 &call ("_aesni_encrypt3"); 451 &movups (&QWP(0,$out),$inout0); 452 &movups (&QWP(0x10,$out),$inout1); 453 &jmp (&label("ecb_ret")); 454 455 &set_label("ecb_enc_three",16); 456 &call ("_aesni_encrypt3"); 457 &movups (&QWP(0,$out),$inout0); 458 &movups (&QWP(0x10,$out),$inout1); 459 &movups (&QWP(0x20,$out),$inout2); 460 &jmp (&label("ecb_ret")); 461 462 &set_label("ecb_enc_four",16); 463 &call ("_aesni_encrypt4"); 464 &movups (&QWP(0,$out),$inout0); 465 &movups (&QWP(0x10,$out),$inout1); 466 &movups (&QWP(0x20,$out),$inout2); 467 &movups (&QWP(0x30,$out),$inout3); 468 &jmp (&label("ecb_ret")); 469 ###################################################################### 470 &set_label("ecb_decrypt",16); 471 &mov ($key_,$key); # backup $key 472 &mov ($rounds_,$rounds); # backup $rounds 473 &cmp ($len,0x60); 474 &jb (&label("ecb_dec_tail")); 475 476 &movdqu ($inout0,&QWP(0,$inp)); 477 &movdqu ($inout1,&QWP(0x10,$inp)); 478 &movdqu ($inout2,&QWP(0x20,$inp)); 479 &movdqu ($inout3,&QWP(0x30,$inp)); 480 &movdqu ($inout4,&QWP(0x40,$inp)); 481 &movdqu ($inout5,&QWP(0x50,$inp)); 482 &lea ($inp,&DWP(0x60,$inp)); 483 &sub ($len,0x60); 484 &jmp (&label("ecb_dec_loop6_enter")); 485 486 &set_label("ecb_dec_loop6",16); 487 &movups (&QWP(0,$out),$inout0); 488 &movdqu ($inout0,&QWP(0,$inp)); 489 &movups (&QWP(0x10,$out),$inout1); 490 &movdqu ($inout1,&QWP(0x10,$inp)); 491 &movups (&QWP(0x20,$out),$inout2); 492 &movdqu ($inout2,&QWP(0x20,$inp)); 493 &movups (&QWP(0x30,$out),$inout3); 494 &movdqu ($inout3,&QWP(0x30,$inp)); 495 &movups (&QWP(0x40,$out),$inout4); 496 &movdqu ($inout4,&QWP(0x40,$inp)); 497 &movups (&QWP(0x50,$out),$inout5); 498 &lea ($out,&DWP(0x60,$out)); 499 &movdqu ($inout5,&QWP(0x50,$inp)); 500 &lea ($inp,&DWP(0x60,$inp)); 501 &set_label("ecb_dec_loop6_enter"); 502 503 &call ("_aesni_decrypt6"); 504 505 &mov ($key,$key_); # restore $key 506 &mov ($rounds,$rounds_); # restore $rounds 507 &sub ($len,0x60); 508 &jnc (&label("ecb_dec_loop6")); 509 510 &movups (&QWP(0,$out),$inout0); 511 &movups (&QWP(0x10,$out),$inout1); 512 &movups (&QWP(0x20,$out),$inout2); 513 &movups (&QWP(0x30,$out),$inout3); 514 &movups (&QWP(0x40,$out),$inout4); 515 &movups (&QWP(0x50,$out),$inout5); 516 &lea ($out,&DWP(0x60,$out)); 517 &add ($len,0x60); 518 &jz (&label("ecb_ret")); 519 520 &set_label("ecb_dec_tail"); 521 &movups ($inout0,&QWP(0,$inp)); 522 &cmp ($len,0x20); 523 &jb (&label("ecb_dec_one")); 524 &movups ($inout1,&QWP(0x10,$inp)); 525 &je (&label("ecb_dec_two")); 526 &movups ($inout2,&QWP(0x20,$inp)); 527 &cmp ($len,0x40); 528 &jb (&label("ecb_dec_three")); 529 &movups ($inout3,&QWP(0x30,$inp)); 530 &je (&label("ecb_dec_four")); 531 &movups ($inout4,&QWP(0x40,$inp)); 532 &xorps ($inout5,$inout5); 533 &call ("_aesni_decrypt6"); 534 &movups (&QWP(0,$out),$inout0); 535 &movups (&QWP(0x10,$out),$inout1); 536 &movups (&QWP(0x20,$out),$inout2); 537 &movups (&QWP(0x30,$out),$inout3); 538 &movups (&QWP(0x40,$out),$inout4); 539 &jmp (&label("ecb_ret")); 540 541 &set_label("ecb_dec_one",16); 542 if ($inline) 543 { &aesni_inline_generate1("dec"); } 544 else 545 { &call ("_aesni_decrypt1"); } 546 &movups (&QWP(0,$out),$inout0); 547 &jmp (&label("ecb_ret")); 548 549 &set_label("ecb_dec_two",16); 550 &xorps ($inout2,$inout2); 551 &call ("_aesni_decrypt3"); 552 &movups (&QWP(0,$out),$inout0); 553 &movups (&QWP(0x10,$out),$inout1); 554 &jmp (&label("ecb_ret")); 555 556 &set_label("ecb_dec_three",16); 557 &call ("_aesni_decrypt3"); 558 &movups (&QWP(0,$out),$inout0); 559 &movups (&QWP(0x10,$out),$inout1); 560 &movups (&QWP(0x20,$out),$inout2); 561 &jmp (&label("ecb_ret")); 562 563 &set_label("ecb_dec_four",16); 564 &call ("_aesni_decrypt4"); 565 &movups (&QWP(0,$out),$inout0); 566 &movups (&QWP(0x10,$out),$inout1); 567 &movups (&QWP(0x20,$out),$inout2); 568 &movups (&QWP(0x30,$out),$inout3); 569 570 &set_label("ecb_ret"); 571 &function_end("aesni_ecb_encrypt"); 572 573 ###################################################################### 574 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, 575 # size_t blocks, const AES_KEY *key, 576 # const char *ivec,char *cmac); 577 # 578 # Handles only complete blocks, operates on 64-bit counter and 579 # does not update *ivec! Nor does it finalize CMAC value 580 # (see engine/eng_aesni.c for details) 581 # 582 { my $cmac=$inout1; 583 &function_begin("aesni_ccm64_encrypt_blocks"); 584 &mov ($inp,&wparam(0)); 585 &mov ($out,&wparam(1)); 586 &mov ($len,&wparam(2)); 587 &mov ($key,&wparam(3)); 588 &mov ($rounds_,&wparam(4)); 589 &mov ($rounds,&wparam(5)); 590 &mov ($key_,"esp"); 591 &sub ("esp",60); 592 &and ("esp",-16); # align stack 593 &mov (&DWP(48,"esp"),$key_); 594 595 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 596 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 597 &mov ($rounds,&DWP(240,$key)); 598 599 # compose byte-swap control mask for pshufb on stack 600 &mov (&DWP(0,"esp"),0x0c0d0e0f); 601 &mov (&DWP(4,"esp"),0x08090a0b); 602 &mov (&DWP(8,"esp"),0x04050607); 603 &mov (&DWP(12,"esp"),0x00010203); 604 605 # compose counter increment vector on stack 606 &mov ($rounds_,1); 607 &xor ($key_,$key_); 608 &mov (&DWP(16,"esp"),$rounds_); 609 &mov (&DWP(20,"esp"),$key_); 610 &mov (&DWP(24,"esp"),$key_); 611 &mov (&DWP(28,"esp"),$key_); 612 613 &shr ($rounds,1); 614 &lea ($key_,&DWP(0,$key)); 615 &movdqa ($inout3,&QWP(0,"esp")); 616 &movdqa ($inout0,$ivec); 617 &mov ($rounds_,$rounds); 618 &pshufb ($ivec,$inout3); 619 620 &set_label("ccm64_enc_outer"); 621 &$movekey ($rndkey0,&QWP(0,$key_)); 622 &mov ($rounds,$rounds_); 623 &movups ($in0,&QWP(0,$inp)); 624 625 &xorps ($inout0,$rndkey0); 626 &$movekey ($rndkey1,&QWP(16,$key_)); 627 &xorps ($rndkey0,$in0); 628 &lea ($key,&DWP(32,$key_)); 629 &xorps ($cmac,$rndkey0); # cmac^=inp 630 &$movekey ($rndkey0,&QWP(0,$key)); 631 632 &set_label("ccm64_enc2_loop"); 633 &aesenc ($inout0,$rndkey1); 634 &dec ($rounds); 635 &aesenc ($cmac,$rndkey1); 636 &$movekey ($rndkey1,&QWP(16,$key)); 637 &aesenc ($inout0,$rndkey0); 638 &lea ($key,&DWP(32,$key)); 639 &aesenc ($cmac,$rndkey0); 640 &$movekey ($rndkey0,&QWP(0,$key)); 641 &jnz (&label("ccm64_enc2_loop")); 642 &aesenc ($inout0,$rndkey1); 643 &aesenc ($cmac,$rndkey1); 644 &paddq ($ivec,&QWP(16,"esp")); 645 &aesenclast ($inout0,$rndkey0); 646 &aesenclast ($cmac,$rndkey0); 647 648 &dec ($len); 649 &lea ($inp,&DWP(16,$inp)); 650 &xorps ($in0,$inout0); # inp^=E(ivec) 651 &movdqa ($inout0,$ivec); 652 &movups (&QWP(0,$out),$in0); # save output 653 &lea ($out,&DWP(16,$out)); 654 &pshufb ($inout0,$inout3); 655 &jnz (&label("ccm64_enc_outer")); 656 657 &mov ("esp",&DWP(48,"esp")); 658 &mov ($out,&wparam(5)); 659 &movups (&QWP(0,$out),$cmac); 660 &function_end("aesni_ccm64_encrypt_blocks"); 661 662 &function_begin("aesni_ccm64_decrypt_blocks"); 663 &mov ($inp,&wparam(0)); 664 &mov ($out,&wparam(1)); 665 &mov ($len,&wparam(2)); 666 &mov ($key,&wparam(3)); 667 &mov ($rounds_,&wparam(4)); 668 &mov ($rounds,&wparam(5)); 669 &mov ($key_,"esp"); 670 &sub ("esp",60); 671 &and ("esp",-16); # align stack 672 &mov (&DWP(48,"esp"),$key_); 673 674 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec 675 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac 676 &mov ($rounds,&DWP(240,$key)); 677 678 # compose byte-swap control mask for pshufb on stack 679 &mov (&DWP(0,"esp"),0x0c0d0e0f); 680 &mov (&DWP(4,"esp"),0x08090a0b); 681 &mov (&DWP(8,"esp"),0x04050607); 682 &mov (&DWP(12,"esp"),0x00010203); 683 684 # compose counter increment vector on stack 685 &mov ($rounds_,1); 686 &xor ($key_,$key_); 687 &mov (&DWP(16,"esp"),$rounds_); 688 &mov (&DWP(20,"esp"),$key_); 689 &mov (&DWP(24,"esp"),$key_); 690 &mov (&DWP(28,"esp"),$key_); 691 692 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask 693 &movdqa ($inout0,$ivec); 694 695 &mov ($key_,$key); 696 &mov ($rounds_,$rounds); 697 698 &pshufb ($ivec,$inout3); 699 if ($inline) 700 { &aesni_inline_generate1("enc"); } 701 else 702 { &call ("_aesni_encrypt1"); } 703 &movups ($in0,&QWP(0,$inp)); # load inp 704 &paddq ($ivec,&QWP(16,"esp")); 705 &lea ($inp,&QWP(16,$inp)); 706 &jmp (&label("ccm64_dec_outer")); 707 708 &set_label("ccm64_dec_outer",16); 709 &xorps ($in0,$inout0); # inp ^= E(ivec) 710 &movdqa ($inout0,$ivec); 711 &mov ($rounds,$rounds_); 712 &movups (&QWP(0,$out),$in0); # save output 713 &lea ($out,&DWP(16,$out)); 714 &pshufb ($inout0,$inout3); 715 716 &sub ($len,1); 717 &jz (&label("ccm64_dec_break")); 718 719 &$movekey ($rndkey0,&QWP(0,$key_)); 720 &shr ($rounds,1); 721 &$movekey ($rndkey1,&QWP(16,$key_)); 722 &xorps ($in0,$rndkey0); 723 &lea ($key,&DWP(32,$key_)); 724 &xorps ($inout0,$rndkey0); 725 &xorps ($cmac,$in0); # cmac^=out 726 &$movekey ($rndkey0,&QWP(0,$key)); 727 728 &set_label("ccm64_dec2_loop"); 729 &aesenc ($inout0,$rndkey1); 730 &dec ($rounds); 731 &aesenc ($cmac,$rndkey1); 732 &$movekey ($rndkey1,&QWP(16,$key)); 733 &aesenc ($inout0,$rndkey0); 734 &lea ($key,&DWP(32,$key)); 735 &aesenc ($cmac,$rndkey0); 736 &$movekey ($rndkey0,&QWP(0,$key)); 737 &jnz (&label("ccm64_dec2_loop")); 738 &movups ($in0,&QWP(0,$inp)); # load inp 739 &paddq ($ivec,&QWP(16,"esp")); 740 &aesenc ($inout0,$rndkey1); 741 &aesenc ($cmac,$rndkey1); 742 &lea ($inp,&QWP(16,$inp)); 743 &aesenclast ($inout0,$rndkey0); 744 &aesenclast ($cmac,$rndkey0); 745 &jmp (&label("ccm64_dec_outer")); 746 747 &set_label("ccm64_dec_break",16); 748 &mov ($key,$key_); 749 if ($inline) 750 { &aesni_inline_generate1("enc",$cmac,$in0); } 751 else 752 { &call ("_aesni_encrypt1",$cmac); } 753 754 &mov ("esp",&DWP(48,"esp")); 755 &mov ($out,&wparam(5)); 756 &movups (&QWP(0,$out),$cmac); 757 &function_end("aesni_ccm64_decrypt_blocks"); 758 } 759 760 ###################################################################### 761 # void aesni_ctr32_encrypt_blocks (const void *in, void *out, 762 # size_t blocks, const AES_KEY *key, 763 # const char *ivec); 764 # 765 # Handles only complete blocks, operates on 32-bit counter and 766 # does not update *ivec! (see engine/eng_aesni.c for details) 767 # 768 # stack layout: 769 # 0 pshufb mask 770 # 16 vector addend: 0,6,6,6 771 # 32 counter-less ivec 772 # 48 1st triplet of counter vector 773 # 64 2nd triplet of counter vector 774 # 80 saved %esp 775 776 &function_begin("aesni_ctr32_encrypt_blocks"); 777 &mov ($inp,&wparam(0)); 778 &mov ($out,&wparam(1)); 779 &mov ($len,&wparam(2)); 780 &mov ($key,&wparam(3)); 781 &mov ($rounds_,&wparam(4)); 782 &mov ($key_,"esp"); 783 &sub ("esp",88); 784 &and ("esp",-16); # align stack 785 &mov (&DWP(80,"esp"),$key_); 786 787 &cmp ($len,1); 788 &je (&label("ctr32_one_shortcut")); 789 790 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec 791 792 # compose byte-swap control mask for pshufb on stack 793 &mov (&DWP(0,"esp"),0x0c0d0e0f); 794 &mov (&DWP(4,"esp"),0x08090a0b); 795 &mov (&DWP(8,"esp"),0x04050607); 796 &mov (&DWP(12,"esp"),0x00010203); 797 798 # compose counter increment vector on stack 799 &mov ($rounds,6); 800 &xor ($key_,$key_); 801 &mov (&DWP(16,"esp"),$rounds); 802 &mov (&DWP(20,"esp"),$rounds); 803 &mov (&DWP(24,"esp"),$rounds); 804 &mov (&DWP(28,"esp"),$key_); 805 806 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter 807 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter 808 809 &mov ($rounds,&DWP(240,$key)); # key->rounds 810 811 # compose 2 vectors of 3x32-bit counters 812 &bswap ($rounds_); 813 &pxor ($rndkey1,$rndkey1); 814 &pxor ($rndkey0,$rndkey0); 815 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask 816 &pinsrd ($rndkey1,$rounds_,0); 817 &lea ($key_,&DWP(3,$rounds_)); 818 &pinsrd ($rndkey0,$key_,0); 819 &inc ($rounds_); 820 &pinsrd ($rndkey1,$rounds_,1); 821 &inc ($key_); 822 &pinsrd ($rndkey0,$key_,1); 823 &inc ($rounds_); 824 &pinsrd ($rndkey1,$rounds_,2); 825 &inc ($key_); 826 &pinsrd ($rndkey0,$key_,2); 827 &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet 828 &pshufb ($rndkey1,$inout0); # byte swap 829 &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet 830 &pshufb ($rndkey0,$inout0); # byte swap 831 832 &pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword 833 &pshufd ($inout1,$rndkey1,2<<6); 834 &cmp ($len,6); 835 &jb (&label("ctr32_tail")); 836 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec 837 &shr ($rounds,1); 838 &mov ($key_,$key); # backup $key 839 &mov ($rounds_,$rounds); # backup $rounds 840 &sub ($len,6); 841 &jmp (&label("ctr32_loop6")); 842 843 &set_label("ctr32_loop6",16); 844 &pshufd ($inout2,$rndkey1,1<<6); 845 &movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec 846 &pshufd ($inout3,$rndkey0,3<<6); 847 &por ($inout0,$rndkey1); # merge counter-less ivec 848 &pshufd ($inout4,$rndkey0,2<<6); 849 &por ($inout1,$rndkey1); 850 &pshufd ($inout5,$rndkey0,1<<6); 851 &por ($inout2,$rndkey1); 852 &por ($inout3,$rndkey1); 853 &por ($inout4,$rndkey1); 854 &por ($inout5,$rndkey1); 855 856 # inlining _aesni_encrypt6's prologue gives ~4% improvement... 857 &$movekey ($rndkey0,&QWP(0,$key_)); 858 &$movekey ($rndkey1,&QWP(16,$key_)); 859 &lea ($key,&DWP(32,$key_)); 860 &dec ($rounds); 861 &pxor ($inout0,$rndkey0); 862 &pxor ($inout1,$rndkey0); 863 &aesenc ($inout0,$rndkey1); 864 &pxor ($inout2,$rndkey0); 865 &aesenc ($inout1,$rndkey1); 866 &pxor ($inout3,$rndkey0); 867 &aesenc ($inout2,$rndkey1); 868 &pxor ($inout4,$rndkey0); 869 &aesenc ($inout3,$rndkey1); 870 &pxor ($inout5,$rndkey0); 871 &aesenc ($inout4,$rndkey1); 872 &$movekey ($rndkey0,&QWP(0,$key)); 873 &aesenc ($inout5,$rndkey1); 874 875 &call (&label("_aesni_encrypt6_enter")); 876 877 &movups ($rndkey1,&QWP(0,$inp)); 878 &movups ($rndkey0,&QWP(0x10,$inp)); 879 &xorps ($inout0,$rndkey1); 880 &movups ($rndkey1,&QWP(0x20,$inp)); 881 &xorps ($inout1,$rndkey0); 882 &movups (&QWP(0,$out),$inout0); 883 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment 884 &xorps ($inout2,$rndkey1); 885 &movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet 886 &movups (&QWP(0x10,$out),$inout1); 887 &movups (&QWP(0x20,$out),$inout2); 888 889 &paddd ($rndkey1,$rndkey0); # 1st triplet increment 890 &paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment 891 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask 892 893 &movups ($inout1,&QWP(0x30,$inp)); 894 &movups ($inout2,&QWP(0x40,$inp)); 895 &xorps ($inout3,$inout1); 896 &movups ($inout1,&QWP(0x50,$inp)); 897 &lea ($inp,&DWP(0x60,$inp)); 898 &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet 899 &pshufb ($rndkey1,$inout0); # byte swap 900 &xorps ($inout4,$inout2); 901 &movups (&QWP(0x30,$out),$inout3); 902 &xorps ($inout5,$inout1); 903 &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet 904 &pshufb ($rndkey0,$inout0); # byte swap 905 &movups (&QWP(0x40,$out),$inout4); 906 &pshufd ($inout0,$rndkey1,3<<6); 907 &movups (&QWP(0x50,$out),$inout5); 908 &lea ($out,&DWP(0x60,$out)); 909 910 &mov ($rounds,$rounds_); 911 &pshufd ($inout1,$rndkey1,2<<6); 912 &sub ($len,6); 913 &jnc (&label("ctr32_loop6")); 914 915 &add ($len,6); 916 &jz (&label("ctr32_ret")); 917 &mov ($key,$key_); 918 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds 919 &movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec 920 921 &set_label("ctr32_tail"); 922 &por ($inout0,$inout5); 923 &cmp ($len,2); 924 &jb (&label("ctr32_one")); 925 926 &pshufd ($inout2,$rndkey1,1<<6); 927 &por ($inout1,$inout5); 928 &je (&label("ctr32_two")); 929 930 &pshufd ($inout3,$rndkey0,3<<6); 931 &por ($inout2,$inout5); 932 &cmp ($len,4); 933 &jb (&label("ctr32_three")); 934 935 &pshufd ($inout4,$rndkey0,2<<6); 936 &por ($inout3,$inout5); 937 &je (&label("ctr32_four")); 938 939 &por ($inout4,$inout5); 940 &call ("_aesni_encrypt6"); 941 &movups ($rndkey1,&QWP(0,$inp)); 942 &movups ($rndkey0,&QWP(0x10,$inp)); 943 &xorps ($inout0,$rndkey1); 944 &movups ($rndkey1,&QWP(0x20,$inp)); 945 &xorps ($inout1,$rndkey0); 946 &movups ($rndkey0,&QWP(0x30,$inp)); 947 &xorps ($inout2,$rndkey1); 948 &movups ($rndkey1,&QWP(0x40,$inp)); 949 &xorps ($inout3,$rndkey0); 950 &movups (&QWP(0,$out),$inout0); 951 &xorps ($inout4,$rndkey1); 952 &movups (&QWP(0x10,$out),$inout1); 953 &movups (&QWP(0x20,$out),$inout2); 954 &movups (&QWP(0x30,$out),$inout3); 955 &movups (&QWP(0x40,$out),$inout4); 956 &jmp (&label("ctr32_ret")); 957 958 &set_label("ctr32_one_shortcut",16); 959 &movups ($inout0,&QWP(0,$rounds_)); # load ivec 960 &mov ($rounds,&DWP(240,$key)); 961 962 &set_label("ctr32_one"); 963 if ($inline) 964 { &aesni_inline_generate1("enc"); } 965 else 966 { &call ("_aesni_encrypt1"); } 967 &movups ($in0,&QWP(0,$inp)); 968 &xorps ($in0,$inout0); 969 &movups (&QWP(0,$out),$in0); 970 &jmp (&label("ctr32_ret")); 971 972 &set_label("ctr32_two",16); 973 &call ("_aesni_encrypt3"); 974 &movups ($inout3,&QWP(0,$inp)); 975 &movups ($inout4,&QWP(0x10,$inp)); 976 &xorps ($inout0,$inout3); 977 &xorps ($inout1,$inout4); 978 &movups (&QWP(0,$out),$inout0); 979 &movups (&QWP(0x10,$out),$inout1); 980 &jmp (&label("ctr32_ret")); 981 982 &set_label("ctr32_three",16); 983 &call ("_aesni_encrypt3"); 984 &movups ($inout3,&QWP(0,$inp)); 985 &movups ($inout4,&QWP(0x10,$inp)); 986 &xorps ($inout0,$inout3); 987 &movups ($inout5,&QWP(0x20,$inp)); 988 &xorps ($inout1,$inout4); 989 &movups (&QWP(0,$out),$inout0); 990 &xorps ($inout2,$inout5); 991 &movups (&QWP(0x10,$out),$inout1); 992 &movups (&QWP(0x20,$out),$inout2); 993 &jmp (&label("ctr32_ret")); 994 995 &set_label("ctr32_four",16); 996 &call ("_aesni_encrypt4"); 997 &movups ($inout4,&QWP(0,$inp)); 998 &movups ($inout5,&QWP(0x10,$inp)); 999 &movups ($rndkey1,&QWP(0x20,$inp)); 1000 &xorps ($inout0,$inout4); 1001 &movups ($rndkey0,&QWP(0x30,$inp)); 1002 &xorps ($inout1,$inout5); 1003 &movups (&QWP(0,$out),$inout0); 1004 &xorps ($inout2,$rndkey1); 1005 &movups (&QWP(0x10,$out),$inout1); 1006 &xorps ($inout3,$rndkey0); 1007 &movups (&QWP(0x20,$out),$inout2); 1008 &movups (&QWP(0x30,$out),$inout3); 1009 1010 &set_label("ctr32_ret"); 1011 &mov ("esp",&DWP(80,"esp")); 1012 &function_end("aesni_ctr32_encrypt_blocks"); 1013 1014 ###################################################################### 1015 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, 1016 # const AES_KEY *key1, const AES_KEY *key2 1017 # const unsigned char iv[16]); 1018 # 1019 { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1); 1020 1021 &function_begin("aesni_xts_encrypt"); 1022 &mov ($key,&wparam(4)); # key2 1023 &mov ($inp,&wparam(5)); # clear-text tweak 1024 1025 &mov ($rounds,&DWP(240,$key)); # key2->rounds 1026 &movups ($inout0,&QWP(0,$inp)); 1027 if ($inline) 1028 { &aesni_inline_generate1("enc"); } 1029 else 1030 { &call ("_aesni_encrypt1"); } 1031 1032 &mov ($inp,&wparam(0)); 1033 &mov ($out,&wparam(1)); 1034 &mov ($len,&wparam(2)); 1035 &mov ($key,&wparam(3)); # key1 1036 1037 &mov ($key_,"esp"); 1038 &sub ("esp",16*7+8); 1039 &mov ($rounds,&DWP(240,$key)); # key1->rounds 1040 &and ("esp",-16); # align stack 1041 1042 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1043 &mov (&DWP(16*6+4,"esp"),0); 1044 &mov (&DWP(16*6+8,"esp"),1); 1045 &mov (&DWP(16*6+12,"esp"),0); 1046 &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1047 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1048 1049 &movdqa ($tweak,$inout0); 1050 &pxor ($twtmp,$twtmp); 1051 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1052 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1053 1054 &and ($len,-16); 1055 &mov ($key_,$key); # backup $key 1056 &mov ($rounds_,$rounds); # backup $rounds 1057 &sub ($len,16*6); 1058 &jc (&label("xts_enc_short")); 1059 1060 &shr ($rounds,1); 1061 &mov ($rounds_,$rounds); 1062 &jmp (&label("xts_enc_loop6")); 1063 1064 &set_label("xts_enc_loop6",16); 1065 for ($i=0;$i<4;$i++) { 1066 &pshufd ($twres,$twtmp,0x13); 1067 &pxor ($twtmp,$twtmp); 1068 &movdqa (&QWP(16*$i,"esp"),$tweak); 1069 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1070 &pand ($twres,$twmask); # isolate carry and residue 1071 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1072 &pxor ($tweak,$twres); 1073 } 1074 &pshufd ($inout5,$twtmp,0x13); 1075 &movdqa (&QWP(16*$i++,"esp"),$tweak); 1076 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1077 &$movekey ($rndkey0,&QWP(0,$key_)); 1078 &pand ($inout5,$twmask); # isolate carry and residue 1079 &movups ($inout0,&QWP(0,$inp)); # load input 1080 &pxor ($inout5,$tweak); 1081 1082 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1083 &movdqu ($inout1,&QWP(16*1,$inp)); 1084 &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1085 &movdqu ($inout2,&QWP(16*2,$inp)); 1086 &pxor ($inout1,$rndkey0); 1087 &movdqu ($inout3,&QWP(16*3,$inp)); 1088 &pxor ($inout2,$rndkey0); 1089 &movdqu ($inout4,&QWP(16*4,$inp)); 1090 &pxor ($inout3,$rndkey0); 1091 &movdqu ($rndkey1,&QWP(16*5,$inp)); 1092 &pxor ($inout4,$rndkey0); 1093 &lea ($inp,&DWP(16*6,$inp)); 1094 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1095 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1096 &pxor ($inout5,$rndkey1); 1097 1098 &$movekey ($rndkey1,&QWP(16,$key_)); 1099 &lea ($key,&DWP(32,$key_)); 1100 &pxor ($inout1,&QWP(16*1,"esp")); 1101 &aesenc ($inout0,$rndkey1); 1102 &pxor ($inout2,&QWP(16*2,"esp")); 1103 &aesenc ($inout1,$rndkey1); 1104 &pxor ($inout3,&QWP(16*3,"esp")); 1105 &dec ($rounds); 1106 &aesenc ($inout2,$rndkey1); 1107 &pxor ($inout4,&QWP(16*4,"esp")); 1108 &aesenc ($inout3,$rndkey1); 1109 &pxor ($inout5,$rndkey0); 1110 &aesenc ($inout4,$rndkey1); 1111 &$movekey ($rndkey0,&QWP(0,$key)); 1112 &aesenc ($inout5,$rndkey1); 1113 &call (&label("_aesni_encrypt6_enter")); 1114 1115 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1116 &pxor ($twtmp,$twtmp); 1117 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1118 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1119 &xorps ($inout1,&QWP(16*1,"esp")); 1120 &movups (&QWP(16*0,$out),$inout0); # write output 1121 &xorps ($inout2,&QWP(16*2,"esp")); 1122 &movups (&QWP(16*1,$out),$inout1); 1123 &xorps ($inout3,&QWP(16*3,"esp")); 1124 &movups (&QWP(16*2,$out),$inout2); 1125 &xorps ($inout4,&QWP(16*4,"esp")); 1126 &movups (&QWP(16*3,$out),$inout3); 1127 &xorps ($inout5,$tweak); 1128 &movups (&QWP(16*4,$out),$inout4); 1129 &pshufd ($twres,$twtmp,0x13); 1130 &movups (&QWP(16*5,$out),$inout5); 1131 &lea ($out,&DWP(16*6,$out)); 1132 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1133 1134 &pxor ($twtmp,$twtmp); 1135 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1136 &pand ($twres,$twmask); # isolate carry and residue 1137 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1138 &mov ($rounds,$rounds_); # restore $rounds 1139 &pxor ($tweak,$twres); 1140 1141 &sub ($len,16*6); 1142 &jnc (&label("xts_enc_loop6")); 1143 1144 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds 1145 &mov ($key,$key_); # restore $key 1146 &mov ($rounds_,$rounds); 1147 1148 &set_label("xts_enc_short"); 1149 &add ($len,16*6); 1150 &jz (&label("xts_enc_done6x")); 1151 1152 &movdqa ($inout3,$tweak); # put aside previous tweak 1153 &cmp ($len,0x20); 1154 &jb (&label("xts_enc_one")); 1155 1156 &pshufd ($twres,$twtmp,0x13); 1157 &pxor ($twtmp,$twtmp); 1158 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1159 &pand ($twres,$twmask); # isolate carry and residue 1160 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1161 &pxor ($tweak,$twres); 1162 &je (&label("xts_enc_two")); 1163 1164 &pshufd ($twres,$twtmp,0x13); 1165 &pxor ($twtmp,$twtmp); 1166 &movdqa ($inout4,$tweak); # put aside previous tweak 1167 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1168 &pand ($twres,$twmask); # isolate carry and residue 1169 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1170 &pxor ($tweak,$twres); 1171 &cmp ($len,0x40); 1172 &jb (&label("xts_enc_three")); 1173 1174 &pshufd ($twres,$twtmp,0x13); 1175 &pxor ($twtmp,$twtmp); 1176 &movdqa ($inout5,$tweak); # put aside previous tweak 1177 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1178 &pand ($twres,$twmask); # isolate carry and residue 1179 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1180 &pxor ($tweak,$twres); 1181 &movdqa (&QWP(16*0,"esp"),$inout3); 1182 &movdqa (&QWP(16*1,"esp"),$inout4); 1183 &je (&label("xts_enc_four")); 1184 1185 &movdqa (&QWP(16*2,"esp"),$inout5); 1186 &pshufd ($inout5,$twtmp,0x13); 1187 &movdqa (&QWP(16*3,"esp"),$tweak); 1188 &paddq ($tweak,$tweak); # &psllq($inout0,1); 1189 &pand ($inout5,$twmask); # isolate carry and residue 1190 &pxor ($inout5,$tweak); 1191 1192 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1193 &movdqu ($inout1,&QWP(16*1,$inp)); 1194 &movdqu ($inout2,&QWP(16*2,$inp)); 1195 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1196 &movdqu ($inout3,&QWP(16*3,$inp)); 1197 &pxor ($inout1,&QWP(16*1,"esp")); 1198 &movdqu ($inout4,&QWP(16*4,$inp)); 1199 &pxor ($inout2,&QWP(16*2,"esp")); 1200 &lea ($inp,&DWP(16*5,$inp)); 1201 &pxor ($inout3,&QWP(16*3,"esp")); 1202 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1203 &pxor ($inout4,$inout5); 1204 1205 &call ("_aesni_encrypt6"); 1206 1207 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1208 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1209 &xorps ($inout1,&QWP(16*1,"esp")); 1210 &xorps ($inout2,&QWP(16*2,"esp")); 1211 &movups (&QWP(16*0,$out),$inout0); # write output 1212 &xorps ($inout3,&QWP(16*3,"esp")); 1213 &movups (&QWP(16*1,$out),$inout1); 1214 &xorps ($inout4,$tweak); 1215 &movups (&QWP(16*2,$out),$inout2); 1216 &movups (&QWP(16*3,$out),$inout3); 1217 &movups (&QWP(16*4,$out),$inout4); 1218 &lea ($out,&DWP(16*5,$out)); 1219 &jmp (&label("xts_enc_done")); 1220 1221 &set_label("xts_enc_one",16); 1222 &movups ($inout0,&QWP(16*0,$inp)); # load input 1223 &lea ($inp,&DWP(16*1,$inp)); 1224 &xorps ($inout0,$inout3); # input^=tweak 1225 if ($inline) 1226 { &aesni_inline_generate1("enc"); } 1227 else 1228 { &call ("_aesni_encrypt1"); } 1229 &xorps ($inout0,$inout3); # output^=tweak 1230 &movups (&QWP(16*0,$out),$inout0); # write output 1231 &lea ($out,&DWP(16*1,$out)); 1232 1233 &movdqa ($tweak,$inout3); # last tweak 1234 &jmp (&label("xts_enc_done")); 1235 1236 &set_label("xts_enc_two",16); 1237 &movaps ($inout4,$tweak); # put aside last tweak 1238 1239 &movups ($inout0,&QWP(16*0,$inp)); # load input 1240 &movups ($inout1,&QWP(16*1,$inp)); 1241 &lea ($inp,&DWP(16*2,$inp)); 1242 &xorps ($inout0,$inout3); # input^=tweak 1243 &xorps ($inout1,$inout4); 1244 &xorps ($inout2,$inout2); 1245 1246 &call ("_aesni_encrypt3"); 1247 1248 &xorps ($inout0,$inout3); # output^=tweak 1249 &xorps ($inout1,$inout4); 1250 &movups (&QWP(16*0,$out),$inout0); # write output 1251 &movups (&QWP(16*1,$out),$inout1); 1252 &lea ($out,&DWP(16*2,$out)); 1253 1254 &movdqa ($tweak,$inout4); # last tweak 1255 &jmp (&label("xts_enc_done")); 1256 1257 &set_label("xts_enc_three",16); 1258 &movaps ($inout5,$tweak); # put aside last tweak 1259 &movups ($inout0,&QWP(16*0,$inp)); # load input 1260 &movups ($inout1,&QWP(16*1,$inp)); 1261 &movups ($inout2,&QWP(16*2,$inp)); 1262 &lea ($inp,&DWP(16*3,$inp)); 1263 &xorps ($inout0,$inout3); # input^=tweak 1264 &xorps ($inout1,$inout4); 1265 &xorps ($inout2,$inout5); 1266 1267 &call ("_aesni_encrypt3"); 1268 1269 &xorps ($inout0,$inout3); # output^=tweak 1270 &xorps ($inout1,$inout4); 1271 &xorps ($inout2,$inout5); 1272 &movups (&QWP(16*0,$out),$inout0); # write output 1273 &movups (&QWP(16*1,$out),$inout1); 1274 &movups (&QWP(16*2,$out),$inout2); 1275 &lea ($out,&DWP(16*3,$out)); 1276 1277 &movdqa ($tweak,$inout5); # last tweak 1278 &jmp (&label("xts_enc_done")); 1279 1280 &set_label("xts_enc_four",16); 1281 &movaps ($inout4,$tweak); # put aside last tweak 1282 1283 &movups ($inout0,&QWP(16*0,$inp)); # load input 1284 &movups ($inout1,&QWP(16*1,$inp)); 1285 &movups ($inout2,&QWP(16*2,$inp)); 1286 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1287 &movups ($inout3,&QWP(16*3,$inp)); 1288 &lea ($inp,&DWP(16*4,$inp)); 1289 &xorps ($inout1,&QWP(16*1,"esp")); 1290 &xorps ($inout2,$inout5); 1291 &xorps ($inout3,$inout4); 1292 1293 &call ("_aesni_encrypt4"); 1294 1295 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1296 &xorps ($inout1,&QWP(16*1,"esp")); 1297 &xorps ($inout2,$inout5); 1298 &movups (&QWP(16*0,$out),$inout0); # write output 1299 &xorps ($inout3,$inout4); 1300 &movups (&QWP(16*1,$out),$inout1); 1301 &movups (&QWP(16*2,$out),$inout2); 1302 &movups (&QWP(16*3,$out),$inout3); 1303 &lea ($out,&DWP(16*4,$out)); 1304 1305 &movdqa ($tweak,$inout4); # last tweak 1306 &jmp (&label("xts_enc_done")); 1307 1308 &set_label("xts_enc_done6x",16); # $tweak is pre-calculated 1309 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1310 &and ($len,15); 1311 &jz (&label("xts_enc_ret")); 1312 &movdqa ($inout3,$tweak); 1313 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1314 &jmp (&label("xts_enc_steal")); 1315 1316 &set_label("xts_enc_done",16); 1317 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1318 &pxor ($twtmp,$twtmp); 1319 &and ($len,15); 1320 &jz (&label("xts_enc_ret")); 1321 1322 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1323 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1324 &pshufd ($inout3,$twtmp,0x13); 1325 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1326 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue 1327 &pxor ($inout3,$tweak); 1328 1329 &set_label("xts_enc_steal"); 1330 &movz ($rounds,&BP(0,$inp)); 1331 &movz ($key,&BP(-16,$out)); 1332 &lea ($inp,&DWP(1,$inp)); 1333 &mov (&BP(-16,$out),&LB($rounds)); 1334 &mov (&BP(0,$out),&LB($key)); 1335 &lea ($out,&DWP(1,$out)); 1336 &sub ($len,1); 1337 &jnz (&label("xts_enc_steal")); 1338 1339 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1340 &mov ($key,$key_); # restore $key 1341 &mov ($rounds,$rounds_); # restore $rounds 1342 1343 &movups ($inout0,&QWP(-16,$out)); # load input 1344 &xorps ($inout0,$inout3); # input^=tweak 1345 if ($inline) 1346 { &aesni_inline_generate1("enc"); } 1347 else 1348 { &call ("_aesni_encrypt1"); } 1349 &xorps ($inout0,$inout3); # output^=tweak 1350 &movups (&QWP(-16,$out),$inout0); # write output 1351 1352 &set_label("xts_enc_ret"); 1353 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1354 &function_end("aesni_xts_encrypt"); 1355 1356 &function_begin("aesni_xts_decrypt"); 1357 &mov ($key,&wparam(4)); # key2 1358 &mov ($inp,&wparam(5)); # clear-text tweak 1359 1360 &mov ($rounds,&DWP(240,$key)); # key2->rounds 1361 &movups ($inout0,&QWP(0,$inp)); 1362 if ($inline) 1363 { &aesni_inline_generate1("enc"); } 1364 else 1365 { &call ("_aesni_encrypt1"); } 1366 1367 &mov ($inp,&wparam(0)); 1368 &mov ($out,&wparam(1)); 1369 &mov ($len,&wparam(2)); 1370 &mov ($key,&wparam(3)); # key1 1371 1372 &mov ($key_,"esp"); 1373 &sub ("esp",16*7+8); 1374 &and ("esp",-16); # align stack 1375 1376 &xor ($rounds_,$rounds_); # if(len%16) len-=16; 1377 &test ($len,15); 1378 &setnz (&LB($rounds_)); 1379 &shl ($rounds_,4); 1380 &sub ($len,$rounds_); 1381 1382 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant 1383 &mov (&DWP(16*6+4,"esp"),0); 1384 &mov (&DWP(16*6+8,"esp"),1); 1385 &mov (&DWP(16*6+12,"esp"),0); 1386 &mov (&DWP(16*7+0,"esp"),$len); # save original $len 1387 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp 1388 1389 &mov ($rounds,&DWP(240,$key)); # key1->rounds 1390 &mov ($key_,$key); # backup $key 1391 &mov ($rounds_,$rounds); # backup $rounds 1392 1393 &movdqa ($tweak,$inout0); 1394 &pxor ($twtmp,$twtmp); 1395 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 1396 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1397 1398 &and ($len,-16); 1399 &sub ($len,16*6); 1400 &jc (&label("xts_dec_short")); 1401 1402 &shr ($rounds,1); 1403 &mov ($rounds_,$rounds); 1404 &jmp (&label("xts_dec_loop6")); 1405 1406 &set_label("xts_dec_loop6",16); 1407 for ($i=0;$i<4;$i++) { 1408 &pshufd ($twres,$twtmp,0x13); 1409 &pxor ($twtmp,$twtmp); 1410 &movdqa (&QWP(16*$i,"esp"),$tweak); 1411 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1412 &pand ($twres,$twmask); # isolate carry and residue 1413 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1414 &pxor ($tweak,$twres); 1415 } 1416 &pshufd ($inout5,$twtmp,0x13); 1417 &movdqa (&QWP(16*$i++,"esp"),$tweak); 1418 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1419 &$movekey ($rndkey0,&QWP(0,$key_)); 1420 &pand ($inout5,$twmask); # isolate carry and residue 1421 &movups ($inout0,&QWP(0,$inp)); # load input 1422 &pxor ($inout5,$tweak); 1423 1424 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] 1425 &movdqu ($inout1,&QWP(16*1,$inp)); 1426 &xorps ($inout0,$rndkey0); # input^=rndkey[0] 1427 &movdqu ($inout2,&QWP(16*2,$inp)); 1428 &pxor ($inout1,$rndkey0); 1429 &movdqu ($inout3,&QWP(16*3,$inp)); 1430 &pxor ($inout2,$rndkey0); 1431 &movdqu ($inout4,&QWP(16*4,$inp)); 1432 &pxor ($inout3,$rndkey0); 1433 &movdqu ($rndkey1,&QWP(16*5,$inp)); 1434 &pxor ($inout4,$rndkey0); 1435 &lea ($inp,&DWP(16*6,$inp)); 1436 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1437 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak 1438 &pxor ($inout5,$rndkey1); 1439 1440 &$movekey ($rndkey1,&QWP(16,$key_)); 1441 &lea ($key,&DWP(32,$key_)); 1442 &pxor ($inout1,&QWP(16*1,"esp")); 1443 &aesdec ($inout0,$rndkey1); 1444 &pxor ($inout2,&QWP(16*2,"esp")); 1445 &aesdec ($inout1,$rndkey1); 1446 &pxor ($inout3,&QWP(16*3,"esp")); 1447 &dec ($rounds); 1448 &aesdec ($inout2,$rndkey1); 1449 &pxor ($inout4,&QWP(16*4,"esp")); 1450 &aesdec ($inout3,$rndkey1); 1451 &pxor ($inout5,$rndkey0); 1452 &aesdec ($inout4,$rndkey1); 1453 &$movekey ($rndkey0,&QWP(0,$key)); 1454 &aesdec ($inout5,$rndkey1); 1455 &call (&label("_aesni_decrypt6_enter")); 1456 1457 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak 1458 &pxor ($twtmp,$twtmp); 1459 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1460 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits 1461 &xorps ($inout1,&QWP(16*1,"esp")); 1462 &movups (&QWP(16*0,$out),$inout0); # write output 1463 &xorps ($inout2,&QWP(16*2,"esp")); 1464 &movups (&QWP(16*1,$out),$inout1); 1465 &xorps ($inout3,&QWP(16*3,"esp")); 1466 &movups (&QWP(16*2,$out),$inout2); 1467 &xorps ($inout4,&QWP(16*4,"esp")); 1468 &movups (&QWP(16*3,$out),$inout3); 1469 &xorps ($inout5,$tweak); 1470 &movups (&QWP(16*4,$out),$inout4); 1471 &pshufd ($twres,$twtmp,0x13); 1472 &movups (&QWP(16*5,$out),$inout5); 1473 &lea ($out,&DWP(16*6,$out)); 1474 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 1475 1476 &pxor ($twtmp,$twtmp); 1477 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1478 &pand ($twres,$twmask); # isolate carry and residue 1479 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1480 &mov ($rounds,$rounds_); # restore $rounds 1481 &pxor ($tweak,$twres); 1482 1483 &sub ($len,16*6); 1484 &jnc (&label("xts_dec_loop6")); 1485 1486 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds 1487 &mov ($key,$key_); # restore $key 1488 &mov ($rounds_,$rounds); 1489 1490 &set_label("xts_dec_short"); 1491 &add ($len,16*6); 1492 &jz (&label("xts_dec_done6x")); 1493 1494 &movdqa ($inout3,$tweak); # put aside previous tweak 1495 &cmp ($len,0x20); 1496 &jb (&label("xts_dec_one")); 1497 1498 &pshufd ($twres,$twtmp,0x13); 1499 &pxor ($twtmp,$twtmp); 1500 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1501 &pand ($twres,$twmask); # isolate carry and residue 1502 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1503 &pxor ($tweak,$twres); 1504 &je (&label("xts_dec_two")); 1505 1506 &pshufd ($twres,$twtmp,0x13); 1507 &pxor ($twtmp,$twtmp); 1508 &movdqa ($inout4,$tweak); # put aside previous tweak 1509 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1510 &pand ($twres,$twmask); # isolate carry and residue 1511 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1512 &pxor ($tweak,$twres); 1513 &cmp ($len,0x40); 1514 &jb (&label("xts_dec_three")); 1515 1516 &pshufd ($twres,$twtmp,0x13); 1517 &pxor ($twtmp,$twtmp); 1518 &movdqa ($inout5,$tweak); # put aside previous tweak 1519 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1520 &pand ($twres,$twmask); # isolate carry and residue 1521 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1522 &pxor ($tweak,$twres); 1523 &movdqa (&QWP(16*0,"esp"),$inout3); 1524 &movdqa (&QWP(16*1,"esp"),$inout4); 1525 &je (&label("xts_dec_four")); 1526 1527 &movdqa (&QWP(16*2,"esp"),$inout5); 1528 &pshufd ($inout5,$twtmp,0x13); 1529 &movdqa (&QWP(16*3,"esp"),$tweak); 1530 &paddq ($tweak,$tweak); # &psllq($inout0,1); 1531 &pand ($inout5,$twmask); # isolate carry and residue 1532 &pxor ($inout5,$tweak); 1533 1534 &movdqu ($inout0,&QWP(16*0,$inp)); # load input 1535 &movdqu ($inout1,&QWP(16*1,$inp)); 1536 &movdqu ($inout2,&QWP(16*2,$inp)); 1537 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak 1538 &movdqu ($inout3,&QWP(16*3,$inp)); 1539 &pxor ($inout1,&QWP(16*1,"esp")); 1540 &movdqu ($inout4,&QWP(16*4,$inp)); 1541 &pxor ($inout2,&QWP(16*2,"esp")); 1542 &lea ($inp,&DWP(16*5,$inp)); 1543 &pxor ($inout3,&QWP(16*3,"esp")); 1544 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak 1545 &pxor ($inout4,$inout5); 1546 1547 &call ("_aesni_decrypt6"); 1548 1549 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak 1550 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1551 &xorps ($inout1,&QWP(16*1,"esp")); 1552 &xorps ($inout2,&QWP(16*2,"esp")); 1553 &movups (&QWP(16*0,$out),$inout0); # write output 1554 &xorps ($inout3,&QWP(16*3,"esp")); 1555 &movups (&QWP(16*1,$out),$inout1); 1556 &xorps ($inout4,$tweak); 1557 &movups (&QWP(16*2,$out),$inout2); 1558 &movups (&QWP(16*3,$out),$inout3); 1559 &movups (&QWP(16*4,$out),$inout4); 1560 &lea ($out,&DWP(16*5,$out)); 1561 &jmp (&label("xts_dec_done")); 1562 1563 &set_label("xts_dec_one",16); 1564 &movups ($inout0,&QWP(16*0,$inp)); # load input 1565 &lea ($inp,&DWP(16*1,$inp)); 1566 &xorps ($inout0,$inout3); # input^=tweak 1567 if ($inline) 1568 { &aesni_inline_generate1("dec"); } 1569 else 1570 { &call ("_aesni_decrypt1"); } 1571 &xorps ($inout0,$inout3); # output^=tweak 1572 &movups (&QWP(16*0,$out),$inout0); # write output 1573 &lea ($out,&DWP(16*1,$out)); 1574 1575 &movdqa ($tweak,$inout3); # last tweak 1576 &jmp (&label("xts_dec_done")); 1577 1578 &set_label("xts_dec_two",16); 1579 &movaps ($inout4,$tweak); # put aside last tweak 1580 1581 &movups ($inout0,&QWP(16*0,$inp)); # load input 1582 &movups ($inout1,&QWP(16*1,$inp)); 1583 &lea ($inp,&DWP(16*2,$inp)); 1584 &xorps ($inout0,$inout3); # input^=tweak 1585 &xorps ($inout1,$inout4); 1586 1587 &call ("_aesni_decrypt3"); 1588 1589 &xorps ($inout0,$inout3); # output^=tweak 1590 &xorps ($inout1,$inout4); 1591 &movups (&QWP(16*0,$out),$inout0); # write output 1592 &movups (&QWP(16*1,$out),$inout1); 1593 &lea ($out,&DWP(16*2,$out)); 1594 1595 &movdqa ($tweak,$inout4); # last tweak 1596 &jmp (&label("xts_dec_done")); 1597 1598 &set_label("xts_dec_three",16); 1599 &movaps ($inout5,$tweak); # put aside last tweak 1600 &movups ($inout0,&QWP(16*0,$inp)); # load input 1601 &movups ($inout1,&QWP(16*1,$inp)); 1602 &movups ($inout2,&QWP(16*2,$inp)); 1603 &lea ($inp,&DWP(16*3,$inp)); 1604 &xorps ($inout0,$inout3); # input^=tweak 1605 &xorps ($inout1,$inout4); 1606 &xorps ($inout2,$inout5); 1607 1608 &call ("_aesni_decrypt3"); 1609 1610 &xorps ($inout0,$inout3); # output^=tweak 1611 &xorps ($inout1,$inout4); 1612 &xorps ($inout2,$inout5); 1613 &movups (&QWP(16*0,$out),$inout0); # write output 1614 &movups (&QWP(16*1,$out),$inout1); 1615 &movups (&QWP(16*2,$out),$inout2); 1616 &lea ($out,&DWP(16*3,$out)); 1617 1618 &movdqa ($tweak,$inout5); # last tweak 1619 &jmp (&label("xts_dec_done")); 1620 1621 &set_label("xts_dec_four",16); 1622 &movaps ($inout4,$tweak); # put aside last tweak 1623 1624 &movups ($inout0,&QWP(16*0,$inp)); # load input 1625 &movups ($inout1,&QWP(16*1,$inp)); 1626 &movups ($inout2,&QWP(16*2,$inp)); 1627 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak 1628 &movups ($inout3,&QWP(16*3,$inp)); 1629 &lea ($inp,&DWP(16*4,$inp)); 1630 &xorps ($inout1,&QWP(16*1,"esp")); 1631 &xorps ($inout2,$inout5); 1632 &xorps ($inout3,$inout4); 1633 1634 &call ("_aesni_decrypt4"); 1635 1636 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak 1637 &xorps ($inout1,&QWP(16*1,"esp")); 1638 &xorps ($inout2,$inout5); 1639 &movups (&QWP(16*0,$out),$inout0); # write output 1640 &xorps ($inout3,$inout4); 1641 &movups (&QWP(16*1,$out),$inout1); 1642 &movups (&QWP(16*2,$out),$inout2); 1643 &movups (&QWP(16*3,$out),$inout3); 1644 &lea ($out,&DWP(16*4,$out)); 1645 1646 &movdqa ($tweak,$inout4); # last tweak 1647 &jmp (&label("xts_dec_done")); 1648 1649 &set_label("xts_dec_done6x",16); # $tweak is pre-calculated 1650 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1651 &and ($len,15); 1652 &jz (&label("xts_dec_ret")); 1653 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1654 &jmp (&label("xts_dec_only_one_more")); 1655 1656 &set_label("xts_dec_done",16); 1657 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len 1658 &pxor ($twtmp,$twtmp); 1659 &and ($len,15); 1660 &jz (&label("xts_dec_ret")); 1661 1662 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1663 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 1664 &pshufd ($twres,$twtmp,0x13); 1665 &pxor ($twtmp,$twtmp); 1666 &movdqa ($twmask,&QWP(16*6,"esp")); 1667 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1668 &pand ($twres,$twmask); # isolate carry and residue 1669 &pcmpgtd($twtmp,$tweak); # broadcast upper bits 1670 &pxor ($tweak,$twres); 1671 1672 &set_label("xts_dec_only_one_more"); 1673 &pshufd ($inout3,$twtmp,0x13); 1674 &movdqa ($inout4,$tweak); # put aside previous tweak 1675 &paddq ($tweak,$tweak); # &psllq($tweak,1); 1676 &pand ($inout3,$twmask); # isolate carry and residue 1677 &pxor ($inout3,$tweak); 1678 1679 &mov ($key,$key_); # restore $key 1680 &mov ($rounds,$rounds_); # restore $rounds 1681 1682 &movups ($inout0,&QWP(0,$inp)); # load input 1683 &xorps ($inout0,$inout3); # input^=tweak 1684 if ($inline) 1685 { &aesni_inline_generate1("dec"); } 1686 else 1687 { &call ("_aesni_decrypt1"); } 1688 &xorps ($inout0,$inout3); # output^=tweak 1689 &movups (&QWP(0,$out),$inout0); # write output 1690 1691 &set_label("xts_dec_steal"); 1692 &movz ($rounds,&BP(16,$inp)); 1693 &movz ($key,&BP(0,$out)); 1694 &lea ($inp,&DWP(1,$inp)); 1695 &mov (&BP(0,$out),&LB($rounds)); 1696 &mov (&BP(16,$out),&LB($key)); 1697 &lea ($out,&DWP(1,$out)); 1698 &sub ($len,1); 1699 &jnz (&label("xts_dec_steal")); 1700 1701 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out 1702 &mov ($key,$key_); # restore $key 1703 &mov ($rounds,$rounds_); # restore $rounds 1704 1705 &movups ($inout0,&QWP(0,$out)); # load input 1706 &xorps ($inout0,$inout4); # input^=tweak 1707 if ($inline) 1708 { &aesni_inline_generate1("dec"); } 1709 else 1710 { &call ("_aesni_decrypt1"); } 1711 &xorps ($inout0,$inout4); # output^=tweak 1712 &movups (&QWP(0,$out),$inout0); # write output 1713 1714 &set_label("xts_dec_ret"); 1715 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp 1716 &function_end("aesni_xts_decrypt"); 1717 } 1718 } 1719 1720 ###################################################################### 1721 # void $PREFIX_cbc_encrypt (const void *inp, void *out, 1722 # size_t length, const AES_KEY *key, 1723 # unsigned char *ivp,const int enc); 1724 &function_begin("${PREFIX}_cbc_encrypt"); 1725 &mov ($inp,&wparam(0)); 1726 &mov ($rounds_,"esp"); 1727 &mov ($out,&wparam(1)); 1728 &sub ($rounds_,24); 1729 &mov ($len,&wparam(2)); 1730 &and ($rounds_,-16); 1731 &mov ($key,&wparam(3)); 1732 &mov ($key_,&wparam(4)); 1733 &test ($len,$len); 1734 &jz (&label("cbc_abort")); 1735 1736 &cmp (&wparam(5),0); 1737 &xchg ($rounds_,"esp"); # alloca 1738 &movups ($ivec,&QWP(0,$key_)); # load IV 1739 &mov ($rounds,&DWP(240,$key)); 1740 &mov ($key_,$key); # backup $key 1741 &mov (&DWP(16,"esp"),$rounds_); # save original %esp 1742 &mov ($rounds_,$rounds); # backup $rounds 1743 &je (&label("cbc_decrypt")); 1744 1745 &movaps ($inout0,$ivec); 1746 &cmp ($len,16); 1747 &jb (&label("cbc_enc_tail")); 1748 &sub ($len,16); 1749 &jmp (&label("cbc_enc_loop")); 1750 1751 &set_label("cbc_enc_loop",16); 1752 &movups ($ivec,&QWP(0,$inp)); # input actually 1753 &lea ($inp,&DWP(16,$inp)); 1754 if ($inline) 1755 { &aesni_inline_generate1("enc",$inout0,$ivec); } 1756 else 1757 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); } 1758 &mov ($rounds,$rounds_); # restore $rounds 1759 &mov ($key,$key_); # restore $key 1760 &movups (&QWP(0,$out),$inout0); # store output 1761 &lea ($out,&DWP(16,$out)); 1762 &sub ($len,16); 1763 &jnc (&label("cbc_enc_loop")); 1764 &add ($len,16); 1765 &jnz (&label("cbc_enc_tail")); 1766 &movaps ($ivec,$inout0); 1767 &jmp (&label("cbc_ret")); 1768 1769 &set_label("cbc_enc_tail"); 1770 &mov ("ecx",$len); # zaps $rounds 1771 &data_word(0xA4F3F689); # rep movsb 1772 &mov ("ecx",16); # zero tail 1773 &sub ("ecx",$len); 1774 &xor ("eax","eax"); # zaps $len 1775 &data_word(0xAAF3F689); # rep stosb 1776 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block 1777 &mov ($rounds,$rounds_); # restore $rounds 1778 &mov ($inp,$out); # $inp and $out are the same 1779 &mov ($key,$key_); # restore $key 1780 &jmp (&label("cbc_enc_loop")); 1781 ###################################################################### 1782 &set_label("cbc_decrypt",16); 1783 &cmp ($len,0x50); 1784 &jbe (&label("cbc_dec_tail")); 1785 &movaps (&QWP(0,"esp"),$ivec); # save IV 1786 &sub ($len,0x50); 1787 &jmp (&label("cbc_dec_loop6_enter")); 1788 1789 &set_label("cbc_dec_loop6",16); 1790 &movaps (&QWP(0,"esp"),$rndkey0); # save IV 1791 &movups (&QWP(0,$out),$inout5); 1792 &lea ($out,&DWP(0x10,$out)); 1793 &set_label("cbc_dec_loop6_enter"); 1794 &movdqu ($inout0,&QWP(0,$inp)); 1795 &movdqu ($inout1,&QWP(0x10,$inp)); 1796 &movdqu ($inout2,&QWP(0x20,$inp)); 1797 &movdqu ($inout3,&QWP(0x30,$inp)); 1798 &movdqu ($inout4,&QWP(0x40,$inp)); 1799 &movdqu ($inout5,&QWP(0x50,$inp)); 1800 1801 &call ("_aesni_decrypt6"); 1802 1803 &movups ($rndkey1,&QWP(0,$inp)); 1804 &movups ($rndkey0,&QWP(0x10,$inp)); 1805 &xorps ($inout0,&QWP(0,"esp")); # ^=IV 1806 &xorps ($inout1,$rndkey1); 1807 &movups ($rndkey1,&QWP(0x20,$inp)); 1808 &xorps ($inout2,$rndkey0); 1809 &movups ($rndkey0,&QWP(0x30,$inp)); 1810 &xorps ($inout3,$rndkey1); 1811 &movups ($rndkey1,&QWP(0x40,$inp)); 1812 &xorps ($inout4,$rndkey0); 1813 &movups ($rndkey0,&QWP(0x50,$inp)); # IV 1814 &xorps ($inout5,$rndkey1); 1815 &movups (&QWP(0,$out),$inout0); 1816 &movups (&QWP(0x10,$out),$inout1); 1817 &lea ($inp,&DWP(0x60,$inp)); 1818 &movups (&QWP(0x20,$out),$inout2); 1819 &mov ($rounds,$rounds_) # restore $rounds 1820 &movups (&QWP(0x30,$out),$inout3); 1821 &mov ($key,$key_); # restore $key 1822 &movups (&QWP(0x40,$out),$inout4); 1823 &lea ($out,&DWP(0x50,$out)); 1824 &sub ($len,0x60); 1825 &ja (&label("cbc_dec_loop6")); 1826 1827 &movaps ($inout0,$inout5); 1828 &movaps ($ivec,$rndkey0); 1829 &add ($len,0x50); 1830 &jle (&label("cbc_dec_tail_collected")); 1831 &movups (&QWP(0,$out),$inout0); 1832 &lea ($out,&DWP(0x10,$out)); 1833 &set_label("cbc_dec_tail"); 1834 &movups ($inout0,&QWP(0,$inp)); 1835 &movaps ($in0,$inout0); 1836 &cmp ($len,0x10); 1837 &jbe (&label("cbc_dec_one")); 1838 1839 &movups ($inout1,&QWP(0x10,$inp)); 1840 &movaps ($in1,$inout1); 1841 &cmp ($len,0x20); 1842 &jbe (&label("cbc_dec_two")); 1843 1844 &movups ($inout2,&QWP(0x20,$inp)); 1845 &cmp ($len,0x30); 1846 &jbe (&label("cbc_dec_three")); 1847 1848 &movups ($inout3,&QWP(0x30,$inp)); 1849 &cmp ($len,0x40); 1850 &jbe (&label("cbc_dec_four")); 1851 1852 &movups ($inout4,&QWP(0x40,$inp)); 1853 &movaps (&QWP(0,"esp"),$ivec); # save IV 1854 &movups ($inout0,&QWP(0,$inp)); 1855 &xorps ($inout5,$inout5); 1856 &call ("_aesni_decrypt6"); 1857 &movups ($rndkey1,&QWP(0,$inp)); 1858 &movups ($rndkey0,&QWP(0x10,$inp)); 1859 &xorps ($inout0,&QWP(0,"esp")); # ^= IV 1860 &xorps ($inout1,$rndkey1); 1861 &movups ($rndkey1,&QWP(0x20,$inp)); 1862 &xorps ($inout2,$rndkey0); 1863 &movups ($rndkey0,&QWP(0x30,$inp)); 1864 &xorps ($inout3,$rndkey1); 1865 &movups ($ivec,&QWP(0x40,$inp)); # IV 1866 &xorps ($inout4,$rndkey0); 1867 &movups (&QWP(0,$out),$inout0); 1868 &movups (&QWP(0x10,$out),$inout1); 1869 &movups (&QWP(0x20,$out),$inout2); 1870 &movups (&QWP(0x30,$out),$inout3); 1871 &lea ($out,&DWP(0x40,$out)); 1872 &movaps ($inout0,$inout4); 1873 &sub ($len,0x50); 1874 &jmp (&label("cbc_dec_tail_collected")); 1875 1876 &set_label("cbc_dec_one",16); 1877 if ($inline) 1878 { &aesni_inline_generate1("dec"); } 1879 else 1880 { &call ("_aesni_decrypt1"); } 1881 &xorps ($inout0,$ivec); 1882 &movaps ($ivec,$in0); 1883 &sub ($len,0x10); 1884 &jmp (&label("cbc_dec_tail_collected")); 1885 1886 &set_label("cbc_dec_two",16); 1887 &xorps ($inout2,$inout2); 1888 &call ("_aesni_decrypt3"); 1889 &xorps ($inout0,$ivec); 1890 &xorps ($inout1,$in0); 1891 &movups (&QWP(0,$out),$inout0); 1892 &movaps ($inout0,$inout1); 1893 &lea ($out,&DWP(0x10,$out)); 1894 &movaps ($ivec,$in1); 1895 &sub ($len,0x20); 1896 &jmp (&label("cbc_dec_tail_collected")); 1897 1898 &set_label("cbc_dec_three",16); 1899 &call ("_aesni_decrypt3"); 1900 &xorps ($inout0,$ivec); 1901 &xorps ($inout1,$in0); 1902 &xorps ($inout2,$in1); 1903 &movups (&QWP(0,$out),$inout0); 1904 &movaps ($inout0,$inout2); 1905 &movups (&QWP(0x10,$out),$inout1); 1906 &lea ($out,&DWP(0x20,$out)); 1907 &movups ($ivec,&QWP(0x20,$inp)); 1908 &sub ($len,0x30); 1909 &jmp (&label("cbc_dec_tail_collected")); 1910 1911 &set_label("cbc_dec_four",16); 1912 &call ("_aesni_decrypt4"); 1913 &movups ($rndkey1,&QWP(0x10,$inp)); 1914 &movups ($rndkey0,&QWP(0x20,$inp)); 1915 &xorps ($inout0,$ivec); 1916 &movups ($ivec,&QWP(0x30,$inp)); 1917 &xorps ($inout1,$in0); 1918 &movups (&QWP(0,$out),$inout0); 1919 &xorps ($inout2,$rndkey1); 1920 &movups (&QWP(0x10,$out),$inout1); 1921 &xorps ($inout3,$rndkey0); 1922 &movups (&QWP(0x20,$out),$inout2); 1923 &lea ($out,&DWP(0x30,$out)); 1924 &movaps ($inout0,$inout3); 1925 &sub ($len,0x40); 1926 1927 &set_label("cbc_dec_tail_collected"); 1928 &and ($len,15); 1929 &jnz (&label("cbc_dec_tail_partial")); 1930 &movups (&QWP(0,$out),$inout0); 1931 &jmp (&label("cbc_ret")); 1932 1933 &set_label("cbc_dec_tail_partial",16); 1934 &movaps (&QWP(0,"esp"),$inout0); 1935 &mov ("ecx",16); 1936 &mov ($inp,"esp"); 1937 &sub ("ecx",$len); 1938 &data_word(0xA4F3F689); # rep movsb 1939 1940 &set_label("cbc_ret"); 1941 &mov ("esp",&DWP(16,"esp")); # pull original %esp 1942 &mov ($key_,&wparam(4)); 1943 &movups (&QWP(0,$key_),$ivec); # output IV 1944 &set_label("cbc_abort"); 1945 &function_end("${PREFIX}_cbc_encrypt"); 1946 1947 ###################################################################### 1948 # Mechanical port from aesni-x86_64.pl. 1949 # 1950 # _aesni_set_encrypt_key is private interface, 1951 # input: 1952 # "eax" const unsigned char *userKey 1953 # $rounds int bits 1954 # $key AES_KEY *key 1955 # output: 1956 # "eax" return code 1957 # $round rounds 1958 1959 &function_begin_B("_aesni_set_encrypt_key"); 1960 &test ("eax","eax"); 1961 &jz (&label("bad_pointer")); 1962 &test ($key,$key); 1963 &jz (&label("bad_pointer")); 1964 1965 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey 1966 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 1967 &lea ($key,&DWP(16,$key)); 1968 &cmp ($rounds,256); 1969 &je (&label("14rounds")); 1970 &cmp ($rounds,192); 1971 &je (&label("12rounds")); 1972 &cmp ($rounds,128); 1973 &jne (&label("bad_keybits")); 1974 1975 &set_label("10rounds",16); 1976 &mov ($rounds,9); 1977 &$movekey (&QWP(-16,$key),"xmm0"); # round 0 1978 &aeskeygenassist("xmm1","xmm0",0x01); # round 1 1979 &call (&label("key_128_cold")); 1980 &aeskeygenassist("xmm1","xmm0",0x2); # round 2 1981 &call (&label("key_128")); 1982 &aeskeygenassist("xmm1","xmm0",0x04); # round 3 1983 &call (&label("key_128")); 1984 &aeskeygenassist("xmm1","xmm0",0x08); # round 4 1985 &call (&label("key_128")); 1986 &aeskeygenassist("xmm1","xmm0",0x10); # round 5 1987 &call (&label("key_128")); 1988 &aeskeygenassist("xmm1","xmm0",0x20); # round 6 1989 &call (&label("key_128")); 1990 &aeskeygenassist("xmm1","xmm0",0x40); # round 7 1991 &call (&label("key_128")); 1992 &aeskeygenassist("xmm1","xmm0",0x80); # round 8 1993 &call (&label("key_128")); 1994 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 1995 &call (&label("key_128")); 1996 &aeskeygenassist("xmm1","xmm0",0x36); # round 10 1997 &call (&label("key_128")); 1998 &$movekey (&QWP(0,$key),"xmm0"); 1999 &mov (&DWP(80,$key),$rounds); 2000 &xor ("eax","eax"); 2001 &ret(); 2002 2003 &set_label("key_128",16); 2004 &$movekey (&QWP(0,$key),"xmm0"); 2005 &lea ($key,&DWP(16,$key)); 2006 &set_label("key_128_cold"); 2007 &shufps ("xmm4","xmm0",0b00010000); 2008 &xorps ("xmm0","xmm4"); 2009 &shufps ("xmm4","xmm0",0b10001100); 2010 &xorps ("xmm0","xmm4"); 2011 &shufps ("xmm1","xmm1",0b11111111); # critical path 2012 &xorps ("xmm0","xmm1"); 2013 &ret(); 2014 2015 &set_label("12rounds",16); 2016 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey 2017 &mov ($rounds,11); 2018 &$movekey (&QWP(-16,$key),"xmm0") # round 0 2019 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 2020 &call (&label("key_192a_cold")); 2021 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 2022 &call (&label("key_192b")); 2023 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 2024 &call (&label("key_192a")); 2025 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 2026 &call (&label("key_192b")); 2027 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 2028 &call (&label("key_192a")); 2029 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 2030 &call (&label("key_192b")); 2031 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 2032 &call (&label("key_192a")); 2033 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 2034 &call (&label("key_192b")); 2035 &$movekey (&QWP(0,$key),"xmm0"); 2036 &mov (&DWP(48,$key),$rounds); 2037 &xor ("eax","eax"); 2038 &ret(); 2039 2040 &set_label("key_192a",16); 2041 &$movekey (&QWP(0,$key),"xmm0"); 2042 &lea ($key,&DWP(16,$key)); 2043 &set_label("key_192a_cold",16); 2044 &movaps ("xmm5","xmm2"); 2045 &set_label("key_192b_warm"); 2046 &shufps ("xmm4","xmm0",0b00010000); 2047 &movdqa ("xmm3","xmm2"); 2048 &xorps ("xmm0","xmm4"); 2049 &shufps ("xmm4","xmm0",0b10001100); 2050 &pslldq ("xmm3",4); 2051 &xorps ("xmm0","xmm4"); 2052 &pshufd ("xmm1","xmm1",0b01010101); # critical path 2053 &pxor ("xmm2","xmm3"); 2054 &pxor ("xmm0","xmm1"); 2055 &pshufd ("xmm3","xmm0",0b11111111); 2056 &pxor ("xmm2","xmm3"); 2057 &ret(); 2058 2059 &set_label("key_192b",16); 2060 &movaps ("xmm3","xmm0"); 2061 &shufps ("xmm5","xmm0",0b01000100); 2062 &$movekey (&QWP(0,$key),"xmm5"); 2063 &shufps ("xmm3","xmm2",0b01001110); 2064 &$movekey (&QWP(16,$key),"xmm3"); 2065 &lea ($key,&DWP(32,$key)); 2066 &jmp (&label("key_192b_warm")); 2067 2068 &set_label("14rounds",16); 2069 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey 2070 &mov ($rounds,13); 2071 &lea ($key,&DWP(16,$key)); 2072 &$movekey (&QWP(-32,$key),"xmm0"); # round 0 2073 &$movekey (&QWP(-16,$key),"xmm2"); # round 1 2074 &aeskeygenassist("xmm1","xmm2",0x01); # round 2 2075 &call (&label("key_256a_cold")); 2076 &aeskeygenassist("xmm1","xmm0",0x01); # round 3 2077 &call (&label("key_256b")); 2078 &aeskeygenassist("xmm1","xmm2",0x02); # round 4 2079 &call (&label("key_256a")); 2080 &aeskeygenassist("xmm1","xmm0",0x02); # round 5 2081 &call (&label("key_256b")); 2082 &aeskeygenassist("xmm1","xmm2",0x04); # round 6 2083 &call (&label("key_256a")); 2084 &aeskeygenassist("xmm1","xmm0",0x04); # round 7 2085 &call (&label("key_256b")); 2086 &aeskeygenassist("xmm1","xmm2",0x08); # round 8 2087 &call (&label("key_256a")); 2088 &aeskeygenassist("xmm1","xmm0",0x08); # round 9 2089 &call (&label("key_256b")); 2090 &aeskeygenassist("xmm1","xmm2",0x10); # round 10 2091 &call (&label("key_256a")); 2092 &aeskeygenassist("xmm1","xmm0",0x10); # round 11 2093 &call (&label("key_256b")); 2094 &aeskeygenassist("xmm1","xmm2",0x20); # round 12 2095 &call (&label("key_256a")); 2096 &aeskeygenassist("xmm1","xmm0",0x20); # round 13 2097 &call (&label("key_256b")); 2098 &aeskeygenassist("xmm1","xmm2",0x40); # round 14 2099 &call (&label("key_256a")); 2100 &$movekey (&QWP(0,$key),"xmm0"); 2101 &mov (&DWP(16,$key),$rounds); 2102 &xor ("eax","eax"); 2103 &ret(); 2104 2105 &set_label("key_256a",16); 2106 &$movekey (&QWP(0,$key),"xmm2"); 2107 &lea ($key,&DWP(16,$key)); 2108 &set_label("key_256a_cold"); 2109 &shufps ("xmm4","xmm0",0b00010000); 2110 &xorps ("xmm0","xmm4"); 2111 &shufps ("xmm4","xmm0",0b10001100); 2112 &xorps ("xmm0","xmm4"); 2113 &shufps ("xmm1","xmm1",0b11111111); # critical path 2114 &xorps ("xmm0","xmm1"); 2115 &ret(); 2116 2117 &set_label("key_256b",16); 2118 &$movekey (&QWP(0,$key),"xmm0"); 2119 &lea ($key,&DWP(16,$key)); 2120 2121 &shufps ("xmm4","xmm2",0b00010000); 2122 &xorps ("xmm2","xmm4"); 2123 &shufps ("xmm4","xmm2",0b10001100); 2124 &xorps ("xmm2","xmm4"); 2125 &shufps ("xmm1","xmm1",0b10101010); # critical path 2126 &xorps ("xmm2","xmm1"); 2127 &ret(); 2128 2129 &set_label("bad_pointer",4); 2130 &mov ("eax",-1); 2131 &ret (); 2132 &set_label("bad_keybits",4); 2133 &mov ("eax",-2); 2134 &ret (); 2135 &function_end_B("_aesni_set_encrypt_key"); 2136 2137 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, 2138 # AES_KEY *key) 2139 &function_begin_B("${PREFIX}_set_encrypt_key"); 2140 &mov ("eax",&wparam(0)); 2141 &mov ($rounds,&wparam(1)); 2142 &mov ($key,&wparam(2)); 2143 &call ("_aesni_set_encrypt_key"); 2144 &ret (); 2145 &function_end_B("${PREFIX}_set_encrypt_key"); 2146 2147 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, 2148 # AES_KEY *key) 2149 &function_begin_B("${PREFIX}_set_decrypt_key"); 2150 &mov ("eax",&wparam(0)); 2151 &mov ($rounds,&wparam(1)); 2152 &mov ($key,&wparam(2)); 2153 &call ("_aesni_set_encrypt_key"); 2154 &mov ($key,&wparam(2)); 2155 &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key 2156 &test ("eax","eax"); 2157 &jnz (&label("dec_key_ret")); 2158 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule 2159 2160 &$movekey ("xmm0",&QWP(0,$key)); # just swap 2161 &$movekey ("xmm1",&QWP(0,"eax")); 2162 &$movekey (&QWP(0,"eax"),"xmm0"); 2163 &$movekey (&QWP(0,$key),"xmm1"); 2164 &lea ($key,&DWP(16,$key)); 2165 &lea ("eax",&DWP(-16,"eax")); 2166 2167 &set_label("dec_key_inverse"); 2168 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse 2169 &$movekey ("xmm1",&QWP(0,"eax")); 2170 &aesimc ("xmm0","xmm0"); 2171 &aesimc ("xmm1","xmm1"); 2172 &lea ($key,&DWP(16,$key)); 2173 &lea ("eax",&DWP(-16,"eax")); 2174 &$movekey (&QWP(16,"eax"),"xmm0"); 2175 &$movekey (&QWP(-16,$key),"xmm1"); 2176 &cmp ("eax",$key); 2177 &ja (&label("dec_key_inverse")); 2178 2179 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle 2180 &aesimc ("xmm0","xmm0"); 2181 &$movekey (&QWP(0,$key),"xmm0"); 2182 2183 &xor ("eax","eax"); # return success 2184 &set_label("dec_key_ret"); 2185 &ret (); 2186 &function_end_B("${PREFIX}_set_decrypt_key"); 2187 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"); 2188 2189 &asm_finish();