1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org> 5 # 6 # This module may be used under the terms of either the GNU General 7 # Public License version 2 or later, the GNU Lesser General Public 8 # License version 2.1 or later, the Mozilla Public License version 9 # 1.1 or the BSD License. The exact terms of either license are 10 # distributed along with this module. For further details see 11 # http://www.openssl.org/~appro/camellia/. 12 # ==================================================================== 13 14 # Performance in cycles per processed byte (less is better) in 15 # 'openssl speed ...' benchmark: 16 # 17 # AMD64 Core2 EM64T 18 # -evp camellia-128-ecb 16.7 21.0 22.7 19 # + over gcc 3.4.6 +25% +5% 0% 20 # 21 # camellia-128-cbc 15.7 20.4 21.1 22 # 23 # 128-bit key setup 128 216 205 cycles/key 24 # + over gcc 3.4.6 +54% +39% +15% 25 # 26 # Numbers in "+" rows represent performance improvement over compiler 27 # generated code. Key setup timings are impressive on AMD and Core2 28 # thanks to 64-bit operations being covertly deployed. Improvement on 29 # EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it 30 # apparently emulates some of 64-bit operations in [32-bit] microcode. 31 32 $flavour = shift; 33 $output = shift; 34 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 35 36 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 37 38 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 39 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 40 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 41 die "can't locate x86_64-xlate.pl"; 42 43 open OUT,"| \"$^X\" $xlate $flavour $output"; 44 *STDOUT=*OUT; 45 46 sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; } 47 sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; 48 $r =~ s/%[er]([sd]i)/%\1l/; 49 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } 50 51 $t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx"; 52 @S=("%r8d","%r9d","%r10d","%r11d"); 53 $i0="%esi"; 54 $i1="%edi"; 55 $Tbl="%rbp"; # size optimization 56 $inp="%r12"; 57 $out="%r13"; 58 $key="%r14"; 59 $keyend="%r15"; 60 $arg0d=$win64?"%ecx":"%edi"; 61 62 # const unsigned int Camellia_SBOX[4][256]; 63 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][], 64 # and [2][] - with [3][]. This is done to minimize code size. 65 $SBOX1_1110=0; # Camellia_SBOX[0] 66 $SBOX4_4404=4; # Camellia_SBOX[1] 67 $SBOX2_0222=2048; # Camellia_SBOX[2] 68 $SBOX3_3033=2052; # Camellia_SBOX[3] 69 70 sub Camellia_Feistel { 71 my $i=@_[0]; 72 my $seed=defined(@_[1])?@_[1]:0; 73 my $scale=$seed<0?-8:8; 74 my $j=($i&1)*2; 75 my $s0=@S[($j)%4],$s1=@S[($j+1)%4],$s2=@S[($j+2)%4],$s3=@S[($j+3)%4]; 76 77 $code.=<<___; 78 xor $s0,$t0 # t0^=key[0] 79 xor $s1,$t1 # t1^=key[1] 80 movz `&hi("$t0")`,$i0 # (t0>>8)&0xff 81 movz `&lo("$t1")`,$i1 # (t1>>0)&0xff 82 mov $SBOX3_3033($Tbl,$i0,8),$t3 # t3=SBOX3_3033[0] 83 mov $SBOX1_1110($Tbl,$i1,8),$t2 # t2=SBOX1_1110[1] 84 movz `&lo("$t0")`,$i0 # (t0>>0)&0xff 85 shr \$16,$t0 86 movz `&hi("$t1")`,$i1 # (t1>>8)&0xff 87 xor $SBOX4_4404($Tbl,$i0,8),$t3 # t3^=SBOX4_4404[0] 88 shr \$16,$t1 89 xor $SBOX4_4404($Tbl,$i1,8),$t2 # t2^=SBOX4_4404[1] 90 movz `&hi("$t0")`,$i0 # (t0>>24)&0xff 91 movz `&lo("$t1")`,$i1 # (t1>>16)&0xff 92 xor $SBOX1_1110($Tbl,$i0,8),$t3 # t3^=SBOX1_1110[0] 93 xor $SBOX3_3033($Tbl,$i1,8),$t2 # t2^=SBOX3_3033[1] 94 movz `&lo("$t0")`,$i0 # (t0>>16)&0xff 95 movz `&hi("$t1")`,$i1 # (t1>>24)&0xff 96 xor $SBOX2_0222($Tbl,$i0,8),$t3 # t3^=SBOX2_0222[0] 97 xor $SBOX2_0222($Tbl,$i1,8),$t2 # t2^=SBOX2_0222[1] 98 mov `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1] 99 mov `$seed+($i+1)*$scale+4`($key),$t0 100 xor $t3,$t2 # t2^=t3 101 ror \$8,$t3 # t3=RightRotate(t3,8) 102 xor $t2,$s2 103 xor $t2,$s3 104 xor $t3,$s3 105 ___ 106 } 107 108 # void Camellia_EncryptBlock_Rounds( 109 # int grandRounds, 110 # const Byte plaintext[], 111 # const KEY_TABLE_TYPE keyTable, 112 # Byte ciphertext[]) 113 $code=<<___; 114 .text 115 116 # V1.x API 117 .globl Camellia_EncryptBlock 118 .type Camellia_EncryptBlock,\@abi-omnipotent 119 .align 16 120 Camellia_EncryptBlock: 121 movl \$128,%eax 122 subl $arg0d,%eax 123 movl \$3,$arg0d 124 adcl \$0,$arg0d # keyBitLength==128?3:4 125 jmp .Lenc_rounds 126 .size Camellia_EncryptBlock,.-Camellia_EncryptBlock 127 # V2 128 .globl Camellia_EncryptBlock_Rounds 129 .type Camellia_EncryptBlock_Rounds,\@function,4 130 .align 16 131 .Lenc_rounds: 132 Camellia_EncryptBlock_Rounds: 133 push %rbx 134 push %rbp 135 push %r13 136 push %r14 137 push %r15 138 .Lenc_prologue: 139 140 #mov %rsi,$inp # put away arguments 141 mov %rcx,$out 142 mov %rdx,$key 143 144 shl \$6,%edi # process grandRounds 145 lea .LCamellia_SBOX(%rip),$Tbl 146 lea ($key,%rdi),$keyend 147 148 mov 0(%rsi),@S[0] # load plaintext 149 mov 4(%rsi),@S[1] 150 mov 8(%rsi),@S[2] 151 bswap @S[0] 152 mov 12(%rsi),@S[3] 153 bswap @S[1] 154 bswap @S[2] 155 bswap @S[3] 156 157 call _x86_64_Camellia_encrypt 158 159 bswap @S[0] 160 bswap @S[1] 161 bswap @S[2] 162 mov @S[0],0($out) 163 bswap @S[3] 164 mov @S[1],4($out) 165 mov @S[2],8($out) 166 mov @S[3],12($out) 167 168 mov 0(%rsp),%r15 169 mov 8(%rsp),%r14 170 mov 16(%rsp),%r13 171 mov 24(%rsp),%rbp 172 mov 32(%rsp),%rbx 173 lea 40(%rsp),%rsp 174 .Lenc_epilogue: 175 ret 176 .size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds 177 178 .type _x86_64_Camellia_encrypt,\@abi-omnipotent 179 .align 16 180 _x86_64_Camellia_encrypt: 181 xor 0($key),@S[1] 182 xor 4($key),@S[0] # ^=key[0-3] 183 xor 8($key),@S[3] 184 xor 12($key),@S[2] 185 .align 16 186 .Leloop: 187 mov 16($key),$t1 # prefetch key[4-5] 188 mov 20($key),$t0 189 190 ___ 191 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); } 192 $code.=<<___; 193 lea 16*4($key),$key 194 cmp $keyend,$key 195 mov 8($key),$t3 # prefetch key[2-3] 196 mov 12($key),$t2 197 je .Ledone 198 199 and @S[0],$t0 200 or @S[3],$t3 201 rol \$1,$t0 202 xor $t3,@S[2] # s2^=s3|key[3]; 203 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1); 204 and @S[2],$t2 205 or @S[1],$t1 206 rol \$1,$t2 207 xor $t1,@S[0] # s0^=s1|key[1]; 208 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1); 209 jmp .Leloop 210 211 .align 16 212 .Ledone: 213 xor @S[2],$t0 # SwapHalf 214 xor @S[3],$t1 215 xor @S[0],$t2 216 xor @S[1],$t3 217 218 mov $t0,@S[0] 219 mov $t1,@S[1] 220 mov $t2,@S[2] 221 mov $t3,@S[3] 222 223 .byte 0xf3,0xc3 # rep ret 224 .size _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt 225 226 # V1.x API 227 .globl Camellia_DecryptBlock 228 .type Camellia_DecryptBlock,\@abi-omnipotent 229 .align 16 230 Camellia_DecryptBlock: 231 movl \$128,%eax 232 subl $arg0d,%eax 233 movl \$3,$arg0d 234 adcl \$0,$arg0d # keyBitLength==128?3:4 235 jmp .Ldec_rounds 236 .size Camellia_DecryptBlock,.-Camellia_DecryptBlock 237 # V2 238 .globl Camellia_DecryptBlock_Rounds 239 .type Camellia_DecryptBlock_Rounds,\@function,4 240 .align 16 241 .Ldec_rounds: 242 Camellia_DecryptBlock_Rounds: 243 push %rbx 244 push %rbp 245 push %r13 246 push %r14 247 push %r15 248 .Ldec_prologue: 249 250 #mov %rsi,$inp # put away arguments 251 mov %rcx,$out 252 mov %rdx,$keyend 253 254 shl \$6,%edi # process grandRounds 255 lea .LCamellia_SBOX(%rip),$Tbl 256 lea ($keyend,%rdi),$key 257 258 mov 0(%rsi),@S[0] # load plaintext 259 mov 4(%rsi),@S[1] 260 mov 8(%rsi),@S[2] 261 bswap @S[0] 262 mov 12(%rsi),@S[3] 263 bswap @S[1] 264 bswap @S[2] 265 bswap @S[3] 266 267 call _x86_64_Camellia_decrypt 268 269 bswap @S[0] 270 bswap @S[1] 271 bswap @S[2] 272 mov @S[0],0($out) 273 bswap @S[3] 274 mov @S[1],4($out) 275 mov @S[2],8($out) 276 mov @S[3],12($out) 277 278 mov 0(%rsp),%r15 279 mov 8(%rsp),%r14 280 mov 16(%rsp),%r13 281 mov 24(%rsp),%rbp 282 mov 32(%rsp),%rbx 283 lea 40(%rsp),%rsp 284 .Ldec_epilogue: 285 ret 286 .size Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds 287 288 .type _x86_64_Camellia_decrypt,\@abi-omnipotent 289 .align 16 290 _x86_64_Camellia_decrypt: 291 xor 0($key),@S[1] 292 xor 4($key),@S[0] # ^=key[0-3] 293 xor 8($key),@S[3] 294 xor 12($key),@S[2] 295 .align 16 296 .Ldloop: 297 mov -8($key),$t1 # prefetch key[4-5] 298 mov -4($key),$t0 299 300 ___ 301 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); } 302 $code.=<<___; 303 lea -16*4($key),$key 304 cmp $keyend,$key 305 mov 0($key),$t3 # prefetch key[2-3] 306 mov 4($key),$t2 307 je .Lddone 308 309 and @S[0],$t0 310 or @S[3],$t3 311 rol \$1,$t0 312 xor $t3,@S[2] # s2^=s3|key[3]; 313 xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1); 314 and @S[2],$t2 315 or @S[1],$t1 316 rol \$1,$t2 317 xor $t1,@S[0] # s0^=s1|key[1]; 318 xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1); 319 320 jmp .Ldloop 321 322 .align 16 323 .Lddone: 324 xor @S[2],$t2 325 xor @S[3],$t3 326 xor @S[0],$t0 327 xor @S[1],$t1 328 329 mov $t2,@S[0] # SwapHalf 330 mov $t3,@S[1] 331 mov $t0,@S[2] 332 mov $t1,@S[3] 333 334 .byte 0xf3,0xc3 # rep ret 335 .size _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt 336 ___ 337 338 sub _saveround { 339 my ($rnd,$key,@T)=@_; 340 my $bias=int(@T[0])?shift(@T):0; 341 342 if ($#T==3) { 343 $code.=<<___; 344 mov @T[1],`$bias+$rnd*8+0`($key) 345 mov @T[0],`$bias+$rnd*8+4`($key) 346 mov @T[3],`$bias+$rnd*8+8`($key) 347 mov @T[2],`$bias+$rnd*8+12`($key) 348 ___ 349 } else { 350 $code.=" mov @T[0],`$bias+$rnd*8+0`($key)\n"; 351 $code.=" mov @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1); 352 } 353 } 354 355 sub _loadround { 356 my ($rnd,$key,@T)=@_; 357 my $bias=int(@T[0])?shift(@T):0; 358 359 $code.=" mov `$bias+$rnd*8+0`($key),@T[0]\n"; 360 $code.=" mov `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1); 361 } 362 363 # shld is very slow on Intel EM64T family. Even on AMD it limits 364 # instruction decode rate [because it's VectorPath] and consequently 365 # performance... 366 sub __rotl128 { 367 my ($i0,$i1,$rot)=@_; 368 369 if ($rot) { 370 $code.=<<___; 371 mov $i0,%r11 372 shld \$$rot,$i1,$i0 373 shld \$$rot,%r11,$i1 374 ___ 375 } 376 } 377 378 # ... Implementing 128-bit rotate without shld gives 80% better 379 # performance EM64T, +15% on AMD64 and only ~7% degradation on 380 # Core2. This is therefore preferred. 381 sub _rotl128 { 382 my ($i0,$i1,$rot)=@_; 383 384 if ($rot) { 385 $code.=<<___; 386 mov $i0,%r11 387 shl \$$rot,$i0 388 mov $i1,%r9 389 shr \$`64-$rot`,%r9 390 shr \$`64-$rot`,%r11 391 or %r9,$i0 392 shl \$$rot,$i1 393 or %r11,$i1 394 ___ 395 } 396 } 397 398 { my $step=0; 399 400 $code.=<<___; 401 .globl Camellia_Ekeygen 402 .type Camellia_Ekeygen,\@function,3 403 .align 16 404 Camellia_Ekeygen: 405 push %rbx 406 push %rbp 407 push %r13 408 push %r14 409 push %r15 410 .Lkey_prologue: 411 412 mov %rdi,$keyend # put away arguments, keyBitLength 413 mov %rdx,$out # keyTable 414 415 mov 0(%rsi),@S[0] # load 0-127 bits 416 mov 4(%rsi),@S[1] 417 mov 8(%rsi),@S[2] 418 mov 12(%rsi),@S[3] 419 420 bswap @S[0] 421 bswap @S[1] 422 bswap @S[2] 423 bswap @S[3] 424 ___ 425 &_saveround (0,$out,@S); # KL<<<0 426 $code.=<<___; 427 cmp \$128,$keyend # check keyBitLength 428 je .L1st128 429 430 mov 16(%rsi),@S[0] # load 128-191 bits 431 mov 20(%rsi),@S[1] 432 cmp \$192,$keyend 433 je .L1st192 434 mov 24(%rsi),@S[2] # load 192-255 bits 435 mov 28(%rsi),@S[3] 436 jmp .L1st256 437 .L1st192: 438 mov @S[0],@S[2] 439 mov @S[1],@S[3] 440 not @S[2] 441 not @S[3] 442 .L1st256: 443 bswap @S[0] 444 bswap @S[1] 445 bswap @S[2] 446 bswap @S[3] 447 ___ 448 &_saveround (4,$out,@S); # temp storage for KR! 449 $code.=<<___; 450 xor 0($out),@S[1] # KR^KL 451 xor 4($out),@S[0] 452 xor 8($out),@S[3] 453 xor 12($out),@S[2] 454 455 .L1st128: 456 lea .LCamellia_SIGMA(%rip),$key 457 lea .LCamellia_SBOX(%rip),$Tbl 458 459 mov 0($key),$t1 460 mov 4($key),$t0 461 ___ 462 &Camellia_Feistel($step++); 463 &Camellia_Feistel($step++); 464 $code.=<<___; 465 xor 0($out),@S[1] # ^KL 466 xor 4($out),@S[0] 467 xor 8($out),@S[3] 468 xor 12($out),@S[2] 469 ___ 470 &Camellia_Feistel($step++); 471 &Camellia_Feistel($step++); 472 $code.=<<___; 473 cmp \$128,$keyend 474 jne .L2nd256 475 476 lea 128($out),$out # size optimization 477 shl \$32,%r8 # @S[0]|| 478 shl \$32,%r10 # @S[2]|| 479 or %r9,%r8 # ||@S[1] 480 or %r11,%r10 # ||@S[3] 481 ___ 482 &_loadround (0,$out,-128,"%rax","%rbx"); # KL 483 &_saveround (2,$out,-128,"%r8","%r10"); # KA<<<0 484 &_rotl128 ("%rax","%rbx",15); 485 &_saveround (4,$out,-128,"%rax","%rbx"); # KL<<<15 486 &_rotl128 ("%r8","%r10",15); 487 &_saveround (6,$out,-128,"%r8","%r10"); # KA<<<15 488 &_rotl128 ("%r8","%r10",15); # 15+15=30 489 &_saveround (8,$out,-128,"%r8","%r10"); # KA<<<30 490 &_rotl128 ("%rax","%rbx",30); # 15+30=45 491 &_saveround (10,$out,-128,"%rax","%rbx"); # KL<<<45 492 &_rotl128 ("%r8","%r10",15); # 30+15=45 493 &_saveround (12,$out,-128,"%r8"); # KA<<<45 494 &_rotl128 ("%rax","%rbx",15); # 45+15=60 495 &_saveround (13,$out,-128,"%rbx"); # KL<<<60 496 &_rotl128 ("%r8","%r10",15); # 45+15=60 497 &_saveround (14,$out,-128,"%r8","%r10"); # KA<<<60 498 &_rotl128 ("%rax","%rbx",17); # 60+17=77 499 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<77 500 &_rotl128 ("%rax","%rbx",17); # 77+17=94 501 &_saveround (18,$out,-128,"%rax","%rbx"); # KL<<<94 502 &_rotl128 ("%r8","%r10",34); # 60+34=94 503 &_saveround (20,$out,-128,"%r8","%r10"); # KA<<<94 504 &_rotl128 ("%rax","%rbx",17); # 94+17=111 505 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<111 506 &_rotl128 ("%r8","%r10",17); # 94+17=111 507 &_saveround (24,$out,-128,"%r8","%r10"); # KA<<<111 508 $code.=<<___; 509 mov \$3,%eax 510 jmp .Ldone 511 .align 16 512 .L2nd256: 513 ___ 514 &_saveround (6,$out,@S); # temp storage for KA! 515 $code.=<<___; 516 xor `4*8+0`($out),@S[1] # KA^KR 517 xor `4*8+4`($out),@S[0] 518 xor `5*8+0`($out),@S[3] 519 xor `5*8+4`($out),@S[2] 520 ___ 521 &Camellia_Feistel($step++); 522 &Camellia_Feistel($step++); 523 524 &_loadround (0,$out,"%rax","%rbx"); # KL 525 &_loadround (4,$out,"%rcx","%rdx"); # KR 526 &_loadround (6,$out,"%r14","%r15"); # KA 527 $code.=<<___; 528 lea 128($out),$out # size optimization 529 shl \$32,%r8 # @S[0]|| 530 shl \$32,%r10 # @S[2]|| 531 or %r9,%r8 # ||@S[1] 532 or %r11,%r10 # ||@S[3] 533 ___ 534 &_saveround (2,$out,-128,"%r8","%r10"); # KB<<<0 535 &_rotl128 ("%rcx","%rdx",15); 536 &_saveround (4,$out,-128,"%rcx","%rdx"); # KR<<<15 537 &_rotl128 ("%r14","%r15",15); 538 &_saveround (6,$out,-128,"%r14","%r15"); # KA<<<15 539 &_rotl128 ("%rcx","%rdx",15); # 15+15=30 540 &_saveround (8,$out,-128,"%rcx","%rdx"); # KR<<<30 541 &_rotl128 ("%r8","%r10",30); 542 &_saveround (10,$out,-128,"%r8","%r10"); # KB<<<30 543 &_rotl128 ("%rax","%rbx",45); 544 &_saveround (12,$out,-128,"%rax","%rbx"); # KL<<<45 545 &_rotl128 ("%r14","%r15",30); # 15+30=45 546 &_saveround (14,$out,-128,"%r14","%r15"); # KA<<<45 547 &_rotl128 ("%rax","%rbx",15); # 45+15=60 548 &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<60 549 &_rotl128 ("%rcx","%rdx",30); # 30+30=60 550 &_saveround (18,$out,-128,"%rcx","%rdx"); # KR<<<60 551 &_rotl128 ("%r8","%r10",30); # 30+30=60 552 &_saveround (20,$out,-128,"%r8","%r10"); # KB<<<60 553 &_rotl128 ("%rax","%rbx",17); # 60+17=77 554 &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<77 555 &_rotl128 ("%r14","%r15",32); # 45+32=77 556 &_saveround (24,$out,-128,"%r14","%r15"); # KA<<<77 557 &_rotl128 ("%rcx","%rdx",34); # 60+34=94 558 &_saveround (26,$out,-128,"%rcx","%rdx"); # KR<<<94 559 &_rotl128 ("%r14","%r15",17); # 77+17=94 560 &_saveround (28,$out,-128,"%r14","%r15"); # KA<<<77 561 &_rotl128 ("%rax","%rbx",34); # 77+34=111 562 &_saveround (30,$out,-128,"%rax","%rbx"); # KL<<<111 563 &_rotl128 ("%r8","%r10",51); # 60+51=111 564 &_saveround (32,$out,-128,"%r8","%r10"); # KB<<<111 565 $code.=<<___; 566 mov \$4,%eax 567 .Ldone: 568 mov 0(%rsp),%r15 569 mov 8(%rsp),%r14 570 mov 16(%rsp),%r13 571 mov 24(%rsp),%rbp 572 mov 32(%rsp),%rbx 573 lea 40(%rsp),%rsp 574 .Lkey_epilogue: 575 ret 576 .size Camellia_Ekeygen,.-Camellia_Ekeygen 577 ___ 578 } 579 580 @SBOX=( 581 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65, 582 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189, 583 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26, 584 166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77, 585 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153, 586 223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215, 587 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34, 588 254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80, 589 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210, 590 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148, 591 135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226, 592 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46, 593 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89, 594 120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250, 595 114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164, 596 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158); 597 598 sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); } 599 sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); } 600 sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); } 601 sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); } 602 603 $code.=<<___; 604 .align 64 605 .LCamellia_SIGMA: 606 .long 0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858 607 .long 0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5 608 .long 0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2 609 .long 0, 0, 0, 0 610 .LCamellia_SBOX: 611 ___ 612 # tables are interleaved, remember? 613 sub data_word { $code.=".long\t".join(',',@_)."\n"; } 614 for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); } 615 for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); } 616 617 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out, 618 # size_t length, const CAMELLIA_KEY *key, 619 # unsigned char *ivp,const int enc); 620 { 621 $_key="0(%rsp)"; 622 $_end="8(%rsp)"; # inp+len&~15 623 $_res="16(%rsp)"; # len&15 624 $ivec="24(%rsp)"; 625 $_ivp="40(%rsp)"; 626 $_rsp="48(%rsp)"; 627 628 $code.=<<___; 629 .globl Camellia_cbc_encrypt 630 .type Camellia_cbc_encrypt,\@function,6 631 .align 16 632 Camellia_cbc_encrypt: 633 cmp \$0,%rdx 634 je .Lcbc_abort 635 push %rbx 636 push %rbp 637 push %r12 638 push %r13 639 push %r14 640 push %r15 641 .Lcbc_prologue: 642 643 mov %rsp,%rbp 644 sub \$64,%rsp 645 and \$-64,%rsp 646 647 # place stack frame just "above mod 1024" the key schedule, 648 # this ensures that cache associativity suffices 649 lea -64-63(%rcx),%r10 650 sub %rsp,%r10 651 neg %r10 652 and \$0x3C0,%r10 653 sub %r10,%rsp 654 #add \$8,%rsp # 8 is reserved for callee's ra 655 656 mov %rdi,$inp # inp argument 657 mov %rsi,$out # out argument 658 mov %r8,%rbx # ivp argument 659 mov %rcx,$key # key argument 660 mov 272(%rcx),${keyend}d # grandRounds 661 662 mov %r8,$_ivp 663 mov %rbp,$_rsp 664 665 .Lcbc_body: 666 lea .LCamellia_SBOX(%rip),$Tbl 667 668 mov \$32,%ecx 669 .align 4 670 .Lcbc_prefetch_sbox: 671 mov 0($Tbl),%rax 672 mov 32($Tbl),%rsi 673 mov 64($Tbl),%rdi 674 mov 96($Tbl),%r11 675 lea 128($Tbl),$Tbl 676 loop .Lcbc_prefetch_sbox 677 sub \$4096,$Tbl 678 shl \$6,$keyend 679 mov %rdx,%rcx # len argument 680 lea ($key,$keyend),$keyend 681 682 cmp \$0,%r9d # enc argument 683 je .LCBC_DECRYPT 684 685 and \$-16,%rdx 686 and \$15,%rcx # length residue 687 lea ($inp,%rdx),%rdx 688 mov $key,$_key 689 mov %rdx,$_end 690 mov %rcx,$_res 691 692 cmp $inp,%rdx 693 mov 0(%rbx),@S[0] # load IV 694 mov 4(%rbx),@S[1] 695 mov 8(%rbx),@S[2] 696 mov 12(%rbx),@S[3] 697 je .Lcbc_enc_tail 698 jmp .Lcbc_eloop 699 700 .align 16 701 .Lcbc_eloop: 702 xor 0($inp),@S[0] 703 xor 4($inp),@S[1] 704 xor 8($inp),@S[2] 705 bswap @S[0] 706 xor 12($inp),@S[3] 707 bswap @S[1] 708 bswap @S[2] 709 bswap @S[3] 710 711 call _x86_64_Camellia_encrypt 712 713 mov $_key,$key # "rewind" the key 714 bswap @S[0] 715 mov $_end,%rdx 716 bswap @S[1] 717 mov $_res,%rcx 718 bswap @S[2] 719 mov @S[0],0($out) 720 bswap @S[3] 721 mov @S[1],4($out) 722 mov @S[2],8($out) 723 lea 16($inp),$inp 724 mov @S[3],12($out) 725 cmp %rdx,$inp 726 lea 16($out),$out 727 jne .Lcbc_eloop 728 729 cmp \$0,%rcx 730 jne .Lcbc_enc_tail 731 732 mov $_ivp,$out 733 mov @S[0],0($out) # write out IV residue 734 mov @S[1],4($out) 735 mov @S[2],8($out) 736 mov @S[3],12($out) 737 jmp .Lcbc_done 738 739 .align 16 740 .Lcbc_enc_tail: 741 xor %rax,%rax 742 mov %rax,0+$ivec 743 mov %rax,8+$ivec 744 mov %rax,$_res 745 746 .Lcbc_enc_pushf: 747 pushfq 748 cld 749 mov $inp,%rsi 750 lea 8+$ivec,%rdi 751 .long 0x9066A4F3 # rep movsb 752 popfq 753 .Lcbc_enc_popf: 754 755 lea $ivec,$inp 756 lea 16+$ivec,%rax 757 mov %rax,$_end 758 jmp .Lcbc_eloop # one more time 759 760 .align 16 761 .LCBC_DECRYPT: 762 xchg $key,$keyend 763 add \$15,%rdx 764 and \$15,%rcx # length residue 765 and \$-16,%rdx 766 mov $key,$_key 767 lea ($inp,%rdx),%rdx 768 mov %rdx,$_end 769 mov %rcx,$_res 770 771 mov (%rbx),%rax # load IV 772 mov 8(%rbx),%rbx 773 jmp .Lcbc_dloop 774 .align 16 775 .Lcbc_dloop: 776 mov 0($inp),@S[0] 777 mov 4($inp),@S[1] 778 mov 8($inp),@S[2] 779 bswap @S[0] 780 mov 12($inp),@S[3] 781 bswap @S[1] 782 mov %rax,0+$ivec # save IV to temporary storage 783 bswap @S[2] 784 mov %rbx,8+$ivec 785 bswap @S[3] 786 787 call _x86_64_Camellia_decrypt 788 789 mov $_key,$key # "rewind" the key 790 mov $_end,%rdx 791 mov $_res,%rcx 792 793 bswap @S[0] 794 mov ($inp),%rax # load IV for next iteration 795 bswap @S[1] 796 mov 8($inp),%rbx 797 bswap @S[2] 798 xor 0+$ivec,@S[0] 799 bswap @S[3] 800 xor 4+$ivec,@S[1] 801 xor 8+$ivec,@S[2] 802 lea 16($inp),$inp 803 xor 12+$ivec,@S[3] 804 cmp %rdx,$inp 805 je .Lcbc_ddone 806 807 mov @S[0],0($out) 808 mov @S[1],4($out) 809 mov @S[2],8($out) 810 mov @S[3],12($out) 811 812 lea 16($out),$out 813 jmp .Lcbc_dloop 814 815 .align 16 816 .Lcbc_ddone: 817 mov $_ivp,%rdx 818 cmp \$0,%rcx 819 jne .Lcbc_dec_tail 820 821 mov @S[0],0($out) 822 mov @S[1],4($out) 823 mov @S[2],8($out) 824 mov @S[3],12($out) 825 826 mov %rax,(%rdx) # write out IV residue 827 mov %rbx,8(%rdx) 828 jmp .Lcbc_done 829 .align 16 830 .Lcbc_dec_tail: 831 mov @S[0],0+$ivec 832 mov @S[1],4+$ivec 833 mov @S[2],8+$ivec 834 mov @S[3],12+$ivec 835 836 .Lcbc_dec_pushf: 837 pushfq 838 cld 839 lea 8+$ivec,%rsi 840 lea ($out),%rdi 841 .long 0x9066A4F3 # rep movsb 842 popfq 843 .Lcbc_dec_popf: 844 845 mov %rax,(%rdx) # write out IV residue 846 mov %rbx,8(%rdx) 847 jmp .Lcbc_done 848 849 .align 16 850 .Lcbc_done: 851 mov $_rsp,%rcx 852 mov 0(%rcx),%r15 853 mov 8(%rcx),%r14 854 mov 16(%rcx),%r13 855 mov 24(%rcx),%r12 856 mov 32(%rcx),%rbp 857 mov 40(%rcx),%rbx 858 lea 48(%rcx),%rsp 859 .Lcbc_abort: 860 ret 861 .size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt 862 863 .asciz "Camellia for x86_64 by <appro\@openssl.org>" 864 ___ 865 } 866 867 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 868 # CONTEXT *context,DISPATCHER_CONTEXT *disp) 869 if ($win64) { 870 $rec="%rcx"; 871 $frame="%rdx"; 872 $context="%r8"; 873 $disp="%r9"; 874 875 $code.=<<___; 876 .extern __imp_RtlVirtualUnwind 877 .type common_se_handler,\@abi-omnipotent 878 .align 16 879 common_se_handler: 880 push %rsi 881 push %rdi 882 push %rbx 883 push %rbp 884 push %r12 885 push %r13 886 push %r14 887 push %r15 888 pushfq 889 lea -64(%rsp),%rsp 890 891 mov 120($context),%rax # pull context->Rax 892 mov 248($context),%rbx # pull context->Rip 893 894 mov 8($disp),%rsi # disp->ImageBase 895 mov 56($disp),%r11 # disp->HandlerData 896 897 mov 0(%r11),%r10d # HandlerData[0] 898 lea (%rsi,%r10),%r10 # prologue label 899 cmp %r10,%rbx # context->Rip<prologue label 900 jb .Lin_prologue 901 902 mov 152($context),%rax # pull context->Rsp 903 904 mov 4(%r11),%r10d # HandlerData[1] 905 lea (%rsi,%r10),%r10 # epilogue label 906 cmp %r10,%rbx # context->Rip>=epilogue label 907 jae .Lin_prologue 908 909 lea 40(%rax),%rax 910 mov -8(%rax),%rbx 911 mov -16(%rax),%rbp 912 mov -24(%rax),%r13 913 mov -32(%rax),%r14 914 mov -40(%rax),%r15 915 mov %rbx,144($context) # restore context->Rbx 916 mov %rbp,160($context) # restore context->Rbp 917 mov %r13,224($context) # restore context->R13 918 mov %r14,232($context) # restore context->R14 919 mov %r15,240($context) # restore context->R15 920 921 .Lin_prologue: 922 mov 8(%rax),%rdi 923 mov 16(%rax),%rsi 924 mov %rax,152($context) # restore context->Rsp 925 mov %rsi,168($context) # restore context->Rsi 926 mov %rdi,176($context) # restore context->Rdi 927 928 jmp .Lcommon_seh_exit 929 .size common_se_handler,.-common_se_handler 930 931 .type cbc_se_handler,\@abi-omnipotent 932 .align 16 933 cbc_se_handler: 934 push %rsi 935 push %rdi 936 push %rbx 937 push %rbp 938 push %r12 939 push %r13 940 push %r14 941 push %r15 942 pushfq 943 lea -64(%rsp),%rsp 944 945 mov 120($context),%rax # pull context->Rax 946 mov 248($context),%rbx # pull context->Rip 947 948 lea .Lcbc_prologue(%rip),%r10 949 cmp %r10,%rbx # context->Rip<.Lcbc_prologue 950 jb .Lin_cbc_prologue 951 952 lea .Lcbc_body(%rip),%r10 953 cmp %r10,%rbx # context->Rip<.Lcbc_body 954 jb .Lin_cbc_frame_setup 955 956 mov 152($context),%rax # pull context->Rsp 957 958 lea .Lcbc_abort(%rip),%r10 959 cmp %r10,%rbx # context->Rip>=.Lcbc_abort 960 jae .Lin_cbc_prologue 961 962 # handle pushf/popf in Camellia_cbc_encrypt 963 lea .Lcbc_enc_pushf(%rip),%r10 964 cmp %r10,%rbx # context->Rip<=.Lcbc_enc_pushf 965 jbe .Lin_cbc_no_flag 966 lea 8(%rax),%rax 967 lea .Lcbc_enc_popf(%rip),%r10 968 cmp %r10,%rbx # context->Rip<.Lcbc_enc_popf 969 jb .Lin_cbc_no_flag 970 lea -8(%rax),%rax 971 lea .Lcbc_dec_pushf(%rip),%r10 972 cmp %r10,%rbx # context->Rip<=.Lcbc_dec_pushf 973 jbe .Lin_cbc_no_flag 974 lea 8(%rax),%rax 975 lea .Lcbc_dec_popf(%rip),%r10 976 cmp %r10,%rbx # context->Rip<.Lcbc_dec_popf 977 jb .Lin_cbc_no_flag 978 lea -8(%rax),%rax 979 980 .Lin_cbc_no_flag: 981 mov 48(%rax),%rax # $_rsp 982 lea 48(%rax),%rax 983 984 .Lin_cbc_frame_setup: 985 mov -8(%rax),%rbx 986 mov -16(%rax),%rbp 987 mov -24(%rax),%r12 988 mov -32(%rax),%r13 989 mov -40(%rax),%r14 990 mov -48(%rax),%r15 991 mov %rbx,144($context) # restore context->Rbx 992 mov %rbp,160($context) # restore context->Rbp 993 mov %r12,216($context) # restore context->R12 994 mov %r13,224($context) # restore context->R13 995 mov %r14,232($context) # restore context->R14 996 mov %r15,240($context) # restore context->R15 997 998 .Lin_cbc_prologue: 999 mov 8(%rax),%rdi 1000 mov 16(%rax),%rsi 1001 mov %rax,152($context) # restore context->Rsp 1002 mov %rsi,168($context) # restore context->Rsi 1003 mov %rdi,176($context) # restore context->Rdi 1004 1005 .align 4 1006 .Lcommon_seh_exit: 1007 1008 mov 40($disp),%rdi # disp->ContextRecord 1009 mov $context,%rsi # context 1010 mov \$`1232/8`,%ecx # sizeof(CONTEXT) 1011 .long 0xa548f3fc # cld; rep movsq 1012 1013 mov $disp,%rsi 1014 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1015 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1016 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1017 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1018 mov 40(%rsi),%r10 # disp->ContextRecord 1019 lea 56(%rsi),%r11 # &disp->HandlerData 1020 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1021 mov %r10,32(%rsp) # arg5 1022 mov %r11,40(%rsp) # arg6 1023 mov %r12,48(%rsp) # arg7 1024 mov %rcx,56(%rsp) # arg8, (NULL) 1025 call *__imp_RtlVirtualUnwind(%rip) 1026 1027 mov \$1,%eax # ExceptionContinueSearch 1028 lea 64(%rsp),%rsp 1029 popfq 1030 pop %r15 1031 pop %r14 1032 pop %r13 1033 pop %r12 1034 pop %rbp 1035 pop %rbx 1036 pop %rdi 1037 pop %rsi 1038 ret 1039 .size cbc_se_handler,.-cbc_se_handler 1040 1041 .section .pdata 1042 .align 4 1043 .rva .LSEH_begin_Camellia_EncryptBlock_Rounds 1044 .rva .LSEH_end_Camellia_EncryptBlock_Rounds 1045 .rva .LSEH_info_Camellia_EncryptBlock_Rounds 1046 1047 .rva .LSEH_begin_Camellia_DecryptBlock_Rounds 1048 .rva .LSEH_end_Camellia_DecryptBlock_Rounds 1049 .rva .LSEH_info_Camellia_DecryptBlock_Rounds 1050 1051 .rva .LSEH_begin_Camellia_Ekeygen 1052 .rva .LSEH_end_Camellia_Ekeygen 1053 .rva .LSEH_info_Camellia_Ekeygen 1054 1055 .rva .LSEH_begin_Camellia_cbc_encrypt 1056 .rva .LSEH_end_Camellia_cbc_encrypt 1057 .rva .LSEH_info_Camellia_cbc_encrypt 1058 1059 .section .xdata 1060 .align 8 1061 .LSEH_info_Camellia_EncryptBlock_Rounds: 1062 .byte 9,0,0,0 1063 .rva common_se_handler 1064 .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[] 1065 .LSEH_info_Camellia_DecryptBlock_Rounds: 1066 .byte 9,0,0,0 1067 .rva common_se_handler 1068 .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] 1069 .LSEH_info_Camellia_Ekeygen: 1070 .byte 9,0,0,0 1071 .rva common_se_handler 1072 .rva .Lkey_prologue,.Lkey_epilogue # HandlerData[] 1073 .LSEH_info_Camellia_cbc_encrypt: 1074 .byte 9,0,0,0 1075 .rva cbc_se_handler 1076 ___ 1077 } 1078 1079 $code =~ s/\`([^\`]*)\`/eval $1/gem; 1080 print $code; 1081 close STDOUT;