1 #!/usr/bin/env perl
   2 
   3 # ====================================================================
   4 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
   5 #
   6 # This module may be used under the terms of either the GNU General
   7 # Public License version 2 or later, the GNU Lesser General Public
   8 # License version 2.1 or later, the Mozilla Public License version
   9 # 1.1 or the BSD License. The exact terms of either license are
  10 # distributed along with this module. For further details see
  11 # http://www.openssl.org/~appro/camellia/.
  12 # ====================================================================
  13 
  14 # Performance in cycles per processed byte (less is better) in
  15 # 'openssl speed ...' benchmark:
  16 #
  17 #                       AMD64   Core2   EM64T
  18 # -evp camellia-128-ecb 16.7    21.0    22.7
  19 # + over gcc 3.4.6      +25%    +5%     0%
  20 #
  21 # camellia-128-cbc      15.7    20.4    21.1
  22 #
  23 # 128-bit key setup     128     216     205     cycles/key
  24 # + over gcc 3.4.6      +54%    +39%    +15%
  25 #
  26 # Numbers in "+" rows represent performance improvement over compiler
  27 # generated code. Key setup timings are impressive on AMD and Core2
  28 # thanks to 64-bit operations being covertly deployed. Improvement on
  29 # EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
  30 # apparently emulates some of 64-bit operations in [32-bit] microcode.
  31 
  32 $flavour = shift;
  33 $output  = shift;
  34 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  35 
  36 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  37 
  38 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  39 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  40 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  41 die "can't locate x86_64-xlate.pl";
  42 
  43 open OUT,"| \"$^X\" $xlate $flavour $output";
  44 *STDOUT=*OUT;
  45 
  46 sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/;    $r; }
  47 sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
  48                         $r =~ s/%[er]([sd]i)/%\1l/;
  49                         $r =~ s/%(r[0-9]+)[d]?/%\1b/;   $r; }
  50 
  51 $t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
  52 @S=("%r8d","%r9d","%r10d","%r11d");
  53 $i0="%esi";
  54 $i1="%edi";
  55 $Tbl="%rbp";    # size optimization
  56 $inp="%r12";
  57 $out="%r13";
  58 $key="%r14";
  59 $keyend="%r15";
  60 $arg0d=$win64?"%ecx":"%edi";
  61 
  62 # const unsigned int Camellia_SBOX[4][256];
  63 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
  64 # and [2][] - with [3][]. This is done to minimize code size.
  65 $SBOX1_1110=0;          # Camellia_SBOX[0]
  66 $SBOX4_4404=4;          # Camellia_SBOX[1]
  67 $SBOX2_0222=2048;       # Camellia_SBOX[2]
  68 $SBOX3_3033=2052;       # Camellia_SBOX[3]
  69 
  70 sub Camellia_Feistel {
  71 my $i=@_[0];
  72 my $seed=defined(@_[1])?@_[1]:0;
  73 my $scale=$seed<0?-8:8;
  74 my $j=($i&1)*2;
  75 my $s0=@S[($j)%4],$s1=@S[($j+1)%4],$s2=@S[($j+2)%4],$s3=@S[($j+3)%4];
  76 
  77 $code.=<<___;
  78         xor     $s0,$t0                         # t0^=key[0]
  79         xor     $s1,$t1                         # t1^=key[1]
  80         movz    `&hi("$t0")`,$i0            # (t0>>8)&0xff
  81         movz    `&lo("$t1")`,$i1            # (t1>>0)&0xff
  82         mov     $SBOX3_3033($Tbl,$i0,8),$t3     # t3=SBOX3_3033[0]
  83         mov     $SBOX1_1110($Tbl,$i1,8),$t2     # t2=SBOX1_1110[1]
  84         movz    `&lo("$t0")`,$i0            # (t0>>0)&0xff
  85         shr     \$16,$t0
  86         movz    `&hi("$t1")`,$i1            # (t1>>8)&0xff
  87         xor     $SBOX4_4404($Tbl,$i0,8),$t3     # t3^=SBOX4_4404[0]
  88         shr     \$16,$t1
  89         xor     $SBOX4_4404($Tbl,$i1,8),$t2     # t2^=SBOX4_4404[1]
  90         movz    `&hi("$t0")`,$i0            # (t0>>24)&0xff
  91         movz    `&lo("$t1")`,$i1            # (t1>>16)&0xff
  92         xor     $SBOX1_1110($Tbl,$i0,8),$t3     # t3^=SBOX1_1110[0]
  93         xor     $SBOX3_3033($Tbl,$i1,8),$t2     # t2^=SBOX3_3033[1]
  94         movz    `&lo("$t0")`,$i0            # (t0>>16)&0xff
  95         movz    `&hi("$t1")`,$i1            # (t1>>24)&0xff
  96         xor     $SBOX2_0222($Tbl,$i0,8),$t3     # t3^=SBOX2_0222[0]
  97         xor     $SBOX2_0222($Tbl,$i1,8),$t2     # t2^=SBOX2_0222[1]
  98         mov     `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
  99         mov     `$seed+($i+1)*$scale+4`($key),$t0
 100         xor     $t3,$t2                         # t2^=t3
 101         ror     \$8,$t3                         # t3=RightRotate(t3,8)
 102         xor     $t2,$s2
 103         xor     $t2,$s3
 104         xor     $t3,$s3
 105 ___
 106 }
 107 
 108 # void Camellia_EncryptBlock_Rounds(
 109 #               int grandRounds,
 110 #               const Byte plaintext[],
 111 #               const KEY_TABLE_TYPE keyTable,
 112 #               Byte ciphertext[])
 113 $code=<<___;
 114 .text
 115 
 116 # V1.x API
 117 .globl  Camellia_EncryptBlock
 118 .type   Camellia_EncryptBlock,\@abi-omnipotent
 119 .align  16
 120 Camellia_EncryptBlock:
 121         movl    \$128,%eax
 122         subl    $arg0d,%eax
 123         movl    \$3,$arg0d
 124         adcl    \$0,$arg0d      # keyBitLength==128?3:4
 125         jmp     .Lenc_rounds
 126 .size   Camellia_EncryptBlock,.-Camellia_EncryptBlock
 127 # V2
 128 .globl  Camellia_EncryptBlock_Rounds
 129 .type   Camellia_EncryptBlock_Rounds,\@function,4
 130 .align  16
 131 .Lenc_rounds:
 132 Camellia_EncryptBlock_Rounds:
 133         push    %rbx
 134         push    %rbp
 135         push    %r13
 136         push    %r14
 137         push    %r15
 138 .Lenc_prologue:
 139 
 140         #mov    %rsi,$inp               # put away arguments
 141         mov     %rcx,$out
 142         mov     %rdx,$key
 143 
 144         shl     \$6,%edi                # process grandRounds
 145         lea     .LCamellia_SBOX(%rip),$Tbl
 146         lea     ($key,%rdi),$keyend
 147 
 148         mov     0(%rsi),@S[0]           # load plaintext
 149         mov     4(%rsi),@S[1]
 150         mov     8(%rsi),@S[2]
 151         bswap   @S[0]
 152         mov     12(%rsi),@S[3]
 153         bswap   @S[1]
 154         bswap   @S[2]
 155         bswap   @S[3]
 156 
 157         call    _x86_64_Camellia_encrypt
 158 
 159         bswap   @S[0]
 160         bswap   @S[1]
 161         bswap   @S[2]
 162         mov     @S[0],0($out)
 163         bswap   @S[3]
 164         mov     @S[1],4($out)
 165         mov     @S[2],8($out)
 166         mov     @S[3],12($out)
 167 
 168         mov     0(%rsp),%r15
 169         mov     8(%rsp),%r14
 170         mov     16(%rsp),%r13
 171         mov     24(%rsp),%rbp
 172         mov     32(%rsp),%rbx
 173         lea     40(%rsp),%rsp
 174 .Lenc_epilogue:
 175         ret
 176 .size   Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
 177 
 178 .type   _x86_64_Camellia_encrypt,\@abi-omnipotent
 179 .align  16
 180 _x86_64_Camellia_encrypt:
 181         xor     0($key),@S[1]
 182         xor     4($key),@S[0]           # ^=key[0-3]
 183         xor     8($key),@S[3]
 184         xor     12($key),@S[2]
 185 .align  16
 186 .Leloop:
 187         mov     16($key),$t1            # prefetch key[4-5]
 188         mov     20($key),$t0
 189 
 190 ___
 191         for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
 192 $code.=<<___;
 193         lea     16*4($key),$key
 194         cmp     $keyend,$key
 195         mov     8($key),$t3             # prefetch key[2-3]
 196         mov     12($key),$t2
 197         je      .Ledone
 198 
 199         and     @S[0],$t0
 200         or      @S[3],$t3
 201         rol     \$1,$t0
 202         xor     $t3,@S[2]               # s2^=s3|key[3];
 203         xor     $t0,@S[1]               # s1^=LeftRotate(s0&key[0],1);
 204         and     @S[2],$t2
 205         or      @S[1],$t1
 206         rol     \$1,$t2
 207         xor     $t1,@S[0]               # s0^=s1|key[1];
 208         xor     $t2,@S[3]               # s3^=LeftRotate(s2&key[2],1);
 209         jmp     .Leloop
 210 
 211 .align  16
 212 .Ledone:
 213         xor     @S[2],$t0               # SwapHalf
 214         xor     @S[3],$t1
 215         xor     @S[0],$t2
 216         xor     @S[1],$t3
 217 
 218         mov     $t0,@S[0]
 219         mov     $t1,@S[1]
 220         mov     $t2,@S[2]
 221         mov     $t3,@S[3]
 222 
 223         .byte   0xf3,0xc3               # rep ret
 224 .size   _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
 225 
 226 # V1.x API
 227 .globl  Camellia_DecryptBlock
 228 .type   Camellia_DecryptBlock,\@abi-omnipotent
 229 .align  16
 230 Camellia_DecryptBlock:
 231         movl    \$128,%eax
 232         subl    $arg0d,%eax
 233         movl    \$3,$arg0d
 234         adcl    \$0,$arg0d      # keyBitLength==128?3:4
 235         jmp     .Ldec_rounds
 236 .size   Camellia_DecryptBlock,.-Camellia_DecryptBlock
 237 # V2
 238 .globl  Camellia_DecryptBlock_Rounds
 239 .type   Camellia_DecryptBlock_Rounds,\@function,4
 240 .align  16
 241 .Ldec_rounds:
 242 Camellia_DecryptBlock_Rounds:
 243         push    %rbx
 244         push    %rbp
 245         push    %r13
 246         push    %r14
 247         push    %r15
 248 .Ldec_prologue:
 249 
 250         #mov    %rsi,$inp               # put away arguments
 251         mov     %rcx,$out
 252         mov     %rdx,$keyend
 253 
 254         shl     \$6,%edi                # process grandRounds
 255         lea     .LCamellia_SBOX(%rip),$Tbl
 256         lea     ($keyend,%rdi),$key
 257 
 258         mov     0(%rsi),@S[0]           # load plaintext
 259         mov     4(%rsi),@S[1]
 260         mov     8(%rsi),@S[2]
 261         bswap   @S[0]
 262         mov     12(%rsi),@S[3]
 263         bswap   @S[1]
 264         bswap   @S[2]
 265         bswap   @S[3]
 266 
 267         call    _x86_64_Camellia_decrypt
 268 
 269         bswap   @S[0]
 270         bswap   @S[1]
 271         bswap   @S[2]
 272         mov     @S[0],0($out)
 273         bswap   @S[3]
 274         mov     @S[1],4($out)
 275         mov     @S[2],8($out)
 276         mov     @S[3],12($out)
 277 
 278         mov     0(%rsp),%r15
 279         mov     8(%rsp),%r14
 280         mov     16(%rsp),%r13
 281         mov     24(%rsp),%rbp
 282         mov     32(%rsp),%rbx
 283         lea     40(%rsp),%rsp
 284 .Ldec_epilogue:
 285         ret
 286 .size   Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
 287 
 288 .type   _x86_64_Camellia_decrypt,\@abi-omnipotent
 289 .align  16
 290 _x86_64_Camellia_decrypt:
 291         xor     0($key),@S[1]
 292         xor     4($key),@S[0]           # ^=key[0-3]
 293         xor     8($key),@S[3]
 294         xor     12($key),@S[2]
 295 .align  16
 296 .Ldloop:
 297         mov     -8($key),$t1            # prefetch key[4-5]
 298         mov     -4($key),$t0
 299 
 300 ___
 301         for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
 302 $code.=<<___;
 303         lea     -16*4($key),$key
 304         cmp     $keyend,$key
 305         mov     0($key),$t3             # prefetch key[2-3]
 306         mov     4($key),$t2
 307         je      .Lddone
 308 
 309         and     @S[0],$t0
 310         or      @S[3],$t3
 311         rol     \$1,$t0
 312         xor     $t3,@S[2]               # s2^=s3|key[3];
 313         xor     $t0,@S[1]               # s1^=LeftRotate(s0&key[0],1);
 314         and     @S[2],$t2
 315         or      @S[1],$t1
 316         rol     \$1,$t2
 317         xor     $t1,@S[0]               # s0^=s1|key[1];
 318         xor     $t2,@S[3]               # s3^=LeftRotate(s2&key[2],1);
 319 
 320         jmp     .Ldloop
 321 
 322 .align  16
 323 .Lddone:
 324         xor     @S[2],$t2
 325         xor     @S[3],$t3
 326         xor     @S[0],$t0
 327         xor     @S[1],$t1
 328 
 329         mov     $t2,@S[0]               # SwapHalf
 330         mov     $t3,@S[1]
 331         mov     $t0,@S[2]
 332         mov     $t1,@S[3]
 333 
 334         .byte   0xf3,0xc3               # rep ret
 335 .size   _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
 336 ___
 337 
 338 sub _saveround {
 339 my ($rnd,$key,@T)=@_;
 340 my $bias=int(@T[0])?shift(@T):0;
 341 
 342     if ($#T==3) {
 343         $code.=<<___;
 344         mov     @T[1],`$bias+$rnd*8+0`($key)
 345         mov     @T[0],`$bias+$rnd*8+4`($key)
 346         mov     @T[3],`$bias+$rnd*8+8`($key)
 347         mov     @T[2],`$bias+$rnd*8+12`($key)
 348 ___
 349     } else {
 350         $code.="        mov     @T[0],`$bias+$rnd*8+0`($key)\n";
 351         $code.="        mov     @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
 352     }
 353 }
 354 
 355 sub _loadround {
 356 my ($rnd,$key,@T)=@_;
 357 my $bias=int(@T[0])?shift(@T):0;
 358 
 359 $code.="        mov     `$bias+$rnd*8+0`($key),@T[0]\n";
 360 $code.="        mov     `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
 361 }
 362 
 363 # shld is very slow on Intel EM64T family. Even on AMD it limits
 364 # instruction decode rate [because it's VectorPath] and consequently
 365 # performance...
 366 sub __rotl128 {
 367 my ($i0,$i1,$rot)=@_;
 368 
 369     if ($rot) {
 370         $code.=<<___;
 371         mov     $i0,%r11
 372         shld    \$$rot,$i1,$i0
 373         shld    \$$rot,%r11,$i1
 374 ___
 375     }
 376 }
 377 
 378 # ... Implementing 128-bit rotate without shld gives 80% better
 379 # performance EM64T, +15% on AMD64 and only ~7% degradation on
 380 # Core2. This is therefore preferred.
 381 sub _rotl128 {
 382 my ($i0,$i1,$rot)=@_;
 383 
 384     if ($rot) {
 385         $code.=<<___;
 386         mov     $i0,%r11
 387         shl     \$$rot,$i0
 388         mov     $i1,%r9
 389         shr     \$`64-$rot`,%r9
 390         shr     \$`64-$rot`,%r11
 391         or      %r9,$i0
 392         shl     \$$rot,$i1
 393         or      %r11,$i1
 394 ___
 395     }
 396 }
 397 
 398 { my $step=0;
 399 
 400 $code.=<<___;
 401 .globl  Camellia_Ekeygen
 402 .type   Camellia_Ekeygen,\@function,3
 403 .align  16
 404 Camellia_Ekeygen:
 405         push    %rbx
 406         push    %rbp
 407         push    %r13
 408         push    %r14
 409         push    %r15
 410 .Lkey_prologue:
 411 
 412         mov     %rdi,$keyend            # put away arguments, keyBitLength
 413         mov     %rdx,$out               # keyTable
 414 
 415         mov     0(%rsi),@S[0]           # load 0-127 bits
 416         mov     4(%rsi),@S[1]
 417         mov     8(%rsi),@S[2]
 418         mov     12(%rsi),@S[3]
 419 
 420         bswap   @S[0]
 421         bswap   @S[1]
 422         bswap   @S[2]
 423         bswap   @S[3]
 424 ___
 425         &_saveround (0,$out,@S);    # KL<<<0
 426 $code.=<<___;
 427         cmp     \$128,$keyend           # check keyBitLength
 428         je      .L1st128
 429 
 430         mov     16(%rsi),@S[0]          # load 128-191 bits
 431         mov     20(%rsi),@S[1]
 432         cmp     \$192,$keyend
 433         je      .L1st192
 434         mov     24(%rsi),@S[2]          # load 192-255 bits
 435         mov     28(%rsi),@S[3]
 436         jmp     .L1st256
 437 .L1st192:
 438         mov     @S[0],@S[2]
 439         mov     @S[1],@S[3]
 440         not     @S[2]
 441         not     @S[3]
 442 .L1st256:
 443         bswap   @S[0]
 444         bswap   @S[1]
 445         bswap   @S[2]
 446         bswap   @S[3]
 447 ___
 448         &_saveround (4,$out,@S);    # temp storage for KR!
 449 $code.=<<___;
 450         xor     0($out),@S[1]           # KR^KL
 451         xor     4($out),@S[0]
 452         xor     8($out),@S[3]
 453         xor     12($out),@S[2]
 454 
 455 .L1st128:
 456         lea     .LCamellia_SIGMA(%rip),$key
 457         lea     .LCamellia_SBOX(%rip),$Tbl
 458 
 459         mov     0($key),$t1
 460         mov     4($key),$t0
 461 ___
 462         &Camellia_Feistel($step++);
 463         &Camellia_Feistel($step++);
 464 $code.=<<___;
 465         xor     0($out),@S[1]           # ^KL
 466         xor     4($out),@S[0]
 467         xor     8($out),@S[3]
 468         xor     12($out),@S[2]
 469 ___
 470         &Camellia_Feistel($step++);
 471         &Camellia_Feistel($step++);
 472 $code.=<<___;
 473         cmp     \$128,$keyend
 474         jne     .L2nd256
 475 
 476         lea     128($out),$out          # size optimization
 477         shl     \$32,%r8                # @S[0]||
 478         shl     \$32,%r10               # @S[2]||
 479         or      %r9,%r8                 # ||@S[1]
 480         or      %r11,%r10               # ||@S[3]
 481 ___
 482         &_loadround (0,$out,-128,"%rax","%rbx");    # KL
 483         &_saveround (2,$out,-128,"%r8","%r10");     # KA<<<0
 484         &_rotl128   ("%rax","%rbx",15);
 485         &_saveround (4,$out,-128,"%rax","%rbx");    # KL<<<15
 486         &_rotl128   ("%r8","%r10",15);
 487         &_saveround (6,$out,-128,"%r8","%r10");     # KA<<<15
 488         &_rotl128   ("%r8","%r10",15);              # 15+15=30
 489         &_saveround (8,$out,-128,"%r8","%r10");     # KA<<<30
 490         &_rotl128   ("%rax","%rbx",30);             # 15+30=45
 491         &_saveround (10,$out,-128,"%rax","%rbx");   # KL<<<45
 492         &_rotl128   ("%r8","%r10",15);              # 30+15=45
 493         &_saveround (12,$out,-128,"%r8");           # KA<<<45
 494         &_rotl128   ("%rax","%rbx",15);             # 45+15=60
 495         &_saveround (13,$out,-128,"%rbx");          # KL<<<60
 496         &_rotl128   ("%r8","%r10",15);              # 45+15=60
 497         &_saveround (14,$out,-128,"%r8","%r10");    # KA<<<60
 498         &_rotl128   ("%rax","%rbx",17);             # 60+17=77
 499         &_saveround (16,$out,-128,"%rax","%rbx");   # KL<<<77
 500         &_rotl128   ("%rax","%rbx",17);             # 77+17=94
 501         &_saveround (18,$out,-128,"%rax","%rbx");   # KL<<<94
 502         &_rotl128   ("%r8","%r10",34);              # 60+34=94
 503         &_saveround (20,$out,-128,"%r8","%r10");    # KA<<<94
 504         &_rotl128   ("%rax","%rbx",17);             # 94+17=111
 505         &_saveround (22,$out,-128,"%rax","%rbx");   # KL<<<111
 506         &_rotl128   ("%r8","%r10",17);              # 94+17=111
 507         &_saveround (24,$out,-128,"%r8","%r10");    # KA<<<111
 508 $code.=<<___;
 509         mov     \$3,%eax
 510         jmp     .Ldone
 511 .align  16
 512 .L2nd256:
 513 ___
 514         &_saveround (6,$out,@S);    # temp storage for KA!
 515 $code.=<<___;
 516         xor     `4*8+0`($out),@S[1]     # KA^KR
 517         xor     `4*8+4`($out),@S[0]
 518         xor     `5*8+0`($out),@S[3]
 519         xor     `5*8+4`($out),@S[2]
 520 ___
 521         &Camellia_Feistel($step++);
 522         &Camellia_Feistel($step++);
 523 
 524         &_loadround (0,$out,"%rax","%rbx"); # KL
 525         &_loadround (4,$out,"%rcx","%rdx"); # KR
 526         &_loadround (6,$out,"%r14","%r15"); # KA
 527 $code.=<<___;
 528         lea     128($out),$out          # size optimization
 529         shl     \$32,%r8                # @S[0]||
 530         shl     \$32,%r10               # @S[2]||
 531         or      %r9,%r8                 # ||@S[1]
 532         or      %r11,%r10               # ||@S[3]
 533 ___
 534         &_saveround (2,$out,-128,"%r8","%r10");     # KB<<<0
 535         &_rotl128   ("%rcx","%rdx",15);
 536         &_saveround (4,$out,-128,"%rcx","%rdx");    # KR<<<15
 537         &_rotl128   ("%r14","%r15",15);
 538         &_saveround (6,$out,-128,"%r14","%r15");    # KA<<<15
 539         &_rotl128   ("%rcx","%rdx",15);             # 15+15=30
 540         &_saveround (8,$out,-128,"%rcx","%rdx");    # KR<<<30
 541         &_rotl128   ("%r8","%r10",30);
 542         &_saveround (10,$out,-128,"%r8","%r10");    # KB<<<30
 543         &_rotl128   ("%rax","%rbx",45);
 544         &_saveround (12,$out,-128,"%rax","%rbx");   # KL<<<45
 545         &_rotl128   ("%r14","%r15",30);             # 15+30=45
 546         &_saveround (14,$out,-128,"%r14","%r15");   # KA<<<45
 547         &_rotl128   ("%rax","%rbx",15);             # 45+15=60
 548         &_saveround (16,$out,-128,"%rax","%rbx");   # KL<<<60
 549         &_rotl128   ("%rcx","%rdx",30);             # 30+30=60
 550         &_saveround (18,$out,-128,"%rcx","%rdx");   # KR<<<60
 551         &_rotl128   ("%r8","%r10",30);              # 30+30=60
 552         &_saveround (20,$out,-128,"%r8","%r10");    # KB<<<60
 553         &_rotl128   ("%rax","%rbx",17);             # 60+17=77
 554         &_saveround (22,$out,-128,"%rax","%rbx");   # KL<<<77
 555         &_rotl128   ("%r14","%r15",32);             # 45+32=77
 556         &_saveround (24,$out,-128,"%r14","%r15");   # KA<<<77
 557         &_rotl128   ("%rcx","%rdx",34);             # 60+34=94
 558         &_saveround (26,$out,-128,"%rcx","%rdx");   # KR<<<94
 559         &_rotl128   ("%r14","%r15",17);             # 77+17=94
 560         &_saveround (28,$out,-128,"%r14","%r15");   # KA<<<77
 561         &_rotl128   ("%rax","%rbx",34);             # 77+34=111
 562         &_saveround (30,$out,-128,"%rax","%rbx");   # KL<<<111
 563         &_rotl128   ("%r8","%r10",51);              # 60+51=111
 564         &_saveround (32,$out,-128,"%r8","%r10");    # KB<<<111
 565 $code.=<<___;
 566         mov     \$4,%eax
 567 .Ldone:
 568         mov     0(%rsp),%r15
 569         mov     8(%rsp),%r14
 570         mov     16(%rsp),%r13
 571         mov     24(%rsp),%rbp
 572         mov     32(%rsp),%rbx
 573         lea     40(%rsp),%rsp
 574 .Lkey_epilogue:
 575         ret
 576 .size   Camellia_Ekeygen,.-Camellia_Ekeygen
 577 ___
 578 }
 579 
 580 @SBOX=(
 581 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
 582  35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
 583 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
 584 166,225, 57,202,213, 71, 93, 61,217,  1, 90,214, 81, 86,108, 77,
 585 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
 586 223, 76,203,194, 52,126,118,  5,109,183,169, 49,209, 23,  4,215,
 587  20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
 588 254, 68,207,178,195,181,122,145, 36,  8,232,168, 96,252,105, 80,
 589 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
 590  16,196,  0, 72,163,247,117,219,138,  3,230,218,  9, 63,221,148,
 591 135, 92,131,  2,205, 74,144, 51,115,103,246,243,157,127,191,226,
 592  82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
 593 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
 594 120,152,  6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
 595 114,  7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
 596  64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
 597 
 598 sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
 599 sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
 600 sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
 601 sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
 602 
 603 $code.=<<___;
 604 .align  64
 605 .LCamellia_SIGMA:
 606 .long   0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
 607 .long   0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
 608 .long   0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
 609 .long   0,          0,          0,          0
 610 .LCamellia_SBOX:
 611 ___
 612 # tables are interleaved, remember?
 613 sub data_word { $code.=".long\t".join(',',@_)."\n"; }
 614 for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
 615 for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
 616 
 617 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
 618 #                       size_t length, const CAMELLIA_KEY *key,
 619 #                       unsigned char *ivp,const int enc);
 620 {
 621 $_key="0(%rsp)";
 622 $_end="8(%rsp)";        # inp+len&~15
 623 $_res="16(%rsp)";       # len&15
 624 $ivec="24(%rsp)";
 625 $_ivp="40(%rsp)";
 626 $_rsp="48(%rsp)";
 627 
 628 $code.=<<___;
 629 .globl  Camellia_cbc_encrypt
 630 .type   Camellia_cbc_encrypt,\@function,6
 631 .align  16
 632 Camellia_cbc_encrypt:
 633         cmp     \$0,%rdx
 634         je      .Lcbc_abort
 635         push    %rbx
 636         push    %rbp
 637         push    %r12
 638         push    %r13
 639         push    %r14
 640         push    %r15
 641 .Lcbc_prologue:
 642 
 643         mov     %rsp,%rbp
 644         sub     \$64,%rsp
 645         and     \$-64,%rsp
 646 
 647         # place stack frame just "above mod 1024" the key schedule,
 648         # this ensures that cache associativity suffices
 649         lea     -64-63(%rcx),%r10
 650         sub     %rsp,%r10
 651         neg     %r10
 652         and     \$0x3C0,%r10
 653         sub     %r10,%rsp
 654         #add    \$8,%rsp                # 8 is reserved for callee's ra
 655 
 656         mov     %rdi,$inp               # inp argument
 657         mov     %rsi,$out               # out argument
 658         mov     %r8,%rbx                # ivp argument
 659         mov     %rcx,$key               # key argument
 660         mov     272(%rcx),${keyend}d    # grandRounds
 661 
 662         mov     %r8,$_ivp
 663         mov     %rbp,$_rsp
 664 
 665 .Lcbc_body:
 666         lea     .LCamellia_SBOX(%rip),$Tbl
 667 
 668         mov     \$32,%ecx
 669 .align  4
 670 .Lcbc_prefetch_sbox:
 671         mov     0($Tbl),%rax
 672         mov     32($Tbl),%rsi
 673         mov     64($Tbl),%rdi
 674         mov     96($Tbl),%r11
 675         lea     128($Tbl),$Tbl
 676         loop    .Lcbc_prefetch_sbox
 677         sub     \$4096,$Tbl
 678         shl     \$6,$keyend
 679         mov     %rdx,%rcx               # len argument
 680         lea     ($key,$keyend),$keyend
 681 
 682         cmp     \$0,%r9d                # enc argument
 683         je      .LCBC_DECRYPT
 684 
 685         and     \$-16,%rdx
 686         and     \$15,%rcx               # length residue
 687         lea     ($inp,%rdx),%rdx
 688         mov     $key,$_key
 689         mov     %rdx,$_end
 690         mov     %rcx,$_res
 691 
 692         cmp     $inp,%rdx
 693         mov     0(%rbx),@S[0]           # load IV
 694         mov     4(%rbx),@S[1]
 695         mov     8(%rbx),@S[2]
 696         mov     12(%rbx),@S[3]
 697         je      .Lcbc_enc_tail
 698         jmp     .Lcbc_eloop
 699 
 700 .align  16
 701 .Lcbc_eloop:
 702         xor     0($inp),@S[0]
 703         xor     4($inp),@S[1]
 704         xor     8($inp),@S[2]
 705         bswap   @S[0]
 706         xor     12($inp),@S[3]
 707         bswap   @S[1]
 708         bswap   @S[2]
 709         bswap   @S[3]
 710 
 711         call    _x86_64_Camellia_encrypt
 712 
 713         mov     $_key,$key              # "rewind" the key
 714         bswap   @S[0]
 715         mov     $_end,%rdx
 716         bswap   @S[1]
 717         mov     $_res,%rcx
 718         bswap   @S[2]
 719         mov     @S[0],0($out)
 720         bswap   @S[3]
 721         mov     @S[1],4($out)
 722         mov     @S[2],8($out)
 723         lea     16($inp),$inp
 724         mov     @S[3],12($out)
 725         cmp     %rdx,$inp
 726         lea     16($out),$out
 727         jne     .Lcbc_eloop
 728 
 729         cmp     \$0,%rcx
 730         jne     .Lcbc_enc_tail
 731 
 732         mov     $_ivp,$out
 733         mov     @S[0],0($out)           # write out IV residue
 734         mov     @S[1],4($out)
 735         mov     @S[2],8($out)
 736         mov     @S[3],12($out)
 737         jmp     .Lcbc_done
 738 
 739 .align  16
 740 .Lcbc_enc_tail:
 741         xor     %rax,%rax
 742         mov     %rax,0+$ivec
 743         mov     %rax,8+$ivec
 744         mov     %rax,$_res
 745 
 746 .Lcbc_enc_pushf:
 747         pushfq
 748         cld
 749         mov     $inp,%rsi
 750         lea     8+$ivec,%rdi
 751         .long   0x9066A4F3              # rep movsb
 752         popfq
 753 .Lcbc_enc_popf:
 754 
 755         lea     $ivec,$inp
 756         lea     16+$ivec,%rax
 757         mov     %rax,$_end
 758         jmp     .Lcbc_eloop             # one more time
 759 
 760 .align  16
 761 .LCBC_DECRYPT:
 762         xchg    $key,$keyend
 763         add     \$15,%rdx
 764         and     \$15,%rcx               # length residue
 765         and     \$-16,%rdx
 766         mov     $key,$_key
 767         lea     ($inp,%rdx),%rdx
 768         mov     %rdx,$_end
 769         mov     %rcx,$_res
 770 
 771         mov     (%rbx),%rax             # load IV
 772         mov     8(%rbx),%rbx
 773         jmp     .Lcbc_dloop
 774 .align  16
 775 .Lcbc_dloop:
 776         mov     0($inp),@S[0]
 777         mov     4($inp),@S[1]
 778         mov     8($inp),@S[2]
 779         bswap   @S[0]
 780         mov     12($inp),@S[3]
 781         bswap   @S[1]
 782         mov     %rax,0+$ivec            # save IV to temporary storage
 783         bswap   @S[2]
 784         mov     %rbx,8+$ivec
 785         bswap   @S[3]
 786 
 787         call    _x86_64_Camellia_decrypt
 788 
 789         mov     $_key,$key              # "rewind" the key
 790         mov     $_end,%rdx
 791         mov     $_res,%rcx
 792 
 793         bswap   @S[0]
 794         mov     ($inp),%rax             # load IV for next iteration
 795         bswap   @S[1]
 796         mov     8($inp),%rbx
 797         bswap   @S[2]
 798         xor     0+$ivec,@S[0]
 799         bswap   @S[3]
 800         xor     4+$ivec,@S[1]
 801         xor     8+$ivec,@S[2]
 802         lea     16($inp),$inp
 803         xor     12+$ivec,@S[3]
 804         cmp     %rdx,$inp
 805         je      .Lcbc_ddone
 806 
 807         mov     @S[0],0($out)
 808         mov     @S[1],4($out)
 809         mov     @S[2],8($out)
 810         mov     @S[3],12($out)
 811 
 812         lea     16($out),$out
 813         jmp     .Lcbc_dloop
 814 
 815 .align  16
 816 .Lcbc_ddone:
 817         mov     $_ivp,%rdx
 818         cmp     \$0,%rcx
 819         jne     .Lcbc_dec_tail
 820 
 821         mov     @S[0],0($out)
 822         mov     @S[1],4($out)
 823         mov     @S[2],8($out)
 824         mov     @S[3],12($out)
 825 
 826         mov     %rax,(%rdx)             # write out IV residue
 827         mov     %rbx,8(%rdx)
 828         jmp     .Lcbc_done
 829 .align  16
 830 .Lcbc_dec_tail:
 831         mov     @S[0],0+$ivec
 832         mov     @S[1],4+$ivec
 833         mov     @S[2],8+$ivec
 834         mov     @S[3],12+$ivec
 835 
 836 .Lcbc_dec_pushf:
 837         pushfq
 838         cld
 839         lea     8+$ivec,%rsi
 840         lea     ($out),%rdi
 841         .long   0x9066A4F3              # rep movsb
 842         popfq
 843 .Lcbc_dec_popf:
 844 
 845         mov     %rax,(%rdx)             # write out IV residue
 846         mov     %rbx,8(%rdx)
 847         jmp     .Lcbc_done
 848 
 849 .align  16
 850 .Lcbc_done:
 851         mov     $_rsp,%rcx
 852         mov     0(%rcx),%r15
 853         mov     8(%rcx),%r14
 854         mov     16(%rcx),%r13
 855         mov     24(%rcx),%r12
 856         mov     32(%rcx),%rbp
 857         mov     40(%rcx),%rbx
 858         lea     48(%rcx),%rsp
 859 .Lcbc_abort:
 860         ret
 861 .size   Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
 862 
 863 .asciz  "Camellia for x86_64 by <appro\@openssl.org>"
 864 ___
 865 }
 866 
 867 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
 868 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
 869 if ($win64) {
 870 $rec="%rcx";
 871 $frame="%rdx";
 872 $context="%r8";
 873 $disp="%r9";
 874 
 875 $code.=<<___;
 876 .extern __imp_RtlVirtualUnwind
 877 .type   common_se_handler,\@abi-omnipotent
 878 .align  16
 879 common_se_handler:
 880         push    %rsi
 881         push    %rdi
 882         push    %rbx
 883         push    %rbp
 884         push    %r12
 885         push    %r13
 886         push    %r14
 887         push    %r15
 888         pushfq
 889         lea     -64(%rsp),%rsp
 890 
 891         mov     120($context),%rax      # pull context->Rax
 892         mov     248($context),%rbx      # pull context->Rip
 893 
 894         mov     8($disp),%rsi           # disp->ImageBase
 895         mov     56($disp),%r11          # disp->HandlerData
 896 
 897         mov     0(%r11),%r10d           # HandlerData[0]
 898         lea     (%rsi,%r10),%r10        # prologue label
 899         cmp     %r10,%rbx               # context->Rip<prologue label
 900         jb      .Lin_prologue
 901 
 902         mov     152($context),%rax      # pull context->Rsp
 903 
 904         mov     4(%r11),%r10d           # HandlerData[1]
 905         lea     (%rsi,%r10),%r10        # epilogue label
 906         cmp     %r10,%rbx               # context->Rip>=epilogue label
 907         jae     .Lin_prologue
 908 
 909         lea     40(%rax),%rax
 910         mov     -8(%rax),%rbx
 911         mov     -16(%rax),%rbp
 912         mov     -24(%rax),%r13
 913         mov     -32(%rax),%r14
 914         mov     -40(%rax),%r15
 915         mov     %rbx,144($context)      # restore context->Rbx
 916         mov     %rbp,160($context)      # restore context->Rbp
 917         mov     %r13,224($context)      # restore context->R13
 918         mov     %r14,232($context)      # restore context->R14
 919         mov     %r15,240($context)      # restore context->R15
 920 
 921 .Lin_prologue:
 922         mov     8(%rax),%rdi
 923         mov     16(%rax),%rsi
 924         mov     %rax,152($context)      # restore context->Rsp
 925         mov     %rsi,168($context)      # restore context->Rsi
 926         mov     %rdi,176($context)      # restore context->Rdi
 927 
 928         jmp     .Lcommon_seh_exit
 929 .size   common_se_handler,.-common_se_handler
 930 
 931 .type   cbc_se_handler,\@abi-omnipotent
 932 .align  16
 933 cbc_se_handler:
 934         push    %rsi
 935         push    %rdi
 936         push    %rbx
 937         push    %rbp
 938         push    %r12
 939         push    %r13
 940         push    %r14
 941         push    %r15
 942         pushfq
 943         lea     -64(%rsp),%rsp
 944 
 945         mov     120($context),%rax      # pull context->Rax
 946         mov     248($context),%rbx      # pull context->Rip
 947 
 948         lea     .Lcbc_prologue(%rip),%r10
 949         cmp     %r10,%rbx               # context->Rip<.Lcbc_prologue
 950         jb      .Lin_cbc_prologue
 951 
 952         lea     .Lcbc_body(%rip),%r10
 953         cmp     %r10,%rbx               # context->Rip<.Lcbc_body
 954         jb      .Lin_cbc_frame_setup
 955 
 956         mov     152($context),%rax      # pull context->Rsp
 957 
 958         lea     .Lcbc_abort(%rip),%r10
 959         cmp     %r10,%rbx               # context->Rip>=.Lcbc_abort
 960         jae     .Lin_cbc_prologue
 961 
 962         # handle pushf/popf in Camellia_cbc_encrypt
 963         lea     .Lcbc_enc_pushf(%rip),%r10
 964         cmp     %r10,%rbx               # context->Rip<=.Lcbc_enc_pushf
 965         jbe     .Lin_cbc_no_flag
 966         lea     8(%rax),%rax
 967         lea     .Lcbc_enc_popf(%rip),%r10
 968         cmp     %r10,%rbx               # context->Rip<.Lcbc_enc_popf
 969         jb      .Lin_cbc_no_flag
 970         lea     -8(%rax),%rax
 971         lea     .Lcbc_dec_pushf(%rip),%r10
 972         cmp     %r10,%rbx               # context->Rip<=.Lcbc_dec_pushf
 973         jbe     .Lin_cbc_no_flag
 974         lea     8(%rax),%rax
 975         lea     .Lcbc_dec_popf(%rip),%r10
 976         cmp     %r10,%rbx               # context->Rip<.Lcbc_dec_popf
 977         jb      .Lin_cbc_no_flag
 978         lea     -8(%rax),%rax
 979 
 980 .Lin_cbc_no_flag:
 981         mov     48(%rax),%rax           # $_rsp
 982         lea     48(%rax),%rax
 983 
 984 .Lin_cbc_frame_setup:
 985         mov     -8(%rax),%rbx
 986         mov     -16(%rax),%rbp
 987         mov     -24(%rax),%r12
 988         mov     -32(%rax),%r13
 989         mov     -40(%rax),%r14
 990         mov     -48(%rax),%r15
 991         mov     %rbx,144($context)      # restore context->Rbx
 992         mov     %rbp,160($context)      # restore context->Rbp
 993         mov     %r12,216($context)      # restore context->R12
 994         mov     %r13,224($context)      # restore context->R13
 995         mov     %r14,232($context)      # restore context->R14
 996         mov     %r15,240($context)      # restore context->R15
 997 
 998 .Lin_cbc_prologue:
 999         mov     8(%rax),%rdi
1000         mov     16(%rax),%rsi
1001         mov     %rax,152($context)      # restore context->Rsp
1002         mov     %rsi,168($context)      # restore context->Rsi
1003         mov     %rdi,176($context)      # restore context->Rdi
1004 
1005 .align  4
1006 .Lcommon_seh_exit:
1007 
1008         mov     40($disp),%rdi          # disp->ContextRecord
1009         mov     $context,%rsi           # context
1010         mov     \$`1232/8`,%ecx         # sizeof(CONTEXT)
1011         .long   0xa548f3fc              # cld; rep movsq
1012 
1013         mov     $disp,%rsi
1014         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1015         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1016         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1017         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1018         mov     40(%rsi),%r10           # disp->ContextRecord
1019         lea     56(%rsi),%r11           # &disp->HandlerData
1020         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1021         mov     %r10,32(%rsp)           # arg5
1022         mov     %r11,40(%rsp)           # arg6
1023         mov     %r12,48(%rsp)           # arg7
1024         mov     %rcx,56(%rsp)           # arg8, (NULL)
1025         call    *__imp_RtlVirtualUnwind(%rip)
1026 
1027         mov     \$1,%eax                # ExceptionContinueSearch
1028         lea     64(%rsp),%rsp
1029         popfq
1030         pop     %r15
1031         pop     %r14
1032         pop     %r13
1033         pop     %r12
1034         pop     %rbp
1035         pop     %rbx
1036         pop     %rdi
1037         pop     %rsi
1038         ret
1039 .size   cbc_se_handler,.-cbc_se_handler
1040 
1041 .section        .pdata
1042 .align  4
1043         .rva    .LSEH_begin_Camellia_EncryptBlock_Rounds
1044         .rva    .LSEH_end_Camellia_EncryptBlock_Rounds
1045         .rva    .LSEH_info_Camellia_EncryptBlock_Rounds
1046 
1047         .rva    .LSEH_begin_Camellia_DecryptBlock_Rounds
1048         .rva    .LSEH_end_Camellia_DecryptBlock_Rounds
1049         .rva    .LSEH_info_Camellia_DecryptBlock_Rounds
1050 
1051         .rva    .LSEH_begin_Camellia_Ekeygen
1052         .rva    .LSEH_end_Camellia_Ekeygen
1053         .rva    .LSEH_info_Camellia_Ekeygen
1054 
1055         .rva    .LSEH_begin_Camellia_cbc_encrypt
1056         .rva    .LSEH_end_Camellia_cbc_encrypt
1057         .rva    .LSEH_info_Camellia_cbc_encrypt
1058 
1059 .section        .xdata
1060 .align  8
1061 .LSEH_info_Camellia_EncryptBlock_Rounds:
1062         .byte   9,0,0,0
1063         .rva    common_se_handler
1064         .rva    .Lenc_prologue,.Lenc_epilogue   # HandlerData[]
1065 .LSEH_info_Camellia_DecryptBlock_Rounds:
1066         .byte   9,0,0,0
1067         .rva    common_se_handler
1068         .rva    .Ldec_prologue,.Ldec_epilogue   # HandlerData[]
1069 .LSEH_info_Camellia_Ekeygen:
1070         .byte   9,0,0,0
1071         .rva    common_se_handler
1072         .rva    .Lkey_prologue,.Lkey_epilogue   # HandlerData[]
1073 .LSEH_info_Camellia_cbc_encrypt:
1074         .byte   9,0,0,0
1075         .rva    cbc_se_handler
1076 ___
1077 }
1078 
1079 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1080 print $code;
1081 close STDOUT;