1 #!/usr/bin/env perl
   2 
   3 # ====================================================================
   4 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
   5 #
   6 # This module may be used under the terms of either the GNU General
   7 # Public License version 2 or later, the GNU Lesser General Public
   8 # License version 2.1 or later, the Mozilla Public License version
   9 # 1.1 or the BSD License. The exact terms of either license are
  10 # distributed along with this module. For further details see
  11 # http://www.openssl.org/~appro/camellia/.
  12 # ====================================================================
  13 
  14 # Performance in cycles per processed byte (less is better) in
  15 # 'openssl speed ...' benchmark:
  16 #
  17 #                       AMD K8  Core2   PIII    P4
  18 # -evp camellia-128-ecb 21.5    22.8    27.0    28.9
  19 # + over gcc 3.4.6      +90/11% +70/10% +53/4%  +160/64%
  20 # + over icc 8.0        +48/19% +21/15% +21/17% +55/37%
  21 #
  22 # camellia-128-cbc      17.3    21.1    23.9    25.9
  23 #
  24 # 128-bit key setup     196     280     256     240     cycles/key
  25 # + over gcc 3.4.6      +30/0%  +17/11% +11/0%  +63/40%
  26 # + over icc 8.0        +18/3%  +10/0%  +10/3%  +21/10%
  27 #
  28 # Pairs of numbers in "+" rows represent performance improvement over
  29 # compiler generated position-independent code, PIC, and non-PIC
  30 # respectively. PIC results are of greater relevance, as this module
  31 # is position-independent, i.e. suitable for a shared library or PIE.
  32 # Position independence "costs" one register, which is why compilers
  33 # are so close with non-PIC results, they have an extra register to
  34 # spare. CBC results are better than ECB ones thanks to "zero-copy"
  35 # private _x86_* interface, and are ~30-40% better than with compiler
  36 # generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on
  37 # same CPU (where applicable).
  38 
  39 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  40 push(@INC,"${dir}","${dir}../../perlasm");
  41 require "x86asm.pl";
  42 
  43 $OPENSSL=1;
  44 
  45 &asm_init($ARGV[0],"cmll-586.pl",$ARGV[$#ARGV] eq "386");
  46 
  47 @T=("eax","ebx","ecx","edx");
  48 $idx="esi";
  49 $key="edi";
  50 $Tbl="ebp";
  51 
  52 # stack frame layout in _x86_Camellia_* routines, frame is allocated
  53 # by caller
  54 $__ra=&DWP(0,"esp");        # return address
  55 $__s0=&DWP(4,"esp");        # s0 backing store
  56 $__s1=&DWP(8,"esp");        # s1 backing store
  57 $__s2=&DWP(12,"esp");       # s2 backing store
  58 $__s3=&DWP(16,"esp");       # s3 backing store
  59 $__end=&DWP(20,"esp");      # pointer to end/start of key schedule
  60 
  61 # stack frame layout in Camellia_[en|crypt] routines, which differs from
  62 # above by 4 and overlaps by pointer to end/start of key schedule
  63 $_end=&DWP(16,"esp");
  64 $_esp=&DWP(20,"esp");
  65 
  66 # const unsigned int Camellia_SBOX[4][256];
  67 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
  68 # and [2][] - with [3][]. This is done to optimize code size.
  69 $SBOX1_1110=0;          # Camellia_SBOX[0]
  70 $SBOX4_4404=4;          # Camellia_SBOX[1]
  71 $SBOX2_0222=2048;       # Camellia_SBOX[2]
  72 $SBOX3_3033=2052;       # Camellia_SBOX[3]
  73 &static_label("Camellia_SIGMA");
  74 &static_label("Camellia_SBOX");
  75 
  76 sub Camellia_Feistel {
  77 my $i=@_[0];
  78 my $seed=defined(@_[1])?@_[1]:0;
  79 my $scale=$seed<0?-8:8;
  80 my $frame=defined(@_[2])?@_[2]:0;
  81 my $j=($i&1)*2;
  82 my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4];
  83 
  84         &xor        ($t0,$idx);                             # t0^=key[0]
  85         &xor        ($t1,&DWP($seed+$i*$scale+4,$key)); # t1^=key[1]
  86         &movz       ($idx,&HB($t0));                    # (t0>>8)&0xff
  87         &mov        ($t3,&DWP($SBOX3_3033,$Tbl,$idx,8));        # t3=SBOX3_3033[0]
  88         &movz       ($idx,&LB($t0));                    # (t0>>0)&0xff
  89         &xor        ($t3,&DWP($SBOX4_4404,$Tbl,$idx,8));        # t3^=SBOX4_4404[0]
  90         &shr        ($t0,16);
  91         &movz       ($idx,&LB($t1));                    # (t1>>0)&0xff
  92         &mov        ($t2,&DWP($SBOX1_1110,$Tbl,$idx,8));        # t2=SBOX1_1110[1]
  93         &movz       ($idx,&HB($t0));                    # (t0>>24)&0xff
  94         &xor        ($t3,&DWP($SBOX1_1110,$Tbl,$idx,8));        # t3^=SBOX1_1110[0]
  95         &movz       ($idx,&HB($t1));                    # (t1>>8)&0xff
  96         &xor        ($t2,&DWP($SBOX4_4404,$Tbl,$idx,8));        # t2^=SBOX4_4404[1]
  97         &shr        ($t1,16);
  98         &movz       ($t0,&LB($t0));                             # (t0>>16)&0xff
  99         &xor        ($t3,&DWP($SBOX2_0222,$Tbl,$t0,8)); # t3^=SBOX2_0222[0]
 100         &movz       ($idx,&HB($t1));                    # (t1>>24)&0xff
 101         &mov        ($t0,&DWP($frame+4*(($j+3)%4),"esp"));      # prefetch "s3"
 102         &xor        ($t2,$t3);                              # t2^=t3
 103         &rotr       ($t3,8);                                # t3=RightRotate(t3,8)
 104         &xor        ($t2,&DWP($SBOX2_0222,$Tbl,$idx,8));        # t2^=SBOX2_0222[1]
 105         &movz       ($idx,&LB($t1));                    # (t1>>16)&0xff
 106         &mov        ($t1,&DWP($frame+4*(($j+2)%4),"esp"));      # prefetch "s2"
 107         &xor        ($t3,$t0);                              # t3^=s3
 108         &xor        ($t2,&DWP($SBOX3_3033,$Tbl,$idx,8));        # t2^=SBOX3_3033[1]
 109         &mov        ($idx,&DWP($seed+($i+1)*$scale,$key));      # prefetch key[i+1]
 110         &xor        ($t3,$t2);                              # t3^=t2
 111         &mov        (&DWP($frame+4*(($j+3)%4),"esp"),$t3);      # s3=t3
 112         &xor        ($t2,$t1);                              # t2^=s2
 113         &mov        (&DWP($frame+4*(($j+2)%4),"esp"),$t2);      # s2=t2
 114 }
 115 
 116 # void Camellia_EncryptBlock_Rounds(
 117 #               int grandRounds,
 118 #               const Byte plaintext[],
 119 #               const KEY_TABLE_TYPE keyTable,
 120 #               Byte ciphertext[])
 121 &function_begin("Camellia_EncryptBlock_Rounds");
 122         &mov        ("eax",&wparam(0)); # load grandRounds
 123         &mov        ($idx,&wparam(1));  # load plaintext pointer
 124         &mov        ($key,&wparam(2));  # load key schedule pointer
 125 
 126         &mov        ("ebx","esp");
 127         &sub        ("esp",7*4);            # place for s[0-3],keyEnd,esp and ra
 128         &and        ("esp",-64);
 129 
 130         # place stack frame just "above mod 1024" the key schedule
 131         # this ensures that cache associativity of 2 suffices
 132         &lea        ("ecx",&DWP(-64-63,$key));
 133         &sub        ("ecx","esp");
 134         &neg        ("ecx");
 135         &and        ("ecx",0x3C0);  # modulo 1024, but aligned to cache-line
 136         &sub        ("esp","ecx");
 137         &add        ("esp",4);      # 4 is reserved for callee's return address
 138 
 139         &shl        ("eax",6);
 140         &lea        ("eax",&DWP(0,$key,"eax"));
 141         &mov        ($_esp,"ebx");  # save %esp
 142         &mov        ($_end,"eax");  # save keyEnd
 143 
 144         &call       (&label("pic_point"));
 145         &set_label("pic_point");
 146         &blindpop($Tbl);
 147         &lea        ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
 148 
 149         &mov        (@T[0],&DWP(0,$idx));       # load plaintext
 150         &mov        (@T[1],&DWP(4,$idx));
 151         &mov        (@T[2],&DWP(8,$idx));
 152         &bswap      (@T[0]);
 153         &mov        (@T[3],&DWP(12,$idx));
 154         &bswap      (@T[1]);
 155         &bswap      (@T[2]);
 156         &bswap      (@T[3]);
 157 
 158         &call       ("_x86_Camellia_encrypt");
 159 
 160         &mov        ("esp",$_esp);
 161         &bswap      (@T[0]);
 162         &mov        ($idx,&wparam(3));  # load ciphertext pointer
 163         &bswap      (@T[1]);
 164         &bswap      (@T[2]);
 165         &bswap      (@T[3]);
 166         &mov        (&DWP(0,$idx),@T[0]);       # write ciphertext
 167         &mov        (&DWP(4,$idx),@T[1]);
 168         &mov        (&DWP(8,$idx),@T[2]);
 169         &mov        (&DWP(12,$idx),@T[3]);
 170 &function_end("Camellia_EncryptBlock_Rounds");
 171 # V1.x API
 172 &function_begin_B("Camellia_EncryptBlock");
 173         &mov        ("eax",128);
 174         &sub        ("eax",&wparam(0)); # load keyBitLength
 175         &mov        ("eax",3);
 176         &adc        ("eax",0);              # keyBitLength==128?3:4
 177         &mov        (&wparam(0),"eax");
 178         &jmp        (&label("Camellia_EncryptBlock_Rounds"));
 179 &function_end_B("Camellia_EncryptBlock");
 180 
 181 if ($OPENSSL) {
 182 # void Camellia_encrypt(
 183 #               const unsigned char *in,
 184 #               unsigned char *out,
 185 #               const CAMELLIA_KEY *key)
 186 &function_begin("Camellia_encrypt");
 187         &mov        ($idx,&wparam(0));  # load plaintext pointer
 188         &mov        ($key,&wparam(2));  # load key schedule pointer
 189 
 190         &mov        ("ebx","esp");
 191         &sub        ("esp",7*4);            # place for s[0-3],keyEnd,esp and ra
 192         &and        ("esp",-64);
 193         &mov        ("eax",&DWP(272,$key));     # load grandRounds counter
 194 
 195         # place stack frame just "above mod 1024" the key schedule
 196         # this ensures that cache associativity of 2 suffices
 197         &lea        ("ecx",&DWP(-64-63,$key));
 198         &sub        ("ecx","esp");
 199         &neg        ("ecx");
 200         &and        ("ecx",0x3C0);  # modulo 1024, but aligned to cache-line
 201         &sub        ("esp","ecx");
 202         &add        ("esp",4);      # 4 is reserved for callee's return address
 203 
 204         &shl        ("eax",6);
 205         &lea        ("eax",&DWP(0,$key,"eax"));
 206         &mov        ($_esp,"ebx");  # save %esp
 207         &mov        ($_end,"eax");  # save keyEnd
 208 
 209         &call       (&label("pic_point"));
 210         &set_label("pic_point");
 211         &blindpop($Tbl);
 212         &lea        ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
 213 
 214         &mov        (@T[0],&DWP(0,$idx));       # load plaintext
 215         &mov        (@T[1],&DWP(4,$idx));
 216         &mov        (@T[2],&DWP(8,$idx));
 217         &bswap      (@T[0]);
 218         &mov        (@T[3],&DWP(12,$idx));
 219         &bswap      (@T[1]);
 220         &bswap      (@T[2]);
 221         &bswap      (@T[3]);
 222 
 223         &call       ("_x86_Camellia_encrypt");
 224 
 225         &mov        ("esp",$_esp);
 226         &bswap      (@T[0]);
 227         &mov        ($idx,&wparam(1));  # load ciphertext pointer
 228         &bswap      (@T[1]);
 229         &bswap      (@T[2]);
 230         &bswap      (@T[3]);
 231         &mov        (&DWP(0,$idx),@T[0]);       # write ciphertext
 232         &mov        (&DWP(4,$idx),@T[1]);
 233         &mov        (&DWP(8,$idx),@T[2]);
 234         &mov        (&DWP(12,$idx),@T[3]);
 235 &function_end("Camellia_encrypt");
 236 }
 237 
 238 &function_begin_B("_x86_Camellia_encrypt");
 239         &xor        (@T[0],&DWP(0,$key));       # ^=key[0-3]
 240         &xor        (@T[1],&DWP(4,$key));
 241         &xor        (@T[2],&DWP(8,$key));
 242         &xor        (@T[3],&DWP(12,$key));
 243         &mov        ($idx,&DWP(16,$key));       # prefetch key[4]
 244 
 245         &mov        ($__s0,@T[0]);          # save s[0-3]
 246         &mov        ($__s1,@T[1]);
 247         &mov        ($__s2,@T[2]);
 248         &mov        ($__s3,@T[3]);
 249 
 250 &set_label("loop",16);
 251         for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); }
 252 
 253         &add        ($key,16*4);
 254         &cmp        ($key,$__end);
 255         &je (&label("done"));
 256 
 257         # @T[0-1] are preloaded, $idx is preloaded with key[0]
 258         &and        ($idx,@T[0]);
 259          &mov        (@T[3],$__s3);
 260         &rotl       ($idx,1);
 261          &mov        (@T[2],@T[3]);
 262         &xor        (@T[1],$idx);
 263          &or         (@T[2],&DWP(12,$key));
 264         &mov        ($__s1,@T[1]);          # s1^=LeftRotate(s0&key[0],1);
 265          &xor        (@T[2],$__s2);
 266 
 267         &mov        ($idx,&DWP(4,$key));
 268          &mov        ($__s2,@T[2]);         # s2^=s3|key[3];
 269         &or ($idx,@T[1]);
 270          &and        (@T[2],&DWP(8,$key));
 271         &xor        (@T[0],$idx);
 272          &rotl       (@T[2],1);
 273         &mov        ($__s0,@T[0]);          # s0^=s1|key[1];
 274          &xor        (@T[3],@T[2]);
 275         &mov        ($idx,&DWP(16,$key));               # prefetch key[4]
 276          &mov        ($__s3,@T[3]);         # s3^=LeftRotate(s2&key[2],1);
 277         &jmp        (&label("loop"));
 278 
 279 &set_label("done",8);
 280         &mov        (@T[2],@T[0]);          # SwapHalf
 281         &mov        (@T[3],@T[1]);
 282         &mov        (@T[0],$__s2);
 283         &mov        (@T[1],$__s3);
 284         &xor        (@T[0],$idx);           # $idx is preloaded with key[0]
 285         &xor        (@T[1],&DWP(4,$key));
 286         &xor        (@T[2],&DWP(8,$key));
 287         &xor        (@T[3],&DWP(12,$key));
 288         &ret        ();
 289 &function_end_B("_x86_Camellia_encrypt");
 290 
 291 # void Camellia_DecryptBlock_Rounds(
 292 #               int grandRounds,
 293 #               const Byte ciphertext[],
 294 #               const KEY_TABLE_TYPE keyTable,
 295 #               Byte plaintext[])
 296 &function_begin("Camellia_DecryptBlock_Rounds");
 297         &mov        ("eax",&wparam(0)); # load grandRounds
 298         &mov        ($idx,&wparam(1));  # load ciphertext pointer
 299         &mov        ($key,&wparam(2));  # load key schedule pointer
 300 
 301         &mov        ("ebx","esp");
 302         &sub        ("esp",7*4);            # place for s[0-3],keyEnd,esp and ra
 303         &and        ("esp",-64);
 304 
 305         # place stack frame just "above mod 1024" the key schedule
 306         # this ensures that cache associativity of 2 suffices
 307         &lea        ("ecx",&DWP(-64-63,$key));
 308         &sub        ("ecx","esp");
 309         &neg        ("ecx");
 310         &and        ("ecx",0x3C0);  # modulo 1024, but aligned to cache-line
 311         &sub        ("esp","ecx");
 312         &add        ("esp",4);      # 4 is reserved for callee's return address
 313 
 314         &shl        ("eax",6);
 315         &mov        (&DWP(4*4,"esp"),$key);     # save keyStart
 316         &lea        ($key,&DWP(0,$key,"eax"));
 317         &mov        (&DWP(5*4,"esp"),"ebx");# save %esp
 318 
 319         &call       (&label("pic_point"));
 320         &set_label("pic_point");
 321         &blindpop($Tbl);
 322         &lea        ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
 323 
 324         &mov        (@T[0],&DWP(0,$idx));       # load ciphertext
 325         &mov        (@T[1],&DWP(4,$idx));
 326         &mov        (@T[2],&DWP(8,$idx));
 327         &bswap      (@T[0]);
 328         &mov        (@T[3],&DWP(12,$idx));
 329         &bswap      (@T[1]);
 330         &bswap      (@T[2]);
 331         &bswap      (@T[3]);
 332 
 333         &call       ("_x86_Camellia_decrypt");
 334 
 335         &mov        ("esp",&DWP(5*4,"esp"));
 336         &bswap      (@T[0]);
 337         &mov        ($idx,&wparam(3));  # load plaintext pointer
 338         &bswap      (@T[1]);
 339         &bswap      (@T[2]);
 340         &bswap      (@T[3]);
 341         &mov        (&DWP(0,$idx),@T[0]);       # write plaintext
 342         &mov        (&DWP(4,$idx),@T[1]);
 343         &mov        (&DWP(8,$idx),@T[2]);
 344         &mov        (&DWP(12,$idx),@T[3]);
 345 &function_end("Camellia_DecryptBlock_Rounds");
 346 # V1.x API
 347 &function_begin_B("Camellia_DecryptBlock");
 348         &mov        ("eax",128);
 349         &sub        ("eax",&wparam(0)); # load keyBitLength
 350         &mov        ("eax",3);
 351         &adc        ("eax",0);              # keyBitLength==128?3:4
 352         &mov        (&wparam(0),"eax");
 353         &jmp        (&label("Camellia_DecryptBlock_Rounds"));
 354 &function_end_B("Camellia_DecryptBlock");
 355 
 356 if ($OPENSSL) {
 357 # void Camellia_decrypt(
 358 #               const unsigned char *in,
 359 #               unsigned char *out,
 360 #               const CAMELLIA_KEY *key)
 361 &function_begin("Camellia_decrypt");
 362         &mov        ($idx,&wparam(0));  # load ciphertext pointer
 363         &mov        ($key,&wparam(2));  # load key schedule pointer
 364 
 365         &mov        ("ebx","esp");
 366         &sub        ("esp",7*4);            # place for s[0-3],keyEnd,esp and ra
 367         &and        ("esp",-64);
 368         &mov        ("eax",&DWP(272,$key));     # load grandRounds counter
 369 
 370         # place stack frame just "above mod 1024" the key schedule
 371         # this ensures that cache associativity of 2 suffices
 372         &lea        ("ecx",&DWP(-64-63,$key));
 373         &sub        ("ecx","esp");
 374         &neg        ("ecx");
 375         &and        ("ecx",0x3C0);  # modulo 1024, but aligned to cache-line
 376         &sub        ("esp","ecx");
 377         &add        ("esp",4);      # 4 is reserved for callee's return address
 378 
 379         &shl        ("eax",6);
 380         &mov        (&DWP(4*4,"esp"),$key);     # save keyStart
 381         &lea        ($key,&DWP(0,$key,"eax"));
 382         &mov        (&DWP(5*4,"esp"),"ebx");# save %esp
 383 
 384         &call       (&label("pic_point"));
 385         &set_label("pic_point");
 386         &blindpop($Tbl);
 387         &lea        ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
 388 
 389         &mov        (@T[0],&DWP(0,$idx));       # load ciphertext
 390         &mov        (@T[1],&DWP(4,$idx));
 391         &mov        (@T[2],&DWP(8,$idx));
 392         &bswap      (@T[0]);
 393         &mov        (@T[3],&DWP(12,$idx));
 394         &bswap      (@T[1]);
 395         &bswap      (@T[2]);
 396         &bswap      (@T[3]);
 397 
 398         &call       ("_x86_Camellia_decrypt");
 399 
 400         &mov        ("esp",&DWP(5*4,"esp"));
 401         &bswap      (@T[0]);
 402         &mov        ($idx,&wparam(1));  # load plaintext pointer
 403         &bswap      (@T[1]);
 404         &bswap      (@T[2]);
 405         &bswap      (@T[3]);
 406         &mov        (&DWP(0,$idx),@T[0]);       # write plaintext
 407         &mov        (&DWP(4,$idx),@T[1]);
 408         &mov        (&DWP(8,$idx),@T[2]);
 409         &mov        (&DWP(12,$idx),@T[3]);
 410 &function_end("Camellia_decrypt");
 411 }
 412 
 413 &function_begin_B("_x86_Camellia_decrypt");
 414         &xor        (@T[0],&DWP(0,$key));       # ^=key[0-3]
 415         &xor        (@T[1],&DWP(4,$key));
 416         &xor        (@T[2],&DWP(8,$key));
 417         &xor        (@T[3],&DWP(12,$key));
 418         &mov        ($idx,&DWP(-8,$key));       # prefetch key[-2]
 419 
 420         &mov        ($__s0,@T[0]);          # save s[0-3]
 421         &mov        ($__s1,@T[1]);
 422         &mov        ($__s2,@T[2]);
 423         &mov        ($__s3,@T[3]);
 424 
 425 &set_label("loop",16);
 426         for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); }
 427 
 428         &sub        ($key,16*4);
 429         &cmp        ($key,$__end);
 430         &je (&label("done"));
 431 
 432         # @T[0-1] are preloaded, $idx is preloaded with key[2]
 433         &and        ($idx,@T[0]);
 434          &mov        (@T[3],$__s3);
 435         &rotl       ($idx,1);
 436          &mov        (@T[2],@T[3]);
 437         &xor        (@T[1],$idx);
 438          &or         (@T[2],&DWP(4,$key));
 439         &mov        ($__s1,@T[1]);          # s1^=LeftRotate(s0&key[0],1);
 440          &xor        (@T[2],$__s2);
 441 
 442         &mov        ($idx,&DWP(12,$key));
 443          &mov        ($__s2,@T[2]);         # s2^=s3|key[3];
 444         &or ($idx,@T[1]);
 445          &and        (@T[2],&DWP(0,$key));
 446         &xor        (@T[0],$idx);
 447          &rotl       (@T[2],1);
 448         &mov        ($__s0,@T[0]);          # s0^=s1|key[1];
 449          &xor        (@T[3],@T[2]);
 450         &mov        ($idx,&DWP(-8,$key));       # prefetch key[4]
 451          &mov        ($__s3,@T[3]);         # s3^=LeftRotate(s2&key[2],1);
 452         &jmp        (&label("loop"));
 453 
 454 &set_label("done",8);
 455         &mov        (@T[2],@T[0]);          # SwapHalf
 456         &mov        (@T[3],@T[1]);
 457         &mov        (@T[0],$__s2);
 458         &mov        (@T[1],$__s3);
 459         &xor        (@T[2],$idx);           # $idx is preloaded with key[2]
 460         &xor        (@T[3],&DWP(12,$key));
 461         &xor        (@T[0],&DWP(0,$key));
 462         &xor        (@T[1],&DWP(4,$key));
 463         &ret        ();
 464 &function_end_B("_x86_Camellia_decrypt");
 465 
 466 # shld is very slow on Intel P4 family. Even on AMD it limits
 467 # instruction decode rate [because it's VectorPath] and consequently
 468 # performance. PIII, PM and Core[2] seem to be the only ones which
 469 # execute this code ~7% faster...
 470 sub __rotl128 {
 471   my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
 472 
 473     $rnd *= 2;
 474     if ($rot) {
 475         &mov        ($idx,$i0);
 476         &shld       ($i0,$i1,$rot);
 477         &shld       ($i1,$i2,$rot);
 478         &shld       ($i2,$i3,$rot);
 479         &shld       ($i3,$idx,$rot);
 480     }
 481     &mov    (&DWP(-128+4*$rnd++,$key),shift(@T))        if ($i0 eq @T[0]);
 482     &mov    (&DWP(-128+4*$rnd++,$key),shift(@T))        if ($i1 eq @T[0]);
 483     &mov    (&DWP(-128+4*$rnd++,$key),shift(@T))        if ($i2 eq @T[0]);
 484     &mov    (&DWP(-128+4*$rnd++,$key),shift(@T))        if ($i3 eq @T[0]);
 485 }
 486 
 487 # ... Implementing 128-bit rotate without shld gives >3x performance
 488 # improvement on P4, only ~7% degradation on other Intel CPUs and
 489 # not worse performance on AMD. This is therefore preferred.
 490 sub _rotl128 {
 491   my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_;
 492 
 493     $rnd *= 2;
 494     if ($rot) {
 495         &mov        ($Tbl,$i0);
 496         &shl        ($i0,$rot);
 497         &mov        ($idx,$i1);
 498         &shr        ($idx,32-$rot);
 499         &shl        ($i1,$rot);
 500         &or ($i0,$idx);
 501         &mov        ($idx,$i2);
 502         &shl        ($i2,$rot);
 503         &mov        (&DWP(-128+4*$rnd++,$key),shift(@T))        if ($i0 eq @T[0]);
 504         &shr        ($idx,32-$rot);
 505         &or ($i1,$idx);
 506         &shr        ($Tbl,32-$rot);
 507         &mov        ($idx,$i3);
 508         &shr        ($idx,32-$rot);
 509         &mov        (&DWP(-128+4*$rnd++,$key),shift(@T))        if ($i1 eq @T[0]);
 510         &shl        ($i3,$rot);
 511         &or ($i2,$idx);
 512         &or ($i3,$Tbl);
 513         &mov        (&DWP(-128+4*$rnd++,$key),shift(@T))        if ($i2 eq @T[0]);
 514         &mov        (&DWP(-128+4*$rnd++,$key),shift(@T))        if ($i3 eq @T[0]);
 515     } else {
 516         &mov        (&DWP(-128+4*$rnd++,$key),shift(@T))        if ($i0 eq @T[0]);
 517         &mov        (&DWP(-128+4*$rnd++,$key),shift(@T))        if ($i1 eq @T[0]);
 518         &mov        (&DWP(-128+4*$rnd++,$key),shift(@T))        if ($i2 eq @T[0]);
 519         &mov        (&DWP(-128+4*$rnd++,$key),shift(@T))        if ($i3 eq @T[0]);
 520     }
 521 }
 522 
 523 sub _saveround {
 524 my ($rnd,$key,@T)=@_;
 525 my $bias=int(@T[0])?shift(@T):0;
 526 
 527         &mov        (&DWP($bias+$rnd*8+0,$key),@T[0]);
 528         &mov        (&DWP($bias+$rnd*8+4,$key),@T[1])   if ($#T>=1);
 529         &mov        (&DWP($bias+$rnd*8+8,$key),@T[2])   if ($#T>=2);
 530         &mov        (&DWP($bias+$rnd*8+12,$key),@T[3])  if ($#T>=3);
 531 }
 532 
 533 sub _loadround {
 534 my ($rnd,$key,@T)=@_;
 535 my $bias=int(@T[0])?shift(@T):0;
 536 
 537         &mov        (@T[0],&DWP($bias+$rnd*8+0,$key));
 538         &mov        (@T[1],&DWP($bias+$rnd*8+4,$key))   if ($#T>=1);
 539         &mov        (@T[2],&DWP($bias+$rnd*8+8,$key))   if ($#T>=2);
 540         &mov        (@T[3],&DWP($bias+$rnd*8+12,$key))  if ($#T>=3);
 541 }
 542 
 543 # void Camellia_Ekeygen(
 544 #               const int keyBitLength,
 545 #               const Byte *rawKey,
 546 #               KEY_TABLE_TYPE keyTable)
 547 &function_begin("Camellia_Ekeygen");
 548 { my $step=0;
 549 
 550         &stack_push(4);                             # place for s[0-3]
 551 
 552         &mov        ($Tbl,&wparam(0));          # load arguments
 553         &mov        ($idx,&wparam(1));
 554         &mov        ($key,&wparam(2));
 555 
 556         &mov        (@T[0],&DWP(0,$idx));               # load 0-127 bits
 557         &mov        (@T[1],&DWP(4,$idx));
 558         &mov        (@T[2],&DWP(8,$idx));
 559         &mov        (@T[3],&DWP(12,$idx));
 560 
 561         &bswap      (@T[0]);
 562         &bswap      (@T[1]);
 563         &bswap      (@T[2]);
 564         &bswap      (@T[3]);
 565 
 566         &_saveround (0,$key,@T);            # KL<<<0
 567 
 568         &cmp        ($Tbl,128);
 569         &je (&label("1st128"));
 570 
 571         &mov        (@T[0],&DWP(16,$idx));              # load 128-191 bits
 572         &mov        (@T[1],&DWP(20,$idx));
 573         &cmp        ($Tbl,192);
 574         &je (&label("1st192"));
 575         &mov        (@T[2],&DWP(24,$idx));              # load 192-255 bits
 576         &mov        (@T[3],&DWP(28,$idx));
 577         &jmp        (&label("1st256"));
 578 &set_label("1st192",4);
 579         &mov        (@T[2],@T[0]);
 580         &mov        (@T[3],@T[1]);
 581         &not        (@T[2]);
 582         &not        (@T[3]);
 583 &set_label("1st256",4);
 584         &bswap      (@T[0]);
 585         &bswap      (@T[1]);
 586         &bswap      (@T[2]);
 587         &bswap      (@T[3]);
 588 
 589         &_saveround (4,$key,@T);            # temporary storage for KR!
 590 
 591         &xor        (@T[0],&DWP(0*8+0,$key));   # KR^KL
 592         &xor        (@T[1],&DWP(0*8+4,$key));
 593         &xor        (@T[2],&DWP(1*8+0,$key));
 594         &xor        (@T[3],&DWP(1*8+4,$key));
 595 
 596 &set_label("1st128",4);
 597         &call       (&label("pic_point"));
 598         &set_label("pic_point");
 599         &blindpop($Tbl);
 600         &lea        ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
 601         &lea        ($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl));
 602 
 603         &mov        ($idx,&DWP($step*8,$key));  # prefetch SIGMA[0]
 604         &mov        (&swtmp(0),@T[0]);          # save s[0-3]
 605         &mov        (&swtmp(1),@T[1]);
 606         &mov        (&swtmp(2),@T[2]);
 607         &mov        (&swtmp(3),@T[3]);
 608         &Camellia_Feistel($step++);
 609         &Camellia_Feistel($step++);
 610         &mov        (@T[2],&swtmp(2));
 611         &mov        (@T[3],&swtmp(3));
 612 
 613         &mov        ($idx,&wparam(2));
 614         &xor        (@T[0],&DWP(0*8+0,$idx));   # ^KL
 615         &xor        (@T[1],&DWP(0*8+4,$idx));
 616         &xor        (@T[2],&DWP(1*8+0,$idx));
 617         &xor        (@T[3],&DWP(1*8+4,$idx));
 618 
 619         &mov        ($idx,&DWP($step*8,$key));  # prefetch SIGMA[4]
 620         &mov        (&swtmp(0),@T[0]);          # save s[0-3]
 621         &mov        (&swtmp(1),@T[1]);
 622         &mov        (&swtmp(2),@T[2]);
 623         &mov        (&swtmp(3),@T[3]);
 624         &Camellia_Feistel($step++);
 625         &Camellia_Feistel($step++);
 626         &mov        (@T[2],&swtmp(2));
 627         &mov        (@T[3],&swtmp(3));
 628 
 629         &mov        ($idx,&wparam(0));
 630         &cmp        ($idx,128);
 631         &jne        (&label("2nd256"));
 632 
 633         &mov        ($key,&wparam(2));
 634         &lea        ($key,&DWP(128,$key));              # size optimization
 635 
 636         ####### process KA
 637         &_saveround (2,$key,-128,@T);       # KA<<<0
 638         &_rotl128   (@T,15,6,@T);           # KA<<<15
 639         &_rotl128   (@T,15,8,@T);           # KA<<<(15+15=30)
 640         &_rotl128   (@T,15,12,@T[0],@T[1]); # KA<<<(30+15=45)
 641         &_rotl128   (@T,15,14,@T);          # KA<<<(45+15=60)
 642         push            (@T,shift(@T));         # rotl128(@T,32);
 643         &_rotl128   (@T,2,20,@T);           # KA<<<(60+32+2=94)
 644         &_rotl128   (@T,17,24,@T);          # KA<<<(94+17=111)
 645 
 646         ####### process KL
 647         &_loadround (0,$key,-128,@T);       # load KL
 648         &_rotl128   (@T,15,4,@T);           # KL<<<15
 649         &_rotl128   (@T,30,10,@T);          # KL<<<(15+30=45)
 650         &_rotl128   (@T,15,13,@T[2],@T[3]); # KL<<<(45+15=60)
 651         &_rotl128   (@T,17,16,@T);          # KL<<<(60+17=77)
 652         &_rotl128   (@T,17,18,@T);          # KL<<<(77+17=94)
 653         &_rotl128   (@T,17,22,@T);          # KL<<<(94+17=111)
 654 
 655         while (@T[0] ne "eax")                  # restore order
 656         {   unshift     (@T,pop(@T));   }
 657 
 658         &mov        ("eax",3);                      # 3 grandRounds
 659         &jmp        (&label("done"));
 660 
 661 &set_label("2nd256",16);
 662         &mov        ($idx,&wparam(2));
 663         &_saveround (6,$idx,@T);            # temporary storage for KA!
 664 
 665         &xor        (@T[0],&DWP(4*8+0,$idx));   # KA^KR
 666         &xor        (@T[1],&DWP(4*8+4,$idx));
 667         &xor        (@T[2],&DWP(5*8+0,$idx));
 668         &xor        (@T[3],&DWP(5*8+4,$idx));
 669 
 670         &mov        ($idx,&DWP($step*8,$key));  # prefetch SIGMA[8]
 671         &mov        (&swtmp(0),@T[0]);          # save s[0-3]
 672         &mov        (&swtmp(1),@T[1]);
 673         &mov        (&swtmp(2),@T[2]);
 674         &mov        (&swtmp(3),@T[3]);
 675         &Camellia_Feistel($step++);
 676         &Camellia_Feistel($step++);
 677         &mov        (@T[2],&swtmp(2));
 678         &mov        (@T[3],&swtmp(3));
 679 
 680         &mov        ($key,&wparam(2));
 681         &lea        ($key,&DWP(128,$key));              # size optimization
 682 
 683         ####### process KB
 684         &_saveround (2,$key,-128,@T);       # KB<<<0
 685         &_rotl128   (@T,30,10,@T);          # KB<<<30
 686         &_rotl128   (@T,30,20,@T);          # KB<<<(30+30=60)
 687         push            (@T,shift(@T));         # rotl128(@T,32);
 688         &_rotl128   (@T,19,32,@T);          # KB<<<(60+32+19=111)
 689 
 690         ####### process KR
 691         &_loadround (4,$key,-128,@T);       # load KR
 692         &_rotl128   (@T,15,4,@T);           # KR<<<15
 693         &_rotl128   (@T,15,8,@T);           # KR<<<(15+15=30)
 694         &_rotl128   (@T,30,18,@T);          # KR<<<(30+30=60)
 695         push            (@T,shift(@T));         # rotl128(@T,32);
 696         &_rotl128   (@T,2,26,@T);           # KR<<<(60+32+2=94)
 697 
 698         ####### process KA
 699         &_loadround (6,$key,-128,@T);       # load KA
 700         &_rotl128   (@T,15,6,@T);           # KA<<<15
 701         &_rotl128   (@T,30,14,@T);          # KA<<<(15+30=45)
 702         push            (@T,shift(@T));         # rotl128(@T,32);
 703         &_rotl128   (@T,0,24,@T);           # KA<<<(45+32+0=77)
 704         &_rotl128   (@T,17,28,@T);          # KA<<<(77+17=94)
 705 
 706         ####### process KL
 707         &_loadround (0,$key,-128,@T);       # load KL
 708         push            (@T,shift(@T));         # rotl128(@T,32);
 709         &_rotl128   (@T,13,12,@T);          # KL<<<(32+13=45)
 710         &_rotl128   (@T,15,16,@T);          # KL<<<(45+15=60)
 711         &_rotl128   (@T,17,22,@T);          # KL<<<(60+17=77)
 712         push            (@T,shift(@T));         # rotl128(@T,32);
 713         &_rotl128   (@T,2,30,@T);           # KL<<<(77+32+2=111)
 714 
 715         while (@T[0] ne "eax")                  # restore order
 716         {   unshift     (@T,pop(@T));   }
 717 
 718         &mov        ("eax",4);                      # 4 grandRounds
 719 &set_label("done");
 720         &lea        ("edx",&DWP(272-128,$key)); # end of key schedule
 721         &stack_pop(4);
 722 }
 723 &function_end("Camellia_Ekeygen");
 724 
 725 if ($OPENSSL) {
 726 # int private_Camellia_set_key (
 727 #               const unsigned char *userKey,
 728 #               int bits,
 729 #               CAMELLIA_KEY *key)
 730 &function_begin_B("private_Camellia_set_key");
 731         &push       ("ebx");
 732         &mov        ("ecx",&wparam(0)); # pull arguments
 733         &mov        ("ebx",&wparam(1));
 734         &mov        ("edx",&wparam(2));
 735 
 736         &mov        ("eax",-1);
 737         &test       ("ecx","ecx");
 738         &jz (&label("done"));   # userKey==NULL?
 739         &test       ("edx","edx");
 740         &jz (&label("done"));   # key==NULL?
 741 
 742         &mov        ("eax",-2);
 743         &cmp        ("ebx",256);
 744         &je (&label("arg_ok")); # bits==256?
 745         &cmp        ("ebx",192);
 746         &je (&label("arg_ok")); # bits==192?
 747         &cmp        ("ebx",128);
 748         &jne        (&label("done"));   # bits!=128?
 749 &set_label("arg_ok",4);
 750 
 751         &push       ("edx");                # push arguments
 752         &push       ("ecx");
 753         &push       ("ebx");
 754         &call       ("Camellia_Ekeygen");
 755         &stack_pop(3);
 756 
 757         # eax holds grandRounds and edx points at where to put it
 758         &mov        (&DWP(0,"edx"),"eax");
 759         &xor        ("eax","eax");
 760 &set_label("done",4);
 761         &pop        ("ebx");
 762         &ret        ();
 763 &function_end_B("private_Camellia_set_key");
 764 }
 765 
 766 @SBOX=(
 767 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
 768  35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
 769 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
 770 166,225, 57,202,213, 71, 93, 61,217,  1, 90,214, 81, 86,108, 77,
 771 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
 772 223, 76,203,194, 52,126,118,  5,109,183,169, 49,209, 23,  4,215,
 773  20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
 774 254, 68,207,178,195,181,122,145, 36,  8,232,168, 96,252,105, 80,
 775 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
 776  16,196,  0, 72,163,247,117,219,138,  3,230,218,  9, 63,221,148,
 777 135, 92,131,  2,205, 74,144, 51,115,103,246,243,157,127,191,226,
 778  82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
 779 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
 780 120,152,  6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
 781 114,  7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
 782  64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
 783 
 784 sub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; }
 785 sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; }
 786 sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; }
 787 sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; }
 788 
 789 &set_label("Camellia_SIGMA",64);
 790 &data_word(
 791     0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2,
 792     0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c,
 793     0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd,
 794     0,          0,          0,          0);
 795 &set_label("Camellia_SBOX",64);
 796 # tables are interleaved, remember?
 797 for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
 798 for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
 799 
 800 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
 801 #                       size_t length, const CAMELLIA_KEY *key,
 802 #                       unsigned char *ivp,const int enc);
 803 {
 804 # stack frame layout
 805 #             -4(%esp)          # return address         0(%esp)
 806 #              0(%esp)          # s0                     4(%esp)
 807 #              4(%esp)          # s1                     8(%esp)
 808 #              8(%esp)          # s2                    12(%esp)
 809 #             12(%esp)          # s3                    16(%esp)
 810 #             16(%esp)          # end of key schedule   20(%esp)
 811 #             20(%esp)          # %esp backup
 812 my $_inp=&DWP(24,"esp");    #copy of wparam(0)
 813 my $_out=&DWP(28,"esp");    #copy of wparam(1)
 814 my $_len=&DWP(32,"esp");    #copy of wparam(2)
 815 my $_key=&DWP(36,"esp");    #copy of wparam(3)
 816 my $_ivp=&DWP(40,"esp");    #copy of wparam(4)
 817 my $ivec=&DWP(44,"esp");    #ivec[16]
 818 my $_tmp=&DWP(44,"esp");    #volatile variable [yes, aliases with ivec]
 819 my ($s0,$s1,$s2,$s3) = @T;
 820 
 821 &function_begin("Camellia_cbc_encrypt");
 822         &mov        ($s2 eq "ecx"? $s2 : "",&wparam(2));        # load len
 823         &cmp        ($s2,0);
 824         &je (&label("enc_out"));
 825 
 826         &pushf      ();
 827         &cld        ();
 828 
 829         &mov        ($s0,&wparam(0));   # load inp
 830         &mov        ($s1,&wparam(1));   # load out
 831         #&mov       ($s2,&wparam(2));   # load len
 832         &mov        ($s3,&wparam(3));   # load key
 833         &mov        ($Tbl,&wparam(4));  # load ivp
 834 
 835         # allocate aligned stack frame...
 836         &lea        ($idx,&DWP(-64,"esp"));
 837         &and        ($idx,-64);
 838 
 839         # place stack frame just "above mod 1024" the key schedule
 840         # this ensures that cache associativity of 2 suffices
 841         &lea        ($key,&DWP(-64-63,$s3));
 842         &sub        ($key,$idx);
 843         &neg        ($key);
 844         &and        ($key,0x3C0);   # modulo 1024, but aligned to cache-line
 845         &sub        ($idx,$key);
 846 
 847         &mov        ($key,&wparam(5));  # load enc
 848 
 849         &exch       ("esp",$idx);
 850         &add        ("esp",4);              # reserve for return address!
 851         &mov        ($_esp,$idx);           # save %esp
 852 
 853         &mov        ($_inp,$s0);            # save copy of inp
 854         &mov        ($_out,$s1);            # save copy of out
 855         &mov        ($_len,$s2);            # save copy of len
 856         &mov        ($_key,$s3);            # save copy of key
 857         &mov        ($_ivp,$Tbl);           # save copy of ivp
 858 
 859         &call   (&label("pic_point"));  # make it PIC!
 860         &set_label("pic_point");
 861         &blindpop($Tbl);
 862         &lea    ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl));
 863 
 864         &mov        ($idx,32);
 865         &set_label("prefetch_sbox",4);
 866                 &mov        ($s0,&DWP(0,$Tbl));
 867                 &mov        ($s1,&DWP(32,$Tbl));
 868                 &mov        ($s2,&DWP(64,$Tbl));
 869                 &mov        ($s3,&DWP(96,$Tbl));
 870                 &lea        ($Tbl,&DWP(128,$Tbl));
 871                 &dec        ($idx);
 872         &jnz        (&label("prefetch_sbox"));
 873         &mov        ($s0,$_key);
 874         &sub        ($Tbl,4096);
 875         &mov        ($idx,$_inp);
 876         &mov        ($s3,&DWP(272,$s0));                # load grandRounds
 877 
 878         &cmp        ($key,0);
 879         &je (&label("DECRYPT"));
 880 
 881         &mov        ($s2,$_len);
 882         &mov        ($key,$_ivp);
 883         &shl        ($s3,6);
 884         &lea        ($s3,&DWP(0,$s0,$s3));
 885         &mov        ($_end,$s3);
 886 
 887         &test       ($s2,0xFFFFFFF0);
 888         &jz (&label("enc_tail"));               # short input...
 889 
 890         &mov        ($s0,&DWP(0,$key));         # load iv
 891         &mov        ($s1,&DWP(4,$key));
 892 
 893         &set_label("enc_loop",4);
 894                 &mov        ($s2,&DWP(8,$key));
 895                 &mov        ($s3,&DWP(12,$key));
 896 
 897                 &xor        ($s0,&DWP(0,$idx)); # xor input data
 898                 &xor        ($s1,&DWP(4,$idx));
 899                 &xor        ($s2,&DWP(8,$idx));
 900                 &bswap      ($s0);
 901                 &xor        ($s3,&DWP(12,$idx));
 902                 &bswap      ($s1);
 903                 &mov        ($key,$_key);           # load key
 904                 &bswap      ($s2);
 905                 &bswap      ($s3);
 906 
 907                 &call       ("_x86_Camellia_encrypt");
 908 
 909                 &mov        ($idx,$_inp);           # load inp
 910                 &mov        ($key,$_out);           # load out
 911 
 912                 &bswap      ($s0);
 913                 &bswap      ($s1);
 914                 &bswap      ($s2);
 915                 &mov        (&DWP(0,$key),$s0); # save output data
 916                 &bswap      ($s3);
 917                 &mov        (&DWP(4,$key),$s1);
 918                 &mov        (&DWP(8,$key),$s2);
 919                 &mov        (&DWP(12,$key),$s3);
 920 
 921                 &mov        ($s2,$_len);            # load len
 922 
 923                 &lea        ($idx,&DWP(16,$idx));
 924                 &mov        ($_inp,$idx);           # save inp
 925 
 926                 &lea        ($s3,&DWP(16,$key));
 927                 &mov        ($_out,$s3);            # save out
 928 
 929                 &sub        ($s2,16);
 930                 &test       ($s2,0xFFFFFFF0);
 931                 &mov        ($_len,$s2);            # save len
 932         &jnz        (&label("enc_loop"));
 933         &test       ($s2,15);
 934         &jnz        (&label("enc_tail"));
 935         &mov        ($idx,$_ivp);           # load ivp
 936         &mov        ($s2,&DWP(8,$key)); # restore last dwords
 937         &mov        ($s3,&DWP(12,$key));
 938         &mov        (&DWP(0,$idx),$s0); # save ivec
 939         &mov        (&DWP(4,$idx),$s1);
 940         &mov        (&DWP(8,$idx),$s2);
 941         &mov        (&DWP(12,$idx),$s3);
 942 
 943         &mov        ("esp",$_esp);
 944         &popf       ();
 945     &set_label("enc_out");
 946         &function_end_A();
 947         &pushf      ();                     # kludge, never executed
 948 
 949     &set_label("enc_tail",4);
 950         &mov        ($s0,$key eq "edi" ? $key : "");
 951         &mov        ($key,$_out);                   # load out
 952         &push       ($s0);                          # push ivp
 953         &mov        ($s1,16);
 954         &sub        ($s1,$s2);
 955         &cmp        ($key,$idx);                    # compare with inp
 956         &je (&label("enc_in_place"));
 957         &align      (4);
 958         &data_word(0xA4F3F689);     # rep movsb     # copy input
 959         &jmp        (&label("enc_skip_in_place"));
 960     &set_label("enc_in_place");
 961         &lea        ($key,&DWP(0,$key,$s2));
 962     &set_label("enc_skip_in_place");
 963         &mov        ($s2,$s1);
 964         &xor        ($s0,$s0);
 965         &align      (4);
 966         &data_word(0xAAF3F689);     # rep stosb     # zero tail
 967         &pop        ($key);                         # pop ivp
 968 
 969         &mov        ($idx,$_out);                   # output as input
 970         &mov        ($s0,&DWP(0,$key));
 971         &mov        ($s1,&DWP(4,$key));
 972         &mov        ($_len,16);                     # len=16
 973         &jmp        (&label("enc_loop"));               # one more spin...
 974 
 975 #----------------------------- DECRYPT -----------------------------#
 976 &set_label("DECRYPT",16);
 977         &shl        ($s3,6);
 978         &lea        ($s3,&DWP(0,$s0,$s3));
 979         &mov        ($_end,$s0);
 980         &mov        ($_key,$s3);
 981 
 982         &cmp        ($idx,$_out);
 983         &je (&label("dec_in_place"));   # in-place processing...
 984 
 985         &mov        ($key,$_ivp);                   # load ivp
 986         &mov        ($_tmp,$key);
 987 
 988         &set_label("dec_loop",4);
 989                 &mov        ($s0,&DWP(0,$idx)); # read input
 990                 &mov        ($s1,&DWP(4,$idx));
 991                 &mov        ($s2,&DWP(8,$idx));
 992                 &bswap      ($s0);
 993                 &mov        ($s3,&DWP(12,$idx));
 994                 &bswap      ($s1);
 995                 &mov        ($key,$_key);           # load key
 996                 &bswap      ($s2);
 997                 &bswap      ($s3);
 998 
 999                 &call       ("_x86_Camellia_decrypt");
1000 
1001                 &mov        ($key,$_tmp);           # load ivp
1002                 &mov        ($idx,$_len);           # load len
1003 
1004                 &bswap      ($s0);
1005                 &bswap      ($s1);
1006                 &bswap      ($s2);
1007                 &xor        ($s0,&DWP(0,$key)); # xor iv
1008                 &bswap      ($s3);
1009                 &xor        ($s1,&DWP(4,$key));
1010                 &xor        ($s2,&DWP(8,$key));
1011                 &xor        ($s3,&DWP(12,$key));
1012 
1013                 &sub        ($idx,16);
1014                 &jc (&label("dec_partial"));
1015                 &mov        ($_len,$idx);           # save len
1016                 &mov        ($idx,$_inp);           # load inp
1017                 &mov        ($key,$_out);           # load out
1018 
1019                 &mov        (&DWP(0,$key),$s0); # write output
1020                 &mov        (&DWP(4,$key),$s1);
1021                 &mov        (&DWP(8,$key),$s2);
1022                 &mov        (&DWP(12,$key),$s3);
1023 
1024                 &mov        ($_tmp,$idx);           # save ivp
1025                 &lea        ($idx,&DWP(16,$idx));
1026                 &mov        ($_inp,$idx);           # save inp
1027 
1028                 &lea        ($key,&DWP(16,$key));
1029                 &mov        ($_out,$key);           # save out
1030 
1031         &jnz        (&label("dec_loop"));
1032         &mov        ($key,$_tmp);           # load temp ivp
1033     &set_label("dec_end");
1034         &mov        ($idx,$_ivp);           # load user ivp
1035         &mov        ($s0,&DWP(0,$key)); # load iv
1036         &mov        ($s1,&DWP(4,$key));
1037         &mov        ($s2,&DWP(8,$key));
1038         &mov        ($s3,&DWP(12,$key));
1039         &mov        (&DWP(0,$idx),$s0); # copy back to user
1040         &mov        (&DWP(4,$idx),$s1);
1041         &mov        (&DWP(8,$idx),$s2);
1042         &mov        (&DWP(12,$idx),$s3);
1043         &jmp        (&label("dec_out"));
1044 
1045     &set_label("dec_partial",4);
1046         &lea        ($key,$ivec);
1047         &mov        (&DWP(0,$key),$s0); # dump output to stack
1048         &mov        (&DWP(4,$key),$s1);
1049         &mov        (&DWP(8,$key),$s2);
1050         &mov        (&DWP(12,$key),$s3);
1051         &lea        ($s2 eq "ecx" ? $s2 : "",&DWP(16,$idx));
1052         &mov        ($idx eq "esi" ? $idx : "",$key);
1053         &mov        ($key eq "edi" ? $key : "",$_out);      # load out
1054         &data_word(0xA4F3F689);     # rep movsb             # copy output
1055         &mov        ($key,$_inp);                           # use inp as temp ivp
1056         &jmp        (&label("dec_end"));
1057 
1058     &set_label("dec_in_place",4);
1059         &set_label("dec_in_place_loop");
1060                 &lea        ($key,$ivec);
1061                 &mov        ($s0,&DWP(0,$idx)); # read input
1062                 &mov        ($s1,&DWP(4,$idx));
1063                 &mov        ($s2,&DWP(8,$idx));
1064                 &mov        ($s3,&DWP(12,$idx));
1065 
1066                 &mov        (&DWP(0,$key),$s0); # copy to temp
1067                 &mov        (&DWP(4,$key),$s1);
1068                 &mov        (&DWP(8,$key),$s2);
1069                 &bswap      ($s0);
1070                 &mov        (&DWP(12,$key),$s3);
1071                 &bswap      ($s1);
1072                 &mov        ($key,$_key);           # load key
1073                 &bswap      ($s2);
1074                 &bswap      ($s3);
1075 
1076                 &call       ("_x86_Camellia_decrypt");
1077 
1078                 &mov        ($key,$_ivp);           # load ivp
1079                 &mov        ($idx,$_out);           # load out
1080 
1081                 &bswap      ($s0);
1082                 &bswap      ($s1);
1083                 &bswap      ($s2);
1084                 &xor        ($s0,&DWP(0,$key)); # xor iv
1085                 &bswap      ($s3);
1086                 &xor        ($s1,&DWP(4,$key));
1087                 &xor        ($s2,&DWP(8,$key));
1088                 &xor        ($s3,&DWP(12,$key));
1089 
1090                 &mov        (&DWP(0,$idx),$s0); # write output
1091                 &mov        (&DWP(4,$idx),$s1);
1092                 &mov        (&DWP(8,$idx),$s2);
1093                 &mov        (&DWP(12,$idx),$s3);
1094 
1095                 &lea        ($idx,&DWP(16,$idx));
1096                 &mov        ($_out,$idx);           # save out
1097 
1098                 &lea        ($idx,$ivec);
1099                 &mov        ($s0,&DWP(0,$idx)); # read temp
1100                 &mov        ($s1,&DWP(4,$idx));
1101                 &mov        ($s2,&DWP(8,$idx));
1102                 &mov        ($s3,&DWP(12,$idx));
1103 
1104                 &mov        (&DWP(0,$key),$s0); # copy iv
1105                 &mov        (&DWP(4,$key),$s1);
1106                 &mov        (&DWP(8,$key),$s2);
1107                 &mov        (&DWP(12,$key),$s3);
1108 
1109                 &mov        ($idx,$_inp);           # load inp
1110 
1111                 &lea        ($idx,&DWP(16,$idx));
1112                 &mov        ($_inp,$idx);           # save inp
1113 
1114                 &mov        ($s2,$_len);            # load len
1115                 &sub        ($s2,16);
1116                 &jc (&label("dec_in_place_partial"));
1117                 &mov        ($_len,$s2);            # save len
1118         &jnz        (&label("dec_in_place_loop"));
1119         &jmp        (&label("dec_out"));
1120 
1121     &set_label("dec_in_place_partial",4);
1122         # one can argue if this is actually required...
1123         &mov        ($key eq "edi" ? $key : "",$_out);
1124         &lea        ($idx eq "esi" ? $idx : "",$ivec);
1125         &lea        ($key,&DWP(0,$key,$s2));
1126         &lea        ($idx,&DWP(16,$idx,$s2));
1127         &neg        ($s2 eq "ecx" ? $s2 : "");
1128         &data_word(0xA4F3F689);     # rep movsb     # restore tail
1129 
1130     &set_label("dec_out",4);
1131     &mov    ("esp",$_esp);
1132     &popf   ();
1133 &function_end("Camellia_cbc_encrypt");
1134 }
1135 
1136 &asciz("Camellia for x86 by <appro\@openssl.org>");
1137 
1138 &asm_finish();