1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Copyright (c) 2008 Andy Polyakov <appro@openssl.org> 5 # 6 # This module may be used under the terms of either the GNU General 7 # Public License version 2 or later, the GNU Lesser General Public 8 # License version 2.1 or later, the Mozilla Public License version 9 # 1.1 or the BSD License. The exact terms of either license are 10 # distributed along with this module. For further details see 11 # http://www.openssl.org/~appro/camellia/. 12 # ==================================================================== 13 14 # Performance in cycles per processed byte (less is better) in 15 # 'openssl speed ...' benchmark: 16 # 17 # AMD K8 Core2 PIII P4 18 # -evp camellia-128-ecb 21.5 22.8 27.0 28.9 19 # + over gcc 3.4.6 +90/11% +70/10% +53/4% +160/64% 20 # + over icc 8.0 +48/19% +21/15% +21/17% +55/37% 21 # 22 # camellia-128-cbc 17.3 21.1 23.9 25.9 23 # 24 # 128-bit key setup 196 280 256 240 cycles/key 25 # + over gcc 3.4.6 +30/0% +17/11% +11/0% +63/40% 26 # + over icc 8.0 +18/3% +10/0% +10/3% +21/10% 27 # 28 # Pairs of numbers in "+" rows represent performance improvement over 29 # compiler generated position-independent code, PIC, and non-PIC 30 # respectively. PIC results are of greater relevance, as this module 31 # is position-independent, i.e. suitable for a shared library or PIE. 32 # Position independence "costs" one register, which is why compilers 33 # are so close with non-PIC results, they have an extra register to 34 # spare. CBC results are better than ECB ones thanks to "zero-copy" 35 # private _x86_* interface, and are ~30-40% better than with compiler 36 # generated cmll_cbc.o, and reach ~80-90% of x86_64 performance on 37 # same CPU (where applicable). 38 39 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 40 push(@INC,"${dir}","${dir}../../perlasm"); 41 require "x86asm.pl"; 42 43 $OPENSSL=1; 44 45 &asm_init($ARGV[0],"cmll-586.pl",$ARGV[$#ARGV] eq "386"); 46 47 @T=("eax","ebx","ecx","edx"); 48 $idx="esi"; 49 $key="edi"; 50 $Tbl="ebp"; 51 52 # stack frame layout in _x86_Camellia_* routines, frame is allocated 53 # by caller 54 $__ra=&DWP(0,"esp"); # return address 55 $__s0=&DWP(4,"esp"); # s0 backing store 56 $__s1=&DWP(8,"esp"); # s1 backing store 57 $__s2=&DWP(12,"esp"); # s2 backing store 58 $__s3=&DWP(16,"esp"); # s3 backing store 59 $__end=&DWP(20,"esp"); # pointer to end/start of key schedule 60 61 # stack frame layout in Camellia_[en|crypt] routines, which differs from 62 # above by 4 and overlaps by pointer to end/start of key schedule 63 $_end=&DWP(16,"esp"); 64 $_esp=&DWP(20,"esp"); 65 66 # const unsigned int Camellia_SBOX[4][256]; 67 # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][], 68 # and [2][] - with [3][]. This is done to optimize code size. 69 $SBOX1_1110=0; # Camellia_SBOX[0] 70 $SBOX4_4404=4; # Camellia_SBOX[1] 71 $SBOX2_0222=2048; # Camellia_SBOX[2] 72 $SBOX3_3033=2052; # Camellia_SBOX[3] 73 &static_label("Camellia_SIGMA"); 74 &static_label("Camellia_SBOX"); 75 76 sub Camellia_Feistel { 77 my $i=@_[0]; 78 my $seed=defined(@_[1])?@_[1]:0; 79 my $scale=$seed<0?-8:8; 80 my $frame=defined(@_[2])?@_[2]:0; 81 my $j=($i&1)*2; 82 my $t0=@T[($j)%4],$t1=@T[($j+1)%4],$t2=@T[($j+2)%4],$t3=@T[($j+3)%4]; 83 84 &xor ($t0,$idx); # t0^=key[0] 85 &xor ($t1,&DWP($seed+$i*$scale+4,$key)); # t1^=key[1] 86 &movz ($idx,&HB($t0)); # (t0>>8)&0xff 87 &mov ($t3,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t3=SBOX3_3033[0] 88 &movz ($idx,&LB($t0)); # (t0>>0)&0xff 89 &xor ($t3,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t3^=SBOX4_4404[0] 90 &shr ($t0,16); 91 &movz ($idx,&LB($t1)); # (t1>>0)&0xff 92 &mov ($t2,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t2=SBOX1_1110[1] 93 &movz ($idx,&HB($t0)); # (t0>>24)&0xff 94 &xor ($t3,&DWP($SBOX1_1110,$Tbl,$idx,8)); # t3^=SBOX1_1110[0] 95 &movz ($idx,&HB($t1)); # (t1>>8)&0xff 96 &xor ($t2,&DWP($SBOX4_4404,$Tbl,$idx,8)); # t2^=SBOX4_4404[1] 97 &shr ($t1,16); 98 &movz ($t0,&LB($t0)); # (t0>>16)&0xff 99 &xor ($t3,&DWP($SBOX2_0222,$Tbl,$t0,8)); # t3^=SBOX2_0222[0] 100 &movz ($idx,&HB($t1)); # (t1>>24)&0xff 101 &mov ($t0,&DWP($frame+4*(($j+3)%4),"esp")); # prefetch "s3" 102 &xor ($t2,$t3); # t2^=t3 103 &rotr ($t3,8); # t3=RightRotate(t3,8) 104 &xor ($t2,&DWP($SBOX2_0222,$Tbl,$idx,8)); # t2^=SBOX2_0222[1] 105 &movz ($idx,&LB($t1)); # (t1>>16)&0xff 106 &mov ($t1,&DWP($frame+4*(($j+2)%4),"esp")); # prefetch "s2" 107 &xor ($t3,$t0); # t3^=s3 108 &xor ($t2,&DWP($SBOX3_3033,$Tbl,$idx,8)); # t2^=SBOX3_3033[1] 109 &mov ($idx,&DWP($seed+($i+1)*$scale,$key)); # prefetch key[i+1] 110 &xor ($t3,$t2); # t3^=t2 111 &mov (&DWP($frame+4*(($j+3)%4),"esp"),$t3); # s3=t3 112 &xor ($t2,$t1); # t2^=s2 113 &mov (&DWP($frame+4*(($j+2)%4),"esp"),$t2); # s2=t2 114 } 115 116 # void Camellia_EncryptBlock_Rounds( 117 # int grandRounds, 118 # const Byte plaintext[], 119 # const KEY_TABLE_TYPE keyTable, 120 # Byte ciphertext[]) 121 &function_begin("Camellia_EncryptBlock_Rounds"); 122 &mov ("eax",&wparam(0)); # load grandRounds 123 &mov ($idx,&wparam(1)); # load plaintext pointer 124 &mov ($key,&wparam(2)); # load key schedule pointer 125 126 &mov ("ebx","esp"); 127 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra 128 &and ("esp",-64); 129 130 # place stack frame just "above mod 1024" the key schedule 131 # this ensures that cache associativity of 2 suffices 132 &lea ("ecx",&DWP(-64-63,$key)); 133 &sub ("ecx","esp"); 134 &neg ("ecx"); 135 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line 136 &sub ("esp","ecx"); 137 &add ("esp",4); # 4 is reserved for callee's return address 138 139 &shl ("eax",6); 140 &lea ("eax",&DWP(0,$key,"eax")); 141 &mov ($_esp,"ebx"); # save %esp 142 &mov ($_end,"eax"); # save keyEnd 143 144 &call (&label("pic_point")); 145 &set_label("pic_point"); 146 &blindpop($Tbl); 147 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 148 149 &mov (@T[0],&DWP(0,$idx)); # load plaintext 150 &mov (@T[1],&DWP(4,$idx)); 151 &mov (@T[2],&DWP(8,$idx)); 152 &bswap (@T[0]); 153 &mov (@T[3],&DWP(12,$idx)); 154 &bswap (@T[1]); 155 &bswap (@T[2]); 156 &bswap (@T[3]); 157 158 &call ("_x86_Camellia_encrypt"); 159 160 &mov ("esp",$_esp); 161 &bswap (@T[0]); 162 &mov ($idx,&wparam(3)); # load ciphertext pointer 163 &bswap (@T[1]); 164 &bswap (@T[2]); 165 &bswap (@T[3]); 166 &mov (&DWP(0,$idx),@T[0]); # write ciphertext 167 &mov (&DWP(4,$idx),@T[1]); 168 &mov (&DWP(8,$idx),@T[2]); 169 &mov (&DWP(12,$idx),@T[3]); 170 &function_end("Camellia_EncryptBlock_Rounds"); 171 # V1.x API 172 &function_begin_B("Camellia_EncryptBlock"); 173 &mov ("eax",128); 174 &sub ("eax",&wparam(0)); # load keyBitLength 175 &mov ("eax",3); 176 &adc ("eax",0); # keyBitLength==128?3:4 177 &mov (&wparam(0),"eax"); 178 &jmp (&label("Camellia_EncryptBlock_Rounds")); 179 &function_end_B("Camellia_EncryptBlock"); 180 181 if ($OPENSSL) { 182 # void Camellia_encrypt( 183 # const unsigned char *in, 184 # unsigned char *out, 185 # const CAMELLIA_KEY *key) 186 &function_begin("Camellia_encrypt"); 187 &mov ($idx,&wparam(0)); # load plaintext pointer 188 &mov ($key,&wparam(2)); # load key schedule pointer 189 190 &mov ("ebx","esp"); 191 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra 192 &and ("esp",-64); 193 &mov ("eax",&DWP(272,$key)); # load grandRounds counter 194 195 # place stack frame just "above mod 1024" the key schedule 196 # this ensures that cache associativity of 2 suffices 197 &lea ("ecx",&DWP(-64-63,$key)); 198 &sub ("ecx","esp"); 199 &neg ("ecx"); 200 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line 201 &sub ("esp","ecx"); 202 &add ("esp",4); # 4 is reserved for callee's return address 203 204 &shl ("eax",6); 205 &lea ("eax",&DWP(0,$key,"eax")); 206 &mov ($_esp,"ebx"); # save %esp 207 &mov ($_end,"eax"); # save keyEnd 208 209 &call (&label("pic_point")); 210 &set_label("pic_point"); 211 &blindpop($Tbl); 212 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 213 214 &mov (@T[0],&DWP(0,$idx)); # load plaintext 215 &mov (@T[1],&DWP(4,$idx)); 216 &mov (@T[2],&DWP(8,$idx)); 217 &bswap (@T[0]); 218 &mov (@T[3],&DWP(12,$idx)); 219 &bswap (@T[1]); 220 &bswap (@T[2]); 221 &bswap (@T[3]); 222 223 &call ("_x86_Camellia_encrypt"); 224 225 &mov ("esp",$_esp); 226 &bswap (@T[0]); 227 &mov ($idx,&wparam(1)); # load ciphertext pointer 228 &bswap (@T[1]); 229 &bswap (@T[2]); 230 &bswap (@T[3]); 231 &mov (&DWP(0,$idx),@T[0]); # write ciphertext 232 &mov (&DWP(4,$idx),@T[1]); 233 &mov (&DWP(8,$idx),@T[2]); 234 &mov (&DWP(12,$idx),@T[3]); 235 &function_end("Camellia_encrypt"); 236 } 237 238 &function_begin_B("_x86_Camellia_encrypt"); 239 &xor (@T[0],&DWP(0,$key)); # ^=key[0-3] 240 &xor (@T[1],&DWP(4,$key)); 241 &xor (@T[2],&DWP(8,$key)); 242 &xor (@T[3],&DWP(12,$key)); 243 &mov ($idx,&DWP(16,$key)); # prefetch key[4] 244 245 &mov ($__s0,@T[0]); # save s[0-3] 246 &mov ($__s1,@T[1]); 247 &mov ($__s2,@T[2]); 248 &mov ($__s3,@T[3]); 249 250 &set_label("loop",16); 251 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16,4); } 252 253 &add ($key,16*4); 254 &cmp ($key,$__end); 255 &je (&label("done")); 256 257 # @T[0-1] are preloaded, $idx is preloaded with key[0] 258 &and ($idx,@T[0]); 259 &mov (@T[3],$__s3); 260 &rotl ($idx,1); 261 &mov (@T[2],@T[3]); 262 &xor (@T[1],$idx); 263 &or (@T[2],&DWP(12,$key)); 264 &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1); 265 &xor (@T[2],$__s2); 266 267 &mov ($idx,&DWP(4,$key)); 268 &mov ($__s2,@T[2]); # s2^=s3|key[3]; 269 &or ($idx,@T[1]); 270 &and (@T[2],&DWP(8,$key)); 271 &xor (@T[0],$idx); 272 &rotl (@T[2],1); 273 &mov ($__s0,@T[0]); # s0^=s1|key[1]; 274 &xor (@T[3],@T[2]); 275 &mov ($idx,&DWP(16,$key)); # prefetch key[4] 276 &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1); 277 &jmp (&label("loop")); 278 279 &set_label("done",8); 280 &mov (@T[2],@T[0]); # SwapHalf 281 &mov (@T[3],@T[1]); 282 &mov (@T[0],$__s2); 283 &mov (@T[1],$__s3); 284 &xor (@T[0],$idx); # $idx is preloaded with key[0] 285 &xor (@T[1],&DWP(4,$key)); 286 &xor (@T[2],&DWP(8,$key)); 287 &xor (@T[3],&DWP(12,$key)); 288 &ret (); 289 &function_end_B("_x86_Camellia_encrypt"); 290 291 # void Camellia_DecryptBlock_Rounds( 292 # int grandRounds, 293 # const Byte ciphertext[], 294 # const KEY_TABLE_TYPE keyTable, 295 # Byte plaintext[]) 296 &function_begin("Camellia_DecryptBlock_Rounds"); 297 &mov ("eax",&wparam(0)); # load grandRounds 298 &mov ($idx,&wparam(1)); # load ciphertext pointer 299 &mov ($key,&wparam(2)); # load key schedule pointer 300 301 &mov ("ebx","esp"); 302 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra 303 &and ("esp",-64); 304 305 # place stack frame just "above mod 1024" the key schedule 306 # this ensures that cache associativity of 2 suffices 307 &lea ("ecx",&DWP(-64-63,$key)); 308 &sub ("ecx","esp"); 309 &neg ("ecx"); 310 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line 311 &sub ("esp","ecx"); 312 &add ("esp",4); # 4 is reserved for callee's return address 313 314 &shl ("eax",6); 315 &mov (&DWP(4*4,"esp"),$key); # save keyStart 316 &lea ($key,&DWP(0,$key,"eax")); 317 &mov (&DWP(5*4,"esp"),"ebx");# save %esp 318 319 &call (&label("pic_point")); 320 &set_label("pic_point"); 321 &blindpop($Tbl); 322 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 323 324 &mov (@T[0],&DWP(0,$idx)); # load ciphertext 325 &mov (@T[1],&DWP(4,$idx)); 326 &mov (@T[2],&DWP(8,$idx)); 327 &bswap (@T[0]); 328 &mov (@T[3],&DWP(12,$idx)); 329 &bswap (@T[1]); 330 &bswap (@T[2]); 331 &bswap (@T[3]); 332 333 &call ("_x86_Camellia_decrypt"); 334 335 &mov ("esp",&DWP(5*4,"esp")); 336 &bswap (@T[0]); 337 &mov ($idx,&wparam(3)); # load plaintext pointer 338 &bswap (@T[1]); 339 &bswap (@T[2]); 340 &bswap (@T[3]); 341 &mov (&DWP(0,$idx),@T[0]); # write plaintext 342 &mov (&DWP(4,$idx),@T[1]); 343 &mov (&DWP(8,$idx),@T[2]); 344 &mov (&DWP(12,$idx),@T[3]); 345 &function_end("Camellia_DecryptBlock_Rounds"); 346 # V1.x API 347 &function_begin_B("Camellia_DecryptBlock"); 348 &mov ("eax",128); 349 &sub ("eax",&wparam(0)); # load keyBitLength 350 &mov ("eax",3); 351 &adc ("eax",0); # keyBitLength==128?3:4 352 &mov (&wparam(0),"eax"); 353 &jmp (&label("Camellia_DecryptBlock_Rounds")); 354 &function_end_B("Camellia_DecryptBlock"); 355 356 if ($OPENSSL) { 357 # void Camellia_decrypt( 358 # const unsigned char *in, 359 # unsigned char *out, 360 # const CAMELLIA_KEY *key) 361 &function_begin("Camellia_decrypt"); 362 &mov ($idx,&wparam(0)); # load ciphertext pointer 363 &mov ($key,&wparam(2)); # load key schedule pointer 364 365 &mov ("ebx","esp"); 366 &sub ("esp",7*4); # place for s[0-3],keyEnd,esp and ra 367 &and ("esp",-64); 368 &mov ("eax",&DWP(272,$key)); # load grandRounds counter 369 370 # place stack frame just "above mod 1024" the key schedule 371 # this ensures that cache associativity of 2 suffices 372 &lea ("ecx",&DWP(-64-63,$key)); 373 &sub ("ecx","esp"); 374 &neg ("ecx"); 375 &and ("ecx",0x3C0); # modulo 1024, but aligned to cache-line 376 &sub ("esp","ecx"); 377 &add ("esp",4); # 4 is reserved for callee's return address 378 379 &shl ("eax",6); 380 &mov (&DWP(4*4,"esp"),$key); # save keyStart 381 &lea ($key,&DWP(0,$key,"eax")); 382 &mov (&DWP(5*4,"esp"),"ebx");# save %esp 383 384 &call (&label("pic_point")); 385 &set_label("pic_point"); 386 &blindpop($Tbl); 387 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 388 389 &mov (@T[0],&DWP(0,$idx)); # load ciphertext 390 &mov (@T[1],&DWP(4,$idx)); 391 &mov (@T[2],&DWP(8,$idx)); 392 &bswap (@T[0]); 393 &mov (@T[3],&DWP(12,$idx)); 394 &bswap (@T[1]); 395 &bswap (@T[2]); 396 &bswap (@T[3]); 397 398 &call ("_x86_Camellia_decrypt"); 399 400 &mov ("esp",&DWP(5*4,"esp")); 401 &bswap (@T[0]); 402 &mov ($idx,&wparam(1)); # load plaintext pointer 403 &bswap (@T[1]); 404 &bswap (@T[2]); 405 &bswap (@T[3]); 406 &mov (&DWP(0,$idx),@T[0]); # write plaintext 407 &mov (&DWP(4,$idx),@T[1]); 408 &mov (&DWP(8,$idx),@T[2]); 409 &mov (&DWP(12,$idx),@T[3]); 410 &function_end("Camellia_decrypt"); 411 } 412 413 &function_begin_B("_x86_Camellia_decrypt"); 414 &xor (@T[0],&DWP(0,$key)); # ^=key[0-3] 415 &xor (@T[1],&DWP(4,$key)); 416 &xor (@T[2],&DWP(8,$key)); 417 &xor (@T[3],&DWP(12,$key)); 418 &mov ($idx,&DWP(-8,$key)); # prefetch key[-2] 419 420 &mov ($__s0,@T[0]); # save s[0-3] 421 &mov ($__s1,@T[1]); 422 &mov ($__s2,@T[2]); 423 &mov ($__s3,@T[3]); 424 425 &set_label("loop",16); 426 for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8,4); } 427 428 &sub ($key,16*4); 429 &cmp ($key,$__end); 430 &je (&label("done")); 431 432 # @T[0-1] are preloaded, $idx is preloaded with key[2] 433 &and ($idx,@T[0]); 434 &mov (@T[3],$__s3); 435 &rotl ($idx,1); 436 &mov (@T[2],@T[3]); 437 &xor (@T[1],$idx); 438 &or (@T[2],&DWP(4,$key)); 439 &mov ($__s1,@T[1]); # s1^=LeftRotate(s0&key[0],1); 440 &xor (@T[2],$__s2); 441 442 &mov ($idx,&DWP(12,$key)); 443 &mov ($__s2,@T[2]); # s2^=s3|key[3]; 444 &or ($idx,@T[1]); 445 &and (@T[2],&DWP(0,$key)); 446 &xor (@T[0],$idx); 447 &rotl (@T[2],1); 448 &mov ($__s0,@T[0]); # s0^=s1|key[1]; 449 &xor (@T[3],@T[2]); 450 &mov ($idx,&DWP(-8,$key)); # prefetch key[4] 451 &mov ($__s3,@T[3]); # s3^=LeftRotate(s2&key[2],1); 452 &jmp (&label("loop")); 453 454 &set_label("done",8); 455 &mov (@T[2],@T[0]); # SwapHalf 456 &mov (@T[3],@T[1]); 457 &mov (@T[0],$__s2); 458 &mov (@T[1],$__s3); 459 &xor (@T[2],$idx); # $idx is preloaded with key[2] 460 &xor (@T[3],&DWP(12,$key)); 461 &xor (@T[0],&DWP(0,$key)); 462 &xor (@T[1],&DWP(4,$key)); 463 &ret (); 464 &function_end_B("_x86_Camellia_decrypt"); 465 466 # shld is very slow on Intel P4 family. Even on AMD it limits 467 # instruction decode rate [because it's VectorPath] and consequently 468 # performance. PIII, PM and Core[2] seem to be the only ones which 469 # execute this code ~7% faster... 470 sub __rotl128 { 471 my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_; 472 473 $rnd *= 2; 474 if ($rot) { 475 &mov ($idx,$i0); 476 &shld ($i0,$i1,$rot); 477 &shld ($i1,$i2,$rot); 478 &shld ($i2,$i3,$rot); 479 &shld ($i3,$idx,$rot); 480 } 481 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); 482 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); 483 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); 484 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); 485 } 486 487 # ... Implementing 128-bit rotate without shld gives >3x performance 488 # improvement on P4, only ~7% degradation on other Intel CPUs and 489 # not worse performance on AMD. This is therefore preferred. 490 sub _rotl128 { 491 my ($i0,$i1,$i2,$i3,$rot,$rnd,@T)=@_; 492 493 $rnd *= 2; 494 if ($rot) { 495 &mov ($Tbl,$i0); 496 &shl ($i0,$rot); 497 &mov ($idx,$i1); 498 &shr ($idx,32-$rot); 499 &shl ($i1,$rot); 500 &or ($i0,$idx); 501 &mov ($idx,$i2); 502 &shl ($i2,$rot); 503 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); 504 &shr ($idx,32-$rot); 505 &or ($i1,$idx); 506 &shr ($Tbl,32-$rot); 507 &mov ($idx,$i3); 508 &shr ($idx,32-$rot); 509 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); 510 &shl ($i3,$rot); 511 &or ($i2,$idx); 512 &or ($i3,$Tbl); 513 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); 514 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); 515 } else { 516 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i0 eq @T[0]); 517 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i1 eq @T[0]); 518 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i2 eq @T[0]); 519 &mov (&DWP(-128+4*$rnd++,$key),shift(@T)) if ($i3 eq @T[0]); 520 } 521 } 522 523 sub _saveround { 524 my ($rnd,$key,@T)=@_; 525 my $bias=int(@T[0])?shift(@T):0; 526 527 &mov (&DWP($bias+$rnd*8+0,$key),@T[0]); 528 &mov (&DWP($bias+$rnd*8+4,$key),@T[1]) if ($#T>=1); 529 &mov (&DWP($bias+$rnd*8+8,$key),@T[2]) if ($#T>=2); 530 &mov (&DWP($bias+$rnd*8+12,$key),@T[3]) if ($#T>=3); 531 } 532 533 sub _loadround { 534 my ($rnd,$key,@T)=@_; 535 my $bias=int(@T[0])?shift(@T):0; 536 537 &mov (@T[0],&DWP($bias+$rnd*8+0,$key)); 538 &mov (@T[1],&DWP($bias+$rnd*8+4,$key)) if ($#T>=1); 539 &mov (@T[2],&DWP($bias+$rnd*8+8,$key)) if ($#T>=2); 540 &mov (@T[3],&DWP($bias+$rnd*8+12,$key)) if ($#T>=3); 541 } 542 543 # void Camellia_Ekeygen( 544 # const int keyBitLength, 545 # const Byte *rawKey, 546 # KEY_TABLE_TYPE keyTable) 547 &function_begin("Camellia_Ekeygen"); 548 { my $step=0; 549 550 &stack_push(4); # place for s[0-3] 551 552 &mov ($Tbl,&wparam(0)); # load arguments 553 &mov ($idx,&wparam(1)); 554 &mov ($key,&wparam(2)); 555 556 &mov (@T[0],&DWP(0,$idx)); # load 0-127 bits 557 &mov (@T[1],&DWP(4,$idx)); 558 &mov (@T[2],&DWP(8,$idx)); 559 &mov (@T[3],&DWP(12,$idx)); 560 561 &bswap (@T[0]); 562 &bswap (@T[1]); 563 &bswap (@T[2]); 564 &bswap (@T[3]); 565 566 &_saveround (0,$key,@T); # KL<<<0 567 568 &cmp ($Tbl,128); 569 &je (&label("1st128")); 570 571 &mov (@T[0],&DWP(16,$idx)); # load 128-191 bits 572 &mov (@T[1],&DWP(20,$idx)); 573 &cmp ($Tbl,192); 574 &je (&label("1st192")); 575 &mov (@T[2],&DWP(24,$idx)); # load 192-255 bits 576 &mov (@T[3],&DWP(28,$idx)); 577 &jmp (&label("1st256")); 578 &set_label("1st192",4); 579 &mov (@T[2],@T[0]); 580 &mov (@T[3],@T[1]); 581 ¬ (@T[2]); 582 ¬ (@T[3]); 583 &set_label("1st256",4); 584 &bswap (@T[0]); 585 &bswap (@T[1]); 586 &bswap (@T[2]); 587 &bswap (@T[3]); 588 589 &_saveround (4,$key,@T); # temporary storage for KR! 590 591 &xor (@T[0],&DWP(0*8+0,$key)); # KR^KL 592 &xor (@T[1],&DWP(0*8+4,$key)); 593 &xor (@T[2],&DWP(1*8+0,$key)); 594 &xor (@T[3],&DWP(1*8+4,$key)); 595 596 &set_label("1st128",4); 597 &call (&label("pic_point")); 598 &set_label("pic_point"); 599 &blindpop($Tbl); 600 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 601 &lea ($key,&DWP(&label("Camellia_SIGMA")."-".&label("Camellia_SBOX"),$Tbl)); 602 603 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[0] 604 &mov (&swtmp(0),@T[0]); # save s[0-3] 605 &mov (&swtmp(1),@T[1]); 606 &mov (&swtmp(2),@T[2]); 607 &mov (&swtmp(3),@T[3]); 608 &Camellia_Feistel($step++); 609 &Camellia_Feistel($step++); 610 &mov (@T[2],&swtmp(2)); 611 &mov (@T[3],&swtmp(3)); 612 613 &mov ($idx,&wparam(2)); 614 &xor (@T[0],&DWP(0*8+0,$idx)); # ^KL 615 &xor (@T[1],&DWP(0*8+4,$idx)); 616 &xor (@T[2],&DWP(1*8+0,$idx)); 617 &xor (@T[3],&DWP(1*8+4,$idx)); 618 619 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[4] 620 &mov (&swtmp(0),@T[0]); # save s[0-3] 621 &mov (&swtmp(1),@T[1]); 622 &mov (&swtmp(2),@T[2]); 623 &mov (&swtmp(3),@T[3]); 624 &Camellia_Feistel($step++); 625 &Camellia_Feistel($step++); 626 &mov (@T[2],&swtmp(2)); 627 &mov (@T[3],&swtmp(3)); 628 629 &mov ($idx,&wparam(0)); 630 &cmp ($idx,128); 631 &jne (&label("2nd256")); 632 633 &mov ($key,&wparam(2)); 634 &lea ($key,&DWP(128,$key)); # size optimization 635 636 ####### process KA 637 &_saveround (2,$key,-128,@T); # KA<<<0 638 &_rotl128 (@T,15,6,@T); # KA<<<15 639 &_rotl128 (@T,15,8,@T); # KA<<<(15+15=30) 640 &_rotl128 (@T,15,12,@T[0],@T[1]); # KA<<<(30+15=45) 641 &_rotl128 (@T,15,14,@T); # KA<<<(45+15=60) 642 push (@T,shift(@T)); # rotl128(@T,32); 643 &_rotl128 (@T,2,20,@T); # KA<<<(60+32+2=94) 644 &_rotl128 (@T,17,24,@T); # KA<<<(94+17=111) 645 646 ####### process KL 647 &_loadround (0,$key,-128,@T); # load KL 648 &_rotl128 (@T,15,4,@T); # KL<<<15 649 &_rotl128 (@T,30,10,@T); # KL<<<(15+30=45) 650 &_rotl128 (@T,15,13,@T[2],@T[3]); # KL<<<(45+15=60) 651 &_rotl128 (@T,17,16,@T); # KL<<<(60+17=77) 652 &_rotl128 (@T,17,18,@T); # KL<<<(77+17=94) 653 &_rotl128 (@T,17,22,@T); # KL<<<(94+17=111) 654 655 while (@T[0] ne "eax") # restore order 656 { unshift (@T,pop(@T)); } 657 658 &mov ("eax",3); # 3 grandRounds 659 &jmp (&label("done")); 660 661 &set_label("2nd256",16); 662 &mov ($idx,&wparam(2)); 663 &_saveround (6,$idx,@T); # temporary storage for KA! 664 665 &xor (@T[0],&DWP(4*8+0,$idx)); # KA^KR 666 &xor (@T[1],&DWP(4*8+4,$idx)); 667 &xor (@T[2],&DWP(5*8+0,$idx)); 668 &xor (@T[3],&DWP(5*8+4,$idx)); 669 670 &mov ($idx,&DWP($step*8,$key)); # prefetch SIGMA[8] 671 &mov (&swtmp(0),@T[0]); # save s[0-3] 672 &mov (&swtmp(1),@T[1]); 673 &mov (&swtmp(2),@T[2]); 674 &mov (&swtmp(3),@T[3]); 675 &Camellia_Feistel($step++); 676 &Camellia_Feistel($step++); 677 &mov (@T[2],&swtmp(2)); 678 &mov (@T[3],&swtmp(3)); 679 680 &mov ($key,&wparam(2)); 681 &lea ($key,&DWP(128,$key)); # size optimization 682 683 ####### process KB 684 &_saveround (2,$key,-128,@T); # KB<<<0 685 &_rotl128 (@T,30,10,@T); # KB<<<30 686 &_rotl128 (@T,30,20,@T); # KB<<<(30+30=60) 687 push (@T,shift(@T)); # rotl128(@T,32); 688 &_rotl128 (@T,19,32,@T); # KB<<<(60+32+19=111) 689 690 ####### process KR 691 &_loadround (4,$key,-128,@T); # load KR 692 &_rotl128 (@T,15,4,@T); # KR<<<15 693 &_rotl128 (@T,15,8,@T); # KR<<<(15+15=30) 694 &_rotl128 (@T,30,18,@T); # KR<<<(30+30=60) 695 push (@T,shift(@T)); # rotl128(@T,32); 696 &_rotl128 (@T,2,26,@T); # KR<<<(60+32+2=94) 697 698 ####### process KA 699 &_loadround (6,$key,-128,@T); # load KA 700 &_rotl128 (@T,15,6,@T); # KA<<<15 701 &_rotl128 (@T,30,14,@T); # KA<<<(15+30=45) 702 push (@T,shift(@T)); # rotl128(@T,32); 703 &_rotl128 (@T,0,24,@T); # KA<<<(45+32+0=77) 704 &_rotl128 (@T,17,28,@T); # KA<<<(77+17=94) 705 706 ####### process KL 707 &_loadround (0,$key,-128,@T); # load KL 708 push (@T,shift(@T)); # rotl128(@T,32); 709 &_rotl128 (@T,13,12,@T); # KL<<<(32+13=45) 710 &_rotl128 (@T,15,16,@T); # KL<<<(45+15=60) 711 &_rotl128 (@T,17,22,@T); # KL<<<(60+17=77) 712 push (@T,shift(@T)); # rotl128(@T,32); 713 &_rotl128 (@T,2,30,@T); # KL<<<(77+32+2=111) 714 715 while (@T[0] ne "eax") # restore order 716 { unshift (@T,pop(@T)); } 717 718 &mov ("eax",4); # 4 grandRounds 719 &set_label("done"); 720 &lea ("edx",&DWP(272-128,$key)); # end of key schedule 721 &stack_pop(4); 722 } 723 &function_end("Camellia_Ekeygen"); 724 725 if ($OPENSSL) { 726 # int private_Camellia_set_key ( 727 # const unsigned char *userKey, 728 # int bits, 729 # CAMELLIA_KEY *key) 730 &function_begin_B("private_Camellia_set_key"); 731 &push ("ebx"); 732 &mov ("ecx",&wparam(0)); # pull arguments 733 &mov ("ebx",&wparam(1)); 734 &mov ("edx",&wparam(2)); 735 736 &mov ("eax",-1); 737 &test ("ecx","ecx"); 738 &jz (&label("done")); # userKey==NULL? 739 &test ("edx","edx"); 740 &jz (&label("done")); # key==NULL? 741 742 &mov ("eax",-2); 743 &cmp ("ebx",256); 744 &je (&label("arg_ok")); # bits==256? 745 &cmp ("ebx",192); 746 &je (&label("arg_ok")); # bits==192? 747 &cmp ("ebx",128); 748 &jne (&label("done")); # bits!=128? 749 &set_label("arg_ok",4); 750 751 &push ("edx"); # push arguments 752 &push ("ecx"); 753 &push ("ebx"); 754 &call ("Camellia_Ekeygen"); 755 &stack_pop(3); 756 757 # eax holds grandRounds and edx points at where to put it 758 &mov (&DWP(0,"edx"),"eax"); 759 &xor ("eax","eax"); 760 &set_label("done",4); 761 &pop ("ebx"); 762 &ret (); 763 &function_end_B("private_Camellia_set_key"); 764 } 765 766 @SBOX=( 767 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65, 768 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189, 769 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26, 770 166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77, 771 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153, 772 223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215, 773 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34, 774 254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80, 775 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210, 776 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148, 777 135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226, 778 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46, 779 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89, 780 120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250, 781 114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164, 782 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158); 783 784 sub S1110 { my $i=shift; $i=@SBOX[$i]; return $i<<24|$i<<16|$i<<8; } 785 sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; return $i<<24|$i<<16|$i; } 786 sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; return $i<<16|$i<<8|$i; } 787 sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; return $i<<24|$i<<8|$i; } 788 789 &set_label("Camellia_SIGMA",64); 790 &data_word( 791 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2, 792 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c, 793 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd, 794 0, 0, 0, 0); 795 &set_label("Camellia_SBOX",64); 796 # tables are interleaved, remember? 797 for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); } 798 for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); } 799 800 # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out, 801 # size_t length, const CAMELLIA_KEY *key, 802 # unsigned char *ivp,const int enc); 803 { 804 # stack frame layout 805 # -4(%esp) # return address 0(%esp) 806 # 0(%esp) # s0 4(%esp) 807 # 4(%esp) # s1 8(%esp) 808 # 8(%esp) # s2 12(%esp) 809 # 12(%esp) # s3 16(%esp) 810 # 16(%esp) # end of key schedule 20(%esp) 811 # 20(%esp) # %esp backup 812 my $_inp=&DWP(24,"esp"); #copy of wparam(0) 813 my $_out=&DWP(28,"esp"); #copy of wparam(1) 814 my $_len=&DWP(32,"esp"); #copy of wparam(2) 815 my $_key=&DWP(36,"esp"); #copy of wparam(3) 816 my $_ivp=&DWP(40,"esp"); #copy of wparam(4) 817 my $ivec=&DWP(44,"esp"); #ivec[16] 818 my $_tmp=&DWP(44,"esp"); #volatile variable [yes, aliases with ivec] 819 my ($s0,$s1,$s2,$s3) = @T; 820 821 &function_begin("Camellia_cbc_encrypt"); 822 &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len 823 &cmp ($s2,0); 824 &je (&label("enc_out")); 825 826 &pushf (); 827 &cld (); 828 829 &mov ($s0,&wparam(0)); # load inp 830 &mov ($s1,&wparam(1)); # load out 831 #&mov ($s2,&wparam(2)); # load len 832 &mov ($s3,&wparam(3)); # load key 833 &mov ($Tbl,&wparam(4)); # load ivp 834 835 # allocate aligned stack frame... 836 &lea ($idx,&DWP(-64,"esp")); 837 &and ($idx,-64); 838 839 # place stack frame just "above mod 1024" the key schedule 840 # this ensures that cache associativity of 2 suffices 841 &lea ($key,&DWP(-64-63,$s3)); 842 &sub ($key,$idx); 843 &neg ($key); 844 &and ($key,0x3C0); # modulo 1024, but aligned to cache-line 845 &sub ($idx,$key); 846 847 &mov ($key,&wparam(5)); # load enc 848 849 &exch ("esp",$idx); 850 &add ("esp",4); # reserve for return address! 851 &mov ($_esp,$idx); # save %esp 852 853 &mov ($_inp,$s0); # save copy of inp 854 &mov ($_out,$s1); # save copy of out 855 &mov ($_len,$s2); # save copy of len 856 &mov ($_key,$s3); # save copy of key 857 &mov ($_ivp,$Tbl); # save copy of ivp 858 859 &call (&label("pic_point")); # make it PIC! 860 &set_label("pic_point"); 861 &blindpop($Tbl); 862 &lea ($Tbl,&DWP(&label("Camellia_SBOX")."-".&label("pic_point"),$Tbl)); 863 864 &mov ($idx,32); 865 &set_label("prefetch_sbox",4); 866 &mov ($s0,&DWP(0,$Tbl)); 867 &mov ($s1,&DWP(32,$Tbl)); 868 &mov ($s2,&DWP(64,$Tbl)); 869 &mov ($s3,&DWP(96,$Tbl)); 870 &lea ($Tbl,&DWP(128,$Tbl)); 871 &dec ($idx); 872 &jnz (&label("prefetch_sbox")); 873 &mov ($s0,$_key); 874 &sub ($Tbl,4096); 875 &mov ($idx,$_inp); 876 &mov ($s3,&DWP(272,$s0)); # load grandRounds 877 878 &cmp ($key,0); 879 &je (&label("DECRYPT")); 880 881 &mov ($s2,$_len); 882 &mov ($key,$_ivp); 883 &shl ($s3,6); 884 &lea ($s3,&DWP(0,$s0,$s3)); 885 &mov ($_end,$s3); 886 887 &test ($s2,0xFFFFFFF0); 888 &jz (&label("enc_tail")); # short input... 889 890 &mov ($s0,&DWP(0,$key)); # load iv 891 &mov ($s1,&DWP(4,$key)); 892 893 &set_label("enc_loop",4); 894 &mov ($s2,&DWP(8,$key)); 895 &mov ($s3,&DWP(12,$key)); 896 897 &xor ($s0,&DWP(0,$idx)); # xor input data 898 &xor ($s1,&DWP(4,$idx)); 899 &xor ($s2,&DWP(8,$idx)); 900 &bswap ($s0); 901 &xor ($s3,&DWP(12,$idx)); 902 &bswap ($s1); 903 &mov ($key,$_key); # load key 904 &bswap ($s2); 905 &bswap ($s3); 906 907 &call ("_x86_Camellia_encrypt"); 908 909 &mov ($idx,$_inp); # load inp 910 &mov ($key,$_out); # load out 911 912 &bswap ($s0); 913 &bswap ($s1); 914 &bswap ($s2); 915 &mov (&DWP(0,$key),$s0); # save output data 916 &bswap ($s3); 917 &mov (&DWP(4,$key),$s1); 918 &mov (&DWP(8,$key),$s2); 919 &mov (&DWP(12,$key),$s3); 920 921 &mov ($s2,$_len); # load len 922 923 &lea ($idx,&DWP(16,$idx)); 924 &mov ($_inp,$idx); # save inp 925 926 &lea ($s3,&DWP(16,$key)); 927 &mov ($_out,$s3); # save out 928 929 &sub ($s2,16); 930 &test ($s2,0xFFFFFFF0); 931 &mov ($_len,$s2); # save len 932 &jnz (&label("enc_loop")); 933 &test ($s2,15); 934 &jnz (&label("enc_tail")); 935 &mov ($idx,$_ivp); # load ivp 936 &mov ($s2,&DWP(8,$key)); # restore last dwords 937 &mov ($s3,&DWP(12,$key)); 938 &mov (&DWP(0,$idx),$s0); # save ivec 939 &mov (&DWP(4,$idx),$s1); 940 &mov (&DWP(8,$idx),$s2); 941 &mov (&DWP(12,$idx),$s3); 942 943 &mov ("esp",$_esp); 944 &popf (); 945 &set_label("enc_out"); 946 &function_end_A(); 947 &pushf (); # kludge, never executed 948 949 &set_label("enc_tail",4); 950 &mov ($s0,$key eq "edi" ? $key : ""); 951 &mov ($key,$_out); # load out 952 &push ($s0); # push ivp 953 &mov ($s1,16); 954 &sub ($s1,$s2); 955 &cmp ($key,$idx); # compare with inp 956 &je (&label("enc_in_place")); 957 &align (4); 958 &data_word(0xA4F3F689); # rep movsb # copy input 959 &jmp (&label("enc_skip_in_place")); 960 &set_label("enc_in_place"); 961 &lea ($key,&DWP(0,$key,$s2)); 962 &set_label("enc_skip_in_place"); 963 &mov ($s2,$s1); 964 &xor ($s0,$s0); 965 &align (4); 966 &data_word(0xAAF3F689); # rep stosb # zero tail 967 &pop ($key); # pop ivp 968 969 &mov ($idx,$_out); # output as input 970 &mov ($s0,&DWP(0,$key)); 971 &mov ($s1,&DWP(4,$key)); 972 &mov ($_len,16); # len=16 973 &jmp (&label("enc_loop")); # one more spin... 974 975 #----------------------------- DECRYPT -----------------------------# 976 &set_label("DECRYPT",16); 977 &shl ($s3,6); 978 &lea ($s3,&DWP(0,$s0,$s3)); 979 &mov ($_end,$s0); 980 &mov ($_key,$s3); 981 982 &cmp ($idx,$_out); 983 &je (&label("dec_in_place")); # in-place processing... 984 985 &mov ($key,$_ivp); # load ivp 986 &mov ($_tmp,$key); 987 988 &set_label("dec_loop",4); 989 &mov ($s0,&DWP(0,$idx)); # read input 990 &mov ($s1,&DWP(4,$idx)); 991 &mov ($s2,&DWP(8,$idx)); 992 &bswap ($s0); 993 &mov ($s3,&DWP(12,$idx)); 994 &bswap ($s1); 995 &mov ($key,$_key); # load key 996 &bswap ($s2); 997 &bswap ($s3); 998 999 &call ("_x86_Camellia_decrypt"); 1000 1001 &mov ($key,$_tmp); # load ivp 1002 &mov ($idx,$_len); # load len 1003 1004 &bswap ($s0); 1005 &bswap ($s1); 1006 &bswap ($s2); 1007 &xor ($s0,&DWP(0,$key)); # xor iv 1008 &bswap ($s3); 1009 &xor ($s1,&DWP(4,$key)); 1010 &xor ($s2,&DWP(8,$key)); 1011 &xor ($s3,&DWP(12,$key)); 1012 1013 &sub ($idx,16); 1014 &jc (&label("dec_partial")); 1015 &mov ($_len,$idx); # save len 1016 &mov ($idx,$_inp); # load inp 1017 &mov ($key,$_out); # load out 1018 1019 &mov (&DWP(0,$key),$s0); # write output 1020 &mov (&DWP(4,$key),$s1); 1021 &mov (&DWP(8,$key),$s2); 1022 &mov (&DWP(12,$key),$s3); 1023 1024 &mov ($_tmp,$idx); # save ivp 1025 &lea ($idx,&DWP(16,$idx)); 1026 &mov ($_inp,$idx); # save inp 1027 1028 &lea ($key,&DWP(16,$key)); 1029 &mov ($_out,$key); # save out 1030 1031 &jnz (&label("dec_loop")); 1032 &mov ($key,$_tmp); # load temp ivp 1033 &set_label("dec_end"); 1034 &mov ($idx,$_ivp); # load user ivp 1035 &mov ($s0,&DWP(0,$key)); # load iv 1036 &mov ($s1,&DWP(4,$key)); 1037 &mov ($s2,&DWP(8,$key)); 1038 &mov ($s3,&DWP(12,$key)); 1039 &mov (&DWP(0,$idx),$s0); # copy back to user 1040 &mov (&DWP(4,$idx),$s1); 1041 &mov (&DWP(8,$idx),$s2); 1042 &mov (&DWP(12,$idx),$s3); 1043 &jmp (&label("dec_out")); 1044 1045 &set_label("dec_partial",4); 1046 &lea ($key,$ivec); 1047 &mov (&DWP(0,$key),$s0); # dump output to stack 1048 &mov (&DWP(4,$key),$s1); 1049 &mov (&DWP(8,$key),$s2); 1050 &mov (&DWP(12,$key),$s3); 1051 &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$idx)); 1052 &mov ($idx eq "esi" ? $idx : "",$key); 1053 &mov ($key eq "edi" ? $key : "",$_out); # load out 1054 &data_word(0xA4F3F689); # rep movsb # copy output 1055 &mov ($key,$_inp); # use inp as temp ivp 1056 &jmp (&label("dec_end")); 1057 1058 &set_label("dec_in_place",4); 1059 &set_label("dec_in_place_loop"); 1060 &lea ($key,$ivec); 1061 &mov ($s0,&DWP(0,$idx)); # read input 1062 &mov ($s1,&DWP(4,$idx)); 1063 &mov ($s2,&DWP(8,$idx)); 1064 &mov ($s3,&DWP(12,$idx)); 1065 1066 &mov (&DWP(0,$key),$s0); # copy to temp 1067 &mov (&DWP(4,$key),$s1); 1068 &mov (&DWP(8,$key),$s2); 1069 &bswap ($s0); 1070 &mov (&DWP(12,$key),$s3); 1071 &bswap ($s1); 1072 &mov ($key,$_key); # load key 1073 &bswap ($s2); 1074 &bswap ($s3); 1075 1076 &call ("_x86_Camellia_decrypt"); 1077 1078 &mov ($key,$_ivp); # load ivp 1079 &mov ($idx,$_out); # load out 1080 1081 &bswap ($s0); 1082 &bswap ($s1); 1083 &bswap ($s2); 1084 &xor ($s0,&DWP(0,$key)); # xor iv 1085 &bswap ($s3); 1086 &xor ($s1,&DWP(4,$key)); 1087 &xor ($s2,&DWP(8,$key)); 1088 &xor ($s3,&DWP(12,$key)); 1089 1090 &mov (&DWP(0,$idx),$s0); # write output 1091 &mov (&DWP(4,$idx),$s1); 1092 &mov (&DWP(8,$idx),$s2); 1093 &mov (&DWP(12,$idx),$s3); 1094 1095 &lea ($idx,&DWP(16,$idx)); 1096 &mov ($_out,$idx); # save out 1097 1098 &lea ($idx,$ivec); 1099 &mov ($s0,&DWP(0,$idx)); # read temp 1100 &mov ($s1,&DWP(4,$idx)); 1101 &mov ($s2,&DWP(8,$idx)); 1102 &mov ($s3,&DWP(12,$idx)); 1103 1104 &mov (&DWP(0,$key),$s0); # copy iv 1105 &mov (&DWP(4,$key),$s1); 1106 &mov (&DWP(8,$key),$s2); 1107 &mov (&DWP(12,$key),$s3); 1108 1109 &mov ($idx,$_inp); # load inp 1110 1111 &lea ($idx,&DWP(16,$idx)); 1112 &mov ($_inp,$idx); # save inp 1113 1114 &mov ($s2,$_len); # load len 1115 &sub ($s2,16); 1116 &jc (&label("dec_in_place_partial")); 1117 &mov ($_len,$s2); # save len 1118 &jnz (&label("dec_in_place_loop")); 1119 &jmp (&label("dec_out")); 1120 1121 &set_label("dec_in_place_partial",4); 1122 # one can argue if this is actually required... 1123 &mov ($key eq "edi" ? $key : "",$_out); 1124 &lea ($idx eq "esi" ? $idx : "",$ivec); 1125 &lea ($key,&DWP(0,$key,$s2)); 1126 &lea ($idx,&DWP(16,$idx,$s2)); 1127 &neg ($s2 eq "ecx" ? $s2 : ""); 1128 &data_word(0xA4F3F689); # rep movsb # restore tail 1129 1130 &set_label("dec_out",4); 1131 &mov ("esp",$_esp); 1132 &popf (); 1133 &function_end("Camellia_cbc_encrypt"); 1134 } 1135 1136 &asciz("Camellia for x86 by <appro\@openssl.org>"); 1137 1138 &asm_finish();