1 #!/usr/local/bin/perl
   2 
   3 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
   4 push(@INC,"${dir}","${dir}../../perlasm");
   5 require "x86asm.pl";
   6 
   7 &asm_init($ARGV[0],$0);
   8 
   9 $sse2=0;
  10 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
  11 
  12 &external_label("OPENSSL_ia32cap_P") if ($sse2);
  13 
  14 &bn_mul_add_words("bn_mul_add_words");
  15 &bn_mul_words("bn_mul_words");
  16 &bn_sqr_words("bn_sqr_words");
  17 &bn_div_words("bn_div_words");
  18 &bn_add_words("bn_add_words");
  19 &bn_sub_words("bn_sub_words");
  20 &bn_sub_part_words("bn_sub_part_words");
  21 
  22 &asm_finish();
  23 
  24 sub bn_mul_add_words
  25         {
  26         local($name)=@_;
  27 
  28         &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
  29 
  30         $r="eax";
  31         $a="edx";
  32         $c="ecx";
  33 
  34         if ($sse2) {
  35                 &picmeup("eax","OPENSSL_ia32cap_P");
  36                 &bt(&DWP(0,"eax"),26);
  37                 &jnc(&label("maw_non_sse2"));
  38 
  39                 &mov($r,&wparam(0));
  40                 &mov($a,&wparam(1));
  41                 &mov($c,&wparam(2));
  42                 &movd("mm0",&wparam(3));        # mm0 = w
  43                 &pxor("mm1","mm1");         # mm1 = carry_in
  44                 &jmp(&label("maw_sse2_entry"));
  45 
  46         &set_label("maw_sse2_unrolled",16);
  47                 &movd("mm3",&DWP(0,$r,"",0));   # mm3 = r[0]
  48                 &paddq("mm1","mm3");                # mm1 = carry_in + r[0]
  49                 &movd("mm2",&DWP(0,$a,"",0));   # mm2 = a[0]
  50                 &pmuludq("mm2","mm0");              # mm2 = w*a[0]
  51                 &movd("mm4",&DWP(4,$a,"",0));   # mm4 = a[1]
  52                 &pmuludq("mm4","mm0");              # mm4 = w*a[1]
  53                 &movd("mm6",&DWP(8,$a,"",0));   # mm6 = a[2]
  54                 &pmuludq("mm6","mm0");              # mm6 = w*a[2]
  55                 &movd("mm7",&DWP(12,$a,"",0));  # mm7 = a[3]
  56                 &pmuludq("mm7","mm0");              # mm7 = w*a[3]
  57                 &paddq("mm1","mm2");                # mm1 = carry_in + r[0] + w*a[0]
  58                 &movd("mm3",&DWP(4,$r,"",0));   # mm3 = r[1]
  59                 &paddq("mm3","mm4");                # mm3 = r[1] + w*a[1]
  60                 &movd("mm5",&DWP(8,$r,"",0));   # mm5 = r[2]
  61                 &paddq("mm5","mm6");                # mm5 = r[2] + w*a[2]
  62                 &movd("mm4",&DWP(12,$r,"",0));  # mm4 = r[3]
  63                 &paddq("mm7","mm4");                # mm7 = r[3] + w*a[3]
  64                 &movd(&DWP(0,$r,"",0),"mm1");
  65                 &movd("mm2",&DWP(16,$a,"",0));  # mm2 = a[4]
  66                 &pmuludq("mm2","mm0");              # mm2 = w*a[4]
  67                 &psrlq("mm1",32);           # mm1 = carry0
  68                 &movd("mm4",&DWP(20,$a,"",0));  # mm4 = a[5]
  69                 &pmuludq("mm4","mm0");              # mm4 = w*a[5]
  70                 &paddq("mm1","mm3");                # mm1 = carry0 + r[1] + w*a[1]
  71                 &movd("mm6",&DWP(24,$a,"",0));  # mm6 = a[6]
  72                 &pmuludq("mm6","mm0");              # mm6 = w*a[6]
  73                 &movd(&DWP(4,$r,"",0),"mm1");
  74                 &psrlq("mm1",32);           # mm1 = carry1
  75                 &movd("mm3",&DWP(28,$a,"",0));  # mm3 = a[7]
  76                 &add($a,32);
  77                 &pmuludq("mm3","mm0");              # mm3 = w*a[7]
  78                 &paddq("mm1","mm5");                # mm1 = carry1 + r[2] + w*a[2]
  79                 &movd("mm5",&DWP(16,$r,"",0));  # mm5 = r[4]
  80                 &paddq("mm2","mm5");                # mm2 = r[4] + w*a[4]
  81                 &movd(&DWP(8,$r,"",0),"mm1");
  82                 &psrlq("mm1",32);           # mm1 = carry2
  83                 &paddq("mm1","mm7");                # mm1 = carry2 + r[3] + w*a[3]
  84                 &movd("mm5",&DWP(20,$r,"",0));  # mm5 = r[5]
  85                 &paddq("mm4","mm5");                # mm4 = r[5] + w*a[5]
  86                 &movd(&DWP(12,$r,"",0),"mm1");
  87                 &psrlq("mm1",32);           # mm1 = carry3
  88                 &paddq("mm1","mm2");                # mm1 = carry3 + r[4] + w*a[4]
  89                 &movd("mm5",&DWP(24,$r,"",0));  # mm5 = r[6]
  90                 &paddq("mm6","mm5");                # mm6 = r[6] + w*a[6]
  91                 &movd(&DWP(16,$r,"",0),"mm1");
  92                 &psrlq("mm1",32);           # mm1 = carry4
  93                 &paddq("mm1","mm4");                # mm1 = carry4 + r[5] + w*a[5]
  94                 &movd("mm5",&DWP(28,$r,"",0));  # mm5 = r[7]
  95                 &paddq("mm3","mm5");                # mm3 = r[7] + w*a[7]
  96                 &movd(&DWP(20,$r,"",0),"mm1");
  97                 &psrlq("mm1",32);           # mm1 = carry5
  98                 &paddq("mm1","mm6");                # mm1 = carry5 + r[6] + w*a[6]
  99                 &movd(&DWP(24,$r,"",0),"mm1");
 100                 &psrlq("mm1",32);           # mm1 = carry6
 101                 &paddq("mm1","mm3");                # mm1 = carry6 + r[7] + w*a[7]
 102                 &movd(&DWP(28,$r,"",0),"mm1");
 103                 &lea($r,&DWP(32,$r));
 104                 &psrlq("mm1",32);           # mm1 = carry_out
 105 
 106                 &sub($c,8);
 107                 &jz(&label("maw_sse2_exit"));
 108         &set_label("maw_sse2_entry");
 109                 &test($c,0xfffffff8);
 110                 &jnz(&label("maw_sse2_unrolled"));
 111 
 112         &set_label("maw_sse2_loop",4);
 113                 &movd("mm2",&DWP(0,$a));        # mm2 = a[i]
 114                 &movd("mm3",&DWP(0,$r));        # mm3 = r[i]
 115                 &pmuludq("mm2","mm0");              # a[i] *= w
 116                 &lea($a,&DWP(4,$a));
 117                 &paddq("mm1","mm3");                # carry += r[i]
 118                 &paddq("mm1","mm2");                # carry += a[i]*w
 119                 &movd(&DWP(0,$r),"mm1");        # r[i] = carry_low
 120                 &sub($c,1);
 121                 &psrlq("mm1",32);           # carry = carry_high
 122                 &lea($r,&DWP(4,$r));
 123                 &jnz(&label("maw_sse2_loop"));
 124         &set_label("maw_sse2_exit");
 125                 &movd("eax","mm1");         # c = carry_out
 126                 &emms();
 127                 &ret();
 128 
 129         &set_label("maw_non_sse2",16);
 130         }
 131 
 132         # function_begin prologue
 133         &push("ebp");
 134         &push("ebx");
 135         &push("esi");
 136         &push("edi");
 137 
 138         &comment("");
 139         $Low="eax";
 140         $High="edx";
 141         $a="ebx";
 142         $w="ebp";
 143         $r="edi";
 144         $c="esi";
 145 
 146         &xor($c,$c);                # clear carry
 147         &mov($r,&wparam(0));    #
 148 
 149         &mov("ecx",&wparam(2)); #
 150         &mov($a,&wparam(1));    #
 151 
 152         &and("ecx",0xfffffff8);     # num / 8
 153         &mov($w,&wparam(3));    #
 154 
 155         &push("ecx");               # Up the stack for a tmp variable
 156 
 157         &jz(&label("maw_finish"));
 158 
 159         &set_label("maw_loop",16);
 160 
 161         for ($i=0; $i<32; $i+=4)
 162                 {
 163                 &comment("Round $i");
 164 
 165                  &mov("eax",&DWP($i,$a));       # *a
 166                 &mul($w);                   # *a * w
 167                 &add("eax",$c);                     # L(t)+= c
 168                 &adc("edx",0);                      # H(t)+=carry
 169                  &add("eax",&DWP($i,$r));       # L(t)+= *r
 170                 &adc("edx",0);                      # H(t)+=carry
 171                  &mov(&DWP($i,$r),"eax");       # *r= L(t);
 172                 &mov($c,"edx");                     # c=  H(t);
 173                 }
 174 
 175         &comment("");
 176         &sub("ecx",8);
 177         &lea($a,&DWP(32,$a));
 178         &lea($r,&DWP(32,$r));
 179         &jnz(&label("maw_loop"));
 180 
 181         &set_label("maw_finish",0);
 182         &mov("ecx",&wparam(2)); # get num
 183         &and("ecx",7);
 184         &jnz(&label("maw_finish2"));    # helps branch prediction
 185         &jmp(&label("maw_end"));
 186 
 187         &set_label("maw_finish2",1);
 188         for ($i=0; $i<7; $i++)
 189                 {
 190                 &comment("Tail Round $i");
 191                  &mov("eax",&DWP($i*4,$a));     # *a
 192                 &mul($w);                   # *a * w
 193                 &add("eax",$c);                     # L(t)+=c
 194                 &adc("edx",0);                      # H(t)+=carry
 195                  &add("eax",&DWP($i*4,$r));     # L(t)+= *r
 196                 &adc("edx",0);                      # H(t)+=carry
 197                  &dec("ecx") if ($i != 7-1);
 198                 &mov(&DWP($i*4,$r),"eax");      # *r= L(t);
 199                  &mov($c,"edx");            # c=  H(t);
 200                 &jz(&label("maw_end")) if ($i != 7-1);
 201                 }
 202         &set_label("maw_end",0);
 203         &mov("eax",$c);
 204 
 205         &pop("ecx");        # clear variable from
 206 
 207         &function_end($name);
 208         }
 209 
 210 sub bn_mul_words
 211         {
 212         local($name)=@_;
 213 
 214         &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
 215 
 216         $r="eax";
 217         $a="edx";
 218         $c="ecx";
 219 
 220         if ($sse2) {
 221                 &picmeup("eax","OPENSSL_ia32cap_P");
 222                 &bt(&DWP(0,"eax"),26);
 223                 &jnc(&label("mw_non_sse2"));
 224 
 225                 &mov($r,&wparam(0));
 226                 &mov($a,&wparam(1));
 227                 &mov($c,&wparam(2));
 228                 &movd("mm0",&wparam(3));        # mm0 = w
 229                 &pxor("mm1","mm1");         # mm1 = carry = 0
 230 
 231         &set_label("mw_sse2_loop",16);
 232                 &movd("mm2",&DWP(0,$a));        # mm2 = a[i]
 233                 &pmuludq("mm2","mm0");              # a[i] *= w
 234                 &lea($a,&DWP(4,$a));
 235                 &paddq("mm1","mm2");                # carry += a[i]*w
 236                 &movd(&DWP(0,$r),"mm1");        # r[i] = carry_low
 237                 &sub($c,1);
 238                 &psrlq("mm1",32);           # carry = carry_high
 239                 &lea($r,&DWP(4,$r));
 240                 &jnz(&label("mw_sse2_loop"));
 241 
 242                 &movd("eax","mm1");         # return carry
 243                 &emms();
 244                 &ret();
 245         &set_label("mw_non_sse2",16);
 246         }
 247 
 248         # function_begin prologue
 249         &push("ebp");
 250         &push("ebx");
 251         &push("esi");
 252         &push("edi");
 253 
 254         &comment("");
 255         $Low="eax";
 256         $High="edx";
 257         $a="ebx";
 258         $w="ecx";
 259         $r="edi";
 260         $c="esi";
 261         $num="ebp";
 262 
 263         &xor($c,$c);                # clear carry
 264         &mov($r,&wparam(0));    #
 265         &mov($a,&wparam(1));    #
 266         &mov($num,&wparam(2));  #
 267         &mov($w,&wparam(3));    #
 268 
 269         &and($num,0xfffffff8);      # num / 8
 270         &jz(&label("mw_finish"));
 271 
 272         &set_label("mw_loop",0);
 273         for ($i=0; $i<32; $i+=4)
 274                 {
 275                 &comment("Round $i");
 276 
 277                  &mov("eax",&DWP($i,$a,"",0));  # *a
 278                 &mul($w);                   # *a * w
 279                 &add("eax",$c);                     # L(t)+=c
 280                  # XXX
 281 
 282                 &adc("edx",0);                      # H(t)+=carry
 283                  &mov(&DWP($i,$r,"",0),"eax");  # *r= L(t);
 284 
 285                 &mov($c,"edx");                     # c=  H(t);
 286                 }
 287 
 288         &comment("");
 289         &add($a,32);
 290         &add($r,32);
 291         &sub($num,8);
 292         &jz(&label("mw_finish"));
 293         &jmp(&label("mw_loop"));
 294 
 295         &set_label("mw_finish",0);
 296         &mov($num,&wparam(2));  # get num
 297         &and($num,7);
 298         &jnz(&label("mw_finish2"));
 299         &jmp(&label("mw_end"));
 300 
 301         &set_label("mw_finish2",1);
 302         for ($i=0; $i<7; $i++)
 303                 {
 304                 &comment("Tail Round $i");
 305                  &mov("eax",&DWP($i*4,$a,"",0));# *a
 306                 &mul($w);                   # *a * w
 307                 &add("eax",$c);                     # L(t)+=c
 308                  # XXX
 309                 &adc("edx",0);                      # H(t)+=carry
 310                  &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
 311                 &mov($c,"edx");                     # c=  H(t);
 312                  &dec($num) if ($i != 7-1);
 313                 &jz(&label("mw_end")) if ($i != 7-1);
 314                 }
 315         &set_label("mw_end",0);
 316         &mov("eax",$c);
 317 
 318         &function_end($name);
 319         }
 320 
 321 sub bn_sqr_words
 322         {
 323         local($name)=@_;
 324 
 325         &function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
 326 
 327         $r="eax";
 328         $a="edx";
 329         $c="ecx";
 330 
 331         if ($sse2) {
 332                 &picmeup("eax","OPENSSL_ia32cap_P");
 333                 &bt(&DWP(0,"eax"),26);
 334                 &jnc(&label("sqr_non_sse2"));
 335 
 336                 &mov($r,&wparam(0));
 337                 &mov($a,&wparam(1));
 338                 &mov($c,&wparam(2));
 339 
 340         &set_label("sqr_sse2_loop",16);
 341                 &movd("mm0",&DWP(0,$a));        # mm0 = a[i]
 342                 &pmuludq("mm0","mm0");              # a[i] *= a[i]
 343                 &lea($a,&DWP(4,$a));            # a++
 344                 &movq(&QWP(0,$r),"mm0");        # r[i] = a[i]*a[i]
 345                 &sub($c,1);
 346                 &lea($r,&DWP(8,$r));            # r += 2
 347                 &jnz(&label("sqr_sse2_loop"));
 348 
 349                 &emms();
 350                 &ret();
 351         &set_label("sqr_non_sse2",16);
 352         }
 353 
 354         # function_begin prologue
 355         &push("ebp");
 356         &push("ebx");
 357         &push("esi");
 358         &push("edi");
 359 
 360         &comment("");
 361         $r="esi";
 362         $a="edi";
 363         $num="ebx";
 364 
 365         &mov($r,&wparam(0));    #
 366         &mov($a,&wparam(1));    #
 367         &mov($num,&wparam(2));  #
 368 
 369         &and($num,0xfffffff8);      # num / 8
 370         &jz(&label("sw_finish"));
 371 
 372         &set_label("sw_loop",0);
 373         for ($i=0; $i<32; $i+=4)
 374                 {
 375                 &comment("Round $i");
 376                 &mov("eax",&DWP($i,$a,"",0));   # *a
 377                  # XXX
 378                 &mul("eax");                        # *a * *a
 379                 &mov(&DWP($i*2,$r,"",0),"eax"); #
 380                  &mov(&DWP($i*2+4,$r,"",0),"edx");#
 381                 }
 382 
 383         &comment("");
 384         &add($a,32);
 385         &add($r,64);
 386         &sub($num,8);
 387         &jnz(&label("sw_loop"));
 388 
 389         &set_label("sw_finish",0);
 390         &mov($num,&wparam(2));  # get num
 391         &and($num,7);
 392         &jz(&label("sw_end"));
 393 
 394         for ($i=0; $i<7; $i++)
 395                 {
 396                 &comment("Tail Round $i");
 397                 &mov("eax",&DWP($i*4,$a,"",0)); # *a
 398                  # XXX
 399                 &mul("eax");                        # *a * *a
 400                 &mov(&DWP($i*8,$r,"",0),"eax"); #
 401                  &dec($num) if ($i != 7-1);
 402                 &mov(&DWP($i*8+4,$r,"",0),"edx");
 403                  &jz(&label("sw_end")) if ($i != 7-1);
 404                 }
 405         &set_label("sw_end",0);
 406 
 407         &function_end($name);
 408         }
 409 
 410 sub bn_div_words
 411         {
 412         local($name)=@_;
 413 
 414         &function_begin_B($name,"");
 415         &mov("edx",&wparam(0)); #
 416         &mov("eax",&wparam(1)); #
 417         &mov("ecx",&wparam(2)); #
 418         &div("ecx");
 419         &ret();
 420         &function_end_B($name);
 421         }
 422 
 423 sub bn_add_words
 424         {
 425         local($name)=@_;
 426 
 427         &function_begin($name,"");
 428 
 429         &comment("");
 430         $a="esi";
 431         $b="edi";
 432         $c="eax";
 433         $r="ebx";
 434         $tmp1="ecx";
 435         $tmp2="edx";
 436         $num="ebp";
 437 
 438         &mov($r,&wparam(0));    # get r
 439          &mov($a,&wparam(1));   # get a
 440         &mov($b,&wparam(2));    # get b
 441          &mov($num,&wparam(3)); # get num
 442         &xor($c,$c);                # clear carry
 443          &and($num,0xfffffff8);     # num / 8
 444 
 445         &jz(&label("aw_finish"));
 446 
 447         &set_label("aw_loop",0);
 448         for ($i=0; $i<8; $i++)
 449                 {
 450                 &comment("Round $i");
 451 
 452                 &mov($tmp1,&DWP($i*4,$a,"",0));         # *a
 453                  &mov($tmp2,&DWP($i*4,$b,"",0));        # *b
 454                 &add($tmp1,$c);
 455                  &mov($c,0);
 456                 &adc($c,$c);
 457                  &add($tmp1,$tmp2);
 458                 &adc($c,0);
 459                  &mov(&DWP($i*4,$r,"",0),$tmp1);        # *r
 460                 }
 461 
 462         &comment("");
 463         &add($a,32);
 464          &add($b,32);
 465         &add($r,32);
 466          &sub($num,8);
 467         &jnz(&label("aw_loop"));
 468 
 469         &set_label("aw_finish",0);
 470         &mov($num,&wparam(3));  # get num
 471         &and($num,7);
 472          &jz(&label("aw_end"));
 473 
 474         for ($i=0; $i<7; $i++)
 475                 {
 476                 &comment("Tail Round $i");
 477                 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
 478                  &mov($tmp2,&DWP($i*4,$b,"",0));# *b
 479                 &add($tmp1,$c);
 480                  &mov($c,0);
 481                 &adc($c,$c);
 482                  &add($tmp1,$tmp2);
 483                 &adc($c,0);
 484                  &dec($num) if ($i != 6);
 485                 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
 486                  &jz(&label("aw_end")) if ($i != 6);
 487                 }
 488         &set_label("aw_end",0);
 489 
 490 #       &mov("eax",$c);             # $c is "eax"
 491 
 492         &function_end($name);
 493         }
 494 
 495 sub bn_sub_words
 496         {
 497         local($name)=@_;
 498 
 499         &function_begin($name,"");
 500 
 501         &comment("");
 502         $a="esi";
 503         $b="edi";
 504         $c="eax";
 505         $r="ebx";
 506         $tmp1="ecx";
 507         $tmp2="edx";
 508         $num="ebp";
 509 
 510         &mov($r,&wparam(0));    # get r
 511          &mov($a,&wparam(1));   # get a
 512         &mov($b,&wparam(2));    # get b
 513          &mov($num,&wparam(3)); # get num
 514         &xor($c,$c);                # clear carry
 515          &and($num,0xfffffff8);     # num / 8
 516 
 517         &jz(&label("aw_finish"));
 518 
 519         &set_label("aw_loop",0);
 520         for ($i=0; $i<8; $i++)
 521                 {
 522                 &comment("Round $i");
 523 
 524                 &mov($tmp1,&DWP($i*4,$a,"",0));         # *a
 525                  &mov($tmp2,&DWP($i*4,$b,"",0));        # *b
 526                 &sub($tmp1,$c);
 527                  &mov($c,0);
 528                 &adc($c,$c);
 529                  &sub($tmp1,$tmp2);
 530                 &adc($c,0);
 531                  &mov(&DWP($i*4,$r,"",0),$tmp1);        # *r
 532                 }
 533 
 534         &comment("");
 535         &add($a,32);
 536          &add($b,32);
 537         &add($r,32);
 538          &sub($num,8);
 539         &jnz(&label("aw_loop"));
 540 
 541         &set_label("aw_finish",0);
 542         &mov($num,&wparam(3));  # get num
 543         &and($num,7);
 544          &jz(&label("aw_end"));
 545 
 546         for ($i=0; $i<7; $i++)
 547                 {
 548                 &comment("Tail Round $i");
 549                 &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
 550                  &mov($tmp2,&DWP($i*4,$b,"",0));# *b
 551                 &sub($tmp1,$c);
 552                  &mov($c,0);
 553                 &adc($c,$c);
 554                  &sub($tmp1,$tmp2);
 555                 &adc($c,0);
 556                  &dec($num) if ($i != 6);
 557                 &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
 558                  &jz(&label("aw_end")) if ($i != 6);
 559                 }
 560         &set_label("aw_end",0);
 561 
 562 #       &mov("eax",$c);             # $c is "eax"
 563 
 564         &function_end($name);
 565         }
 566 
 567 sub bn_sub_part_words
 568         {
 569         local($name)=@_;
 570 
 571         &function_begin($name,"");
 572 
 573         &comment("");
 574         $a="esi";
 575         $b="edi";
 576         $c="eax";
 577         $r="ebx";
 578         $tmp1="ecx";
 579         $tmp2="edx";
 580         $num="ebp";
 581 
 582         &mov($r,&wparam(0));    # get r
 583          &mov($a,&wparam(1));   # get a
 584         &mov($b,&wparam(2));    # get b
 585          &mov($num,&wparam(3)); # get num
 586         &xor($c,$c);                # clear carry
 587          &and($num,0xfffffff8);     # num / 8
 588 
 589         &jz(&label("aw_finish"));
 590 
 591         &set_label("aw_loop",0);
 592         for ($i=0; $i<8; $i++)
 593                 {
 594                 &comment("Round $i");
 595 
 596                 &mov($tmp1,&DWP($i*4,$a,"",0));         # *a
 597                  &mov($tmp2,&DWP($i*4,$b,"",0));        # *b
 598                 &sub($tmp1,$c);
 599                  &mov($c,0);
 600                 &adc($c,$c);
 601                  &sub($tmp1,$tmp2);
 602                 &adc($c,0);
 603                  &mov(&DWP($i*4,$r,"",0),$tmp1);        # *r
 604                 }
 605 
 606         &comment("");
 607         &add($a,32);
 608          &add($b,32);
 609         &add($r,32);
 610          &sub($num,8);
 611         &jnz(&label("aw_loop"));
 612 
 613         &set_label("aw_finish",0);
 614         &mov($num,&wparam(3));  # get num
 615         &and($num,7);
 616          &jz(&label("aw_end"));
 617 
 618         for ($i=0; $i<7; $i++)
 619                 {
 620                 &comment("Tail Round $i");
 621                 &mov($tmp1,&DWP(0,$a,"",0));    # *a
 622                  &mov($tmp2,&DWP(0,$b,"",0));# *b
 623                 &sub($tmp1,$c);
 624                  &mov($c,0);
 625                 &adc($c,$c);
 626                  &sub($tmp1,$tmp2);
 627                 &adc($c,0);
 628                 &mov(&DWP(0,$r,"",0),$tmp1);    # *r
 629                 &add($a, 4);
 630                 &add($b, 4);
 631                 &add($r, 4);
 632                  &dec($num) if ($i != 6);
 633                  &jz(&label("aw_end")) if ($i != 6);
 634                 }
 635         &set_label("aw_end",0);
 636 
 637         &cmp(&wparam(4),0);
 638         &je(&label("pw_end"));
 639 
 640         &mov($num,&wparam(4));  # get dl
 641         &cmp($num,0);
 642         &je(&label("pw_end"));
 643         &jge(&label("pw_pos"));
 644 
 645         &comment("pw_neg");
 646         &mov($tmp2,0);
 647         &sub($tmp2,$num);
 648         &mov($num,$tmp2);
 649         &and($num,0xfffffff8);      # num / 8
 650         &jz(&label("pw_neg_finish"));
 651 
 652         &set_label("pw_neg_loop",0);
 653         for ($i=0; $i<8; $i++)
 654         {
 655             &comment("dl<0 Round $i");
 656 
 657             &mov($tmp1,0);
 658             &mov($tmp2,&DWP($i*4,$b,"",0));     # *b
 659             &sub($tmp1,$c);
 660             &mov($c,0);
 661             &adc($c,$c);
 662             &sub($tmp1,$tmp2);
 663             &adc($c,0);
 664             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
 665         }
 666 
 667         &comment("");
 668         &add($b,32);
 669         &add($r,32);
 670         &sub($num,8);
 671         &jnz(&label("pw_neg_loop"));
 672 
 673         &set_label("pw_neg_finish",0);
 674         &mov($tmp2,&wparam(4)); # get dl
 675         &mov($num,0);
 676         &sub($num,$tmp2);
 677         &and($num,7);
 678         &jz(&label("pw_end"));
 679 
 680         for ($i=0; $i<7; $i++)
 681         {
 682             &comment("dl<0 Tail Round $i");
 683             &mov($tmp1,0);
 684             &mov($tmp2,&DWP($i*4,$b,"",0));# *b
 685             &sub($tmp1,$c);
 686             &mov($c,0);
 687             &adc($c,$c);
 688             &sub($tmp1,$tmp2);
 689             &adc($c,0);
 690             &dec($num) if ($i != 6);
 691             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
 692             &jz(&label("pw_end")) if ($i != 6);
 693         }
 694 
 695         &jmp(&label("pw_end"));
 696 
 697         &set_label("pw_pos",0);
 698 
 699         &and($num,0xfffffff8);      # num / 8
 700         &jz(&label("pw_pos_finish"));
 701 
 702         &set_label("pw_pos_loop",0);
 703 
 704         for ($i=0; $i<8; $i++)
 705         {
 706             &comment("dl>0 Round $i");
 707 
 708             &mov($tmp1,&DWP($i*4,$a,"",0));     # *a
 709             &sub($tmp1,$c);
 710             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
 711             &jnc(&label("pw_nc".$i));
 712         }
 713 
 714         &comment("");
 715         &add($a,32);
 716         &add($r,32);
 717         &sub($num,8);
 718         &jnz(&label("pw_pos_loop"));
 719 
 720         &set_label("pw_pos_finish",0);
 721         &mov($num,&wparam(4));  # get dl
 722         &and($num,7);
 723         &jz(&label("pw_end"));
 724 
 725         for ($i=0; $i<7; $i++)
 726         {
 727             &comment("dl>0 Tail Round $i");
 728             &mov($tmp1,&DWP($i*4,$a,"",0));     # *a
 729             &sub($tmp1,$c);
 730             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
 731             &jnc(&label("pw_tail_nc".$i));
 732             &dec($num) if ($i != 6);
 733             &jz(&label("pw_end")) if ($i != 6);
 734         }
 735         &mov($c,1);
 736         &jmp(&label("pw_end"));
 737 
 738         &set_label("pw_nc_loop",0);
 739         for ($i=0; $i<8; $i++)
 740         {
 741             &mov($tmp1,&DWP($i*4,$a,"",0));     # *a
 742             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
 743             &set_label("pw_nc".$i,0);
 744         }
 745 
 746         &comment("");
 747         &add($a,32);
 748         &add($r,32);
 749         &sub($num,8);
 750         &jnz(&label("pw_nc_loop"));
 751 
 752         &mov($num,&wparam(4));  # get dl
 753         &and($num,7);
 754         &jz(&label("pw_nc_end"));
 755 
 756         for ($i=0; $i<7; $i++)
 757         {
 758             &mov($tmp1,&DWP($i*4,$a,"",0));     # *a
 759             &mov(&DWP($i*4,$r,"",0),$tmp1);     # *r
 760             &set_label("pw_tail_nc".$i,0);
 761             &dec($num) if ($i != 6);
 762             &jz(&label("pw_nc_end")) if ($i != 6);
 763         }
 764 
 765         &set_label("pw_nc_end",0);
 766         &mov($c,0);
 767 
 768         &set_label("pw_end",0);
 769 
 770 #       &mov("eax",$c);             # $c is "eax"
 771 
 772         &function_end($name);
 773         }