1 #!/usr/bin/env perl
   2 
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 
  10 # October 2005
  11 #
  12 # This is a "teaser" code, as it can be improved in several ways...
  13 # First of all non-SSE2 path should be implemented (yes, for now it
  14 # performs Montgomery multiplication/convolution only on SSE2-capable
  15 # CPUs such as P4, others fall down to original code). Then inner loop
  16 # can be unrolled and modulo-scheduled to improve ILP and possibly
  17 # moved to 128-bit XMM register bank (though it would require input
  18 # rearrangement and/or increase bus bandwidth utilization). Dedicated
  19 # squaring procedure should give further performance improvement...
  20 # Yet, for being draft, the code improves rsa512 *sign* benchmark by
  21 # 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
  22 
  23 # December 2006
  24 #
  25 # Modulo-scheduling SSE2 loops results in further 15-20% improvement.
  26 # Integer-only code [being equipped with dedicated squaring procedure]
  27 # gives ~40% on rsa512 sign benchmark...
  28 
  29 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  30 push(@INC,"${dir}","${dir}../../perlasm");
  31 require "x86asm.pl";
  32 
  33 &asm_init($ARGV[0],$0);
  34 
  35 $sse2=0;
  36 for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
  37 
  38 &external_label("OPENSSL_ia32cap_P") if ($sse2);
  39 
  40 &function_begin("bn_mul_mont");
  41 
  42 $i="edx";
  43 $j="ecx";
  44 $ap="esi";      $tp="esi";              # overlapping variables!!!
  45 $rp="edi";      $bp="edi";              # overlapping variables!!!
  46 $np="ebp";
  47 $num="ebx";
  48 
  49 $_num=&DWP(4*0,"esp");                      # stack top layout
  50 $_rp=&DWP(4*1,"esp");
  51 $_ap=&DWP(4*2,"esp");
  52 $_bp=&DWP(4*3,"esp");
  53 $_np=&DWP(4*4,"esp");
  54 $_n0=&DWP(4*5,"esp");       $_n0q=&QWP(4*5,"esp");
  55 $_sp=&DWP(4*6,"esp");
  56 $_bpend=&DWP(4*7,"esp");
  57 $frame=32;                              # size of above frame rounded up to 16n
  58 
  59         &xor        ("eax","eax");
  60         &mov        ("edi",&wparam(5)); # int num
  61         &cmp        ("edi",4);
  62         &jl (&label("just_leave"));
  63 
  64         &lea        ("esi",&wparam(0)); # put aside pointer to argument block
  65         &lea        ("edx",&wparam(1)); # load ap
  66         &mov        ("ebp","esp");          # saved stack pointer!
  67         &add        ("edi",2);              # extra two words on top of tp
  68         &neg        ("edi");
  69         &lea        ("esp",&DWP(-$frame,"esp","edi",4));        # alloca($frame+4*(num+2))
  70         &neg        ("edi");
  71 
  72         # minimize cache contention by arraning 2K window between stack
  73         # pointer and ap argument [np is also position sensitive vector,
  74         # but it's assumed to be near ap, as it's allocated at ~same
  75         # time].
  76         &mov        ("eax","esp");
  77         &sub        ("eax","edx");
  78         &and        ("eax",2047);
  79         &sub        ("esp","eax");          # this aligns sp and ap modulo 2048
  80 
  81         &xor        ("edx","esp");
  82         &and        ("edx",2048);
  83         &xor        ("edx",2048);
  84         &sub        ("esp","edx");          # this splits them apart modulo 4096
  85 
  86         &and        ("esp",-64);            # align to cache line
  87 
  88         ################################# load argument block...
  89         &mov        ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
  90         &mov        ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
  91         &mov        ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
  92         &mov        ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
  93         &mov        ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
  94         #&mov       ("edi",&DWP(5*4,"esi"));# int num
  95 
  96         &mov        ("esi",&DWP(0,"esi"));      # pull n0[0]
  97         &mov        ($_rp,"eax");           # ... save a copy of argument block
  98         &mov        ($_ap,"ebx");
  99         &mov        ($_bp,"ecx");
 100         &mov        ($_np,"edx");
 101         &mov        ($_n0,"esi");
 102         &lea        ($num,&DWP(-3,"edi"));      # num=num-1 to assist modulo-scheduling
 103         #&mov       ($_num,$num);           # redundant as $num is not reused
 104         &mov        ($_sp,"ebp");           # saved stack pointer!
 105 
 106 if($sse2) {
 107 $acc0="mm0";    # mmx register bank layout
 108 $acc1="mm1";
 109 $car0="mm2";
 110 $car1="mm3";
 111 $mul0="mm4";
 112 $mul1="mm5";
 113 $temp="mm6";
 114 $mask="mm7";
 115 
 116         &picmeup("eax","OPENSSL_ia32cap_P");
 117         &bt (&DWP(0,"eax"),26);
 118         &jnc        (&label("non_sse2"));
 119 
 120         &mov        ("eax",-1);
 121         &movd       ($mask,"eax");          # mask 32 lower bits
 122 
 123         &mov        ($ap,$_ap);             # load input pointers
 124         &mov        ($bp,$_bp);
 125         &mov        ($np,$_np);
 126 
 127         &xor        ($i,$i);                # i=0
 128         &xor        ($j,$j);                # j=0
 129 
 130         &movd       ($mul0,&DWP(0,$bp));                # bp[0]
 131         &movd       ($mul1,&DWP(0,$ap));                # ap[0]
 132         &movd       ($car1,&DWP(0,$np));                # np[0]
 133 
 134         &pmuludq($mul1,$mul0);                      # ap[0]*bp[0]
 135         &movq       ($car0,$mul1);
 136         &movq       ($acc0,$mul1);                  # I wish movd worked for
 137         &pand       ($acc0,$mask);                  # inter-register transfers
 138 
 139         &pmuludq($mul1,$_n0q);                      # *=n0
 140 
 141         &pmuludq($car1,$mul1);                      # "t[0]"*np[0]*n0
 142         &paddq      ($car1,$acc0);
 143 
 144         &movd       ($acc1,&DWP(4,$np));                # np[1]
 145         &movd       ($acc0,&DWP(4,$ap));                # ap[1]
 146 
 147         &psrlq      ($car0,32);
 148         &psrlq      ($car1,32);
 149 
 150         &inc        ($j);                           # j++
 151 &set_label("1st",16);
 152         &pmuludq($acc0,$mul0);                      # ap[j]*bp[0]
 153         &pmuludq($acc1,$mul1);                      # np[j]*m1
 154         &paddq      ($car0,$acc0);                  # +=c0
 155         &paddq      ($car1,$acc1);                  # +=c1
 156 
 157         &movq       ($acc0,$car0);
 158         &pand       ($acc0,$mask);
 159         &movd       ($acc1,&DWP(4,$np,$j,4));   # np[j+1]
 160         &paddq      ($car1,$acc0);                  # +=ap[j]*bp[0];
 161         &movd       ($acc0,&DWP(4,$ap,$j,4));   # ap[j+1]
 162         &psrlq      ($car0,32);
 163         &movd       (&DWP($frame-4,"esp",$j,4),$car1);  # tp[j-1]=
 164         &psrlq      ($car1,32);
 165 
 166         &lea        ($j,&DWP(1,$j));
 167         &cmp        ($j,$num);
 168         &jl (&label("1st"));
 169 
 170         &pmuludq($acc0,$mul0);                      # ap[num-1]*bp[0]
 171         &pmuludq($acc1,$mul1);                      # np[num-1]*m1
 172         &paddq      ($car0,$acc0);                  # +=c0
 173         &paddq      ($car1,$acc1);                  # +=c1
 174 
 175         &movq       ($acc0,$car0);
 176         &pand       ($acc0,$mask);
 177         &paddq      ($car1,$acc0);                  # +=ap[num-1]*bp[0];
 178         &movd       (&DWP($frame-4,"esp",$j,4),$car1);  # tp[num-2]=
 179 
 180         &psrlq      ($car0,32);
 181         &psrlq      ($car1,32);
 182 
 183         &paddq      ($car1,$car0);
 184         &movq       (&QWP($frame,"esp",$num,4),$car1);  # tp[num].tp[num-1]
 185 
 186         &inc        ($i);                           # i++
 187 &set_label("outer");
 188         &xor        ($j,$j);                        # j=0
 189 
 190         &movd       ($mul0,&DWP(0,$bp,$i,4));   # bp[i]
 191         &movd       ($mul1,&DWP(0,$ap));                # ap[0]
 192         &movd       ($temp,&DWP($frame,"esp")); # tp[0]
 193         &movd       ($car1,&DWP(0,$np));                # np[0]
 194         &pmuludq($mul1,$mul0);                      # ap[0]*bp[i]
 195 
 196         &paddq      ($mul1,$temp);                  # +=tp[0]
 197         &movq       ($acc0,$mul1);
 198         &movq       ($car0,$mul1);
 199         &pand       ($acc0,$mask);
 200 
 201         &pmuludq($mul1,$_n0q);                      # *=n0
 202 
 203         &pmuludq($car1,$mul1);
 204         &paddq      ($car1,$acc0);
 205 
 206         &movd       ($temp,&DWP($frame+4,"esp"));       # tp[1]
 207         &movd       ($acc1,&DWP(4,$np));                # np[1]
 208         &movd       ($acc0,&DWP(4,$ap));                # ap[1]
 209 
 210         &psrlq      ($car0,32);
 211         &psrlq      ($car1,32);
 212         &paddq      ($car0,$temp);                  # +=tp[1]
 213 
 214         &inc        ($j);                           # j++
 215         &dec        ($num);
 216 &set_label("inner");
 217         &pmuludq($acc0,$mul0);                      # ap[j]*bp[i]
 218         &pmuludq($acc1,$mul1);                      # np[j]*m1
 219         &paddq      ($car0,$acc0);                  # +=c0
 220         &paddq      ($car1,$acc1);                  # +=c1
 221 
 222         &movq       ($acc0,$car0);
 223         &movd       ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
 224         &pand       ($acc0,$mask);
 225         &movd       ($acc1,&DWP(4,$np,$j,4));   # np[j+1]
 226         &paddq      ($car1,$acc0);                  # +=ap[j]*bp[i]+tp[j]
 227         &movd       ($acc0,&DWP(4,$ap,$j,4));   # ap[j+1]
 228         &psrlq      ($car0,32);
 229         &movd       (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
 230         &psrlq      ($car1,32);
 231         &paddq      ($car0,$temp);                  # +=tp[j+1]
 232 
 233         &dec        ($num);
 234         &lea        ($j,&DWP(1,$j));            # j++
 235         &jnz        (&label("inner"));
 236 
 237         &mov        ($num,$j);
 238         &pmuludq($acc0,$mul0);                      # ap[num-1]*bp[i]
 239         &pmuludq($acc1,$mul1);                      # np[num-1]*m1
 240         &paddq      ($car0,$acc0);                  # +=c0
 241         &paddq      ($car1,$acc1);                  # +=c1
 242 
 243         &movq       ($acc0,$car0);
 244         &pand       ($acc0,$mask);
 245         &paddq      ($car1,$acc0);                  # +=ap[num-1]*bp[i]+tp[num-1]
 246         &movd       (&DWP($frame-4,"esp",$j,4),$car1);  # tp[num-2]=
 247         &psrlq      ($car0,32);
 248         &psrlq      ($car1,32);
 249 
 250         &movd       ($temp,&DWP($frame+4,"esp",$num,4));        # += tp[num]
 251         &paddq      ($car1,$car0);
 252         &paddq      ($car1,$temp);
 253         &movq       (&QWP($frame,"esp",$num,4),$car1);  # tp[num].tp[num-1]
 254 
 255         &lea        ($i,&DWP(1,$i));            # i++
 256         &cmp        ($i,$num);
 257         &jle        (&label("outer"));
 258 
 259         &emms       ();                             # done with mmx bank
 260         &jmp        (&label("common_tail"));
 261 
 262 &set_label("non_sse2",16);
 263 }
 264 
 265 if (0) {
 266         &mov        ("esp",$_sp);
 267         &xor        ("eax","eax");  # signal "not fast enough [yet]"
 268         &jmp        (&label("just_leave"));
 269         # While the below code provides competitive performance for
 270         # all key lengthes on modern Intel cores, it's still more
 271         # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
 272         # means compared to the original integer-only assembler.
 273         # 512-bit RSA sign is better by ~40%, but that's about all
 274         # one can say about all CPUs...
 275 } else {
 276 $inp="esi";     # integer path uses these registers differently
 277 $word="edi";
 278 $carry="ebp";
 279 
 280         &mov        ($inp,$_ap);
 281         &lea        ($carry,&DWP(1,$num));
 282         &mov        ($word,$_bp);
 283         &xor        ($j,$j);                                # j=0
 284         &mov        ("edx",$inp);
 285         &and        ($carry,1);                             # see if num is even
 286         &sub        ("edx",$word);                          # see if ap==bp
 287         &lea        ("eax",&DWP(4,$word,$num,4));               # &bp[num]
 288         &or ($carry,"edx");
 289         &mov        ($word,&DWP(0,$word));                      # bp[0]
 290         &jz (&label("bn_sqr_mont"));
 291         &mov        ($_bpend,"eax");
 292         &mov        ("eax",&DWP(0,$inp));
 293         &xor        ("edx","edx");
 294 
 295 &set_label("mull",16);
 296         &mov        ($carry,"edx");
 297         &mul        ($word);                                # ap[j]*bp[0]
 298         &add        ($carry,"eax");
 299         &lea        ($j,&DWP(1,$j));
 300         &adc        ("edx",0);
 301         &mov        ("eax",&DWP(0,$inp,$j,4));          # ap[j+1]
 302         &cmp        ($j,$num);
 303         &mov        (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
 304         &jl (&label("mull"));
 305 
 306         &mov        ($carry,"edx");
 307         &mul        ($word);                                # ap[num-1]*bp[0]
 308          &mov       ($word,$_n0);
 309         &add        ("eax",$carry);
 310          &mov       ($inp,$_np);
 311         &adc        ("edx",0);
 312          &imul      ($word,&DWP($frame,"esp"));         # n0*tp[0]
 313 
 314         &mov        (&DWP($frame,"esp",$num,4),"eax");  # tp[num-1]=
 315         &xor        ($j,$j);
 316         &mov        (&DWP($frame+4,"esp",$num,4),"edx");        # tp[num]=
 317         &mov        (&DWP($frame+8,"esp",$num,4),$j);   # tp[num+1]=
 318 
 319         &mov        ("eax",&DWP(0,$inp));                       # np[0]
 320         &mul        ($word);                                # np[0]*m
 321         &add        ("eax",&DWP($frame,"esp"));         # +=tp[0]
 322         &mov        ("eax",&DWP(4,$inp));                       # np[1]
 323         &adc        ("edx",0);
 324         &inc        ($j);
 325 
 326         &jmp        (&label("2ndmadd"));
 327 
 328 &set_label("1stmadd",16);
 329         &mov        ($carry,"edx");
 330         &mul        ($word);                                # ap[j]*bp[i]
 331         &add        ($carry,&DWP($frame,"esp",$j,4));   # +=tp[j]
 332         &lea        ($j,&DWP(1,$j));
 333         &adc        ("edx",0);
 334         &add        ($carry,"eax");
 335         &mov        ("eax",&DWP(0,$inp,$j,4));          # ap[j+1]
 336         &adc        ("edx",0);
 337         &cmp        ($j,$num);
 338         &mov        (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
 339         &jl (&label("1stmadd"));
 340 
 341         &mov        ($carry,"edx");
 342         &mul        ($word);                                # ap[num-1]*bp[i]
 343         &add        ("eax",&DWP($frame,"esp",$num,4));  # +=tp[num-1]
 344          &mov       ($word,$_n0);
 345         &adc        ("edx",0);
 346          &mov       ($inp,$_np);
 347         &add        ($carry,"eax");
 348         &adc        ("edx",0);
 349          &imul      ($word,&DWP($frame,"esp"));         # n0*tp[0]
 350 
 351         &xor        ($j,$j);
 352         &add        ("edx",&DWP($frame+4,"esp",$num,4));        # carry+=tp[num]
 353         &mov        (&DWP($frame,"esp",$num,4),$carry); # tp[num-1]=
 354         &adc        ($j,0);
 355          &mov       ("eax",&DWP(0,$inp));                       # np[0]
 356         &mov        (&DWP($frame+4,"esp",$num,4),"edx");        # tp[num]=
 357         &mov        (&DWP($frame+8,"esp",$num,4),$j);   # tp[num+1]=
 358 
 359         &mul        ($word);                                # np[0]*m
 360         &add        ("eax",&DWP($frame,"esp"));         # +=tp[0]
 361         &mov        ("eax",&DWP(4,$inp));                       # np[1]
 362         &adc        ("edx",0);
 363         &mov        ($j,1);
 364 
 365 &set_label("2ndmadd",16);
 366         &mov        ($carry,"edx");
 367         &mul        ($word);                                # np[j]*m
 368         &add        ($carry,&DWP($frame,"esp",$j,4));   # +=tp[j]
 369         &lea        ($j,&DWP(1,$j));
 370         &adc        ("edx",0);
 371         &add        ($carry,"eax");
 372         &mov        ("eax",&DWP(0,$inp,$j,4));          # np[j+1]
 373         &adc        ("edx",0);
 374         &cmp        ($j,$num);
 375         &mov        (&DWP($frame-8,"esp",$j,4),$carry); # tp[j-1]=
 376         &jl (&label("2ndmadd"));
 377 
 378         &mov        ($carry,"edx");
 379         &mul        ($word);                                # np[j]*m
 380         &add        ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
 381         &adc        ("edx",0);
 382         &add        ($carry,"eax");
 383         &adc        ("edx",0);
 384         &mov        (&DWP($frame-4,"esp",$num,4),$carry);       # tp[num-2]=
 385 
 386         &xor        ("eax","eax");
 387          &mov       ($j,$_bp);                              # &bp[i]
 388         &add        ("edx",&DWP($frame+4,"esp",$num,4));        # carry+=tp[num]
 389         &adc        ("eax",&DWP($frame+8,"esp",$num,4));        # +=tp[num+1]
 390          &lea       ($j,&DWP(4,$j));
 391         &mov        (&DWP($frame,"esp",$num,4),"edx");  # tp[num-1]=
 392          &cmp       ($j,$_bpend);
 393         &mov        (&DWP($frame+4,"esp",$num,4),"eax");        # tp[num]=
 394         &je (&label("common_tail"));
 395 
 396         &mov        ($word,&DWP(0,$j));                 # bp[i+1]
 397         &mov        ($inp,$_ap);
 398         &mov        ($_bp,$j);                              # &bp[++i]
 399         &xor        ($j,$j);
 400         &xor        ("edx","edx");
 401         &mov        ("eax",&DWP(0,$inp));
 402         &jmp        (&label("1stmadd"));
 403 
 404 &set_label("bn_sqr_mont",16);
 405 $sbit=$num;
 406         &mov        ($_num,$num);
 407         &mov        ($_bp,$j);                              # i=0
 408 
 409         &mov        ("eax",$word);                          # ap[0]
 410         &mul        ($word);                                # ap[0]*ap[0]
 411         &mov        (&DWP($frame,"esp"),"eax");         # tp[0]=
 412         &mov        ($sbit,"edx");
 413         &shr        ("edx",1);
 414         &and        ($sbit,1);
 415         &inc        ($j);
 416 &set_label("sqr",16);
 417         &mov        ("eax",&DWP(0,$inp,$j,4));          # ap[j]
 418         &mov        ($carry,"edx");
 419         &mul        ($word);                                # ap[j]*ap[0]
 420         &add        ("eax",$carry);
 421         &lea        ($j,&DWP(1,$j));
 422         &adc        ("edx",0);
 423         &lea        ($carry,&DWP(0,$sbit,"eax",2));
 424         &shr        ("eax",31);
 425         &cmp        ($j,$_num);
 426         &mov        ($sbit,"eax");
 427         &mov        (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
 428         &jl (&label("sqr"));
 429 
 430         &mov        ("eax",&DWP(0,$inp,$j,4));          # ap[num-1]
 431         &mov        ($carry,"edx");
 432         &mul        ($word);                                # ap[num-1]*ap[0]
 433         &add        ("eax",$carry);
 434          &mov       ($word,$_n0);
 435         &adc        ("edx",0);
 436          &mov       ($inp,$_np);
 437         &lea        ($carry,&DWP(0,$sbit,"eax",2));
 438          &imul      ($word,&DWP($frame,"esp"));         # n0*tp[0]
 439         &shr        ("eax",31);
 440         &mov        (&DWP($frame,"esp",$j,4),$carry);   # tp[num-1]=
 441 
 442         &lea        ($carry,&DWP(0,"eax","edx",2));
 443          &mov       ("eax",&DWP(0,$inp));                       # np[0]
 444         &shr        ("edx",31);
 445         &mov        (&DWP($frame+4,"esp",$j,4),$carry); # tp[num]=
 446         &mov        (&DWP($frame+8,"esp",$j,4),"edx");  # tp[num+1]=
 447 
 448         &mul        ($word);                                # np[0]*m
 449         &add        ("eax",&DWP($frame,"esp"));         # +=tp[0]
 450         &mov        ($num,$j);
 451         &adc        ("edx",0);
 452         &mov        ("eax",&DWP(4,$inp));                       # np[1]
 453         &mov        ($j,1);
 454 
 455 &set_label("3rdmadd",16);
 456         &mov        ($carry,"edx");
 457         &mul        ($word);                                # np[j]*m
 458         &add        ($carry,&DWP($frame,"esp",$j,4));   # +=tp[j]
 459         &adc        ("edx",0);
 460         &add        ($carry,"eax");
 461         &mov        ("eax",&DWP(4,$inp,$j,4));          # np[j+1]
 462         &adc        ("edx",0);
 463         &mov        (&DWP($frame-4,"esp",$j,4),$carry); # tp[j-1]=
 464 
 465         &mov        ($carry,"edx");
 466         &mul        ($word);                                # np[j+1]*m
 467         &add        ($carry,&DWP($frame+4,"esp",$j,4)); # +=tp[j+1]
 468         &lea        ($j,&DWP(2,$j));
 469         &adc        ("edx",0);
 470         &add        ($carry,"eax");
 471         &mov        ("eax",&DWP(0,$inp,$j,4));          # np[j+2]
 472         &adc        ("edx",0);
 473         &cmp        ($j,$num);
 474         &mov        (&DWP($frame-8,"esp",$j,4),$carry); # tp[j]=
 475         &jl (&label("3rdmadd"));
 476 
 477         &mov        ($carry,"edx");
 478         &mul        ($word);                                # np[j]*m
 479         &add        ($carry,&DWP($frame,"esp",$num,4)); # +=tp[num-1]
 480         &adc        ("edx",0);
 481         &add        ($carry,"eax");
 482         &adc        ("edx",0);
 483         &mov        (&DWP($frame-4,"esp",$num,4),$carry);       # tp[num-2]=
 484 
 485         &mov        ($j,$_bp);                              # i
 486         &xor        ("eax","eax");
 487         &mov        ($inp,$_ap);
 488         &add        ("edx",&DWP($frame+4,"esp",$num,4));        # carry+=tp[num]
 489         &adc        ("eax",&DWP($frame+8,"esp",$num,4));        # +=tp[num+1]
 490         &mov        (&DWP($frame,"esp",$num,4),"edx");  # tp[num-1]=
 491         &cmp        ($j,$num);
 492         &mov        (&DWP($frame+4,"esp",$num,4),"eax");        # tp[num]=
 493         &je (&label("common_tail"));
 494 
 495         &mov        ($word,&DWP(4,$inp,$j,4));          # ap[i]
 496         &lea        ($j,&DWP(1,$j));
 497         &mov        ("eax",$word);
 498         &mov        ($_bp,$j);                              # ++i
 499         &mul        ($word);                                # ap[i]*ap[i]
 500         &add        ("eax",&DWP($frame,"esp",$j,4));    # +=tp[i]
 501         &adc        ("edx",0);
 502         &mov        (&DWP($frame,"esp",$j,4),"eax");    # tp[i]=
 503         &xor        ($carry,$carry);
 504         &cmp        ($j,$num);
 505         &lea        ($j,&DWP(1,$j));
 506         &je (&label("sqrlast"));
 507 
 508         &mov        ($sbit,"edx");                          # zaps $num
 509         &shr        ("edx",1);
 510         &and        ($sbit,1);
 511 &set_label("sqradd",16);
 512         &mov        ("eax",&DWP(0,$inp,$j,4));          # ap[j]
 513         &mov        ($carry,"edx");
 514         &mul        ($word);                                # ap[j]*ap[i]
 515         &add        ("eax",$carry);
 516         &lea        ($carry,&DWP(0,"eax","eax"));
 517         &adc        ("edx",0);
 518         &shr        ("eax",31);
 519         &add        ($carry,&DWP($frame,"esp",$j,4));   # +=tp[j]
 520         &lea        ($j,&DWP(1,$j));
 521         &adc        ("eax",0);
 522         &add        ($carry,$sbit);
 523         &adc        ("eax",0);
 524         &cmp        ($j,$_num);
 525         &mov        (&DWP($frame-4,"esp",$j,4),$carry); # tp[j]=
 526         &mov        ($sbit,"eax");
 527         &jle        (&label("sqradd"));
 528 
 529         &mov        ($carry,"edx");
 530         &add        ("edx","edx");
 531         &shr        ($carry,31);
 532         &add        ("edx",$sbit);
 533         &adc        ($carry,0);
 534 &set_label("sqrlast");
 535         &mov        ($word,$_n0);
 536         &mov        ($inp,$_np);
 537         &imul       ($word,&DWP($frame,"esp"));         # n0*tp[0]
 538 
 539         &add        ("edx",&DWP($frame,"esp",$j,4));    # +=tp[num]
 540         &mov        ("eax",&DWP(0,$inp));                       # np[0]
 541         &adc        ($carry,0);
 542         &mov        (&DWP($frame,"esp",$j,4),"edx");    # tp[num]=
 543         &mov        (&DWP($frame+4,"esp",$j,4),$carry); # tp[num+1]=
 544 
 545         &mul        ($word);                                # np[0]*m
 546         &add        ("eax",&DWP($frame,"esp"));         # +=tp[0]
 547         &lea        ($num,&DWP(-1,$j));
 548         &adc        ("edx",0);
 549         &mov        ($j,1);
 550         &mov        ("eax",&DWP(4,$inp));                       # np[1]
 551 
 552         &jmp        (&label("3rdmadd"));
 553 }
 554 
 555 &set_label("common_tail",16);
 556         &mov        ($np,$_np);                     # load modulus pointer
 557         &mov        ($rp,$_rp);                     # load result pointer
 558         &lea        ($tp,&DWP($frame,"esp"));   # [$ap and $bp are zapped]
 559 
 560         &mov        ("eax",&DWP(0,$tp));                # tp[0]
 561         &mov        ($j,$num);                      # j=num-1
 562         &xor        ($i,$i);                        # i=0 and clear CF!
 563 
 564 &set_label("sub",16);
 565         &sbb        ("eax",&DWP(0,$np,$i,4));
 566         &mov        (&DWP(0,$rp,$i,4),"eax");   # rp[i]=tp[i]-np[i]
 567         &dec        ($j);                           # doesn't affect CF!
 568         &mov        ("eax",&DWP(4,$tp,$i,4));   # tp[i+1]
 569         &lea        ($i,&DWP(1,$i));            # i++
 570         &jge        (&label("sub"));
 571 
 572         &sbb        ("eax",0);                      # handle upmost overflow bit
 573         &and        ($tp,"eax");
 574         &not        ("eax");
 575         &mov        ($np,$rp);
 576         &and        ($np,"eax");
 577         &or ($tp,$np);                      # tp=carry?tp:rp
 578 
 579 &set_label("copy",16);                              # copy or in-place refresh
 580         &mov        ("eax",&DWP(0,$tp,$num,4));
 581         &mov        (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
 582         &mov        (&DWP($frame,"esp",$num,4),$j);     # zap temporary vector
 583         &dec        ($num);
 584         &jge        (&label("copy"));
 585 
 586         &mov        ("esp",$_sp);           # pull saved stack pointer
 587         &mov        ("eax",1);
 588 &set_label("just_leave");
 589 &function_end("bn_mul_mont");
 590 
 591 &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
 592 
 593 &asm_finish();