1 /*
   2  * inffast.S is a hand tuned assembler version of:
   3  *
   4  * inffast.c -- fast decoding
   5  * Copyright (C) 1995-2003 Mark Adler
   6  * For conditions of distribution and use, see copyright notice in zlib.h
   7  *
   8  * Copyright (C) 2003 Chris Anderson <christop@charm.net>
   9  * Please use the copyright conditions above.
  10  *
  11  * This version (Jan-23-2003) of inflate_fast was coded and tested under
  12  * GNU/Linux on a pentium 3, using the gcc-3.2 compiler distribution.  On that
  13  * machine, I found that gzip style archives decompressed about 20% faster than
  14  * the gcc-3.2 -O3 -fomit-frame-pointer compiled version.  Your results will
  15  * depend on how large of a buffer is used for z_stream.next_in & next_out
  16  * (8K-32K worked best for my 256K cpu cache) and how much overhead there is in
  17  * stream processing I/O and crc32/addler32.  In my case, this routine used
  18  * 70% of the cpu time and crc32 used 20%.
  19  *
  20  * I am confident that this version will work in the general case, but I have
  21  * not tested a wide variety of datasets or a wide variety of platforms.
  22  *
  23  * Jan-24-2003 -- Added -DUSE_MMX define for slightly faster inflating.
  24  * It should be a runtime flag instead of compile time flag...
  25  *
  26  * Jan-26-2003 -- Added runtime check for MMX support with cpuid instruction.
  27  * With -DUSE_MMX, only MMX code is compiled.  With -DNO_MMX, only non-MMX code
  28  * is compiled.  Without either option, runtime detection is enabled.  Runtime
  29  * detection should work on all modern cpus and the recomended algorithm (flip
  30  * ID bit on eflags and then use the cpuid instruction) is used in many
  31  * multimedia applications.  Tested under win2k with gcc-2.95 and gas-2.12
  32  * distributed with cygwin3.  Compiling with gcc-2.95 -c inffast.S -o
  33  * inffast.obj generates a COFF object which can then be linked with MSVC++
  34  * compiled code.  Tested under FreeBSD 4.7 with gcc-2.95.
  35  *
  36  * Jan-28-2003 -- Tested Athlon XP... MMX mode is slower than no MMX (and
  37  * slower than compiler generated code).  Adjusted cpuid check to use the MMX
  38  * code only for Pentiums < P4 until I have more data on the P4.  Speed
  39  * improvment is only about 15% on the Athlon when compared with code generated
  40  * with MSVC++.  Not sure yet, but I think the P4 will also be slower using the
  41  * MMX mode because many of it's x86 ALU instructions execute in .5 cycles and
  42  * have less latency than MMX ops.  Added code to buffer the last 11 bytes of
  43  * the input stream since the MMX code grabs bits in chunks of 32, which
  44  * differs from the inffast.c algorithm.  I don't think there would have been
  45  * read overruns where a page boundary was crossed (a segfault), but there
  46  * could have been overruns when next_in ends on unaligned memory (unintialized
  47  * memory read).
  48  *
  49  * Mar-13-2003 -- P4 MMX is slightly slower than P4 NO_MMX.  I created a C
  50  * version of the non-MMX code so that it doesn't depend on zstrm and zstate
  51  * structure offsets which are hard coded in this file.  This was last tested
  52  * with zlib-1.2.0 which is currently in beta testing, newer versions of this
  53  * and inffas86.c can be found at http://www.eetbeetee.com/zlib/ and
  54  * http://www.charm.net/~christop/zlib/
  55  */
  56 
  57 
  58 /*
  59  * if you have underscore linking problems (_inflate_fast undefined), try
  60  * using -DGAS_COFF
  61  */
  62 #if ! defined( GAS_COFF ) && ! defined( GAS_ELF )
  63 
  64 #if defined( WIN32 ) || defined( __CYGWIN__ )
  65 #define GAS_COFF /* windows object format */
  66 #else
  67 #define GAS_ELF
  68 #endif
  69 
  70 #endif /* ! GAS_COFF && ! GAS_ELF */
  71 
  72 
  73 #if defined( GAS_COFF )
  74 
  75 /* coff externals have underscores */
  76 #define inflate_fast _inflate_fast
  77 #define inflate_fast_use_mmx _inflate_fast_use_mmx
  78 
  79 #endif /* GAS_COFF */
  80 
  81 
  82 .file "inffast.S"
  83 
  84 .globl inflate_fast
  85 
  86 .text
  87 .align 4,0
  88 .L_invalid_literal_length_code_msg:
  89 .string "invalid literal/length code"
  90 
  91 .align 4,0
  92 .L_invalid_distance_code_msg:
  93 .string "invalid distance code"
  94 
  95 .align 4,0
  96 .L_invalid_distance_too_far_msg:
  97 .string "invalid distance too far back"
  98 
  99 #if ! defined( NO_MMX )
 100 .align 4,0
 101 .L_mask: /* mask[N] = ( 1 << N ) - 1 */
 102 .long 0
 103 .long 1
 104 .long 3
 105 .long 7
 106 .long 15
 107 .long 31
 108 .long 63
 109 .long 127
 110 .long 255
 111 .long 511
 112 .long 1023
 113 .long 2047
 114 .long 4095
 115 .long 8191
 116 .long 16383
 117 .long 32767
 118 .long 65535
 119 .long 131071
 120 .long 262143
 121 .long 524287
 122 .long 1048575
 123 .long 2097151
 124 .long 4194303
 125 .long 8388607
 126 .long 16777215
 127 .long 33554431
 128 .long 67108863
 129 .long 134217727
 130 .long 268435455
 131 .long 536870911
 132 .long 1073741823
 133 .long 2147483647
 134 .long 4294967295
 135 #endif /* NO_MMX */
 136 
 137 .text
 138 
 139 /*
 140  * struct z_stream offsets, in zlib.h
 141  */
 142 #define next_in_strm   0   /* strm->next_in */
 143 #define avail_in_strm  4   /* strm->avail_in */
 144 #define next_out_strm  12  /* strm->next_out */
 145 #define avail_out_strm 16  /* strm->avail_out */
 146 #define msg_strm       24  /* strm->msg */
 147 #define state_strm     28  /* strm->state */
 148 
 149 /*
 150  * struct inflate_state offsets, in inflate.h
 151  */
 152 #define mode_state     0   /* state->mode */
 153 #define wsize_state    32  /* state->wsize */
 154 #define write_state    40  /* state->write */
 155 #define window_state   44  /* state->window */
 156 #define hold_state     48  /* state->hold */
 157 #define bits_state     52  /* state->bits */
 158 #define lencode_state  68  /* state->lencode */
 159 #define distcode_state 72  /* state->distcode */
 160 #define lenbits_state  76  /* state->lenbits */
 161 #define distbits_state 80  /* state->distbits */
 162 
 163 /*
 164  * inflate_fast's activation record
 165  */
 166 #define local_var_size 64 /* how much local space for vars */
 167 #define strm_sp        88 /* first arg: z_stream * (local_var_size + 24) */
 168 #define start_sp       92 /* second arg: unsigned int (local_var_size + 28) */
 169 
 170 /*
 171  * offsets for local vars on stack
 172  */
 173 #define out            60  /* unsigned char* */
 174 #define window         56  /* unsigned char* */
 175 #define wsize          52  /* unsigned int */
 176 #define write          48  /* unsigned int */
 177 #define in             44  /* unsigned char* */
 178 #define beg            40  /* unsigned char* */
 179 #define buf            28  /* char[ 12 ] */
 180 #define len            24  /* unsigned int */
 181 #define last           20  /* unsigned char* */
 182 #define end            16  /* unsigned char* */
 183 #define dcode          12  /* code* */
 184 #define lcode           8  /* code* */
 185 #define dmask           4  /* unsigned int */
 186 #define lmask           0  /* unsigned int */
 187 
 188 /*
 189  * typedef enum inflate_mode consts, in inflate.h
 190  */
 191 #define INFLATE_MODE_TYPE 11  /* state->mode flags enum-ed in inflate.h */
 192 #define INFLATE_MODE_BAD  26
 193 
 194 
 195 #if ! defined( USE_MMX ) && ! defined( NO_MMX )
 196 
 197 #define RUN_TIME_MMX
 198 
 199 #define CHECK_MMX    1
 200 #define DO_USE_MMX   2
 201 #define DONT_USE_MMX 3
 202 
 203 .globl inflate_fast_use_mmx
 204 
 205 .data
 206 
 207 .align 4,0
 208 inflate_fast_use_mmx: /* integer flag for run time control 1=check,2=mmx,3=no */
 209 .long CHECK_MMX
 210 
 211 #if defined( GAS_ELF )
 212 /* elf info */
 213 .type   inflate_fast_use_mmx,@object
 214 .size   inflate_fast_use_mmx,4
 215 #endif
 216 
 217 #endif /* RUN_TIME_MMX */
 218 
 219 #if defined( GAS_COFF )
 220 /* coff info: scl 2 = extern, type 32 = function */
 221 .def inflate_fast; .scl 2; .type 32; .endef
 222 #endif
 223 
 224 .text
 225 
 226 .align 32,0x90
 227 inflate_fast:
 228         pushl   %edi
 229         pushl   %esi
 230         pushl   %ebp
 231         pushl   %ebx
 232         pushf   /* save eflags (strm_sp, state_sp assumes this is 32 bits) */
 233         subl    $local_var_size, %esp
 234         cld
 235 
 236 #define strm_r  %esi
 237 #define state_r %edi
 238 
 239         movl    strm_sp(%esp), strm_r
 240         movl    state_strm(strm_r), state_r
 241 
 242         /* in = strm->next_in;
 243          * out = strm->next_out;
 244          * last = in + strm->avail_in - 11;
 245          * beg = out - (start - strm->avail_out);
 246          * end = out + (strm->avail_out - 257);
 247          */
 248         movl    avail_in_strm(strm_r), %edx
 249         movl    next_in_strm(strm_r), %eax
 250 
 251         addl    %eax, %edx      /* avail_in += next_in */
 252         subl    $11, %edx       /* avail_in -= 11 */
 253 
 254         movl    %eax, in(%esp)
 255         movl    %edx, last(%esp)
 256 
 257         movl    start_sp(%esp), %ebp
 258         movl    avail_out_strm(strm_r), %ecx
 259         movl    next_out_strm(strm_r), %ebx
 260 
 261         subl    %ecx, %ebp      /* start -= avail_out */
 262         negl    %ebp            /* start = -start */
 263         addl    %ebx, %ebp      /* start += next_out */
 264 
 265         subl    $257, %ecx      /* avail_out -= 257 */
 266         addl    %ebx, %ecx      /* avail_out += out */
 267 
 268         movl    %ebx, out(%esp)
 269         movl    %ebp, beg(%esp)
 270         movl    %ecx, end(%esp)
 271 
 272         /* wsize = state->wsize;
 273          * write = state->write;
 274          * window = state->window;
 275          * hold = state->hold;
 276          * bits = state->bits;
 277          * lcode = state->lencode;
 278          * dcode = state->distcode;
 279          * lmask = ( 1 << state->lenbits ) - 1;
 280          * dmask = ( 1 << state->distbits ) - 1;
 281          */
 282 
 283         movl    lencode_state(state_r), %eax
 284         movl    distcode_state(state_r), %ecx
 285 
 286         movl    %eax, lcode(%esp)
 287         movl    %ecx, dcode(%esp)
 288 
 289         movl    $1, %eax
 290         movl    lenbits_state(state_r), %ecx
 291         shll    %cl, %eax
 292         decl    %eax
 293         movl    %eax, lmask(%esp)
 294 
 295         movl    $1, %eax
 296         movl    distbits_state(state_r), %ecx
 297         shll    %cl, %eax
 298         decl    %eax
 299         movl    %eax, dmask(%esp)
 300 
 301         movl    wsize_state(state_r), %eax
 302         movl    write_state(state_r), %ecx
 303         movl    window_state(state_r), %edx
 304 
 305         movl    %eax, wsize(%esp)
 306         movl    %ecx, write(%esp)
 307         movl    %edx, window(%esp)
 308 
 309         movl    hold_state(state_r), %ebp
 310         movl    bits_state(state_r), %ebx
 311 
 312 #undef strm_r
 313 #undef state_r
 314 
 315 #define in_r       %esi
 316 #define from_r     %esi
 317 #define out_r      %edi
 318 
 319         movl    in(%esp), in_r
 320         movl    last(%esp), %ecx
 321         cmpl    in_r, %ecx
 322         ja      .L_align_long           /* if in < last */
 323 
 324         addl    $11, %ecx               /* ecx = &in[ avail_in ] */
 325         subl    in_r, %ecx              /* ecx = avail_in */
 326         movl    $12, %eax
 327         subl    %ecx, %eax              /* eax = 12 - avail_in */
 328         leal    buf(%esp), %edi
 329         rep     movsb                   /* memcpy( buf, in, avail_in ) */
 330         movl    %eax, %ecx
 331         xorl    %eax, %eax
 332         rep     stosb         /* memset( &buf[ avail_in ], 0, 12 - avail_in ) */
 333         leal    buf(%esp), in_r         /* in = buf */
 334         movl    in_r, last(%esp)        /* last = in, do just one iteration */
 335         jmp     .L_is_aligned
 336 
 337         /* align in_r on long boundary */
 338 .L_align_long:
 339         testl   $3, in_r
 340         jz      .L_is_aligned
 341         xorl    %eax, %eax
 342         movb    (in_r), %al
 343         incl    in_r
 344         movl    %ebx, %ecx
 345         addl    $8, %ebx
 346         shll    %cl, %eax
 347         orl     %eax, %ebp
 348         jmp     .L_align_long
 349 
 350 .L_is_aligned:
 351         movl    out(%esp), out_r
 352 
 353 #if defined( NO_MMX )
 354         jmp     .L_do_loop
 355 #endif
 356 
 357 #if defined( USE_MMX )
 358         jmp     .L_init_mmx
 359 #endif
 360 
 361 /*** Runtime MMX check ***/
 362 
 363 #if defined( RUN_TIME_MMX )
 364 .L_check_mmx:
 365         cmpl    $DO_USE_MMX, inflate_fast_use_mmx
 366         je      .L_init_mmx
 367         ja      .L_do_loop /* > 2 */
 368 
 369         pushl   %eax
 370         pushl   %ebx
 371         pushl   %ecx
 372         pushl   %edx
 373         pushf
 374         movl    (%esp), %eax      /* copy eflags to eax */
 375         xorl    $0x200000, (%esp) /* try toggling ID bit of eflags (bit 21)
 376                                    * to see if cpu supports cpuid...
 377                                    * ID bit method not supported by NexGen but
 378                                    * bios may load a cpuid instruction and
 379                                    * cpuid may be disabled on Cyrix 5-6x86 */
 380         popf
 381         pushf
 382         popl    %edx              /* copy new eflags to edx */
 383         xorl    %eax, %edx        /* test if ID bit is flipped */
 384         jz      .L_dont_use_mmx   /* not flipped if zero */
 385         xorl    %eax, %eax
 386         cpuid
 387         cmpl    $0x756e6547, %ebx /* check for GenuineIntel in ebx,ecx,edx */
 388         jne     .L_dont_use_mmx
 389         cmpl    $0x6c65746e, %ecx
 390         jne     .L_dont_use_mmx
 391         cmpl    $0x49656e69, %edx
 392         jne     .L_dont_use_mmx
 393         movl    $1, %eax
 394         cpuid                     /* get cpu features */
 395         shrl    $8, %eax
 396         andl    $15, %eax
 397         cmpl    $6, %eax          /* check for Pentium family, is 0xf for P4 */
 398         jne     .L_dont_use_mmx
 399         testl   $0x800000, %edx   /* test if MMX feature is set (bit 23) */
 400         jnz     .L_use_mmx
 401         jmp     .L_dont_use_mmx
 402 .L_use_mmx:
 403         movl    $DO_USE_MMX, inflate_fast_use_mmx
 404         jmp     .L_check_mmx_pop
 405 .L_dont_use_mmx:
 406         movl    $DONT_USE_MMX, inflate_fast_use_mmx
 407 .L_check_mmx_pop:
 408         popl    %edx
 409         popl    %ecx
 410         popl    %ebx
 411         popl    %eax
 412         jmp     .L_check_mmx
 413 #endif
 414 
 415 
 416 /*** Non-MMX code ***/
 417 
 418 #if defined ( NO_MMX ) || defined( RUN_TIME_MMX )
 419 
 420 #define hold_r     %ebp
 421 #define bits_r     %bl
 422 #define bitslong_r %ebx
 423 
 424 .align 32,0x90
 425 .L_while_test:
 426         /* while (in < last && out < end)
 427          */
 428         cmpl    out_r, end(%esp)
 429         jbe     .L_break_loop           /* if (out >= end) */
 430 
 431         cmpl    in_r, last(%esp)
 432         jbe     .L_break_loop
 433 
 434 .L_do_loop:
 435         /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out
 436          *
 437          * do {
 438          *   if (bits < 15) {
 439          *     hold |= *((unsigned short *)in)++ << bits;
 440          *     bits += 16
 441          *   }
 442          *   this = lcode[hold & lmask]
 443          */
 444         cmpb    $15, bits_r
 445         ja      .L_get_length_code      /* if (15 < bits) */
 446 
 447         xorl    %eax, %eax
 448         lodsw                           /* al = *(ushort *)in++ */
 449         movb    bits_r, %cl             /* cl = bits, needs it for shifting */
 450         addb    $16, bits_r             /* bits += 16 */
 451         shll    %cl, %eax
 452         orl     %eax, hold_r            /* hold |= *((ushort *)in)++ << bits */
 453 
 454 .L_get_length_code:
 455         movl    lmask(%esp), %edx       /* edx = lmask */
 456         movl    lcode(%esp), %ecx       /* ecx = lcode */
 457         andl    hold_r, %edx            /* edx &= hold */
 458         movl    (%ecx,%edx,4), %eax     /* eax = lcode[hold & lmask] */
 459 
 460 .L_dolen:
 461         /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out
 462          *
 463          * dolen:
 464          *    bits -= this.bits;
 465          *    hold >>= this.bits
 466          */
 467         movb    %ah, %cl                /* cl = this.bits */
 468         subb    %ah, bits_r             /* bits -= this.bits */
 469         shrl    %cl, hold_r             /* hold >>= this.bits */
 470 
 471         /* check if op is a literal
 472          * if (op == 0) {
 473          *    PUP(out) = this.val;
 474          *  }
 475          */
 476         testb   %al, %al
 477         jnz     .L_test_for_length_base /* if (op != 0) 45.7% */
 478 
 479         shrl    $16, %eax               /* output this.val char */
 480         stosb
 481         jmp     .L_while_test
 482 
 483 .L_test_for_length_base:
 484         /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out, %edx = len
 485          *
 486          * else if (op & 16) {
 487          *   len = this.val
 488          *   op &= 15
 489          *   if (op) {
 490          *     if (op > bits) {
 491          *       hold |= *((unsigned short *)in)++ << bits;
 492          *       bits += 16
 493          *     }
 494          *     len += hold & mask[op];
 495          *     bits -= op;
 496          *     hold >>= op;
 497          *   }
 498          */
 499 #define len_r %edx
 500         movl    %eax, len_r             /* len = this */
 501         shrl    $16, len_r              /* len = this.val */
 502         movb    %al, %cl
 503 
 504         testb   $16, %al
 505         jz      .L_test_for_second_level_length /* if ((op & 16) == 0) 8% */
 506         andb    $15, %cl                /* op &= 15 */
 507         jz      .L_save_len             /* if (!op) */
 508         cmpb    %cl, bits_r
 509         jae     .L_add_bits_to_len      /* if (op <= bits) */
 510 
 511         movb    %cl, %ch                /* stash op in ch, freeing cl */
 512         xorl    %eax, %eax
 513         lodsw                           /* al = *(ushort *)in++ */
 514         movb    bits_r, %cl             /* cl = bits, needs it for shifting */
 515         addb    $16, bits_r             /* bits += 16 */
 516         shll    %cl, %eax
 517         orl     %eax, hold_r            /* hold |= *((ushort *)in)++ << bits */
 518         movb    %ch, %cl                /* move op back to ecx */
 519 
 520 .L_add_bits_to_len:
 521         movl    $1, %eax
 522         shll    %cl, %eax
 523         decl    %eax
 524         subb    %cl, bits_r
 525         andl    hold_r, %eax            /* eax &= hold */
 526         shrl    %cl, hold_r
 527         addl    %eax, len_r             /* len += hold & mask[op] */
 528 
 529 .L_save_len:
 530         movl    len_r, len(%esp)        /* save len */
 531 #undef  len_r
 532 
 533 .L_decode_distance:
 534         /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out, %edx = dist
 535          *
 536          *   if (bits < 15) {
 537          *     hold |= *((unsigned short *)in)++ << bits;
 538          *     bits += 16
 539          *   }
 540          *   this = dcode[hold & dmask];
 541          * dodist:
 542          *   bits -= this.bits;
 543          *   hold >>= this.bits;
 544          *   op = this.op;
 545          */
 546 
 547         cmpb    $15, bits_r
 548         ja      .L_get_distance_code    /* if (15 < bits) */
 549 
 550         xorl    %eax, %eax
 551         lodsw                           /* al = *(ushort *)in++ */
 552         movb    bits_r, %cl             /* cl = bits, needs it for shifting */
 553         addb    $16, bits_r             /* bits += 16 */
 554         shll    %cl, %eax
 555         orl     %eax, hold_r            /* hold |= *((ushort *)in)++ << bits */
 556 
 557 .L_get_distance_code:
 558         movl    dmask(%esp), %edx       /* edx = dmask */
 559         movl    dcode(%esp), %ecx       /* ecx = dcode */
 560         andl    hold_r, %edx            /* edx &= hold */
 561         movl    (%ecx,%edx,4), %eax     /* eax = dcode[hold & dmask] */
 562 
 563 #define dist_r %edx
 564 .L_dodist:
 565         movl    %eax, dist_r            /* dist = this */
 566         shrl    $16, dist_r             /* dist = this.val */
 567         movb    %ah, %cl
 568         subb    %ah, bits_r             /* bits -= this.bits */
 569         shrl    %cl, hold_r             /* hold >>= this.bits */
 570 
 571         /* if (op & 16) {
 572          *   dist = this.val
 573          *   op &= 15
 574          *   if (op > bits) {
 575          *     hold |= *((unsigned short *)in)++ << bits;
 576          *     bits += 16
 577          *   }
 578          *   dist += hold & mask[op];
 579          *   bits -= op;
 580          *   hold >>= op;
 581          */
 582         movb    %al, %cl                /* cl = this.op */
 583 
 584         testb   $16, %al                /* if ((op & 16) == 0) */
 585         jz      .L_test_for_second_level_dist
 586         andb    $15, %cl                /* op &= 15 */
 587         jz      .L_check_dist_one
 588         cmpb    %cl, bits_r
 589         jae     .L_add_bits_to_dist     /* if (op <= bits) 97.6% */
 590 
 591         movb    %cl, %ch                /* stash op in ch, freeing cl */
 592         xorl    %eax, %eax
 593         lodsw                           /* al = *(ushort *)in++ */
 594         movb    bits_r, %cl             /* cl = bits, needs it for shifting */
 595         addb    $16, bits_r             /* bits += 16 */
 596         shll    %cl, %eax
 597         orl     %eax, hold_r            /* hold |= *((ushort *)in)++ << bits */
 598         movb    %ch, %cl                /* move op back to ecx */
 599 
 600 .L_add_bits_to_dist:
 601         movl    $1, %eax
 602         shll    %cl, %eax
 603         decl    %eax                    /* (1 << op) - 1 */
 604         subb    %cl, bits_r
 605         andl    hold_r, %eax            /* eax &= hold */
 606         shrl    %cl, hold_r
 607         addl    %eax, dist_r            /* dist += hold & ((1 << op) - 1) */
 608         jmp     .L_check_window
 609 
 610 .L_check_window:
 611         /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist
 612          *       %ecx = nbytes
 613          *
 614          * nbytes = out - beg;
 615          * if (dist <= nbytes) {
 616          *   from = out - dist;
 617          *   do {
 618          *     PUP(out) = PUP(from);
 619          *   } while (--len > 0) {
 620          * }
 621          */
 622 
 623         movl    in_r, in(%esp)          /* save in so from can use it's reg */
 624         movl    out_r, %eax
 625         subl    beg(%esp), %eax         /* nbytes = out - beg */
 626 
 627         cmpl    dist_r, %eax
 628         jb      .L_clip_window          /* if (dist > nbytes) 4.2% */
 629 
 630         movl    len(%esp), %ecx
 631         movl    out_r, from_r
 632         subl    dist_r, from_r          /* from = out - dist */
 633 
 634         subl    $3, %ecx
 635         movb    (from_r), %al
 636         movb    %al, (out_r)
 637         movb    1(from_r), %al
 638         movb    2(from_r), %dl
 639         addl    $3, from_r
 640         movb    %al, 1(out_r)
 641         movb    %dl, 2(out_r)
 642         addl    $3, out_r
 643         rep     movsb
 644 
 645         movl    in(%esp), in_r          /* move in back to %esi, toss from */
 646         jmp     .L_while_test
 647 
 648 .align 16,0x90
 649 .L_check_dist_one:
 650         cmpl    $1, dist_r
 651         jne     .L_check_window
 652         cmpl    out_r, beg(%esp)
 653         je      .L_check_window
 654 
 655         decl    out_r
 656         movl    len(%esp), %ecx
 657         movb    (out_r), %al
 658         subl    $3, %ecx
 659 
 660         movb    %al, 1(out_r)
 661         movb    %al, 2(out_r)
 662         movb    %al, 3(out_r)
 663         addl    $4, out_r
 664         rep     stosb
 665 
 666         jmp     .L_while_test
 667 
 668 .align 16,0x90
 669 .L_test_for_second_level_length:
 670         /* else if ((op & 64) == 0) {
 671          *   this = lcode[this.val + (hold & mask[op])];
 672          * }
 673          */
 674         testb   $64, %al
 675         jnz     .L_test_for_end_of_block  /* if ((op & 64) != 0) */
 676 
 677         movl    $1, %eax
 678         shll    %cl, %eax
 679         decl    %eax
 680         andl    hold_r, %eax            /* eax &= hold */
 681         addl    %edx, %eax              /* eax += this.val */
 682         movl    lcode(%esp), %edx       /* edx = lcode */
 683         movl    (%edx,%eax,4), %eax     /* eax = lcode[val + (hold&mask[op])] */
 684         jmp     .L_dolen
 685 
 686 .align 16,0x90
 687 .L_test_for_second_level_dist:
 688         /* else if ((op & 64) == 0) {
 689          *   this = dcode[this.val + (hold & mask[op])];
 690          * }
 691          */
 692         testb   $64, %al
 693         jnz     .L_invalid_distance_code  /* if ((op & 64) != 0) */
 694 
 695         movl    $1, %eax
 696         shll    %cl, %eax
 697         decl    %eax
 698         andl    hold_r, %eax            /* eax &= hold */
 699         addl    %edx, %eax              /* eax += this.val */
 700         movl    dcode(%esp), %edx       /* edx = dcode */
 701         movl    (%edx,%eax,4), %eax     /* eax = dcode[val + (hold&mask[op])] */
 702         jmp     .L_dodist
 703 
 704 .align 16,0x90
 705 .L_clip_window:
 706         /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist
 707          *       %ecx = nbytes
 708          *
 709          * else {
 710          *   if (dist > wsize) {
 711          *     invalid distance
 712          *   }
 713          *   from = window;
 714          *   nbytes = dist - nbytes;
 715          *   if (write == 0) {
 716          *     from += wsize - nbytes;
 717          */
 718 #define nbytes_r %ecx
 719         movl    %eax, nbytes_r
 720         movl    wsize(%esp), %eax       /* prepare for dist compare */
 721         negl    nbytes_r                /* nbytes = -nbytes */
 722         movl    window(%esp), from_r    /* from = window */
 723 
 724         cmpl    dist_r, %eax
 725         jb      .L_invalid_distance_too_far /* if (dist > wsize) */
 726 
 727         addl    dist_r, nbytes_r        /* nbytes = dist - nbytes */
 728         cmpl    $0, write(%esp)
 729         jne     .L_wrap_around_window   /* if (write != 0) */
 730 
 731         subl    nbytes_r, %eax
 732         addl    %eax, from_r            /* from += wsize - nbytes */
 733 
 734         /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist
 735          *       %ecx = nbytes, %eax = len
 736          *
 737          *     if (nbytes < len) {
 738          *       len -= nbytes;
 739          *       do {
 740          *         PUP(out) = PUP(from);
 741          *       } while (--nbytes);
 742          *       from = out - dist;
 743          *     }
 744          *   }
 745          */
 746 #define len_r %eax
 747         movl    len(%esp), len_r
 748         cmpl    nbytes_r, len_r
 749         jbe     .L_do_copy1             /* if (nbytes >= len) */
 750 
 751         subl    nbytes_r, len_r         /* len -= nbytes */
 752         rep     movsb
 753         movl    out_r, from_r
 754         subl    dist_r, from_r          /* from = out - dist */
 755         jmp     .L_do_copy1
 756 
 757         cmpl    nbytes_r, len_r
 758         jbe     .L_do_copy1             /* if (nbytes >= len) */
 759 
 760         subl    nbytes_r, len_r         /* len -= nbytes */
 761         rep     movsb
 762         movl    out_r, from_r
 763         subl    dist_r, from_r          /* from = out - dist */
 764         jmp     .L_do_copy1
 765 
 766 .L_wrap_around_window:
 767         /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist
 768          *       %ecx = nbytes, %eax = write, %eax = len
 769          *
 770          *   else if (write < nbytes) {
 771          *     from += wsize + write - nbytes;
 772          *     nbytes -= write;
 773          *     if (nbytes < len) {
 774          *       len -= nbytes;
 775          *       do {
 776          *         PUP(out) = PUP(from);
 777          *       } while (--nbytes);
 778          *       from = window;
 779          *       nbytes = write;
 780          *       if (nbytes < len) {
 781          *         len -= nbytes;
 782          *         do {
 783          *           PUP(out) = PUP(from);
 784          *         } while(--nbytes);
 785          *         from = out - dist;
 786          *       }
 787          *     }
 788          *   }
 789          */
 790 #define write_r %eax
 791         movl    write(%esp), write_r
 792         cmpl    write_r, nbytes_r
 793         jbe     .L_contiguous_in_window /* if (write >= nbytes) */
 794 
 795         addl    wsize(%esp), from_r
 796         addl    write_r, from_r
 797         subl    nbytes_r, from_r        /* from += wsize + write - nbytes */
 798         subl    write_r, nbytes_r       /* nbytes -= write */
 799 #undef write_r
 800 
 801         movl    len(%esp), len_r
 802         cmpl    nbytes_r, len_r
 803         jbe     .L_do_copy1             /* if (nbytes >= len) */
 804 
 805         subl    nbytes_r, len_r         /* len -= nbytes */
 806         rep     movsb
 807         movl    window(%esp), from_r    /* from = window */
 808         movl    write(%esp), nbytes_r   /* nbytes = write */
 809         cmpl    nbytes_r, len_r
 810         jbe     .L_do_copy1             /* if (nbytes >= len) */
 811 
 812         subl    nbytes_r, len_r         /* len -= nbytes */
 813         rep     movsb
 814         movl    out_r, from_r
 815         subl    dist_r, from_r          /* from = out - dist */
 816         jmp     .L_do_copy1
 817 
 818 .L_contiguous_in_window:
 819         /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist
 820          *       %ecx = nbytes, %eax = write, %eax = len
 821          *
 822          *   else {
 823          *     from += write - nbytes;
 824          *     if (nbytes < len) {
 825          *       len -= nbytes;
 826          *       do {
 827          *         PUP(out) = PUP(from);
 828          *       } while (--nbytes);
 829          *       from = out - dist;
 830          *     }
 831          *   }
 832          */
 833 #define write_r %eax
 834         addl    write_r, from_r
 835         subl    nbytes_r, from_r        /* from += write - nbytes */
 836 #undef write_r
 837 
 838         movl    len(%esp), len_r
 839         cmpl    nbytes_r, len_r
 840         jbe     .L_do_copy1             /* if (nbytes >= len) */
 841 
 842         subl    nbytes_r, len_r         /* len -= nbytes */
 843         rep     movsb
 844         movl    out_r, from_r
 845         subl    dist_r, from_r          /* from = out - dist */
 846 
 847 .L_do_copy1:
 848         /* regs: %esi = from, %esi = in, %ebp = hold, %bl = bits, %edi = out
 849          *       %eax = len
 850          *
 851          *     while (len > 0) {
 852          *       PUP(out) = PUP(from);
 853          *       len--;
 854          *     }
 855          *   }
 856          * } while (in < last && out < end);
 857          */
 858 #undef nbytes_r
 859 #define in_r %esi
 860         movl    len_r, %ecx
 861         rep     movsb
 862 
 863         movl    in(%esp), in_r          /* move in back to %esi, toss from */
 864         jmp     .L_while_test
 865 
 866 #undef len_r
 867 #undef dist_r
 868 
 869 #endif /* NO_MMX || RUN_TIME_MMX */
 870 
 871 
 872 /*** MMX code ***/
 873 
 874 #if defined( USE_MMX ) || defined( RUN_TIME_MMX )
 875 
 876 .align 32,0x90
 877 .L_init_mmx:
 878         emms
 879 
 880 #undef  bits_r
 881 #undef  bitslong_r
 882 #define bitslong_r %ebp
 883 #define hold_mm    %mm0
 884         movd    %ebp, hold_mm
 885         movl    %ebx, bitslong_r
 886 
 887 #define used_mm   %mm1
 888 #define dmask2_mm %mm2
 889 #define lmask2_mm %mm3
 890 #define lmask_mm  %mm4
 891 #define dmask_mm  %mm5
 892 #define tmp_mm    %mm6
 893 
 894         movd    lmask(%esp), lmask_mm
 895         movq    lmask_mm, lmask2_mm
 896         movd    dmask(%esp), dmask_mm
 897         movq    dmask_mm, dmask2_mm
 898         pxor    used_mm, used_mm
 899         movl    lcode(%esp), %ebx       /* ebx = lcode */
 900         jmp     .L_do_loop_mmx
 901 
 902 .align 32,0x90
 903 .L_while_test_mmx:
 904         /* while (in < last && out < end)
 905          */
 906         cmpl    out_r, end(%esp)
 907         jbe     .L_break_loop           /* if (out >= end) */
 908 
 909         cmpl    in_r, last(%esp)
 910         jbe     .L_break_loop
 911 
 912 .L_do_loop_mmx:
 913         psrlq   used_mm, hold_mm        /* hold_mm >>= last bit length */
 914 
 915         cmpl    $32, bitslong_r
 916         ja      .L_get_length_code_mmx  /* if (32 < bits) */
 917 
 918         movd    bitslong_r, tmp_mm
 919         movd    (in_r), %mm7
 920         addl    $4, in_r
 921         psllq   tmp_mm, %mm7
 922         addl    $32, bitslong_r
 923         por     %mm7, hold_mm           /* hold_mm |= *((uint *)in)++ << bits */
 924 
 925 .L_get_length_code_mmx:
 926         pand    hold_mm, lmask_mm
 927         movd    lmask_mm, %eax
 928         movq    lmask2_mm, lmask_mm
 929         movl    (%ebx,%eax,4), %eax     /* eax = lcode[hold & lmask] */
 930 
 931 .L_dolen_mmx:
 932         movzbl  %ah, %ecx               /* ecx = this.bits */
 933         movd    %ecx, used_mm
 934         subl    %ecx, bitslong_r        /* bits -= this.bits */
 935 
 936         testb   %al, %al
 937         jnz     .L_test_for_length_base_mmx /* if (op != 0) 45.7% */
 938 
 939         shrl    $16, %eax               /* output this.val char */
 940         stosb
 941         jmp     .L_while_test_mmx
 942 
 943 .L_test_for_length_base_mmx:
 944 #define len_r  %edx
 945         movl    %eax, len_r             /* len = this */
 946         shrl    $16, len_r              /* len = this.val */
 947 
 948         testb   $16, %al
 949         jz      .L_test_for_second_level_length_mmx /* if ((op & 16) == 0) 8% */
 950         andl    $15, %eax               /* op &= 15 */
 951         jz      .L_decode_distance_mmx  /* if (!op) */
 952 
 953         psrlq   used_mm, hold_mm        /* hold_mm >>= last bit length */
 954         movd    %eax, used_mm
 955         movd    hold_mm, %ecx
 956         subl    %eax, bitslong_r
 957         andl    .L_mask(,%eax,4), %ecx
 958         addl    %ecx, len_r             /* len += hold & mask[op] */
 959 
 960 .L_decode_distance_mmx:
 961         psrlq   used_mm, hold_mm        /* hold_mm >>= last bit length */
 962 
 963         cmpl    $32, bitslong_r
 964         ja      .L_get_dist_code_mmx    /* if (32 < bits) */
 965 
 966         movd    bitslong_r, tmp_mm
 967         movd    (in_r), %mm7
 968         addl    $4, in_r
 969         psllq   tmp_mm, %mm7
 970         addl    $32, bitslong_r
 971         por     %mm7, hold_mm           /* hold_mm |= *((uint *)in)++ << bits */
 972 
 973 .L_get_dist_code_mmx:
 974         movl    dcode(%esp), %ebx       /* ebx = dcode */
 975         pand    hold_mm, dmask_mm
 976         movd    dmask_mm, %eax
 977         movq    dmask2_mm, dmask_mm
 978         movl    (%ebx,%eax,4), %eax     /* eax = dcode[hold & lmask] */
 979 
 980 .L_dodist_mmx:
 981 #define dist_r %ebx
 982         movzbl  %ah, %ecx               /* ecx = this.bits */
 983         movl    %eax, dist_r
 984         shrl    $16, dist_r             /* dist  = this.val */
 985         subl    %ecx, bitslong_r        /* bits -= this.bits */
 986         movd    %ecx, used_mm
 987 
 988         testb   $16, %al                /* if ((op & 16) == 0) */
 989         jz      .L_test_for_second_level_dist_mmx
 990         andl    $15, %eax               /* op &= 15 */
 991         jz      .L_check_dist_one_mmx
 992 
 993 .L_add_bits_to_dist_mmx:
 994         psrlq   used_mm, hold_mm        /* hold_mm >>= last bit length */
 995         movd    %eax, used_mm           /* save bit length of current op */
 996         movd    hold_mm, %ecx           /* get the next bits on input stream */
 997         subl    %eax, bitslong_r        /* bits -= op bits */
 998         andl    .L_mask(,%eax,4), %ecx  /* ecx   = hold & mask[op] */
 999         addl    %ecx, dist_r            /* dist += hold & mask[op] */
1000 
1001 .L_check_window_mmx:
1002         movl    in_r, in(%esp)          /* save in so from can use it's reg */
1003         movl    out_r, %eax
1004         subl    beg(%esp), %eax         /* nbytes = out - beg */
1005 
1006         cmpl    dist_r, %eax
1007         jb      .L_clip_window_mmx      /* if (dist > nbytes) 4.2% */
1008 
1009         movl    len_r, %ecx
1010         movl    out_r, from_r
1011         subl    dist_r, from_r          /* from = out - dist */
1012 
1013         subl    $3, %ecx
1014         movb    (from_r), %al
1015         movb    %al, (out_r)
1016         movb    1(from_r), %al
1017         movb    2(from_r), %dl
1018         addl    $3, from_r
1019         movb    %al, 1(out_r)
1020         movb    %dl, 2(out_r)
1021         addl    $3, out_r
1022         rep     movsb
1023 
1024         movl    in(%esp), in_r          /* move in back to %esi, toss from */
1025         movl    lcode(%esp), %ebx       /* move lcode back to %ebx, toss dist */
1026         jmp     .L_while_test_mmx
1027 
1028 .align 16,0x90
1029 .L_check_dist_one_mmx:
1030         cmpl    $1, dist_r
1031         jne     .L_check_window_mmx
1032         cmpl    out_r, beg(%esp)
1033         je      .L_check_window_mmx
1034 
1035         decl    out_r
1036         movl    len_r, %ecx
1037         movb    (out_r), %al
1038         subl    $3, %ecx
1039 
1040         movb    %al, 1(out_r)
1041         movb    %al, 2(out_r)
1042         movb    %al, 3(out_r)
1043         addl    $4, out_r
1044         rep     stosb
1045 
1046         movl    lcode(%esp), %ebx       /* move lcode back to %ebx, toss dist */
1047         jmp     .L_while_test_mmx
1048 
1049 .align 16,0x90
1050 .L_test_for_second_level_length_mmx:
1051         testb   $64, %al
1052         jnz     .L_test_for_end_of_block  /* if ((op & 64) != 0) */
1053 
1054         andl    $15, %eax
1055         psrlq   used_mm, hold_mm        /* hold_mm >>= last bit length */
1056         movd    hold_mm, %ecx
1057         andl    .L_mask(,%eax,4), %ecx
1058         addl    len_r, %ecx
1059         movl    (%ebx,%ecx,4), %eax     /* eax = lcode[hold & lmask] */
1060         jmp     .L_dolen_mmx
1061 
1062 .align 16,0x90
1063 .L_test_for_second_level_dist_mmx:
1064         testb   $64, %al
1065         jnz     .L_invalid_distance_code  /* if ((op & 64) != 0) */
1066 
1067         andl    $15, %eax
1068         psrlq   used_mm, hold_mm        /* hold_mm >>= last bit length */
1069         movd    hold_mm, %ecx
1070         andl    .L_mask(,%eax,4), %ecx
1071         movl    dcode(%esp), %eax       /* ecx = dcode */
1072         addl    dist_r, %ecx
1073         movl    (%eax,%ecx,4), %eax     /* eax = lcode[hold & lmask] */
1074         jmp     .L_dodist_mmx
1075 
1076 .align 16,0x90
1077 .L_clip_window_mmx:
1078 #define nbytes_r %ecx
1079         movl    %eax, nbytes_r
1080         movl    wsize(%esp), %eax       /* prepare for dist compare */
1081         negl    nbytes_r                /* nbytes = -nbytes */
1082         movl    window(%esp), from_r    /* from = window */
1083 
1084         cmpl    dist_r, %eax
1085         jb      .L_invalid_distance_too_far /* if (dist > wsize) */
1086 
1087         addl    dist_r, nbytes_r        /* nbytes = dist - nbytes */
1088         cmpl    $0, write(%esp)
1089         jne     .L_wrap_around_window_mmx /* if (write != 0) */
1090 
1091         subl    nbytes_r, %eax
1092         addl    %eax, from_r            /* from += wsize - nbytes */
1093 
1094         cmpl    nbytes_r, len_r
1095         jbe     .L_do_copy1_mmx         /* if (nbytes >= len) */
1096 
1097         subl    nbytes_r, len_r         /* len -= nbytes */
1098         rep     movsb
1099         movl    out_r, from_r
1100         subl    dist_r, from_r          /* from = out - dist */
1101         jmp     .L_do_copy1_mmx
1102 
1103         cmpl    nbytes_r, len_r
1104         jbe     .L_do_copy1_mmx         /* if (nbytes >= len) */
1105 
1106         subl    nbytes_r, len_r         /* len -= nbytes */
1107         rep     movsb
1108         movl    out_r, from_r
1109         subl    dist_r, from_r          /* from = out - dist */
1110         jmp     .L_do_copy1_mmx
1111 
1112 .L_wrap_around_window_mmx:
1113 #define write_r %eax
1114         movl    write(%esp), write_r
1115         cmpl    write_r, nbytes_r
1116         jbe     .L_contiguous_in_window_mmx /* if (write >= nbytes) */
1117 
1118         addl    wsize(%esp), from_r
1119         addl    write_r, from_r
1120         subl    nbytes_r, from_r        /* from += wsize + write - nbytes */
1121         subl    write_r, nbytes_r       /* nbytes -= write */
1122 #undef write_r
1123 
1124         cmpl    nbytes_r, len_r
1125         jbe     .L_do_copy1_mmx         /* if (nbytes >= len) */
1126 
1127         subl    nbytes_r, len_r         /* len -= nbytes */
1128         rep     movsb
1129         movl    window(%esp), from_r    /* from = window */
1130         movl    write(%esp), nbytes_r   /* nbytes = write */
1131         cmpl    nbytes_r, len_r
1132         jbe     .L_do_copy1_mmx         /* if (nbytes >= len) */
1133 
1134         subl    nbytes_r, len_r         /* len -= nbytes */
1135         rep     movsb
1136         movl    out_r, from_r
1137         subl    dist_r, from_r          /* from = out - dist */
1138         jmp     .L_do_copy1_mmx
1139 
1140 .L_contiguous_in_window_mmx:
1141 #define write_r %eax
1142         addl    write_r, from_r
1143         subl    nbytes_r, from_r        /* from += write - nbytes */
1144 #undef write_r
1145 
1146         cmpl    nbytes_r, len_r
1147         jbe     .L_do_copy1_mmx         /* if (nbytes >= len) */
1148 
1149         subl    nbytes_r, len_r         /* len -= nbytes */
1150         rep     movsb
1151         movl    out_r, from_r
1152         subl    dist_r, from_r          /* from = out - dist */
1153 
1154 .L_do_copy1_mmx:
1155 #undef nbytes_r
1156 #define in_r %esi
1157         movl    len_r, %ecx
1158         rep     movsb
1159 
1160         movl    in(%esp), in_r          /* move in back to %esi, toss from */
1161         movl    lcode(%esp), %ebx       /* move lcode back to %ebx, toss dist */
1162         jmp     .L_while_test_mmx
1163 
1164 #undef hold_r
1165 #undef bitslong_r
1166 
1167 #endif /* USE_MMX || RUN_TIME_MMX */
1168 
1169 
1170 /*** USE_MMX, NO_MMX, and RUNTIME_MMX from here on ***/
1171 
1172 .L_invalid_distance_code:
1173         /* else {
1174          *   strm->msg = "invalid distance code";
1175          *   state->mode = BAD;
1176          * }
1177          */
1178         movl    $.L_invalid_distance_code_msg, %ecx
1179         movl    $INFLATE_MODE_BAD, %edx
1180         jmp     .L_update_stream_state
1181 
1182 .L_test_for_end_of_block:
1183         /* else if (op & 32) {
1184          *   state->mode = TYPE;
1185          *   break;
1186          * }
1187          */
1188         testb   $32, %al
1189         jz      .L_invalid_literal_length_code  /* if ((op & 32) == 0) */
1190 
1191         movl    $0, %ecx
1192         movl    $INFLATE_MODE_TYPE, %edx
1193         jmp     .L_update_stream_state
1194 
1195 .L_invalid_literal_length_code:
1196         /* else {
1197          *   strm->msg = "invalid literal/length code";
1198          *   state->mode = BAD;
1199          * }
1200          */
1201         movl    $.L_invalid_literal_length_code_msg, %ecx
1202         movl    $INFLATE_MODE_BAD, %edx
1203         jmp     .L_update_stream_state
1204 
1205 .L_invalid_distance_too_far:
1206         /* strm->msg = "invalid distance too far back";
1207          * state->mode = BAD;
1208          */
1209         movl    in(%esp), in_r          /* from_r has in's reg, put in back */
1210         movl    $.L_invalid_distance_too_far_msg, %ecx
1211         movl    $INFLATE_MODE_BAD, %edx
1212         jmp     .L_update_stream_state
1213 
1214 .L_update_stream_state:
1215         /* set strm->msg = %ecx, strm->state->mode = %edx */
1216         movl    strm_sp(%esp), %eax
1217         testl   %ecx, %ecx              /* if (msg != NULL) */
1218         jz      .L_skip_msg
1219         movl    %ecx, msg_strm(%eax)    /* strm->msg = msg */
1220 .L_skip_msg:
1221         movl    state_strm(%eax), %eax  /* state = strm->state */
1222         movl    %edx, mode_state(%eax)  /* state->mode = edx (BAD | TYPE) */
1223         jmp     .L_break_loop
1224 
1225 .align 32,0x90
1226 .L_break_loop:
1227 
1228 /*
1229  * Regs:
1230  *
1231  * bits = %ebp when mmx, and in %ebx when non-mmx
1232  * hold = %hold_mm when mmx, and in %ebp when non-mmx
1233  * in   = %esi
1234  * out  = %edi
1235  */
1236 
1237 #if defined( USE_MMX ) || defined( RUN_TIME_MMX )
1238 
1239 #if defined( RUN_TIME_MMX )
1240 
1241         cmpl    $DO_USE_MMX, inflate_fast_use_mmx
1242         jne     .L_update_next_in
1243 
1244 #endif /* RUN_TIME_MMX */
1245 
1246         movl    %ebp, %ebx
1247 
1248 .L_update_next_in:
1249 
1250 #endif
1251 
1252 #define strm_r  %eax
1253 #define state_r %edx
1254 
1255         /* len = bits >> 3;
1256          * in -= len;
1257          * bits -= len << 3;
1258          * hold &= (1U << bits) - 1;
1259          * state->hold = hold;
1260          * state->bits = bits;
1261          * strm->next_in = in;
1262          * strm->next_out = out;
1263          */
1264         movl    strm_sp(%esp), strm_r
1265         movl    %ebx, %ecx
1266         movl    state_strm(strm_r), state_r
1267         shrl    $3, %ecx
1268         subl    %ecx, in_r
1269         shll    $3, %ecx
1270         subl    %ecx, %ebx
1271         movl    out_r, next_out_strm(strm_r)
1272         movl    %ebx, bits_state(state_r)
1273         movl    %ebx, %ecx
1274 
1275         leal    buf(%esp), %ebx
1276         cmpl    %ebx, last(%esp)
1277         jne     .L_buf_not_used         /* if buf != last */
1278 
1279         subl    %ebx, in_r              /* in -= buf */
1280         movl    next_in_strm(strm_r), %ebx
1281         movl    %ebx, last(%esp)        /* last = strm->next_in */
1282         addl    %ebx, in_r              /* in += strm->next_in */
1283         movl    avail_in_strm(strm_r), %ebx
1284         subl    $11, %ebx
1285         addl    %ebx, last(%esp)    /* last = &strm->next_in[ avail_in - 11 ] */
1286 
1287 .L_buf_not_used:
1288         movl    in_r, next_in_strm(strm_r)
1289 
1290         movl    $1, %ebx
1291         shll    %cl, %ebx
1292         decl    %ebx
1293 
1294 #if defined( USE_MMX ) || defined( RUN_TIME_MMX )
1295 
1296 #if defined( RUN_TIME_MMX )
1297 
1298         cmpl    $DO_USE_MMX, inflate_fast_use_mmx
1299         jne     .L_update_hold
1300 
1301 #endif /* RUN_TIME_MMX */
1302 
1303         psrlq   used_mm, hold_mm        /* hold_mm >>= last bit length */
1304         movd    hold_mm, %ebp
1305 
1306         emms
1307 
1308 .L_update_hold:
1309 
1310 #endif /* USE_MMX || RUN_TIME_MMX */
1311 
1312         andl    %ebx, %ebp
1313         movl    %ebp, hold_state(state_r)
1314 
1315 #define last_r %ebx
1316 
1317         /* strm->avail_in = in < last ? 11 + (last - in) : 11 - (in - last) */
1318         movl    last(%esp), last_r
1319         cmpl    in_r, last_r
1320         jbe     .L_last_is_smaller     /* if (in >= last) */
1321 
1322         subl    in_r, last_r           /* last -= in */
1323         addl    $11, last_r            /* last += 11 */
1324         movl    last_r, avail_in_strm(strm_r)
1325         jmp     .L_fixup_out
1326 .L_last_is_smaller:
1327         subl    last_r, in_r           /* in -= last */
1328         negl    in_r                   /* in = -in */
1329         addl    $11, in_r              /* in += 11 */
1330         movl    in_r, avail_in_strm(strm_r)
1331 
1332 #undef last_r
1333 #define end_r %ebx
1334 
1335 .L_fixup_out:
1336         /* strm->avail_out = out < end ? 257 + (end - out) : 257 - (out - end)*/
1337         movl    end(%esp), end_r
1338         cmpl    out_r, end_r
1339         jbe     .L_end_is_smaller      /* if (out >= end) */
1340 
1341         subl    out_r, end_r           /* end -= out */
1342         addl    $257, end_r            /* end += 257 */
1343         movl    end_r, avail_out_strm(strm_r)
1344         jmp     .L_done
1345 .L_end_is_smaller:
1346         subl    end_r, out_r           /* out -= end */
1347         negl    out_r                  /* out = -out */
1348         addl    $257, out_r            /* out += 257 */
1349         movl    out_r, avail_out_strm(strm_r)
1350 
1351 #undef end_r
1352 #undef strm_r
1353 #undef state_r
1354 
1355 .L_done:
1356         addl    $local_var_size, %esp
1357         popf
1358         popl    %ebx
1359         popl    %ebp
1360         popl    %esi
1361         popl    %edi
1362         ret
1363 
1364 #if defined( GAS_ELF )
1365 /* elf info */
1366 .type inflate_fast,@function
1367 .size inflate_fast,.-inflate_fast
1368 #endif