1 /* zran.c -- example of zlib/gzip stream indexing and random access
   2  * Copyright (C) 2005, 2012 Mark Adler
   3  * For conditions of distribution and use, see copyright notice in zlib.h
   4    Version 1.1  29 Sep 2012  Mark Adler */
   5 
   6 /* Version History:
   7  1.0  29 May 2005  First version
   8  1.1  29 Sep 2012  Fix memory reallocation error
   9  */
  10 
  11 /* Illustrate the use of Z_BLOCK, inflatePrime(), and inflateSetDictionary()
  12    for random access of a compressed file.  A file containing a zlib or gzip
  13    stream is provided on the command line.  The compressed stream is decoded in
  14    its entirety, and an index built with access points about every SPAN bytes
  15    in the uncompressed output.  The compressed file is left open, and can then
  16    be read randomly, having to decompress on the average SPAN/2 uncompressed
  17    bytes before getting to the desired block of data.
  18 
  19    An access point can be created at the start of any deflate block, by saving
  20    the starting file offset and bit of that block, and the 32K bytes of
  21    uncompressed data that precede that block.  Also the uncompressed offset of
  22    that block is saved to provide a referece for locating a desired starting
  23    point in the uncompressed stream.  build_index() works by decompressing the
  24    input zlib or gzip stream a block at a time, and at the end of each block
  25    deciding if enough uncompressed data has gone by to justify the creation of
  26    a new access point.  If so, that point is saved in a data structure that
  27    grows as needed to accommodate the points.
  28 
  29    To use the index, an offset in the uncompressed data is provided, for which
  30    the latest accees point at or preceding that offset is located in the index.
  31    The input file is positioned to the specified location in the index, and if
  32    necessary the first few bits of the compressed data is read from the file.
  33    inflate is initialized with those bits and the 32K of uncompressed data, and
  34    the decompression then proceeds until the desired offset in the file is
  35    reached.  Then the decompression continues to read the desired uncompressed
  36    data from the file.
  37 
  38    Another approach would be to generate the index on demand.  In that case,
  39    requests for random access reads from the compressed data would try to use
  40    the index, but if a read far enough past the end of the index is required,
  41    then further index entries would be generated and added.
  42 
  43    There is some fair bit of overhead to starting inflation for the random
  44    access, mainly copying the 32K byte dictionary.  So if small pieces of the
  45    file are being accessed, it would make sense to implement a cache to hold
  46    some lookahead and avoid many calls to extract() for small lengths.
  47 
  48    Another way to build an index would be to use inflateCopy().  That would
  49    not be constrained to have access points at block boundaries, but requires
  50    more memory per access point, and also cannot be saved to file due to the
  51    use of pointers in the state.  The approach here allows for storage of the
  52    index in a file.
  53  */
  54 
  55 #include <stdio.h>
  56 #include <stdlib.h>
  57 #include <string.h>
  58 #include "zlib.h"
  59 
  60 #define local static
  61 
  62 #define SPAN 1048576L       /* desired distance between access points */
  63 #define WINSIZE 32768U      /* sliding window size */
  64 #define CHUNK 16384         /* file input buffer size */
  65 
  66 /* access point entry */
  67 struct point {
  68     off_t out;          /* corresponding offset in uncompressed data */
  69     off_t in;           /* offset in input file of first full byte */
  70     int bits;           /* number of bits (1-7) from byte at in - 1, or 0 */
  71     unsigned char window[WINSIZE];  /* preceding 32K of uncompressed data */
  72 };
  73 
  74 /* access point list */
  75 struct access {
  76     int have;           /* number of list entries filled in */
  77     int size;           /* number of list entries allocated */
  78     struct point *list; /* allocated list */
  79 };
  80 
  81 /* Deallocate an index built by build_index() */
  82 local void free_index(struct access *index)
  83 {
  84     if (index != NULL) {
  85         free(index->list);
  86         free(index);
  87     }
  88 }
  89 
  90 /* Add an entry to the access point list.  If out of memory, deallocate the
  91    existing list and return NULL. */
  92 local struct access *addpoint(struct access *index, int bits,
  93     off_t in, off_t out, unsigned left, unsigned char *window)
  94 {
  95     struct point *next;
  96 
  97     /* if list is empty, create it (start with eight points) */
  98     if (index == NULL) {
  99         index = malloc(sizeof(struct access));
 100         if (index == NULL) return NULL;
 101         index->list = malloc(sizeof(struct point) << 3);
 102         if (index->list == NULL) {
 103             free(index);
 104             return NULL;
 105         }
 106         index->size = 8;
 107         index->have = 0;
 108     }
 109 
 110     /* if list is full, make it bigger */
 111     else if (index->have == index->size) {
 112         index->size <<= 1;
 113         next = realloc(index->list, sizeof(struct point) * index->size);
 114         if (next == NULL) {
 115             free_index(index);
 116             return NULL;
 117         }
 118         index->list = next;
 119     }
 120 
 121     /* fill in entry and increment how many we have */
 122     next = index->list + index->have;
 123     next->bits = bits;
 124     next->in = in;
 125     next->out = out;
 126     if (left)
 127         memcpy(next->window, window + WINSIZE - left, left);
 128     if (left < WINSIZE)
 129         memcpy(next->window + left, window, WINSIZE - left);
 130     index->have++;
 131 
 132     /* return list, possibly reallocated */
 133     return index;
 134 }
 135 
 136 /* Make one entire pass through the compressed stream and build an index, with
 137    access points about every span bytes of uncompressed output -- span is
 138    chosen to balance the speed of random access against the memory requirements
 139    of the list, about 32K bytes per access point.  Note that data after the end
 140    of the first zlib or gzip stream in the file is ignored.  build_index()
 141    returns the number of access points on success (>= 1), Z_MEM_ERROR for out
 142    of memory, Z_DATA_ERROR for an error in the input file, or Z_ERRNO for a
 143    file read error.  On success, *built points to the resulting index. */
 144 local int build_index(FILE *in, off_t span, struct access **built)
 145 {
 146     int ret;
 147     off_t totin, totout;        /* our own total counters to avoid 4GB limit */
 148     off_t last;                 /* totout value of last access point */
 149     struct access *index;       /* access points being generated */
 150     z_stream strm;
 151     unsigned char input[CHUNK];
 152     unsigned char window[WINSIZE];
 153 
 154     /* initialize inflate */
 155     strm.zalloc = Z_NULL;
 156     strm.zfree = Z_NULL;
 157     strm.opaque = Z_NULL;
 158     strm.avail_in = 0;
 159     strm.next_in = Z_NULL;
 160     ret = inflateInit2(&strm, 47);      /* automatic zlib or gzip decoding */
 161     if (ret != Z_OK)
 162         return ret;
 163 
 164     /* inflate the input, maintain a sliding window, and build an index -- this
 165        also validates the integrity of the compressed data using the check
 166        information at the end of the gzip or zlib stream */
 167     totin = totout = last = 0;
 168     index = NULL;               /* will be allocated by first addpoint() */
 169     strm.avail_out = 0;
 170     do {
 171         /* get some compressed data from input file */
 172         strm.avail_in = fread(input, 1, CHUNK, in);
 173         if (ferror(in)) {
 174             ret = Z_ERRNO;
 175             goto build_index_error;
 176         }
 177         if (strm.avail_in == 0) {
 178             ret = Z_DATA_ERROR;
 179             goto build_index_error;
 180         }
 181         strm.next_in = input;
 182 
 183         /* process all of that, or until end of stream */
 184         do {
 185             /* reset sliding window if necessary */
 186             if (strm.avail_out == 0) {
 187                 strm.avail_out = WINSIZE;
 188                 strm.next_out = window;
 189             }
 190 
 191             /* inflate until out of input, output, or at end of block --
 192                update the total input and output counters */
 193             totin += strm.avail_in;
 194             totout += strm.avail_out;
 195             ret = inflate(&strm, Z_BLOCK);      /* return at end of block */
 196             totin -= strm.avail_in;
 197             totout -= strm.avail_out;
 198             if (ret == Z_NEED_DICT)
 199                 ret = Z_DATA_ERROR;
 200             if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
 201                 goto build_index_error;
 202             if (ret == Z_STREAM_END)
 203                 break;
 204 
 205             /* if at end of block, consider adding an index entry (note that if
 206                data_type indicates an end-of-block, then all of the
 207                uncompressed data from that block has been delivered, and none
 208                of the compressed data after that block has been consumed,
 209                except for up to seven bits) -- the totout == 0 provides an
 210                entry point after the zlib or gzip header, and assures that the
 211                index always has at least one access point; we avoid creating an
 212                access point after the last block by checking bit 6 of data_type
 213              */
 214             if ((strm.data_type & 128) && !(strm.data_type & 64) &&
 215                 (totout == 0 || totout - last > span)) {
 216                 index = addpoint(index, strm.data_type & 7, totin,
 217                                  totout, strm.avail_out, window);
 218                 if (index == NULL) {
 219                     ret = Z_MEM_ERROR;
 220                     goto build_index_error;
 221                 }
 222                 last = totout;
 223             }
 224         } while (strm.avail_in != 0);
 225     } while (ret != Z_STREAM_END);
 226 
 227     /* clean up and return index (release unused entries in list) */
 228     (void)inflateEnd(&strm);
 229     index->list = realloc(index->list, sizeof(struct point) * index->have);
 230     index->size = index->have;
 231     *built = index;
 232     return index->size;
 233 
 234     /* return error */
 235   build_index_error:
 236     (void)inflateEnd(&strm);
 237     if (index != NULL)
 238         free_index(index);
 239     return ret;
 240 }
 241 
 242 /* Use the index to read len bytes from offset into buf, return bytes read or
 243    negative for error (Z_DATA_ERROR or Z_MEM_ERROR).  If data is requested past
 244    the end of the uncompressed data, then extract() will return a value less
 245    than len, indicating how much as actually read into buf.  This function
 246    should not return a data error unless the file was modified since the index
 247    was generated.  extract() may also return Z_ERRNO if there is an error on
 248    reading or seeking the input file. */
 249 local int extract(FILE *in, struct access *index, off_t offset,
 250                   unsigned char *buf, int len)
 251 {
 252     int ret, skip;
 253     z_stream strm;
 254     struct point *here;
 255     unsigned char input[CHUNK];
 256     unsigned char discard[WINSIZE];
 257 
 258     /* proceed only if something reasonable to do */
 259     if (len < 0)
 260         return 0;
 261 
 262     /* find where in stream to start */
 263     here = index->list;
 264     ret = index->have;
 265     while (--ret && here[1].out <= offset)
 266         here++;
 267 
 268     /* initialize file and inflate state to start there */
 269     strm.zalloc = Z_NULL;
 270     strm.zfree = Z_NULL;
 271     strm.opaque = Z_NULL;
 272     strm.avail_in = 0;
 273     strm.next_in = Z_NULL;
 274     ret = inflateInit2(&strm, -15);         /* raw inflate */
 275     if (ret != Z_OK)
 276         return ret;
 277     ret = fseeko(in, here->in - (here->bits ? 1 : 0), SEEK_SET);
 278     if (ret == -1)
 279         goto extract_ret;
 280     if (here->bits) {
 281         ret = getc(in);
 282         if (ret == -1) {
 283             ret = ferror(in) ? Z_ERRNO : Z_DATA_ERROR;
 284             goto extract_ret;
 285         }
 286         (void)inflatePrime(&strm, here->bits, ret >> (8 - here->bits));
 287     }
 288     (void)inflateSetDictionary(&strm, here->window, WINSIZE);
 289 
 290     /* skip uncompressed bytes until offset reached, then satisfy request */
 291     offset -= here->out;
 292     strm.avail_in = 0;
 293     skip = 1;                               /* while skipping to offset */
 294     do {
 295         /* define where to put uncompressed data, and how much */
 296         if (offset == 0 && skip) {          /* at offset now */
 297             strm.avail_out = len;
 298             strm.next_out = buf;
 299             skip = 0;                       /* only do this once */
 300         }
 301         if (offset > WINSIZE) {             /* skip WINSIZE bytes */
 302             strm.avail_out = WINSIZE;
 303             strm.next_out = discard;
 304             offset -= WINSIZE;
 305         }
 306         else if (offset != 0) {             /* last skip */
 307             strm.avail_out = (unsigned)offset;
 308             strm.next_out = discard;
 309             offset = 0;
 310         }
 311 
 312         /* uncompress until avail_out filled, or end of stream */
 313         do {
 314             if (strm.avail_in == 0) {
 315                 strm.avail_in = fread(input, 1, CHUNK, in);
 316                 if (ferror(in)) {
 317                     ret = Z_ERRNO;
 318                     goto extract_ret;
 319                 }
 320                 if (strm.avail_in == 0) {
 321                     ret = Z_DATA_ERROR;
 322                     goto extract_ret;
 323                 }
 324                 strm.next_in = input;
 325             }
 326             ret = inflate(&strm, Z_NO_FLUSH);       /* normal inflate */
 327             if (ret == Z_NEED_DICT)
 328                 ret = Z_DATA_ERROR;
 329             if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
 330                 goto extract_ret;
 331             if (ret == Z_STREAM_END)
 332                 break;
 333         } while (strm.avail_out != 0);
 334 
 335         /* if reach end of stream, then don't keep trying to get more */
 336         if (ret == Z_STREAM_END)
 337             break;
 338 
 339         /* do until offset reached and requested data read, or stream ends */
 340     } while (skip);
 341 
 342     /* compute number of uncompressed bytes read after offset */
 343     ret = skip ? 0 : len - strm.avail_out;
 344 
 345     /* clean up and return bytes read or error */
 346   extract_ret:
 347     (void)inflateEnd(&strm);
 348     return ret;
 349 }
 350 
 351 /* Demonstrate the use of build_index() and extract() by processing the file
 352    provided on the command line, and the extracting 16K from about 2/3rds of
 353    the way through the uncompressed output, and writing that to stdout. */
 354 int main(int argc, char **argv)
 355 {
 356     int len;
 357     off_t offset;
 358     FILE *in;
 359     struct access *index = NULL;
 360     unsigned char buf[CHUNK];
 361 
 362     /* open input file */
 363     if (argc != 2) {
 364         fprintf(stderr, "usage: zran file.gz\n");
 365         return 1;
 366     }
 367     in = fopen(argv[1], "rb");
 368     if (in == NULL) {
 369         fprintf(stderr, "zran: could not open %s for reading\n", argv[1]);
 370         return 1;
 371     }
 372 
 373     /* build index */
 374     len = build_index(in, SPAN, &index);
 375     if (len < 0) {
 376         fclose(in);
 377         switch (len) {
 378         case Z_MEM_ERROR:
 379             fprintf(stderr, "zran: out of memory\n");
 380             break;
 381         case Z_DATA_ERROR:
 382             fprintf(stderr, "zran: compressed data error in %s\n", argv[1]);
 383             break;
 384         case Z_ERRNO:
 385             fprintf(stderr, "zran: read error on %s\n", argv[1]);
 386             break;
 387         default:
 388             fprintf(stderr, "zran: error %d while building index\n", len);
 389         }
 390         return 1;
 391     }
 392     fprintf(stderr, "zran: built index with %d access points\n", len);
 393 
 394     /* use index by reading some bytes from an arbitrary offset */
 395     offset = (index->list[index->have - 1].out << 1) / 3;
 396     len = extract(in, index, offset, buf, CHUNK);
 397     if (len < 0)
 398         fprintf(stderr, "zran: extraction failed: %s error\n",
 399                 len == Z_MEM_ERROR ? "out of memory" : "input corrupted");
 400     else {
 401         fwrite(buf, 1, len, stdout);
 402         fprintf(stderr, "zran: extracted %d bytes at %llu\n", len, offset);
 403     }
 404 
 405     /* clean up and exit */
 406     free_index(index);
 407     fclose(in);
 408     return 0;
 409 }