1 /* gzjoin -- command to join gzip files into one gzip file
   2 
   3   Copyright (C) 2004, 2005, 2012 Mark Adler, all rights reserved
   4   version 1.2, 14 Aug 2012
   5 
   6   This software is provided 'as-is', without any express or implied
   7   warranty.  In no event will the author be held liable for any damages
   8   arising from the use of this software.
   9 
  10   Permission is granted to anyone to use this software for any purpose,
  11   including commercial applications, and to alter it and redistribute it
  12   freely, subject to the following restrictions:
  13 
  14   1. The origin of this software must not be misrepresented; you must not
  15      claim that you wrote the original software. If you use this software
  16      in a product, an acknowledgment in the product documentation would be
  17      appreciated but is not required.
  18   2. Altered source versions must be plainly marked as such, and must not be
  19      misrepresented as being the original software.
  20   3. This notice may not be removed or altered from any source distribution.
  21 
  22   Mark Adler    madler@alumni.caltech.edu
  23  */
  24 
  25 /*
  26  * Change history:
  27  *
  28  * 1.0  11 Dec 2004     - First version
  29  * 1.1  12 Jun 2005     - Changed ssize_t to long for portability
  30  * 1.2  14 Aug 2012     - Clean up for z_const usage
  31  */
  32 
  33 /*
  34    gzjoin takes one or more gzip files on the command line and writes out a
  35    single gzip file that will uncompress to the concatenation of the
  36    uncompressed data from the individual gzip files.  gzjoin does this without
  37    having to recompress any of the data and without having to calculate a new
  38    crc32 for the concatenated uncompressed data.  gzjoin does however have to
  39    decompress all of the input data in order to find the bits in the compressed
  40    data that need to be modified to concatenate the streams.
  41 
  42    gzjoin does not do an integrity check on the input gzip files other than
  43    checking the gzip header and decompressing the compressed data.  They are
  44    otherwise assumed to be complete and correct.
  45 
  46    Each joint between gzip files removes at least 18 bytes of previous trailer
  47    and subsequent header, and inserts an average of about three bytes to the
  48    compressed data in order to connect the streams.  The output gzip file
  49    has a minimal ten-byte gzip header with no file name or modification time.
  50 
  51    This program was written to illustrate the use of the Z_BLOCK option of
  52    inflate() and the crc32_combine() function.  gzjoin will not compile with
  53    versions of zlib earlier than 1.2.3.
  54  */
  55 
  56 #include <stdio.h>      /* fputs(), fprintf(), fwrite(), putc() */
  57 #include <stdlib.h>     /* exit(), malloc(), free() */
  58 #include <fcntl.h>      /* open() */
  59 #include <unistd.h>     /* close(), read(), lseek() */
  60 #include "zlib.h"
  61     /* crc32(), crc32_combine(), inflateInit2(), inflate(), inflateEnd() */
  62 
  63 #define local static
  64 
  65 /* exit with an error (return a value to allow use in an expression) */
  66 local int bail(char *why1, char *why2)
  67 {
  68     fprintf(stderr, "gzjoin error: %s%s, output incomplete\n", why1, why2);
  69     exit(1);
  70     return 0;
  71 }
  72 
  73 /* -- simple buffered file input with access to the buffer -- */
  74 
  75 #define CHUNK 32768         /* must be a power of two and fit in unsigned */
  76 
  77 /* bin buffered input file type */
  78 typedef struct {
  79     char *name;             /* name of file for error messages */
  80     int fd;                 /* file descriptor */
  81     unsigned left;          /* bytes remaining at next */
  82     unsigned char *next;    /* next byte to read */
  83     unsigned char *buf;     /* allocated buffer of length CHUNK */
  84 } bin;
  85 
  86 /* close a buffered file and free allocated memory */
  87 local void bclose(bin *in)
  88 {
  89     if (in != NULL) {
  90         if (in->fd != -1)
  91             close(in->fd);
  92         if (in->buf != NULL)
  93             free(in->buf);
  94         free(in);
  95     }
  96 }
  97 
  98 /* open a buffered file for input, return a pointer to type bin, or NULL on
  99    failure */
 100 local bin *bopen(char *name)
 101 {
 102     bin *in;
 103 
 104     in = malloc(sizeof(bin));
 105     if (in == NULL)
 106         return NULL;
 107     in->buf = malloc(CHUNK);
 108     in->fd = open(name, O_RDONLY, 0);
 109     if (in->buf == NULL || in->fd == -1) {
 110         bclose(in);
 111         return NULL;
 112     }
 113     in->left = 0;
 114     in->next = in->buf;
 115     in->name = name;
 116     return in;
 117 }
 118 
 119 /* load buffer from file, return -1 on read error, 0 or 1 on success, with
 120    1 indicating that end-of-file was reached */
 121 local int bload(bin *in)
 122 {
 123     long len;
 124 
 125     if (in == NULL)
 126         return -1;
 127     if (in->left != 0)
 128         return 0;
 129     in->next = in->buf;
 130     do {
 131         len = (long)read(in->fd, in->buf + in->left, CHUNK - in->left);
 132         if (len < 0)
 133             return -1;
 134         in->left += (unsigned)len;
 135     } while (len != 0 && in->left < CHUNK);
 136     return len == 0 ? 1 : 0;
 137 }
 138 
 139 /* get a byte from the file, bail if end of file */
 140 #define bget(in) (in->left ? 0 : bload(in), \
 141                   in->left ? (in->left--, *(in->next)++) : \
 142                     bail("unexpected end of file on ", in->name))
 143 
 144 /* get a four-byte little-endian unsigned integer from file */
 145 local unsigned long bget4(bin *in)
 146 {
 147     unsigned long val;
 148 
 149     val = bget(in);
 150     val += (unsigned long)(bget(in)) << 8;
 151     val += (unsigned long)(bget(in)) << 16;
 152     val += (unsigned long)(bget(in)) << 24;
 153     return val;
 154 }
 155 
 156 /* skip bytes in file */
 157 local void bskip(bin *in, unsigned skip)
 158 {
 159     /* check pointer */
 160     if (in == NULL)
 161         return;
 162 
 163     /* easy case -- skip bytes in buffer */
 164     if (skip <= in->left) {
 165         in->left -= skip;
 166         in->next += skip;
 167         return;
 168     }
 169 
 170     /* skip what's in buffer, discard buffer contents */
 171     skip -= in->left;
 172     in->left = 0;
 173 
 174     /* seek past multiples of CHUNK bytes */
 175     if (skip > CHUNK) {
 176         unsigned left;
 177 
 178         left = skip & (CHUNK - 1);
 179         if (left == 0) {
 180             /* exact number of chunks: seek all the way minus one byte to check
 181                for end-of-file with a read */
 182             lseek(in->fd, skip - 1, SEEK_CUR);
 183             if (read(in->fd, in->buf, 1) != 1)
 184                 bail("unexpected end of file on ", in->name);
 185             return;
 186         }
 187 
 188         /* skip the integral chunks, update skip with remainder */
 189         lseek(in->fd, skip - left, SEEK_CUR);
 190         skip = left;
 191     }
 192 
 193     /* read more input and skip remainder */
 194     bload(in);
 195     if (skip > in->left)
 196         bail("unexpected end of file on ", in->name);
 197     in->left -= skip;
 198     in->next += skip;
 199 }
 200 
 201 /* -- end of buffered input functions -- */
 202 
 203 /* skip the gzip header from file in */
 204 local void gzhead(bin *in)
 205 {
 206     int flags;
 207 
 208     /* verify gzip magic header and compression method */
 209     if (bget(in) != 0x1f || bget(in) != 0x8b || bget(in) != 8)
 210         bail(in->name, " is not a valid gzip file");
 211 
 212     /* get and verify flags */
 213     flags = bget(in);
 214     if ((flags & 0xe0) != 0)
 215         bail("unknown reserved bits set in ", in->name);
 216 
 217     /* skip modification time, extra flags, and os */
 218     bskip(in, 6);
 219 
 220     /* skip extra field if present */
 221     if (flags & 4) {
 222         unsigned len;
 223 
 224         len = bget(in);
 225         len += (unsigned)(bget(in)) << 8;
 226         bskip(in, len);
 227     }
 228 
 229     /* skip file name if present */
 230     if (flags & 8)
 231         while (bget(in) != 0)
 232             ;
 233 
 234     /* skip comment if present */
 235     if (flags & 16)
 236         while (bget(in) != 0)
 237             ;
 238 
 239     /* skip header crc if present */
 240     if (flags & 2)
 241         bskip(in, 2);
 242 }
 243 
 244 /* write a four-byte little-endian unsigned integer to out */
 245 local void put4(unsigned long val, FILE *out)
 246 {
 247     putc(val & 0xff, out);
 248     putc((val >> 8) & 0xff, out);
 249     putc((val >> 16) & 0xff, out);
 250     putc((val >> 24) & 0xff, out);
 251 }
 252 
 253 /* Load up zlib stream from buffered input, bail if end of file */
 254 local void zpull(z_streamp strm, bin *in)
 255 {
 256     if (in->left == 0)
 257         bload(in);
 258     if (in->left == 0)
 259         bail("unexpected end of file on ", in->name);
 260     strm->avail_in = in->left;
 261     strm->next_in = in->next;
 262 }
 263 
 264 /* Write header for gzip file to out and initialize trailer. */
 265 local void gzinit(unsigned long *crc, unsigned long *tot, FILE *out)
 266 {
 267     fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
 268     *crc = crc32(0L, Z_NULL, 0);
 269     *tot = 0;
 270 }
 271 
 272 /* Copy the compressed data from name, zeroing the last block bit of the last
 273    block if clr is true, and adding empty blocks as needed to get to a byte
 274    boundary.  If clr is false, then the last block becomes the last block of
 275    the output, and the gzip trailer is written.  crc and tot maintains the
 276    crc and length (modulo 2^32) of the output for the trailer.  The resulting
 277    gzip file is written to out.  gzinit() must be called before the first call
 278    of gzcopy() to write the gzip header and to initialize crc and tot. */
 279 local void gzcopy(char *name, int clr, unsigned long *crc, unsigned long *tot,
 280                   FILE *out)
 281 {
 282     int ret;                /* return value from zlib functions */
 283     int pos;                /* where the "last block" bit is in byte */
 284     int last;               /* true if processing the last block */
 285     bin *in;                /* buffered input file */
 286     unsigned char *start;   /* start of compressed data in buffer */
 287     unsigned char *junk;    /* buffer for uncompressed data -- discarded */
 288     z_off_t len;            /* length of uncompressed data (support > 4 GB) */
 289     z_stream strm;          /* zlib inflate stream */
 290 
 291     /* open gzip file and skip header */
 292     in = bopen(name);
 293     if (in == NULL)
 294         bail("could not open ", name);
 295     gzhead(in);
 296 
 297     /* allocate buffer for uncompressed data and initialize raw inflate
 298        stream */
 299     junk = malloc(CHUNK);
 300     strm.zalloc = Z_NULL;
 301     strm.zfree = Z_NULL;
 302     strm.opaque = Z_NULL;
 303     strm.avail_in = 0;
 304     strm.next_in = Z_NULL;
 305     ret = inflateInit2(&strm, -15);
 306     if (junk == NULL || ret != Z_OK)
 307         bail("out of memory", "");
 308 
 309     /* inflate and copy compressed data, clear last-block bit if requested */
 310     len = 0;
 311     zpull(&strm, in);
 312     start = in->next;
 313     last = start[0] & 1;
 314     if (last && clr)
 315         start[0] &= ~1;
 316     strm.avail_out = 0;
 317     for (;;) {
 318         /* if input used and output done, write used input and get more */
 319         if (strm.avail_in == 0 && strm.avail_out != 0) {
 320             fwrite(start, 1, strm.next_in - start, out);
 321             start = in->buf;
 322             in->left = 0;
 323             zpull(&strm, in);
 324         }
 325 
 326         /* decompress -- return early when end-of-block reached */
 327         strm.avail_out = CHUNK;
 328         strm.next_out = junk;
 329         ret = inflate(&strm, Z_BLOCK);
 330         switch (ret) {
 331         case Z_MEM_ERROR:
 332             bail("out of memory", "");
 333         case Z_DATA_ERROR:
 334             bail("invalid compressed data in ", in->name);
 335         }
 336 
 337         /* update length of uncompressed data */
 338         len += CHUNK - strm.avail_out;
 339 
 340         /* check for block boundary (only get this when block copied out) */
 341         if (strm.data_type & 128) {
 342             /* if that was the last block, then done */
 343             if (last)
 344                 break;
 345 
 346             /* number of unused bits in last byte */
 347             pos = strm.data_type & 7;
 348 
 349             /* find the next last-block bit */
 350             if (pos != 0) {
 351                 /* next last-block bit is in last used byte */
 352                 pos = 0x100 >> pos;
 353                 last = strm.next_in[-1] & pos;
 354                 if (last && clr)
 355                     in->buf[strm.next_in - in->buf - 1] &= ~pos;
 356             }
 357             else {
 358                 /* next last-block bit is in next unused byte */
 359                 if (strm.avail_in == 0) {
 360                     /* don't have that byte yet -- get it */
 361                     fwrite(start, 1, strm.next_in - start, out);
 362                     start = in->buf;
 363                     in->left = 0;
 364                     zpull(&strm, in);
 365                 }
 366                 last = strm.next_in[0] & 1;
 367                 if (last && clr)
 368                     in->buf[strm.next_in - in->buf] &= ~1;
 369             }
 370         }
 371     }
 372 
 373     /* update buffer with unused input */
 374     in->left = strm.avail_in;
 375     in->next = in->buf + (strm.next_in - in->buf);
 376 
 377     /* copy used input, write empty blocks to get to byte boundary */
 378     pos = strm.data_type & 7;
 379     fwrite(start, 1, in->next - start - 1, out);
 380     last = in->next[-1];
 381     if (pos == 0 || !clr)
 382         /* already at byte boundary, or last file: write last byte */
 383         putc(last, out);
 384     else {
 385         /* append empty blocks to last byte */
 386         last &= ((0x100 >> pos) - 1);       /* assure unused bits are zero */
 387         if (pos & 1) {
 388             /* odd -- append an empty stored block */
 389             putc(last, out);
 390             if (pos == 1)
 391                 putc(0, out);               /* two more bits in block header */
 392             fwrite("\0\0\xff\xff", 1, 4, out);
 393         }
 394         else {
 395             /* even -- append 1, 2, or 3 empty fixed blocks */
 396             switch (pos) {
 397             case 6:
 398                 putc(last | 8, out);
 399                 last = 0;
 400             case 4:
 401                 putc(last | 0x20, out);
 402                 last = 0;
 403             case 2:
 404                 putc(last | 0x80, out);
 405                 putc(0, out);
 406             }
 407         }
 408     }
 409 
 410     /* update crc and tot */
 411     *crc = crc32_combine(*crc, bget4(in), len);
 412     *tot += (unsigned long)len;
 413 
 414     /* clean up */
 415     inflateEnd(&strm);
 416     free(junk);
 417     bclose(in);
 418 
 419     /* write trailer if this is the last gzip file */
 420     if (!clr) {
 421         put4(*crc, out);
 422         put4(*tot, out);
 423     }
 424 }
 425 
 426 /* join the gzip files on the command line, write result to stdout */
 427 int main(int argc, char **argv)
 428 {
 429     unsigned long crc, tot;     /* running crc and total uncompressed length */
 430 
 431     /* skip command name */
 432     argc--;
 433     argv++;
 434 
 435     /* show usage if no arguments */
 436     if (argc == 0) {
 437         fputs("gzjoin usage: gzjoin f1.gz [f2.gz [f3.gz ...]] > fjoin.gz\n",
 438               stderr);
 439         return 0;
 440     }
 441 
 442     /* join gzip files on command line and write to stdout */
 443     gzinit(&crc, &tot, stdout);
 444     while (argc--)
 445         gzcopy(*argv++, argc, &crc, &tot, stdout);
 446 
 447     /* done */
 448     return 0;
 449 }