1 /* gzappend -- command to append to a gzip file
   2 
   3   Copyright (C) 2003, 2012 Mark Adler, all rights reserved
   4   version 1.2, 11 Oct 2012
   5 
   6   This software is provided 'as-is', without any express or implied
   7   warranty.  In no event will the author be held liable for any damages
   8   arising from the use of this software.
   9 
  10   Permission is granted to anyone to use this software for any purpose,
  11   including commercial applications, and to alter it and redistribute it
  12   freely, subject to the following restrictions:
  13 
  14   1. The origin of this software must not be misrepresented; you must not
  15      claim that you wrote the original software. If you use this software
  16      in a product, an acknowledgment in the product documentation would be
  17      appreciated but is not required.
  18   2. Altered source versions must be plainly marked as such, and must not be
  19      misrepresented as being the original software.
  20   3. This notice may not be removed or altered from any source distribution.
  21 
  22   Mark Adler    madler@alumni.caltech.edu
  23  */
  24 
  25 /*
  26  * Change history:
  27  *
  28  * 1.0  19 Oct 2003     - First version
  29  * 1.1   4 Nov 2003     - Expand and clarify some comments and notes
  30  *                      - Add version and copyright to help
  31  *                      - Send help to stdout instead of stderr
  32  *                      - Add some preemptive typecasts
  33  *                      - Add L to constants in lseek() calls
  34  *                      - Remove some debugging information in error messages
  35  *                      - Use new data_type definition for zlib 1.2.1
  36  *                      - Simplfy and unify file operations
  37  *                      - Finish off gzip file in gztack()
  38  *                      - Use deflatePrime() instead of adding empty blocks
  39  *                      - Keep gzip file clean on appended file read errors
  40  *                      - Use in-place rotate instead of auxiliary buffer
  41  *                        (Why you ask?  Because it was fun to write!)
  42  * 1.2  11 Oct 2012     - Fix for proper z_const usage
  43  *                      - Check for input buffer malloc failure
  44  */
  45 
  46 /*
  47    gzappend takes a gzip file and appends to it, compressing files from the
  48    command line or data from stdin.  The gzip file is written to directly, to
  49    avoid copying that file, in case it's large.  Note that this results in the
  50    unfriendly behavior that if gzappend fails, the gzip file is corrupted.
  51 
  52    This program was written to illustrate the use of the new Z_BLOCK option of
  53    zlib 1.2.x's inflate() function.  This option returns from inflate() at each
  54    block boundary to facilitate locating and modifying the last block bit at
  55    the start of the final deflate block.  Also whether using Z_BLOCK or not,
  56    another required feature of zlib 1.2.x is that inflate() now provides the
  57    number of unusued bits in the last input byte used.  gzappend will not work
  58    with versions of zlib earlier than 1.2.1.
  59 
  60    gzappend first decompresses the gzip file internally, discarding all but
  61    the last 32K of uncompressed data, and noting the location of the last block
  62    bit and the number of unused bits in the last byte of the compressed data.
  63    The gzip trailer containing the CRC-32 and length of the uncompressed data
  64    is verified.  This trailer will be later overwritten.
  65 
  66    Then the last block bit is cleared by seeking back in the file and rewriting
  67    the byte that contains it.  Seeking forward, the last byte of the compressed
  68    data is saved along with the number of unused bits to initialize deflate.
  69 
  70    A deflate process is initialized, using the last 32K of the uncompressed
  71    data from the gzip file to initialize the dictionary.  If the total
  72    uncompressed data was less than 32K, then all of it is used to initialize
  73    the dictionary.  The deflate output bit buffer is also initialized with the
  74    last bits from the original deflate stream.  From here on, the data to
  75    append is simply compressed using deflate, and written to the gzip file.
  76    When that is complete, the new CRC-32 and uncompressed length are written
  77    as the trailer of the gzip file.
  78  */
  79 
  80 #include <stdio.h>
  81 #include <stdlib.h>
  82 #include <string.h>
  83 #include <fcntl.h>
  84 #include <unistd.h>
  85 #include "zlib.h"
  86 
  87 #define local static
  88 #define LGCHUNK 14
  89 #define CHUNK (1U << LGCHUNK)
  90 #define DSIZE 32768U
  91 
  92 /* print an error message and terminate with extreme prejudice */
  93 local void bye(char *msg1, char *msg2)
  94 {
  95     fprintf(stderr, "gzappend error: %s%s\n", msg1, msg2);
  96     exit(1);
  97 }
  98 
  99 /* return the greatest common divisor of a and b using Euclid's algorithm,
 100    modified to be fast when one argument much greater than the other, and
 101    coded to avoid unnecessary swapping */
 102 local unsigned gcd(unsigned a, unsigned b)
 103 {
 104     unsigned c;
 105 
 106     while (a && b)
 107         if (a > b) {
 108             c = b;
 109             while (a - c >= c)
 110                 c <<= 1;
 111             a -= c;
 112         }
 113         else {
 114             c = a;
 115             while (b - c >= c)
 116                 c <<= 1;
 117             b -= c;
 118         }
 119     return a + b;
 120 }
 121 
 122 /* rotate list[0..len-1] left by rot positions, in place */
 123 local void rotate(unsigned char *list, unsigned len, unsigned rot)
 124 {
 125     unsigned char tmp;
 126     unsigned cycles;
 127     unsigned char *start, *last, *to, *from;
 128 
 129     /* normalize rot and handle degenerate cases */
 130     if (len < 2) return;
 131     if (rot >= len) rot %= len;
 132     if (rot == 0) return;
 133 
 134     /* pointer to last entry in list */
 135     last = list + (len - 1);
 136 
 137     /* do simple left shift by one */
 138     if (rot == 1) {
 139         tmp = *list;
 140         memcpy(list, list + 1, len - 1);
 141         *last = tmp;
 142         return;
 143     }
 144 
 145     /* do simple right shift by one */
 146     if (rot == len - 1) {
 147         tmp = *last;
 148         memmove(list + 1, list, len - 1);
 149         *list = tmp;
 150         return;
 151     }
 152 
 153     /* otherwise do rotate as a set of cycles in place */
 154     cycles = gcd(len, rot);             /* number of cycles */
 155     do {
 156         start = from = list + cycles;   /* start index is arbitrary */
 157         tmp = *from;                    /* save entry to be overwritten */
 158         for (;;) {
 159             to = from;                  /* next step in cycle */
 160             from += rot;                /* go right rot positions */
 161             if (from > last) from -= len;   /* (pointer better not wrap) */
 162             if (from == start) break;   /* all but one shifted */
 163             *to = *from;                /* shift left */
 164         }
 165         *to = tmp;                      /* complete the circle */
 166     } while (--cycles);
 167 }
 168 
 169 /* structure for gzip file read operations */
 170 typedef struct {
 171     int fd;                     /* file descriptor */
 172     int size;                   /* 1 << size is bytes in buf */
 173     unsigned left;              /* bytes available at next */
 174     unsigned char *buf;         /* buffer */
 175     z_const unsigned char *next;    /* next byte in buffer */
 176     char *name;                 /* file name for error messages */
 177 } file;
 178 
 179 /* reload buffer */
 180 local int readin(file *in)
 181 {
 182     int len;
 183 
 184     len = read(in->fd, in->buf, 1 << in->size);
 185     if (len == -1) bye("error reading ", in->name);
 186     in->left = (unsigned)len;
 187     in->next = in->buf;
 188     return len;
 189 }
 190 
 191 /* read from file in, exit if end-of-file */
 192 local int readmore(file *in)
 193 {
 194     if (readin(in) == 0) bye("unexpected end of ", in->name);
 195     return 0;
 196 }
 197 
 198 #define read1(in) (in->left == 0 ? readmore(in) : 0, \
 199                    in->left--, *(in->next)++)
 200 
 201 /* skip over n bytes of in */
 202 local void skip(file *in, unsigned n)
 203 {
 204     unsigned bypass;
 205 
 206     if (n > in->left) {
 207         n -= in->left;
 208         bypass = n & ~((1U << in->size) - 1);
 209         if (bypass) {
 210             if (lseek(in->fd, (off_t)bypass, SEEK_CUR) == -1)
 211                 bye("seeking ", in->name);
 212             n -= bypass;
 213         }
 214         readmore(in);
 215         if (n > in->left)
 216             bye("unexpected end of ", in->name);
 217     }
 218     in->left -= n;
 219     in->next += n;
 220 }
 221 
 222 /* read a four-byte unsigned integer, little-endian, from in */
 223 unsigned long read4(file *in)
 224 {
 225     unsigned long val;
 226 
 227     val = read1(in);
 228     val += (unsigned)read1(in) << 8;
 229     val += (unsigned long)read1(in) << 16;
 230     val += (unsigned long)read1(in) << 24;
 231     return val;
 232 }
 233 
 234 /* skip over gzip header */
 235 local void gzheader(file *in)
 236 {
 237     int flags;
 238     unsigned n;
 239 
 240     if (read1(in) != 31 || read1(in) != 139) bye(in->name, " not a gzip file");
 241     if (read1(in) != 8) bye("unknown compression method in", in->name);
 242     flags = read1(in);
 243     if (flags & 0xe0) bye("unknown header flags set in", in->name);
 244     skip(in, 6);
 245     if (flags & 4) {
 246         n = read1(in);
 247         n += (unsigned)(read1(in)) << 8;
 248         skip(in, n);
 249     }
 250     if (flags & 8) while (read1(in) != 0) ;
 251     if (flags & 16) while (read1(in) != 0) ;
 252     if (flags & 2) skip(in, 2);
 253 }
 254 
 255 /* decompress gzip file "name", return strm with a deflate stream ready to
 256    continue compression of the data in the gzip file, and return a file
 257    descriptor pointing to where to write the compressed data -- the deflate
 258    stream is initialized to compress using level "level" */
 259 local int gzscan(char *name, z_stream *strm, int level)
 260 {
 261     int ret, lastbit, left, full;
 262     unsigned have;
 263     unsigned long crc, tot;
 264     unsigned char *window;
 265     off_t lastoff, end;
 266     file gz;
 267 
 268     /* open gzip file */
 269     gz.name = name;
 270     gz.fd = open(name, O_RDWR, 0);
 271     if (gz.fd == -1) bye("cannot open ", name);
 272     gz.buf = malloc(CHUNK);
 273     if (gz.buf == NULL) bye("out of memory", "");
 274     gz.size = LGCHUNK;
 275     gz.left = 0;
 276 
 277     /* skip gzip header */
 278     gzheader(&gz);
 279 
 280     /* prepare to decompress */
 281     window = malloc(DSIZE);
 282     if (window == NULL) bye("out of memory", "");
 283     strm->zalloc = Z_NULL;
 284     strm->zfree = Z_NULL;
 285     strm->opaque = Z_NULL;
 286     ret = inflateInit2(strm, -15);
 287     if (ret != Z_OK) bye("out of memory", " or library mismatch");
 288 
 289     /* decompress the deflate stream, saving append information */
 290     lastbit = 0;
 291     lastoff = lseek(gz.fd, 0L, SEEK_CUR) - gz.left;
 292     left = 0;
 293     strm->avail_in = gz.left;
 294     strm->next_in = gz.next;
 295     crc = crc32(0L, Z_NULL, 0);
 296     have = full = 0;
 297     do {
 298         /* if needed, get more input */
 299         if (strm->avail_in == 0) {
 300             readmore(&gz);
 301             strm->avail_in = gz.left;
 302             strm->next_in = gz.next;
 303         }
 304 
 305         /* set up output to next available section of sliding window */
 306         strm->avail_out = DSIZE - have;
 307         strm->next_out = window + have;
 308 
 309         /* inflate and check for errors */
 310         ret = inflate(strm, Z_BLOCK);
 311         if (ret == Z_STREAM_ERROR) bye("internal stream error!", "");
 312         if (ret == Z_MEM_ERROR) bye("out of memory", "");
 313         if (ret == Z_DATA_ERROR)
 314             bye("invalid compressed data--format violated in", name);
 315 
 316         /* update crc and sliding window pointer */
 317         crc = crc32(crc, window + have, DSIZE - have - strm->avail_out);
 318         if (strm->avail_out)
 319             have = DSIZE - strm->avail_out;
 320         else {
 321             have = 0;
 322             full = 1;
 323         }
 324 
 325         /* process end of block */
 326         if (strm->data_type & 128) {
 327             if (strm->data_type & 64)
 328                 left = strm->data_type & 0x1f;
 329             else {
 330                 lastbit = strm->data_type & 0x1f;
 331                 lastoff = lseek(gz.fd, 0L, SEEK_CUR) - strm->avail_in;
 332             }
 333         }
 334     } while (ret != Z_STREAM_END);
 335     inflateEnd(strm);
 336     gz.left = strm->avail_in;
 337     gz.next = strm->next_in;
 338 
 339     /* save the location of the end of the compressed data */
 340     end = lseek(gz.fd, 0L, SEEK_CUR) - gz.left;
 341 
 342     /* check gzip trailer and save total for deflate */
 343     if (crc != read4(&gz))
 344         bye("invalid compressed data--crc mismatch in ", name);
 345     tot = strm->total_out;
 346     if ((tot & 0xffffffffUL) != read4(&gz))
 347         bye("invalid compressed data--length mismatch in", name);
 348 
 349     /* if not at end of file, warn */
 350     if (gz.left || readin(&gz))
 351         fprintf(stderr,
 352             "gzappend warning: junk at end of gzip file overwritten\n");
 353 
 354     /* clear last block bit */
 355     lseek(gz.fd, lastoff - (lastbit != 0), SEEK_SET);
 356     if (read(gz.fd, gz.buf, 1) != 1) bye("reading after seek on ", name);
 357     *gz.buf = (unsigned char)(*gz.buf ^ (1 << ((8 - lastbit) & 7)));
 358     lseek(gz.fd, -1L, SEEK_CUR);
 359     if (write(gz.fd, gz.buf, 1) != 1) bye("writing after seek to ", name);
 360 
 361     /* if window wrapped, build dictionary from window by rotating */
 362     if (full) {
 363         rotate(window, DSIZE, have);
 364         have = DSIZE;
 365     }
 366 
 367     /* set up deflate stream with window, crc, total_in, and leftover bits */
 368     ret = deflateInit2(strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY);
 369     if (ret != Z_OK) bye("out of memory", "");
 370     deflateSetDictionary(strm, window, have);
 371     strm->adler = crc;
 372     strm->total_in = tot;
 373     if (left) {
 374         lseek(gz.fd, --end, SEEK_SET);
 375         if (read(gz.fd, gz.buf, 1) != 1) bye("reading after seek on ", name);
 376         deflatePrime(strm, 8 - left, *gz.buf);
 377     }
 378     lseek(gz.fd, end, SEEK_SET);
 379 
 380     /* clean up and return */
 381     free(window);
 382     free(gz.buf);
 383     return gz.fd;
 384 }
 385 
 386 /* append file "name" to gzip file gd using deflate stream strm -- if last
 387    is true, then finish off the deflate stream at the end */
 388 local void gztack(char *name, int gd, z_stream *strm, int last)
 389 {
 390     int fd, len, ret;
 391     unsigned left;
 392     unsigned char *in, *out;
 393 
 394     /* open file to compress and append */
 395     fd = 0;
 396     if (name != NULL) {
 397         fd = open(name, O_RDONLY, 0);
 398         if (fd == -1)
 399             fprintf(stderr, "gzappend warning: %s not found, skipping ...\n",
 400                     name);
 401     }
 402 
 403     /* allocate buffers */
 404     in = malloc(CHUNK);
 405     out = malloc(CHUNK);
 406     if (in == NULL || out == NULL) bye("out of memory", "");
 407 
 408     /* compress input file and append to gzip file */
 409     do {
 410         /* get more input */
 411         len = read(fd, in, CHUNK);
 412         if (len == -1) {
 413             fprintf(stderr,
 414                     "gzappend warning: error reading %s, skipping rest ...\n",
 415                     name);
 416             len = 0;
 417         }
 418         strm->avail_in = (unsigned)len;
 419         strm->next_in = in;
 420         if (len) strm->adler = crc32(strm->adler, in, (unsigned)len);
 421 
 422         /* compress and write all available output */
 423         do {
 424             strm->avail_out = CHUNK;
 425             strm->next_out = out;
 426             ret = deflate(strm, last && len == 0 ? Z_FINISH : Z_NO_FLUSH);
 427             left = CHUNK - strm->avail_out;
 428             while (left) {
 429                 len = write(gd, out + CHUNK - strm->avail_out - left, left);
 430                 if (len == -1) bye("writing gzip file", "");
 431                 left -= (unsigned)len;
 432             }
 433         } while (strm->avail_out == 0 && ret != Z_STREAM_END);
 434     } while (len != 0);
 435 
 436     /* write trailer after last entry */
 437     if (last) {
 438         deflateEnd(strm);
 439         out[0] = (unsigned char)(strm->adler);
 440         out[1] = (unsigned char)(strm->adler >> 8);
 441         out[2] = (unsigned char)(strm->adler >> 16);
 442         out[3] = (unsigned char)(strm->adler >> 24);
 443         out[4] = (unsigned char)(strm->total_in);
 444         out[5] = (unsigned char)(strm->total_in >> 8);
 445         out[6] = (unsigned char)(strm->total_in >> 16);
 446         out[7] = (unsigned char)(strm->total_in >> 24);
 447         len = 8;
 448         do {
 449             ret = write(gd, out + 8 - len, len);
 450             if (ret == -1) bye("writing gzip file", "");
 451             len -= ret;
 452         } while (len);
 453         close(gd);
 454     }
 455 
 456     /* clean up and return */
 457     free(out);
 458     free(in);
 459     if (fd > 0) close(fd);
 460 }
 461 
 462 /* process the compression level option if present, scan the gzip file, and
 463    append the specified files, or append the data from stdin if no other file
 464    names are provided on the command line -- the gzip file must be writable
 465    and seekable */
 466 int main(int argc, char **argv)
 467 {
 468     int gd, level;
 469     z_stream strm;
 470 
 471     /* ignore command name */
 472     argc--; argv++;
 473 
 474     /* provide usage if no arguments */
 475     if (*argv == NULL) {
 476         printf(
 477             "gzappend 1.2 (11 Oct 2012) Copyright (C) 2003, 2012 Mark Adler\n"
 478                );
 479         printf(
 480             "usage: gzappend [-level] file.gz [ addthis [ andthis ... ]]\n");
 481         return 0;
 482     }
 483 
 484     /* set compression level */
 485     level = Z_DEFAULT_COMPRESSION;
 486     if (argv[0][0] == '-') {
 487         if (argv[0][1] < '0' || argv[0][1] > '9' || argv[0][2] != 0)
 488             bye("invalid compression level", "");
 489         level = argv[0][1] - '0';
 490         if (*++argv == NULL) bye("no gzip file name after options", "");
 491     }
 492 
 493     /* prepare to append to gzip file */
 494     gd = gzscan(*argv++, &strm, level);
 495 
 496     /* append files on command line, or from stdin if none */
 497     if (*argv == NULL)
 498         gztack(NULL, gd, &strm, 1);
 499     else
 500         do {
 501             gztack(*argv, gd, &strm, argv[1] == NULL);
 502         } while (*++argv != NULL);
 503     return 0;
 504 }