1 /* gzjoin -- command to join gzip files into one gzip file 2 3 Copyright (C) 2004, 2005, 2012 Mark Adler, all rights reserved 4 version 1.2, 14 Aug 2012 5 6 This software is provided 'as-is', without any express or implied 7 warranty. In no event will the author be held liable for any damages 8 arising from the use of this software. 9 10 Permission is granted to anyone to use this software for any purpose, 11 including commercial applications, and to alter it and redistribute it 12 freely, subject to the following restrictions: 13 14 1. The origin of this software must not be misrepresented; you must not 15 claim that you wrote the original software. If you use this software 16 in a product, an acknowledgment in the product documentation would be 17 appreciated but is not required. 18 2. Altered source versions must be plainly marked as such, and must not be 19 misrepresented as being the original software. 20 3. This notice may not be removed or altered from any source distribution. 21 22 Mark Adler madler@alumni.caltech.edu 23 */ 24 25 /* 26 * Change history: 27 * 28 * 1.0 11 Dec 2004 - First version 29 * 1.1 12 Jun 2005 - Changed ssize_t to long for portability 30 * 1.2 14 Aug 2012 - Clean up for z_const usage 31 */ 32 33 /* 34 gzjoin takes one or more gzip files on the command line and writes out a 35 single gzip file that will uncompress to the concatenation of the 36 uncompressed data from the individual gzip files. gzjoin does this without 37 having to recompress any of the data and without having to calculate a new 38 crc32 for the concatenated uncompressed data. gzjoin does however have to 39 decompress all of the input data in order to find the bits in the compressed 40 data that need to be modified to concatenate the streams. 41 42 gzjoin does not do an integrity check on the input gzip files other than 43 checking the gzip header and decompressing the compressed data. They are 44 otherwise assumed to be complete and correct. 45 46 Each joint between gzip files removes at least 18 bytes of previous trailer 47 and subsequent header, and inserts an average of about three bytes to the 48 compressed data in order to connect the streams. The output gzip file 49 has a minimal ten-byte gzip header with no file name or modification time. 50 51 This program was written to illustrate the use of the Z_BLOCK option of 52 inflate() and the crc32_combine() function. gzjoin will not compile with 53 versions of zlib earlier than 1.2.3. 54 */ 55 56 #include <stdio.h> /* fputs(), fprintf(), fwrite(), putc() */ 57 #include <stdlib.h> /* exit(), malloc(), free() */ 58 #include <fcntl.h> /* open() */ 59 #include <unistd.h> /* close(), read(), lseek() */ 60 #include "zlib.h" 61 /* crc32(), crc32_combine(), inflateInit2(), inflate(), inflateEnd() */ 62 63 #define local static 64 65 /* exit with an error (return a value to allow use in an expression) */ 66 local int bail(char *why1, char *why2) 67 { 68 fprintf(stderr, "gzjoin error: %s%s, output incomplete\n", why1, why2); 69 exit(1); 70 return 0; 71 } 72 73 /* -- simple buffered file input with access to the buffer -- */ 74 75 #define CHUNK 32768 /* must be a power of two and fit in unsigned */ 76 77 /* bin buffered input file type */ 78 typedef struct { 79 char *name; /* name of file for error messages */ 80 int fd; /* file descriptor */ 81 unsigned left; /* bytes remaining at next */ 82 unsigned char *next; /* next byte to read */ 83 unsigned char *buf; /* allocated buffer of length CHUNK */ 84 } bin; 85 86 /* close a buffered file and free allocated memory */ 87 local void bclose(bin *in) 88 { 89 if (in != NULL) { 90 if (in->fd != -1) 91 close(in->fd); 92 if (in->buf != NULL) 93 free(in->buf); 94 free(in); 95 } 96 } 97 98 /* open a buffered file for input, return a pointer to type bin, or NULL on 99 failure */ 100 local bin *bopen(char *name) 101 { 102 bin *in; 103 104 in = malloc(sizeof(bin)); 105 if (in == NULL) 106 return NULL; 107 in->buf = malloc(CHUNK); 108 in->fd = open(name, O_RDONLY, 0); 109 if (in->buf == NULL || in->fd == -1) { 110 bclose(in); 111 return NULL; 112 } 113 in->left = 0; 114 in->next = in->buf; 115 in->name = name; 116 return in; 117 } 118 119 /* load buffer from file, return -1 on read error, 0 or 1 on success, with 120 1 indicating that end-of-file was reached */ 121 local int bload(bin *in) 122 { 123 long len; 124 125 if (in == NULL) 126 return -1; 127 if (in->left != 0) 128 return 0; 129 in->next = in->buf; 130 do { 131 len = (long)read(in->fd, in->buf + in->left, CHUNK - in->left); 132 if (len < 0) 133 return -1; 134 in->left += (unsigned)len; 135 } while (len != 0 && in->left < CHUNK); 136 return len == 0 ? 1 : 0; 137 } 138 139 /* get a byte from the file, bail if end of file */ 140 #define bget(in) (in->left ? 0 : bload(in), \ 141 in->left ? (in->left--, *(in->next)++) : \ 142 bail("unexpected end of file on ", in->name)) 143 144 /* get a four-byte little-endian unsigned integer from file */ 145 local unsigned long bget4(bin *in) 146 { 147 unsigned long val; 148 149 val = bget(in); 150 val += (unsigned long)(bget(in)) << 8; 151 val += (unsigned long)(bget(in)) << 16; 152 val += (unsigned long)(bget(in)) << 24; 153 return val; 154 } 155 156 /* skip bytes in file */ 157 local void bskip(bin *in, unsigned skip) 158 { 159 /* check pointer */ 160 if (in == NULL) 161 return; 162 163 /* easy case -- skip bytes in buffer */ 164 if (skip <= in->left) { 165 in->left -= skip; 166 in->next += skip; 167 return; 168 } 169 170 /* skip what's in buffer, discard buffer contents */ 171 skip -= in->left; 172 in->left = 0; 173 174 /* seek past multiples of CHUNK bytes */ 175 if (skip > CHUNK) { 176 unsigned left; 177 178 left = skip & (CHUNK - 1); 179 if (left == 0) { 180 /* exact number of chunks: seek all the way minus one byte to check 181 for end-of-file with a read */ 182 lseek(in->fd, skip - 1, SEEK_CUR); 183 if (read(in->fd, in->buf, 1) != 1) 184 bail("unexpected end of file on ", in->name); 185 return; 186 } 187 188 /* skip the integral chunks, update skip with remainder */ 189 lseek(in->fd, skip - left, SEEK_CUR); 190 skip = left; 191 } 192 193 /* read more input and skip remainder */ 194 bload(in); 195 if (skip > in->left) 196 bail("unexpected end of file on ", in->name); 197 in->left -= skip; 198 in->next += skip; 199 } 200 201 /* -- end of buffered input functions -- */ 202 203 /* skip the gzip header from file in */ 204 local void gzhead(bin *in) 205 { 206 int flags; 207 208 /* verify gzip magic header and compression method */ 209 if (bget(in) != 0x1f || bget(in) != 0x8b || bget(in) != 8) 210 bail(in->name, " is not a valid gzip file"); 211 212 /* get and verify flags */ 213 flags = bget(in); 214 if ((flags & 0xe0) != 0) 215 bail("unknown reserved bits set in ", in->name); 216 217 /* skip modification time, extra flags, and os */ 218 bskip(in, 6); 219 220 /* skip extra field if present */ 221 if (flags & 4) { 222 unsigned len; 223 224 len = bget(in); 225 len += (unsigned)(bget(in)) << 8; 226 bskip(in, len); 227 } 228 229 /* skip file name if present */ 230 if (flags & 8) 231 while (bget(in) != 0) 232 ; 233 234 /* skip comment if present */ 235 if (flags & 16) 236 while (bget(in) != 0) 237 ; 238 239 /* skip header crc if present */ 240 if (flags & 2) 241 bskip(in, 2); 242 } 243 244 /* write a four-byte little-endian unsigned integer to out */ 245 local void put4(unsigned long val, FILE *out) 246 { 247 putc(val & 0xff, out); 248 putc((val >> 8) & 0xff, out); 249 putc((val >> 16) & 0xff, out); 250 putc((val >> 24) & 0xff, out); 251 } 252 253 /* Load up zlib stream from buffered input, bail if end of file */ 254 local void zpull(z_streamp strm, bin *in) 255 { 256 if (in->left == 0) 257 bload(in); 258 if (in->left == 0) 259 bail("unexpected end of file on ", in->name); 260 strm->avail_in = in->left; 261 strm->next_in = in->next; 262 } 263 264 /* Write header for gzip file to out and initialize trailer. */ 265 local void gzinit(unsigned long *crc, unsigned long *tot, FILE *out) 266 { 267 fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out); 268 *crc = crc32(0L, Z_NULL, 0); 269 *tot = 0; 270 } 271 272 /* Copy the compressed data from name, zeroing the last block bit of the last 273 block if clr is true, and adding empty blocks as needed to get to a byte 274 boundary. If clr is false, then the last block becomes the last block of 275 the output, and the gzip trailer is written. crc and tot maintains the 276 crc and length (modulo 2^32) of the output for the trailer. The resulting 277 gzip file is written to out. gzinit() must be called before the first call 278 of gzcopy() to write the gzip header and to initialize crc and tot. */ 279 local void gzcopy(char *name, int clr, unsigned long *crc, unsigned long *tot, 280 FILE *out) 281 { 282 int ret; /* return value from zlib functions */ 283 int pos; /* where the "last block" bit is in byte */ 284 int last; /* true if processing the last block */ 285 bin *in; /* buffered input file */ 286 unsigned char *start; /* start of compressed data in buffer */ 287 unsigned char *junk; /* buffer for uncompressed data -- discarded */ 288 z_off_t len; /* length of uncompressed data (support > 4 GB) */ 289 z_stream strm; /* zlib inflate stream */ 290 291 /* open gzip file and skip header */ 292 in = bopen(name); 293 if (in == NULL) 294 bail("could not open ", name); 295 gzhead(in); 296 297 /* allocate buffer for uncompressed data and initialize raw inflate 298 stream */ 299 junk = malloc(CHUNK); 300 strm.zalloc = Z_NULL; 301 strm.zfree = Z_NULL; 302 strm.opaque = Z_NULL; 303 strm.avail_in = 0; 304 strm.next_in = Z_NULL; 305 ret = inflateInit2(&strm, -15); 306 if (junk == NULL || ret != Z_OK) 307 bail("out of memory", ""); 308 309 /* inflate and copy compressed data, clear last-block bit if requested */ 310 len = 0; 311 zpull(&strm, in); 312 start = in->next; 313 last = start[0] & 1; 314 if (last && clr) 315 start[0] &= ~1; 316 strm.avail_out = 0; 317 for (;;) { 318 /* if input used and output done, write used input and get more */ 319 if (strm.avail_in == 0 && strm.avail_out != 0) { 320 fwrite(start, 1, strm.next_in - start, out); 321 start = in->buf; 322 in->left = 0; 323 zpull(&strm, in); 324 } 325 326 /* decompress -- return early when end-of-block reached */ 327 strm.avail_out = CHUNK; 328 strm.next_out = junk; 329 ret = inflate(&strm, Z_BLOCK); 330 switch (ret) { 331 case Z_MEM_ERROR: 332 bail("out of memory", ""); 333 case Z_DATA_ERROR: 334 bail("invalid compressed data in ", in->name); 335 } 336 337 /* update length of uncompressed data */ 338 len += CHUNK - strm.avail_out; 339 340 /* check for block boundary (only get this when block copied out) */ 341 if (strm.data_type & 128) { 342 /* if that was the last block, then done */ 343 if (last) 344 break; 345 346 /* number of unused bits in last byte */ 347 pos = strm.data_type & 7; 348 349 /* find the next last-block bit */ 350 if (pos != 0) { 351 /* next last-block bit is in last used byte */ 352 pos = 0x100 >> pos; 353 last = strm.next_in[-1] & pos; 354 if (last && clr) 355 in->buf[strm.next_in - in->buf - 1] &= ~pos; 356 } 357 else { 358 /* next last-block bit is in next unused byte */ 359 if (strm.avail_in == 0) { 360 /* don't have that byte yet -- get it */ 361 fwrite(start, 1, strm.next_in - start, out); 362 start = in->buf; 363 in->left = 0; 364 zpull(&strm, in); 365 } 366 last = strm.next_in[0] & 1; 367 if (last && clr) 368 in->buf[strm.next_in - in->buf] &= ~1; 369 } 370 } 371 } 372 373 /* update buffer with unused input */ 374 in->left = strm.avail_in; 375 in->next = in->buf + (strm.next_in - in->buf); 376 377 /* copy used input, write empty blocks to get to byte boundary */ 378 pos = strm.data_type & 7; 379 fwrite(start, 1, in->next - start - 1, out); 380 last = in->next[-1]; 381 if (pos == 0 || !clr) 382 /* already at byte boundary, or last file: write last byte */ 383 putc(last, out); 384 else { 385 /* append empty blocks to last byte */ 386 last &= ((0x100 >> pos) - 1); /* assure unused bits are zero */ 387 if (pos & 1) { 388 /* odd -- append an empty stored block */ 389 putc(last, out); 390 if (pos == 1) 391 putc(0, out); /* two more bits in block header */ 392 fwrite("\0\0\xff\xff", 1, 4, out); 393 } 394 else { 395 /* even -- append 1, 2, or 3 empty fixed blocks */ 396 switch (pos) { 397 case 6: 398 putc(last | 8, out); 399 last = 0; 400 case 4: 401 putc(last | 0x20, out); 402 last = 0; 403 case 2: 404 putc(last | 0x80, out); 405 putc(0, out); 406 } 407 } 408 } 409 410 /* update crc and tot */ 411 *crc = crc32_combine(*crc, bget4(in), len); 412 *tot += (unsigned long)len; 413 414 /* clean up */ 415 inflateEnd(&strm); 416 free(junk); 417 bclose(in); 418 419 /* write trailer if this is the last gzip file */ 420 if (!clr) { 421 put4(*crc, out); 422 put4(*tot, out); 423 } 424 } 425 426 /* join the gzip files on the command line, write result to stdout */ 427 int main(int argc, char **argv) 428 { 429 unsigned long crc, tot; /* running crc and total uncompressed length */ 430 431 /* skip command name */ 432 argc--; 433 argv++; 434 435 /* show usage if no arguments */ 436 if (argc == 0) { 437 fputs("gzjoin usage: gzjoin f1.gz [f2.gz [f3.gz ...]] > fjoin.gz\n", 438 stderr); 439 return 0; 440 } 441 442 /* join gzip files on command line and write to stdout */ 443 gzinit(&crc, &tot, stdout); 444 while (argc--) 445 gzcopy(*argv++, argc, &crc, &tot, stdout); 446 447 /* done */ 448 return 0; 449 }