diff --git a/lib/decompress_impl.h b/lib/decompress_impl.h index d9e01c1..d0ccad3 100644 --- a/lib/decompress_impl.h +++ b/lib/decompress_impl.h @@ -37,7 +37,7 @@ static enum libdeflate_result ATTRIBUTES FUNCNAME(struct libdeflate_decompressor * restrict d, const void * restrict in, size_t in_nbytes, void * restrict out, size_t out_nbytes_avail, - size_t *actual_out_nbytes_ret) + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) { u8 *out_next = out; u8 * const out_end = out_next + out_nbytes_avail; @@ -394,6 +394,14 @@ block_done: /* That was the last block. */ + /* Discard any readahead bits and check for excessive overread */ + ALIGN_INPUT(); + + /* Optionally return the actual number of bytes read */ + if (actual_in_nbytes_ret) + *actual_in_nbytes_ret = in_next - (u8 *)in; + + /* Optionally return the actual number of bytes written */ if (actual_out_nbytes_ret) { *actual_out_nbytes_ret = out_next - (u8 *)out; } else { diff --git a/lib/deflate_decompress.c b/lib/deflate_decompress.c index 8284fc8..819b753 100644 --- a/lib/deflate_decompress.c +++ b/lib/deflate_decompress.c @@ -204,18 +204,20 @@ typedef machine_word_t bitbuf_t; /* * Fill the bitbuffer variable, reading one byte at a time. * - * Note: if we would overrun the input buffer, we just don't read anything, - * leaving the bits as 0 but marking them as filled. This makes the - * implementation simpler because this removes the need to distinguish between - * "real" overruns and overruns that occur because of our own lookahead during - * Huffman decoding. The disadvantage is that a "real" overrun can go - * undetected, and libdeflate_deflate_decompress() may return a success status - * rather than the expected failure status if one occurs. However, this is - * irrelevant because even if this specific case were to be handled "correctly", - * one could easily come up with a different case where the compressed data - * would be corrupted in such a way that fully retains its validity. Users - * should run a checksum against the uncompressed data if they wish to detect - * corruptions. + * If we would overread the input buffer, we just don't read anything, leaving + * the bits zeroed but marking them filled. This simplifies the decompressor + * because it removes the need to distinguish between real overreads and + * overreads that occur only because of the decompressor's own lookahead. + * + * The disadvantage is that real overreads are not detected immediately. + * However, this is safe because the decompressor is still guaranteed to make + * forward progress when presented never-ending 0 bits. In an existing block + * output will be getting generated, whereas new blocks can only be uncompressed + * (since the type code for uncompressed blocks is 0), for which we check for + * previous overread. But even if we didn't check, uncompressed blocks would + * fail to validate because LEN would not equal ~NLEN. So the decompressor will + * eventually either detect that the output buffer is full, or detect invalid + * input, or finish the final block. */ #define FILL_BITS_BYTEWISE() \ do { \ @@ -277,17 +279,19 @@ if (!HAVE_BITS(n)) { \ #define POP_BITS(n) (tmp32 = BITS(n), REMOVE_BITS(n), tmp32) /* - * Align the input to the next byte boundary, discarding any remaining bits in - * the current byte. + * Verify that the input buffer hasn't been overread, then align the input to + * the next byte boundary, discarding any remaining bits in the current byte. * - * Note that if the bitbuffer variable currently contains more than 8 bits, then + * Note that if the bitbuffer variable currently contains more than 7 bits, then * we must rewind 'in_next', effectively putting those bits back. Only the bits * in what would be the "current" byte if we were reading one byte at a time can * be actually discarded. */ #define ALIGN_INPUT() \ do { \ - in_next -= (bitsleft >> 3) - MIN(overrun_count, bitsleft >> 3); \ + SAFETY_CHECK(overrun_count <= (bitsleft >> 3)); \ + in_next -= (bitsleft >> 3) - overrun_count; \ + overrun_count = 0; \ bitbuf = 0; \ bitsleft = 0; \ } while(0) @@ -824,13 +828,13 @@ static enum libdeflate_result dispatch(struct libdeflate_decompressor * restrict d, const void * restrict in, size_t in_nbytes, void * restrict out, size_t out_nbytes_avail, - size_t *actual_out_nbytes_ret); + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); typedef enum libdeflate_result (*decompress_func_t) (struct libdeflate_decompressor * restrict d, const void * restrict in, size_t in_nbytes, void * restrict out, size_t out_nbytes_avail, - size_t *actual_out_nbytes_ret); + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); static decompress_func_t decompress_impl = dispatch; @@ -838,7 +842,7 @@ static enum libdeflate_result dispatch(struct libdeflate_decompressor * restrict d, const void * restrict in, size_t in_nbytes, void * restrict out, size_t out_nbytes_avail, - size_t *actual_out_nbytes_ret) + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) { decompress_func_t f = deflate_decompress_default; #if X86_CPU_FEATURES_ENABLED @@ -847,7 +851,7 @@ dispatch(struct libdeflate_decompressor * restrict d, #endif decompress_impl = f; return (*f)(d, in, in_nbytes, out, out_nbytes_avail, - actual_out_nbytes_ret); + actual_in_nbytes_ret, actual_out_nbytes_ret); } #endif /* DISPATCH_ENABLED */ @@ -860,20 +864,33 @@ dispatch(struct libdeflate_decompressor * restrict d, * calling the appropriate implementation depending on the CPU features at * runtime. */ +LIBDEFLATEAPI enum libdeflate_result +libdeflate_deflate_decompress_ex(struct libdeflate_decompressor * restrict d, + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret) +{ +#if DISPATCH_ENABLED + return (*decompress_impl)(d, in, in_nbytes, out, out_nbytes_avail, + actual_in_nbytes_ret, actual_out_nbytes_ret); +#else + return deflate_decompress_default(d, in, in_nbytes, + out, out_nbytes_avail, + actual_in_nbytes_ret, + actual_out_nbytes_ret); +#endif +} + LIBDEFLATEAPI enum libdeflate_result libdeflate_deflate_decompress(struct libdeflate_decompressor * restrict d, const void * restrict in, size_t in_nbytes, void * restrict out, size_t out_nbytes_avail, size_t *actual_out_nbytes_ret) { -#if DISPATCH_ENABLED - return (*decompress_impl)(d, in, in_nbytes, out, out_nbytes_avail, - actual_out_nbytes_ret); -#else - return deflate_decompress_default(d, in, in_nbytes, out, - out_nbytes_avail, - actual_out_nbytes_ret); -#endif + return libdeflate_deflate_decompress_ex(d, in, in_nbytes, + out, out_nbytes_avail, + NULL, actual_out_nbytes_ret); } LIBDEFLATEAPI struct libdeflate_decompressor * diff --git a/lib/gzip_decompress.c b/lib/gzip_decompress.c index e3ce3d7..5703093 100644 --- a/lib/gzip_decompress.c +++ b/lib/gzip_decompress.c @@ -33,14 +33,16 @@ #include "libdeflate.h" LIBDEFLATEAPI enum libdeflate_result -libdeflate_gzip_decompress(struct libdeflate_decompressor *d, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_out_nbytes_ret) +libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret) { const u8 *in_next = in; const u8 * const in_end = in_next + in_nbytes; u8 flg; + size_t actual_in_nbytes; size_t actual_out_nbytes; enum libdeflate_result result; @@ -102,9 +104,10 @@ libdeflate_gzip_decompress(struct libdeflate_decompressor *d, } /* Compressed data */ - result = libdeflate_deflate_decompress(d, in_next, + result = libdeflate_deflate_decompress_ex(d, in_next, in_end - GZIP_FOOTER_SIZE - in_next, out, out_nbytes_avail, + &actual_in_nbytes, actual_out_nbytes_ret); if (result != LIBDEFLATE_SUCCESS) return result; @@ -114,7 +117,7 @@ libdeflate_gzip_decompress(struct libdeflate_decompressor *d, else actual_out_nbytes = out_nbytes_avail; - in_next = in_end - GZIP_FOOTER_SIZE; + in_next += actual_in_nbytes; /* CRC32 */ if (libdeflate_crc32(0, out, actual_out_nbytes) != @@ -125,6 +128,21 @@ libdeflate_gzip_decompress(struct libdeflate_decompressor *d, /* ISIZE */ if ((u32)actual_out_nbytes != get_unaligned_le32(in_next)) return LIBDEFLATE_BAD_DATA; + in_next += 4; + + if (actual_in_nbytes_ret) + *actual_in_nbytes_ret = in_next - (u8 *)in; return LIBDEFLATE_SUCCESS; } + +LIBDEFLATEAPI enum libdeflate_result +libdeflate_gzip_decompress(struct libdeflate_decompressor *d, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_out_nbytes_ret) +{ + return libdeflate_gzip_decompress_ex(d, in, in_nbytes, + out, out_nbytes_avail, + NULL, actual_out_nbytes_ret); +} diff --git a/libdeflate.h b/libdeflate.h index 8f57295..af34d35 100644 --- a/libdeflate.h +++ b/libdeflate.h @@ -182,14 +182,17 @@ enum libdeflate_result { }; /* - * libdeflate_deflate_decompress() decompresses 'in_nbytes' bytes of - * raw DEFLATE-compressed data at 'in' and writes the uncompressed data to - * 'out', which is a buffer of at least 'out_nbytes_avail' bytes. If - * decompression was successful, then 0 (LIBDEFLATE_SUCCESS) is returned; - * otherwise, a nonzero result code such as LIBDEFLATE_BAD_DATA is returned. If + * libdeflate_deflate_decompress() decompresses the DEFLATE-compressed stream + * from the buffer 'in' with compressed size up to 'in_nbytes' bytes. The + * uncompressed data is written to 'out', a buffer with size 'out_nbytes_avail' + * bytes. If decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned. + * Otherwise, a nonzero result code such as LIBDEFLATE_BAD_DATA is returned. If * a nonzero result code is returned, then the contents of the output buffer are * undefined. * + * Decompression stops at the end of the DEFLATE stream (as indicated by the + * BFINAL flag), even if it is actually shorter than 'in_nbytes' bytes. + * * libdeflate_deflate_decompress() can be used in cases where the actual * uncompressed size is known (recommended) or unknown (not recommended): * @@ -216,6 +219,19 @@ libdeflate_deflate_decompress(struct libdeflate_decompressor *decompressor, void *out, size_t out_nbytes_avail, size_t *actual_out_nbytes_ret); +/* + * Like libdeflate_deflate_decompress(), but adds the 'actual_in_nbytes_ret' + * argument. If decompression succeeds and 'actual_in_nbytes_ret' is not NULL, + * then the actual compressed size of the DEFLATE stream (aligned to the next + * byte boundary) is written to *actual_in_nbytes_ret. + */ +LIBDEFLATEAPI enum libdeflate_result +libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret); + /* * Like libdeflate_deflate_decompress(), but assumes the zlib wrapper format * instead of raw DEFLATE. @@ -229,6 +245,10 @@ libdeflate_zlib_decompress(struct libdeflate_decompressor *decompressor, /* * Like libdeflate_deflate_decompress(), but assumes the gzip wrapper format * instead of raw DEFLATE. + * + * If multiple gzip-compressed members are concatenated, then only the first + * will be decompressed. Use libdeflate_gzip_decompress_ex() if you need + * multi-member support. */ LIBDEFLATEAPI enum libdeflate_result libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor, @@ -236,6 +256,20 @@ libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor, void *out, size_t out_nbytes_avail, size_t *actual_out_nbytes_ret); +/* + * Like libdeflate_gzip_decompress(), but adds the 'actual_in_nbytes_ret' + * argument. If 'actual_in_nbytes_ret' is not NULL and the decompression + * succeeds (indicating that the first gzip-compressed member in the input + * buffer was decompressed), then the actual number of input bytes consumed is + * written to *actual_in_nbytes_ret. + */ +LIBDEFLATEAPI enum libdeflate_result +libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *decompressor, + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret); + /* * libdeflate_free_decompressor() frees a decompressor that was allocated with * libdeflate_alloc_decompressor(). If a NULL pointer is passed in, no action diff --git a/programs/gzip.c b/programs/gzip.c index 259fdac..a08d415 100644 --- a/programs/gzip.c +++ b/programs/gzip.c @@ -189,8 +189,10 @@ do_decompress(struct libdeflate_decompressor *decompressor, size_t compressed_size = in->mmap_size; void *uncompressed_data = NULL; size_t uncompressed_size; + size_t actual_in_nbytes; + size_t actual_out_nbytes; enum libdeflate_result result; - int ret; + int ret = 0; if (compressed_size < sizeof(u32)) { msg("%"TS": not in gzip format", in->name); @@ -200,34 +202,61 @@ do_decompress(struct libdeflate_decompressor *decompressor, uncompressed_size = load_u32_gzip(&compressed_data[compressed_size - 4]); - uncompressed_data = xmalloc(uncompressed_size); - if (uncompressed_data == NULL) { - msg("%"TS": file is probably too large to be processed by this " - "program", in->name); - ret = -1; - goto out; - } + do { + if (uncompressed_data == NULL) { + uncompressed_data = xmalloc(uncompressed_size); + if (uncompressed_data == NULL) { + msg("%"TS": file is probably too large to be " + "processed by this program", in->name); + ret = -1; + goto out; + } + } - result = libdeflate_gzip_decompress(decompressor, - compressed_data, - compressed_size, - uncompressed_data, - uncompressed_size, NULL); + result = libdeflate_gzip_decompress_ex(decompressor, + compressed_data, + compressed_size, + uncompressed_data, + uncompressed_size, + &actual_in_nbytes, + &actual_out_nbytes); - if (result == LIBDEFLATE_INSUFFICIENT_SPACE) { - msg("%"TS": file corrupt or too large to be processed by this " - "program", in->name); - ret = -1; - goto out; - } + if (result == LIBDEFLATE_INSUFFICIENT_SPACE) { + if (uncompressed_size * 2 <= uncompressed_size) { + msg("%"TS": file corrupt or too large to be " + "processed by this program", in->name); + ret = -1; + goto out; + } + uncompressed_size *= 2; + free(uncompressed_data); + uncompressed_data = NULL; + continue; + } - if (result != LIBDEFLATE_SUCCESS) { - msg("%"TS": file corrupt or not in gzip format", in->name); - ret = -1; - goto out; - } + if (result != LIBDEFLATE_SUCCESS) { + msg("%"TS": file corrupt or not in gzip format", + in->name); + ret = -1; + goto out; + } - ret = full_write(out, uncompressed_data, uncompressed_size); + if (actual_in_nbytes == 0 || + actual_in_nbytes > compressed_size || + actual_out_nbytes > uncompressed_size) { + msg("Bug in libdeflate_gzip_decompress_ex()!"); + ret = -1; + goto out; + } + + ret = full_write(out, uncompressed_data, actual_out_nbytes); + if (ret != 0) + goto out; + + compressed_data += actual_in_nbytes; + compressed_size -= actual_in_nbytes; + + } while (compressed_size != 0); out: free(uncompressed_data); return ret; diff --git a/tools/gzip_tests.sh b/tools/gzip_tests.sh index d409d82..367a9e6 100755 --- a/tools/gzip_tests.sh +++ b/tools/gzip_tests.sh @@ -392,6 +392,23 @@ gunzip file.gz [ "$(stat -c '%a;%x;%y' file)" = "$orig_stat" ] +begin_test 'Decompressing multi-member gzip file' +cat file file > orig +gzip -c file > file.gz +gzip -c file >> file.gz +gunzip -f file.gz +cmp file orig + + +begin_test 'Decompressing multi-member gzip file (final member smaller)' +echo 'hello world' > 2 +cat file 2 > orig +gzip -c file > file.gz +gzip -c 2 >> file.gz +gunzip -f file.gz +cmp file orig + + begin_test 'Help option' gzip -h 2>&1 | grep -q 'Usage' gunzip -h 2>&1 | grep -q 'Usage'