Support multi-member gzip files

This commit is contained in:
Eric Biggers 2017-11-20 00:18:20 -08:00
parent 3d96a83ef9
commit 5a9d25a892
6 changed files with 188 additions and 65 deletions

View File

@ -37,7 +37,7 @@ static enum libdeflate_result ATTRIBUTES
FUNCNAME(struct libdeflate_decompressor * restrict d,
const void * restrict in, size_t in_nbytes,
void * restrict out, size_t out_nbytes_avail,
size_t *actual_out_nbytes_ret)
size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
{
u8 *out_next = out;
u8 * const out_end = out_next + out_nbytes_avail;
@ -394,6 +394,14 @@ block_done:
/* That was the last block. */
/* Discard any readahead bits and check for excessive overread */
ALIGN_INPUT();
/* Optionally return the actual number of bytes read */
if (actual_in_nbytes_ret)
*actual_in_nbytes_ret = in_next - (u8 *)in;
/* Optionally return the actual number of bytes written */
if (actual_out_nbytes_ret) {
*actual_out_nbytes_ret = out_next - (u8 *)out;
} else {

View File

@ -204,18 +204,20 @@ typedef machine_word_t bitbuf_t;
/*
* Fill the bitbuffer variable, reading one byte at a time.
*
* Note: if we would overrun the input buffer, we just don't read anything,
* leaving the bits as 0 but marking them as filled. This makes the
* implementation simpler because this removes the need to distinguish between
* "real" overruns and overruns that occur because of our own lookahead during
* Huffman decoding. The disadvantage is that a "real" overrun can go
* undetected, and libdeflate_deflate_decompress() may return a success status
* rather than the expected failure status if one occurs. However, this is
* irrelevant because even if this specific case were to be handled "correctly",
* one could easily come up with a different case where the compressed data
* would be corrupted in such a way that fully retains its validity. Users
* should run a checksum against the uncompressed data if they wish to detect
* corruptions.
* If we would overread the input buffer, we just don't read anything, leaving
* the bits zeroed but marking them filled. This simplifies the decompressor
* because it removes the need to distinguish between real overreads and
* overreads that occur only because of the decompressor's own lookahead.
*
* The disadvantage is that real overreads are not detected immediately.
* However, this is safe because the decompressor is still guaranteed to make
* forward progress when presented never-ending 0 bits. In an existing block
* output will be getting generated, whereas new blocks can only be uncompressed
* (since the type code for uncompressed blocks is 0), for which we check for
* previous overread. But even if we didn't check, uncompressed blocks would
* fail to validate because LEN would not equal ~NLEN. So the decompressor will
* eventually either detect that the output buffer is full, or detect invalid
* input, or finish the final block.
*/
#define FILL_BITS_BYTEWISE() \
do { \
@ -277,17 +279,19 @@ if (!HAVE_BITS(n)) { \
#define POP_BITS(n) (tmp32 = BITS(n), REMOVE_BITS(n), tmp32)
/*
* Align the input to the next byte boundary, discarding any remaining bits in
* the current byte.
* Verify that the input buffer hasn't been overread, then align the input to
* the next byte boundary, discarding any remaining bits in the current byte.
*
* Note that if the bitbuffer variable currently contains more than 8 bits, then
* Note that if the bitbuffer variable currently contains more than 7 bits, then
* we must rewind 'in_next', effectively putting those bits back. Only the bits
* in what would be the "current" byte if we were reading one byte at a time can
* be actually discarded.
*/
#define ALIGN_INPUT() \
do { \
in_next -= (bitsleft >> 3) - MIN(overrun_count, bitsleft >> 3); \
SAFETY_CHECK(overrun_count <= (bitsleft >> 3)); \
in_next -= (bitsleft >> 3) - overrun_count; \
overrun_count = 0; \
bitbuf = 0; \
bitsleft = 0; \
} while(0)
@ -824,13 +828,13 @@ static enum libdeflate_result
dispatch(struct libdeflate_decompressor * restrict d,
const void * restrict in, size_t in_nbytes,
void * restrict out, size_t out_nbytes_avail,
size_t *actual_out_nbytes_ret);
size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
typedef enum libdeflate_result (*decompress_func_t)
(struct libdeflate_decompressor * restrict d,
const void * restrict in, size_t in_nbytes,
void * restrict out, size_t out_nbytes_avail,
size_t *actual_out_nbytes_ret);
size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
static decompress_func_t decompress_impl = dispatch;
@ -838,7 +842,7 @@ static enum libdeflate_result
dispatch(struct libdeflate_decompressor * restrict d,
const void * restrict in, size_t in_nbytes,
void * restrict out, size_t out_nbytes_avail,
size_t *actual_out_nbytes_ret)
size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
{
decompress_func_t f = deflate_decompress_default;
#if X86_CPU_FEATURES_ENABLED
@ -847,7 +851,7 @@ dispatch(struct libdeflate_decompressor * restrict d,
#endif
decompress_impl = f;
return (*f)(d, in, in_nbytes, out, out_nbytes_avail,
actual_out_nbytes_ret);
actual_in_nbytes_ret, actual_out_nbytes_ret);
}
#endif /* DISPATCH_ENABLED */
@ -860,20 +864,33 @@ dispatch(struct libdeflate_decompressor * restrict d,
* calling the appropriate implementation depending on the CPU features at
* runtime.
*/
LIBDEFLATEAPI enum libdeflate_result
libdeflate_deflate_decompress_ex(struct libdeflate_decompressor * restrict d,
const void * restrict in, size_t in_nbytes,
void * restrict out, size_t out_nbytes_avail,
size_t *actual_in_nbytes_ret,
size_t *actual_out_nbytes_ret)
{
#if DISPATCH_ENABLED
return (*decompress_impl)(d, in, in_nbytes, out, out_nbytes_avail,
actual_in_nbytes_ret, actual_out_nbytes_ret);
#else
return deflate_decompress_default(d, in, in_nbytes,
out, out_nbytes_avail,
actual_in_nbytes_ret,
actual_out_nbytes_ret);
#endif
}
LIBDEFLATEAPI enum libdeflate_result
libdeflate_deflate_decompress(struct libdeflate_decompressor * restrict d,
const void * restrict in, size_t in_nbytes,
void * restrict out, size_t out_nbytes_avail,
size_t *actual_out_nbytes_ret)
{
#if DISPATCH_ENABLED
return (*decompress_impl)(d, in, in_nbytes, out, out_nbytes_avail,
actual_out_nbytes_ret);
#else
return deflate_decompress_default(d, in, in_nbytes, out,
out_nbytes_avail,
actual_out_nbytes_ret);
#endif
return libdeflate_deflate_decompress_ex(d, in, in_nbytes,
out, out_nbytes_avail,
NULL, actual_out_nbytes_ret);
}
LIBDEFLATEAPI struct libdeflate_decompressor *

View File

@ -33,14 +33,16 @@
#include "libdeflate.h"
LIBDEFLATEAPI enum libdeflate_result
libdeflate_gzip_decompress(struct libdeflate_decompressor *d,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_out_nbytes_ret)
libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *d,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_in_nbytes_ret,
size_t *actual_out_nbytes_ret)
{
const u8 *in_next = in;
const u8 * const in_end = in_next + in_nbytes;
u8 flg;
size_t actual_in_nbytes;
size_t actual_out_nbytes;
enum libdeflate_result result;
@ -102,9 +104,10 @@ libdeflate_gzip_decompress(struct libdeflate_decompressor *d,
}
/* Compressed data */
result = libdeflate_deflate_decompress(d, in_next,
result = libdeflate_deflate_decompress_ex(d, in_next,
in_end - GZIP_FOOTER_SIZE - in_next,
out, out_nbytes_avail,
&actual_in_nbytes,
actual_out_nbytes_ret);
if (result != LIBDEFLATE_SUCCESS)
return result;
@ -114,7 +117,7 @@ libdeflate_gzip_decompress(struct libdeflate_decompressor *d,
else
actual_out_nbytes = out_nbytes_avail;
in_next = in_end - GZIP_FOOTER_SIZE;
in_next += actual_in_nbytes;
/* CRC32 */
if (libdeflate_crc32(0, out, actual_out_nbytes) !=
@ -125,6 +128,21 @@ libdeflate_gzip_decompress(struct libdeflate_decompressor *d,
/* ISIZE */
if ((u32)actual_out_nbytes != get_unaligned_le32(in_next))
return LIBDEFLATE_BAD_DATA;
in_next += 4;
if (actual_in_nbytes_ret)
*actual_in_nbytes_ret = in_next - (u8 *)in;
return LIBDEFLATE_SUCCESS;
}
LIBDEFLATEAPI enum libdeflate_result
libdeflate_gzip_decompress(struct libdeflate_decompressor *d,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_out_nbytes_ret)
{
return libdeflate_gzip_decompress_ex(d, in, in_nbytes,
out, out_nbytes_avail,
NULL, actual_out_nbytes_ret);
}

View File

@ -182,14 +182,17 @@ enum libdeflate_result {
};
/*
* libdeflate_deflate_decompress() decompresses 'in_nbytes' bytes of
* raw DEFLATE-compressed data at 'in' and writes the uncompressed data to
* 'out', which is a buffer of at least 'out_nbytes_avail' bytes. If
* decompression was successful, then 0 (LIBDEFLATE_SUCCESS) is returned;
* otherwise, a nonzero result code such as LIBDEFLATE_BAD_DATA is returned. If
* libdeflate_deflate_decompress() decompresses the DEFLATE-compressed stream
* from the buffer 'in' with compressed size up to 'in_nbytes' bytes. The
* uncompressed data is written to 'out', a buffer with size 'out_nbytes_avail'
* bytes. If decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned.
* Otherwise, a nonzero result code such as LIBDEFLATE_BAD_DATA is returned. If
* a nonzero result code is returned, then the contents of the output buffer are
* undefined.
*
* Decompression stops at the end of the DEFLATE stream (as indicated by the
* BFINAL flag), even if it is actually shorter than 'in_nbytes' bytes.
*
* libdeflate_deflate_decompress() can be used in cases where the actual
* uncompressed size is known (recommended) or unknown (not recommended):
*
@ -216,6 +219,19 @@ libdeflate_deflate_decompress(struct libdeflate_decompressor *decompressor,
void *out, size_t out_nbytes_avail,
size_t *actual_out_nbytes_ret);
/*
* Like libdeflate_deflate_decompress(), but adds the 'actual_in_nbytes_ret'
* argument. If decompression succeeds and 'actual_in_nbytes_ret' is not NULL,
* then the actual compressed size of the DEFLATE stream (aligned to the next
* byte boundary) is written to *actual_in_nbytes_ret.
*/
LIBDEFLATEAPI enum libdeflate_result
libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *decompressor,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_in_nbytes_ret,
size_t *actual_out_nbytes_ret);
/*
* Like libdeflate_deflate_decompress(), but assumes the zlib wrapper format
* instead of raw DEFLATE.
@ -229,6 +245,10 @@ libdeflate_zlib_decompress(struct libdeflate_decompressor *decompressor,
/*
* Like libdeflate_deflate_decompress(), but assumes the gzip wrapper format
* instead of raw DEFLATE.
*
* If multiple gzip-compressed members are concatenated, then only the first
* will be decompressed. Use libdeflate_gzip_decompress_ex() if you need
* multi-member support.
*/
LIBDEFLATEAPI enum libdeflate_result
libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor,
@ -236,6 +256,20 @@ libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor,
void *out, size_t out_nbytes_avail,
size_t *actual_out_nbytes_ret);
/*
* Like libdeflate_gzip_decompress(), but adds the 'actual_in_nbytes_ret'
* argument. If 'actual_in_nbytes_ret' is not NULL and the decompression
* succeeds (indicating that the first gzip-compressed member in the input
* buffer was decompressed), then the actual number of input bytes consumed is
* written to *actual_in_nbytes_ret.
*/
LIBDEFLATEAPI enum libdeflate_result
libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *decompressor,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_in_nbytes_ret,
size_t *actual_out_nbytes_ret);
/*
* libdeflate_free_decompressor() frees a decompressor that was allocated with
* libdeflate_alloc_decompressor(). If a NULL pointer is passed in, no action

View File

@ -189,8 +189,10 @@ do_decompress(struct libdeflate_decompressor *decompressor,
size_t compressed_size = in->mmap_size;
void *uncompressed_data = NULL;
size_t uncompressed_size;
size_t actual_in_nbytes;
size_t actual_out_nbytes;
enum libdeflate_result result;
int ret;
int ret = 0;
if (compressed_size < sizeof(u32)) {
msg("%"TS": not in gzip format", in->name);
@ -200,34 +202,61 @@ do_decompress(struct libdeflate_decompressor *decompressor,
uncompressed_size = load_u32_gzip(&compressed_data[compressed_size - 4]);
uncompressed_data = xmalloc(uncompressed_size);
if (uncompressed_data == NULL) {
msg("%"TS": file is probably too large to be processed by this "
"program", in->name);
ret = -1;
goto out;
}
do {
if (uncompressed_data == NULL) {
uncompressed_data = xmalloc(uncompressed_size);
if (uncompressed_data == NULL) {
msg("%"TS": file is probably too large to be "
"processed by this program", in->name);
ret = -1;
goto out;
}
}
result = libdeflate_gzip_decompress(decompressor,
compressed_data,
compressed_size,
uncompressed_data,
uncompressed_size, NULL);
result = libdeflate_gzip_decompress_ex(decompressor,
compressed_data,
compressed_size,
uncompressed_data,
uncompressed_size,
&actual_in_nbytes,
&actual_out_nbytes);
if (result == LIBDEFLATE_INSUFFICIENT_SPACE) {
msg("%"TS": file corrupt or too large to be processed by this "
"program", in->name);
ret = -1;
goto out;
}
if (result == LIBDEFLATE_INSUFFICIENT_SPACE) {
if (uncompressed_size * 2 <= uncompressed_size) {
msg("%"TS": file corrupt or too large to be "
"processed by this program", in->name);
ret = -1;
goto out;
}
uncompressed_size *= 2;
free(uncompressed_data);
uncompressed_data = NULL;
continue;
}
if (result != LIBDEFLATE_SUCCESS) {
msg("%"TS": file corrupt or not in gzip format", in->name);
ret = -1;
goto out;
}
if (result != LIBDEFLATE_SUCCESS) {
msg("%"TS": file corrupt or not in gzip format",
in->name);
ret = -1;
goto out;
}
ret = full_write(out, uncompressed_data, uncompressed_size);
if (actual_in_nbytes == 0 ||
actual_in_nbytes > compressed_size ||
actual_out_nbytes > uncompressed_size) {
msg("Bug in libdeflate_gzip_decompress_ex()!");
ret = -1;
goto out;
}
ret = full_write(out, uncompressed_data, actual_out_nbytes);
if (ret != 0)
goto out;
compressed_data += actual_in_nbytes;
compressed_size -= actual_in_nbytes;
} while (compressed_size != 0);
out:
free(uncompressed_data);
return ret;

View File

@ -392,6 +392,23 @@ gunzip file.gz
[ "$(stat -c '%a;%x;%y' file)" = "$orig_stat" ]
begin_test 'Decompressing multi-member gzip file'
cat file file > orig
gzip -c file > file.gz
gzip -c file >> file.gz
gunzip -f file.gz
cmp file orig
begin_test 'Decompressing multi-member gzip file (final member smaller)'
echo 'hello world' > 2
cat file 2 > orig
gzip -c file > file.gz
gzip -c 2 >> file.gz
gunzip -f file.gz
cmp file orig
begin_test 'Help option'
gzip -h 2>&1 | grep -q 'Usage'
gunzip -h 2>&1 | grep -q 'Usage'