Support multi-member gzip files

2025-09-08 11:50:00 -04:00 · 2017-11-20 00:18:20 -08:00 · 2017-11-20 00:18:20 -08:00 · 5a9d25a892
commit 5a9d25a892
parent 3d96a83ef9
6 changed files with 188 additions and 65 deletions
--- a/lib/decompress_impl.h
+++ b/lib/decompress_impl.h
@ -37,7 +37,7 @@ static enum libdeflate_result ATTRIBUTES
 FUNCNAME(struct libdeflate_decompressor * restrict d,
 	 const void * restrict in, size_t in_nbytes,
 	 void * restrict out, size_t out_nbytes_avail,
-	 size_t *actual_out_nbytes_ret)
+	 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
 {
 	u8 *out_next = out;
 	u8 * const out_end = out_next + out_nbytes_avail;
@ -394,6 +394,14 @@ block_done:

 	/* That was the last block.  */

+	/* Discard any readahead bits and check for excessive overread */
+	ALIGN_INPUT();
+
+	/* Optionally return the actual number of bytes read */
+	if (actual_in_nbytes_ret)
+		*actual_in_nbytes_ret = in_next - (u8 *)in;
+
+	/* Optionally return the actual number of bytes written */
 	if (actual_out_nbytes_ret) {
 		*actual_out_nbytes_ret = out_next - (u8 *)out;
 	} else {
--- a/lib/deflate_decompress.c
+++ b/lib/deflate_decompress.c
@ -204,18 +204,20 @@ typedef machine_word_t bitbuf_t;
 /*
 * Fill the bitbuffer variable, reading one byte at a time.
 *
- * Note: if we would overrun the input buffer, we just don't read anything,
- * leaving the bits as 0 but marking them as filled.  This makes the
- * implementation simpler because this removes the need to distinguish between
- * "real" overruns and overruns that occur because of our own lookahead during
- * Huffman decoding.  The disadvantage is that a "real" overrun can go
- * undetected, and libdeflate_deflate_decompress() may return a success status
- * rather than the expected failure status if one occurs.  However, this is
- * irrelevant because even if this specific case were to be handled "correctly",
- * one could easily come up with a different case where the compressed data
- * would be corrupted in such a way that fully retains its validity.  Users
- * should run a checksum against the uncompressed data if they wish to detect
- * corruptions.
+ * If we would overread the input buffer, we just don't read anything, leaving
+ * the bits zeroed but marking them filled.  This simplifies the decompressor
+ * because it removes the need to distinguish between real overreads and
+ * overreads that occur only because of the decompressor's own lookahead.
+ *
+ * The disadvantage is that real overreads are not detected immediately.
+ * However, this is safe because the decompressor is still guaranteed to make
+ * forward progress when presented never-ending 0 bits.  In an existing block
+ * output will be getting generated, whereas new blocks can only be uncompressed
+ * (since the type code for uncompressed blocks is 0), for which we check for
+ * previous overread.  But even if we didn't check, uncompressed blocks would
+ * fail to validate because LEN would not equal ~NLEN.  So the decompressor will
+ * eventually either detect that the output buffer is full, or detect invalid
+ * input, or finish the final block.
 */
 #define FILL_BITS_BYTEWISE()					\
 do {								\
@ -277,17 +279,19 @@ if (!HAVE_BITS(n)) {						\
 #define POP_BITS(n) (tmp32 = BITS(n), REMOVE_BITS(n), tmp32)

 /*
- * Align the input to the next byte boundary, discarding any remaining bits in
- * the current byte.
+ * Verify that the input buffer hasn't been overread, then align the input to
+ * the next byte boundary, discarding any remaining bits in the current byte.
 *
- * Note that if the bitbuffer variable currently contains more than 8 bits, then
+ * Note that if the bitbuffer variable currently contains more than 7 bits, then
 * we must rewind 'in_next', effectively putting those bits back.  Only the bits
 * in what would be the "current" byte if we were reading one byte at a time can
 * be actually discarded.
 */
 #define ALIGN_INPUT()							\
 do {									\
-	in_next -= (bitsleft >> 3) - MIN(overrun_count, bitsleft >> 3);	\
+	SAFETY_CHECK(overrun_count <= (bitsleft >> 3));			\
+	in_next -= (bitsleft >> 3) - overrun_count;			\
+	overrun_count = 0;						\
 	bitbuf = 0;							\
 	bitsleft = 0;							\
 } while(0)
@ -824,13 +828,13 @@ static enum libdeflate_result
 dispatch(struct libdeflate_decompressor * restrict d,
 	 const void * restrict in, size_t in_nbytes,
 	 void * restrict out, size_t out_nbytes_avail,
-	 size_t *actual_out_nbytes_ret);
+	 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);

 typedef enum libdeflate_result (*decompress_func_t)
 	(struct libdeflate_decompressor * restrict d,
 	 const void * restrict in, size_t in_nbytes,
 	 void * restrict out, size_t out_nbytes_avail,
-	 size_t *actual_out_nbytes_ret);
+	 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);

 static decompress_func_t decompress_impl = dispatch;

@ -838,7 +842,7 @@ static enum libdeflate_result
 dispatch(struct libdeflate_decompressor * restrict d,
 	 const void * restrict in, size_t in_nbytes,
 	 void * restrict out, size_t out_nbytes_avail,
-	 size_t *actual_out_nbytes_ret)
+	 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
 {
 	decompress_func_t f = deflate_decompress_default;
 #if X86_CPU_FEATURES_ENABLED
@ -847,7 +851,7 @@ dispatch(struct libdeflate_decompressor * restrict d,
 #endif
 	decompress_impl = f;
 	return (*f)(d, in, in_nbytes, out, out_nbytes_avail,
-		    actual_out_nbytes_ret);
+		    actual_in_nbytes_ret, actual_out_nbytes_ret);
 }
 #endif /* DISPATCH_ENABLED */

@ -860,20 +864,33 @@ dispatch(struct libdeflate_decompressor * restrict d,
 * calling the appropriate implementation depending on the CPU features at
 * runtime.
 */
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_deflate_decompress_ex(struct libdeflate_decompressor * restrict d,
+				 const void * restrict in, size_t in_nbytes,
+				 void * restrict out, size_t out_nbytes_avail,
+				 size_t *actual_in_nbytes_ret,
+				 size_t *actual_out_nbytes_ret)
+{
+#if DISPATCH_ENABLED
+	return (*decompress_impl)(d, in, in_nbytes, out, out_nbytes_avail,
+				  actual_in_nbytes_ret, actual_out_nbytes_ret);
+#else
+	return deflate_decompress_default(d, in, in_nbytes,
+					  out, out_nbytes_avail,
+					  actual_in_nbytes_ret,
+					  actual_out_nbytes_ret);
+#endif
+}
+
 LIBDEFLATEAPI enum libdeflate_result
 libdeflate_deflate_decompress(struct libdeflate_decompressor * restrict d,
 			      const void * restrict in, size_t in_nbytes,
 			      void * restrict out, size_t out_nbytes_avail,
 			      size_t *actual_out_nbytes_ret)
 {
-#if DISPATCH_ENABLED
-	return (*decompress_impl)(d, in, in_nbytes, out, out_nbytes_avail,
-				  actual_out_nbytes_ret);
-#else
-	return deflate_decompress_default(d, in, in_nbytes, out,
-					  out_nbytes_avail,
-					  actual_out_nbytes_ret);
-#endif
+	return libdeflate_deflate_decompress_ex(d, in, in_nbytes,
+						out, out_nbytes_avail,
+						NULL, actual_out_nbytes_ret);
 }

 LIBDEFLATEAPI struct libdeflate_decompressor *
--- a/lib/gzip_decompress.c
+++ b/lib/gzip_decompress.c
@ -33,14 +33,16 @@
 #include "libdeflate.h"

 LIBDEFLATEAPI enum libdeflate_result
-libdeflate_gzip_decompress(struct libdeflate_decompressor *d,
-			   const void *in, size_t in_nbytes,
-			   void *out, size_t out_nbytes_avail,
-			   size_t *actual_out_nbytes_ret)
+libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *d,
+			      const void *in, size_t in_nbytes,
+			      void *out, size_t out_nbytes_avail,
+			      size_t *actual_in_nbytes_ret,
+			      size_t *actual_out_nbytes_ret)
 {
 	const u8 *in_next = in;
 	const u8 * const in_end = in_next + in_nbytes;
 	u8 flg;
+	size_t actual_in_nbytes;
 	size_t actual_out_nbytes;
 	enum libdeflate_result result;

@ -102,9 +104,10 @@ libdeflate_gzip_decompress(struct libdeflate_decompressor *d,
 	}

 	/* Compressed data  */
-	result = libdeflate_deflate_decompress(d, in_next,
+	result = libdeflate_deflate_decompress_ex(d, in_next,
 					in_end - GZIP_FOOTER_SIZE - in_next,
 					out, out_nbytes_avail,
+					&actual_in_nbytes,
 					actual_out_nbytes_ret);
 	if (result != LIBDEFLATE_SUCCESS)
 		return result;
@ -114,7 +117,7 @@ libdeflate_gzip_decompress(struct libdeflate_decompressor *d,
 	else
 		actual_out_nbytes = out_nbytes_avail;

-	in_next = in_end - GZIP_FOOTER_SIZE;
+	in_next += actual_in_nbytes;

 	/* CRC32 */
 	if (libdeflate_crc32(0, out, actual_out_nbytes) !=
@ -125,6 +128,21 @@ libdeflate_gzip_decompress(struct libdeflate_decompressor *d,
 	/* ISIZE */
 	if ((u32)actual_out_nbytes != get_unaligned_le32(in_next))
 		return LIBDEFLATE_BAD_DATA;
+	in_next += 4;
+
+	if (actual_in_nbytes_ret)
+		*actual_in_nbytes_ret = in_next - (u8 *)in;

 	return LIBDEFLATE_SUCCESS;
 }
+
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_gzip_decompress(struct libdeflate_decompressor *d,
+			   const void *in, size_t in_nbytes,
+			   void *out, size_t out_nbytes_avail,
+			   size_t *actual_out_nbytes_ret)
+{
+	return libdeflate_gzip_decompress_ex(d, in, in_nbytes,
+					     out, out_nbytes_avail,
+					     NULL, actual_out_nbytes_ret);
+}
--- a/libdeflate.h
+++ b/libdeflate.h
@ -182,14 +182,17 @@ enum libdeflate_result {
 };

 /*
- * libdeflate_deflate_decompress() decompresses 'in_nbytes' bytes of
- * raw DEFLATE-compressed data at 'in' and writes the uncompressed data to
- * 'out', which is a buffer of at least 'out_nbytes_avail' bytes.  If
- * decompression was successful, then 0 (LIBDEFLATE_SUCCESS) is returned;
- * otherwise, a nonzero result code such as LIBDEFLATE_BAD_DATA is returned.  If
+ * libdeflate_deflate_decompress() decompresses the DEFLATE-compressed stream
+ * from the buffer 'in' with compressed size up to 'in_nbytes' bytes.  The
+ * uncompressed data is written to 'out', a buffer with size 'out_nbytes_avail'
+ * bytes.  If decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned.
+ * Otherwise, a nonzero result code such as LIBDEFLATE_BAD_DATA is returned.  If
 * a nonzero result code is returned, then the contents of the output buffer are
 * undefined.
 *
+ * Decompression stops at the end of the DEFLATE stream (as indicated by the
+ * BFINAL flag), even if it is actually shorter than 'in_nbytes' bytes.
+ *
 * libdeflate_deflate_decompress() can be used in cases where the actual
 * uncompressed size is known (recommended) or unknown (not recommended):
 *
@ -216,6 +219,19 @@ libdeflate_deflate_decompress(struct libdeflate_decompressor *decompressor,
 			      void *out, size_t out_nbytes_avail,
 			      size_t *actual_out_nbytes_ret);

+/*
+ * Like libdeflate_deflate_decompress(), but adds the 'actual_in_nbytes_ret'
+ * argument.  If decompression succeeds and 'actual_in_nbytes_ret' is not NULL,
+ * then the actual compressed size of the DEFLATE stream (aligned to the next
+ * byte boundary) is written to *actual_in_nbytes_ret.
+ */
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *decompressor,
+				 const void *in, size_t in_nbytes,
+				 void *out, size_t out_nbytes_avail,
+				 size_t *actual_in_nbytes_ret,
+				 size_t *actual_out_nbytes_ret);
+
 /*
 * Like libdeflate_deflate_decompress(), but assumes the zlib wrapper format
 * instead of raw DEFLATE.
@ -229,6 +245,10 @@ libdeflate_zlib_decompress(struct libdeflate_decompressor *decompressor,
 /*
 * Like libdeflate_deflate_decompress(), but assumes the gzip wrapper format
 * instead of raw DEFLATE.
+ *
+ * If multiple gzip-compressed members are concatenated, then only the first
+ * will be decompressed.  Use libdeflate_gzip_decompress_ex() if you need
+ * multi-member support.
 */
 LIBDEFLATEAPI enum libdeflate_result
 libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor,
@ -236,6 +256,20 @@ libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor,
 			   void *out, size_t out_nbytes_avail,
 			   size_t *actual_out_nbytes_ret);

+/*
+ * Like libdeflate_gzip_decompress(), but adds the 'actual_in_nbytes_ret'
+ * argument.  If 'actual_in_nbytes_ret' is not NULL and the decompression
+ * succeeds (indicating that the first gzip-compressed member in the input
+ * buffer was decompressed), then the actual number of input bytes consumed is
+ * written to *actual_in_nbytes_ret.
+ */
+LIBDEFLATEAPI enum libdeflate_result
+libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *decompressor,
+			      const void *in, size_t in_nbytes,
+			      void *out, size_t out_nbytes_avail,
+			      size_t *actual_in_nbytes_ret,
+			      size_t *actual_out_nbytes_ret);
+
 /*
 * libdeflate_free_decompressor() frees a decompressor that was allocated with
 * libdeflate_alloc_decompressor().  If a NULL pointer is passed in, no action
--- a/programs/gzip.c
+++ b/programs/gzip.c
@ -189,8 +189,10 @@ do_decompress(struct libdeflate_decompressor *decompressor,
 	size_t compressed_size = in->mmap_size;
 	void *uncompressed_data = NULL;
 	size_t uncompressed_size;
+	size_t actual_in_nbytes;
+	size_t actual_out_nbytes;
 	enum libdeflate_result result;
-	int ret;
+	int ret = 0;

 	if (compressed_size < sizeof(u32)) {
 	       msg("%"TS": not in gzip format", in->name);
@ -200,34 +202,61 @@ do_decompress(struct libdeflate_decompressor *decompressor,

 	uncompressed_size = load_u32_gzip(&compressed_data[compressed_size - 4]);

-	uncompressed_data = xmalloc(uncompressed_size);
-	if (uncompressed_data == NULL) {
-		msg("%"TS": file is probably too large to be processed by this "
-		    "program", in->name);
-		ret = -1;
-		goto out;
-	}
+	do {
+		if (uncompressed_data == NULL) {
+			uncompressed_data = xmalloc(uncompressed_size);
+			if (uncompressed_data == NULL) {
+				msg("%"TS": file is probably too large to be "
+				    "processed by this program", in->name);
+				ret = -1;
+				goto out;
+			}
+		}

-	result = libdeflate_gzip_decompress(decompressor,
-					    compressed_data,
-					    compressed_size,
-					    uncompressed_data,
-					    uncompressed_size, NULL);
+		result = libdeflate_gzip_decompress_ex(decompressor,
+						       compressed_data,
+						       compressed_size,
+						       uncompressed_data,
+						       uncompressed_size,
+						       &actual_in_nbytes,
+						       &actual_out_nbytes);

-	if (result == LIBDEFLATE_INSUFFICIENT_SPACE) {
-		msg("%"TS": file corrupt or too large to be processed by this "
-		    "program", in->name);
-		ret = -1;
-		goto out;
-	}
+		if (result == LIBDEFLATE_INSUFFICIENT_SPACE) {
+			if (uncompressed_size * 2 <= uncompressed_size) {
+				msg("%"TS": file corrupt or too large to be "
+				    "processed by this program", in->name);
+				ret = -1;
+				goto out;
+			}
+			uncompressed_size *= 2;
+			free(uncompressed_data);
+			uncompressed_data = NULL;
+			continue;
+		}

-	if (result != LIBDEFLATE_SUCCESS) {
-		msg("%"TS": file corrupt or not in gzip format", in->name);
-		ret = -1;
-		goto out;
-	}
+		if (result != LIBDEFLATE_SUCCESS) {
+			msg("%"TS": file corrupt or not in gzip format",
+			    in->name);
+			ret = -1;
+			goto out;
+		}

-	ret = full_write(out, uncompressed_data, uncompressed_size);
+		if (actual_in_nbytes == 0 ||
+		    actual_in_nbytes > compressed_size ||
+		    actual_out_nbytes > uncompressed_size) {
+			msg("Bug in libdeflate_gzip_decompress_ex()!");
+			ret = -1;
+			goto out;
+		}
+
+		ret = full_write(out, uncompressed_data, actual_out_nbytes);
+		if (ret != 0)
+			goto out;
+
+		compressed_data += actual_in_nbytes;
+		compressed_size -= actual_in_nbytes;
+
+	} while (compressed_size != 0);
 out:
 	free(uncompressed_data);
 	return ret;
--- a/tools/gzip_tests.sh
+++ b/tools/gzip_tests.sh
@ -392,6 +392,23 @@ gunzip file.gz
 [ "$(stat -c '%a;%x;%y' file)" = "$orig_stat" ]


+begin_test 'Decompressing multi-member gzip file'
+cat file file > orig
+gzip -c file > file.gz
+gzip -c file >> file.gz
+gunzip -f file.gz
+cmp file orig
+
+
+begin_test 'Decompressing multi-member gzip file (final member smaller)'
+echo 'hello world' > 2
+cat file 2 > orig
+gzip -c file > file.gz
+gzip -c 2 >> file.gz
+gunzip -f file.gz
+cmp file orig
+
+
 begin_test 'Help option'
 gzip -h 2>&1 | grep -q 'Usage'
 gunzip -h 2>&1 | grep -q 'Usage'