deflate_compress: fix corruption with long literal run

When the block splitting algorithm was implemented, it became possible
for the compressor to use longer blocks, up to ~300KB.  Unfortunately it
was overlooked that this can allow literal runs > 65535 bytes, while in
one place the length of a literal run was still being stored in a u16.
To overflow the litrunlen and hit the bug the data would have had to
have far fewer matches than random data, which is possible but very
unusual.  Fix the bug by reserving more space to hold the litrunlen, and
add a test for it.
This commit is contained in:
Eric Biggers 2017-01-14 20:44:09 -08:00
parent e79444be27
commit f2f0df7274
2 changed files with 51 additions and 26 deletions

View File

@ -246,10 +246,19 @@ struct deflate_costs {
*/ */
struct deflate_sequence { struct deflate_sequence {
/* The number of literals in the run. This may be 0. The literals are /* Bits 0..22: the number of literals in this run. This may be 0 and
* not stored explicitly in this structure; instead, they are read * can be at most about SOFT_MAX_BLOCK_LENGTH. The literals are not
* directly from the uncompressed data. */ * stored explicitly in this structure; instead, they are read directly
u16 litrunlen; * from the uncompressed data.
*
* Bits 23..31: the length of the match which follows the literals, or 0
* if this literal run was the last in the block, so there is no match
* which follows it. */
u32 litrunlen_and_length;
/* If 'length' doesn't indicate end-of-block, then this is the offset of
* the match which follows the literals. */
u16 offset;
/* If 'length' doesn't indicate end-of-block, then this is the offset /* If 'length' doesn't indicate end-of-block, then this is the offset
* symbol of the match which follows the literals. */ * symbol of the match which follows the literals. */
@ -258,15 +267,6 @@ struct deflate_sequence {
/* If 'length' doesn't indicate end-of-block, then this is the length /* If 'length' doesn't indicate end-of-block, then this is the length
* slot of the match which follows the literals. */ * slot of the match which follows the literals. */
u8 length_slot; u8 length_slot;
/* The length of the match which follows the literals, or 0 if this this
* sequence's literal run was the last literal run in the block, so
* there is no match that follows it. */
u16 length;
/* If 'length' doesn't indicate end-of-block, then this is the offset of
* the match which follows the literals. */
u16 offset;
}; };
#if SUPPORT_NEAR_OPTIMAL_PARSING #if SUPPORT_NEAR_OPTIMAL_PARSING
@ -1459,8 +1459,8 @@ deflate_write_sequences(struct deflate_output_bitstream * restrict os,
const struct deflate_sequence *seq = sequences; const struct deflate_sequence *seq = sequences;
for (;;) { for (;;) {
unsigned litrunlen = seq->litrunlen; u32 litrunlen = seq->litrunlen_and_length & 0x7FFFFF;
unsigned length; unsigned length = seq->litrunlen_and_length >> 23;
unsigned length_slot; unsigned length_slot;
unsigned litlen_symbol; unsigned litlen_symbol;
unsigned offset_symbol; unsigned offset_symbol;
@ -1527,9 +1527,6 @@ deflate_write_sequences(struct deflate_output_bitstream * restrict os,
#endif #endif
} }
length = seq->length;
if (length == 0) if (length == 0)
return; return;
@ -1802,7 +1799,7 @@ deflate_flush_block(struct libdeflate_compressor * restrict c,
static forceinline void static forceinline void
deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal, deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal,
unsigned *litrunlen_p) u32 *litrunlen_p)
{ {
c->freqs.litlen[literal]++; c->freqs.litlen[literal]++;
++*litrunlen_p; ++*litrunlen_p;
@ -1811,9 +1808,8 @@ deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal,
static forceinline void static forceinline void
deflate_choose_match(struct libdeflate_compressor *c, deflate_choose_match(struct libdeflate_compressor *c,
unsigned length, unsigned offset, unsigned length, unsigned offset,
unsigned *litrunlen_p, struct deflate_sequence **next_seq_p) u32 *litrunlen_p, struct deflate_sequence **next_seq_p)
{ {
unsigned litrunlen = *litrunlen_p;
struct deflate_sequence *seq = *next_seq_p; struct deflate_sequence *seq = *next_seq_p;
unsigned length_slot = deflate_length_slot[length]; unsigned length_slot = deflate_length_slot[length];
unsigned offset_slot = deflate_get_offset_slot(c, offset); unsigned offset_slot = deflate_get_offset_slot(c, offset);
@ -1821,8 +1817,7 @@ deflate_choose_match(struct libdeflate_compressor *c,
c->freqs.litlen[257 + length_slot]++; c->freqs.litlen[257 + length_slot]++;
c->freqs.offset[offset_slot]++; c->freqs.offset[offset_slot]++;
seq->litrunlen = litrunlen; seq->litrunlen_and_length = ((u32)length << 23) | *litrunlen_p;
seq->length = length;
seq->offset = offset; seq->offset = offset;
seq->length_slot = length_slot; seq->length_slot = length_slot;
seq->offset_symbol = offset_slot; seq->offset_symbol = offset_slot;
@ -1832,10 +1827,9 @@ deflate_choose_match(struct libdeflate_compressor *c,
} }
static forceinline void static forceinline void
deflate_finish_sequence(struct deflate_sequence *seq, unsigned litrunlen) deflate_finish_sequence(struct deflate_sequence *seq, u32 litrunlen)
{ {
seq->litrunlen = litrunlen; seq->litrunlen_and_length = litrunlen; /* length = 0 */
seq->length = 0;
} }
/******************************************************************************/ /******************************************************************************/

View File

@ -255,6 +255,36 @@ gzip_tests() {
############################################################################### ###############################################################################
edge_case_tests() {
test_group_included edge_case || return 0
# Regression test for "deflate_compress: fix corruption with long
# literal run". Try to compress a file longer than 65535 bytes where no
# 2-byte sequence (3 would be sufficient) is repeated <= 32768 bytes
# apart, and the distribution of bytes remains constant throughout, and
# yet not all bytes are used so the data is still slightly compressible.
# There will be no matches in this data, but the compressor should still
# output a compressed block, and this block should contain more than
# 65535 consecutive literals, which triggered the bug.
#
# Note: on random data, this situation is extremely unlikely if the
# compressor uses all matches it finds, since random data will on
# average have a 3-byte match every (256**3)/32768 = 512 bytes.
python3 > "$TMPFILE" << EOF
import sys
for i in range(2):
for stride in range(1,251):
b = bytes(stride*multiple % 251 for multiple in range(251))
sys.stdout.buffer.write(b)
EOF
run_cmd make -j$NPROC benchmark
run_cmd ./benchmark -3 "$TMPFILE"
run_cmd ./benchmark -6 "$TMPFILE"
run_cmd ./benchmark -12 "$TMPFILE"
}
###############################################################################
log "Starting libdeflate tests" log "Starting libdeflate tests"
log " TESTGROUPS=(${TESTGROUPS[@]})" log " TESTGROUPS=(${TESTGROUPS[@]})"
log " SMOKEDATA=$SMOKEDATA" log " SMOKEDATA=$SMOKEDATA"
@ -266,6 +296,7 @@ mips_tests
windows_tests windows_tests
static_analysis_tests static_analysis_tests
gzip_tests gzip_tests
edge_case_tests
if (( TESTS_SKIPPED )); then if (( TESTS_SKIPPED )); then
log "No tests failed, but some tests were skipped. See above." log "No tests failed, but some tests were skipped. See above."