diff --git a/src/dwarfs/segmenter.cpp b/src/dwarfs/segmenter.cpp index f08309ca..6c0289f9 100644 --- a/src/dwarfs/segmenter.cpp +++ b/src/dwarfs/segmenter.cpp @@ -38,6 +38,7 @@ #include #include +#include #include #include "dwarfs/block_data.h" @@ -152,7 +153,13 @@ class fast_multimap { collision_t collisions_; }; -using repeating_sequence_map_type = phmap::flat_hash_map; +template +using small_sorted_vector_set = + folly::sorted_vector_set, std::allocator, void, + folly::small_vector>; + +using repeating_sequence_map_type = + phmap::flat_hash_map>; using repeating_collisions_map_type = std::unordered_map; /** @@ -660,11 +667,11 @@ class segmenter_ final : public segmenter::impl, private SegmentingPolicy { LOG_VERBOSE << cfg_.context << "bloom filter size: " << size_with_unit(global_filter_.size() / 8); - repeating_sequence_hash_values_.reserve(256); - for (int i = 0; i < 256; ++i) { - repeating_sequence_hash_values_.emplace( - rsync_hash::repeating_window(i, frames_to_bytes(window_size_)), i); + auto val = + rsync_hash::repeating_window(i, frames_to_bytes(window_size_)); + DWARFS_CHECK(repeating_sequence_hash_values_[val].emplace(i).second, + "repeating sequence hash value / byte collision"); } } } @@ -785,18 +792,24 @@ bool active_block:: auto& raw = data_->vec(); auto winbeg = raw.begin() + frames_to_bytes(offset); auto winend = winbeg + frames_to_bytes(window_size_); + auto byte = *winbeg; + static_assert(std::is_same_vsecond)::value_type, + decltype(byte)>); - if (std::find_if(winbeg, winend, [byte = it->second](auto b) { - return b != byte; - }) == winend) { + // check if this is a known character for a repeating sequence + if (!it->second.contains(byte)) { + return false; + } + + if (std::find_if(winbeg, winend, [byte](auto b) { return b != byte; }) == + winend) { return offsets_.any_value_is(hashval, [&, this](auto off) { auto offbeg = raw.begin() + frames_to_bytes(off); auto offend = offbeg + frames_to_bytes(window_size_); - if (std::find_if(offbeg, offend, [byte = it->second](auto b) { - return b != byte; - }) == offend) { - ++repeating_collisions_[it->second]; + if (std::find_if(offbeg, offend, + [byte](auto b) { return b != byte; }) == offend) { + ++repeating_collisions_[byte]; return true; } @@ -923,12 +936,11 @@ void segmenter_::finish() { << ", lookups=" << stats_.bloom_lookups << ")"; } if (stats_.total_matches > 0) { - LOG_VERBOSE << cfg_.context - << "segmentation matches: good=" << stats_.good_matches - << ", bad=" << stats_.bad_matches << ", collisions=" - << (stats_.total_matches - - (stats_.bad_matches + stats_.good_matches)) - << ", total=" << stats_.total_matches; + LOG_VERBOSE << fmt::format( + "{}segment matches: good={}, bad={}, collisions={}, total={}", + cfg_.context, stats_.good_matches, stats_.bad_matches, + (stats_.total_matches - (stats_.bad_matches + stats_.good_matches)), + stats_.total_matches); } if (stats_.total_hashes > 0) { LOG_VERBOSE << cfg_.context << "segmentation collisions: L1=" diff --git a/test/tool_main_test.cpp b/test/tool_main_test.cpp index 1163e544..67c0aba9 100644 --- a/test/tool_main_test.cpp +++ b/test/tool_main_test.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -2044,3 +2045,59 @@ TEST(mkdwarfs_test, filesystem_read_error) { EXPECT_EQ(-EBADF, res.error()); } } + +class segmenter_repeating_sequence_test : public testing::TestWithParam { +}; + +TEST_P(segmenter_repeating_sequence_test, github161) { + auto byte = GetParam(); + + static constexpr int const final_bytes{10'000'000}; + static constexpr int const repetitions{2'000}; + auto match = test::create_random_string(5'000); + auto suffix = test::create_random_string(50); + auto sequence = std::string(3'000, byte); + + std::string content; + content.reserve(match.size() + suffix.size() + + (sequence.size() + match.size()) * repetitions + final_bytes); + + content += match + suffix; + for (int i = 0; i < repetitions; ++i) { + content += sequence + match; + } + content += std::string(final_bytes, byte); + + auto t = mkdwarfs_tester::create_empty(); + t.add_root_dir(); + t.os->add_file("/bug", content); + + ASSERT_EQ(0, t.run("-i / -o - -C lz4 -W12 --log-level=verbose --no-progress")) + << t.err(); + + auto log = t.err(); + + EXPECT_THAT(log, + ::testing::ContainsRegex(test::fix_regex(fmt::format( + "avoided \\d\\d\\d\\d+ collisions in 0x{:02x}-byte sequences", + byte)))); + + std::regex const re{"segment matches: good=(\\d+), bad=(\\d+), " + "collisions=(\\d+), total=(\\d+)"}; + std::smatch m; + + ASSERT_TRUE(std::regex_search(log, m, re)) << log; + + auto good = std::stoi(m[1]); + auto bad = std::stoi(m[2]); + auto collisions = std::stoi(m[3]); + auto total = std::stoi(m[4]); + + EXPECT_GT(good, 2000); + EXPECT_EQ(0, bad); + EXPECT_EQ(0, collisions); + EXPECT_GT(total, 2000); +} + +INSTANTIATE_TEST_SUITE_P(dwarfs, segmenter_repeating_sequence_test, + ::testing::Values('\0', 'G', '\xff'));