fix(segmenter): repeating sequence detection was broken + test

This never properly worked for bytes >= 0x10 due to hash collisions
in the map that was used to verify the repeating sequence.
This commit is contained in:
Marcus Holland-Moritz 2024-01-14 13:05:18 +01:00
parent c322708a7b
commit 3be0715c1f
2 changed files with 87 additions and 18 deletions

View File

@ -38,6 +38,7 @@
#include <folly/hash/Hash.h>
#include <folly/small_vector.h>
#include <folly/sorted_vector_types.h>
#include <folly/stats/Histogram.h>
#include "dwarfs/block_data.h"
@ -152,7 +153,13 @@ class fast_multimap {
collision_t collisions_;
};
using repeating_sequence_map_type = phmap::flat_hash_map<uint32_t, uint8_t>;
template <typename T, size_t MaxInline = 1>
using small_sorted_vector_set =
folly::sorted_vector_set<T, std::less<T>, std::allocator<T>, void,
folly::small_vector<T, MaxInline>>;
using repeating_sequence_map_type =
phmap::flat_hash_map<uint32_t, small_sorted_vector_set<uint8_t, 8>>;
using repeating_collisions_map_type = std::unordered_map<uint8_t, uint32_t>;
/**
@ -660,11 +667,11 @@ class segmenter_ final : public segmenter::impl, private SegmentingPolicy {
LOG_VERBOSE << cfg_.context << "bloom filter size: "
<< size_with_unit(global_filter_.size() / 8);
repeating_sequence_hash_values_.reserve(256);
for (int i = 0; i < 256; ++i) {
repeating_sequence_hash_values_.emplace(
rsync_hash::repeating_window(i, frames_to_bytes(window_size_)), i);
auto val =
rsync_hash::repeating_window(i, frames_to_bytes(window_size_));
DWARFS_CHECK(repeating_sequence_hash_values_[val].emplace(i).second,
"repeating sequence hash value / byte collision");
}
}
}
@ -785,18 +792,24 @@ bool active_block<LoggerPolicy, GranularityPolicy>::
auto& raw = data_->vec();
auto winbeg = raw.begin() + frames_to_bytes(offset);
auto winend = winbeg + frames_to_bytes(window_size_);
auto byte = *winbeg;
static_assert(std::is_same_v<typename decltype(it->second)::value_type,
decltype(byte)>);
if (std::find_if(winbeg, winend, [byte = it->second](auto b) {
return b != byte;
}) == winend) {
// check if this is a known character for a repeating sequence
if (!it->second.contains(byte)) {
return false;
}
if (std::find_if(winbeg, winend, [byte](auto b) { return b != byte; }) ==
winend) {
return offsets_.any_value_is(hashval, [&, this](auto off) {
auto offbeg = raw.begin() + frames_to_bytes(off);
auto offend = offbeg + frames_to_bytes(window_size_);
if (std::find_if(offbeg, offend, [byte = it->second](auto b) {
return b != byte;
}) == offend) {
++repeating_collisions_[it->second];
if (std::find_if(offbeg, offend,
[byte](auto b) { return b != byte; }) == offend) {
++repeating_collisions_[byte];
return true;
}
@ -923,12 +936,11 @@ void segmenter_<LoggerPolicy, SegmentingPolicy>::finish() {
<< ", lookups=" << stats_.bloom_lookups << ")";
}
if (stats_.total_matches > 0) {
LOG_VERBOSE << cfg_.context
<< "segmentation matches: good=" << stats_.good_matches
<< ", bad=" << stats_.bad_matches << ", collisions="
<< (stats_.total_matches -
(stats_.bad_matches + stats_.good_matches))
<< ", total=" << stats_.total_matches;
LOG_VERBOSE << fmt::format(
"{}segment matches: good={}, bad={}, collisions={}, total={}",
cfg_.context, stats_.good_matches, stats_.bad_matches,
(stats_.total_matches - (stats_.bad_matches + stats_.good_matches)),
stats_.total_matches);
}
if (stats_.total_hashes > 0) {
LOG_VERBOSE << cfg_.context << "segmentation collisions: L1="

View File

@ -24,6 +24,7 @@
#include <filesystem>
#include <iostream>
#include <random>
#include <regex>
#include <set>
#include <vector>
@ -2044,3 +2045,59 @@ TEST(mkdwarfs_test, filesystem_read_error) {
EXPECT_EQ(-EBADF, res.error());
}
}
class segmenter_repeating_sequence_test : public testing::TestWithParam<char> {
};
TEST_P(segmenter_repeating_sequence_test, github161) {
auto byte = GetParam();
static constexpr int const final_bytes{10'000'000};
static constexpr int const repetitions{2'000};
auto match = test::create_random_string(5'000);
auto suffix = test::create_random_string(50);
auto sequence = std::string(3'000, byte);
std::string content;
content.reserve(match.size() + suffix.size() +
(sequence.size() + match.size()) * repetitions + final_bytes);
content += match + suffix;
for (int i = 0; i < repetitions; ++i) {
content += sequence + match;
}
content += std::string(final_bytes, byte);
auto t = mkdwarfs_tester::create_empty();
t.add_root_dir();
t.os->add_file("/bug", content);
ASSERT_EQ(0, t.run("-i / -o - -C lz4 -W12 --log-level=verbose --no-progress"))
<< t.err();
auto log = t.err();
EXPECT_THAT(log,
::testing::ContainsRegex(test::fix_regex(fmt::format(
"avoided \\d\\d\\d\\d+ collisions in 0x{:02x}-byte sequences",
byte))));
std::regex const re{"segment matches: good=(\\d+), bad=(\\d+), "
"collisions=(\\d+), total=(\\d+)"};
std::smatch m;
ASSERT_TRUE(std::regex_search(log, m, re)) << log;
auto good = std::stoi(m[1]);
auto bad = std::stoi(m[2]);
auto collisions = std::stoi(m[3]);
auto total = std::stoi(m[4]);
EXPECT_GT(good, 2000);
EXPECT_EQ(0, bad);
EXPECT_EQ(0, collisions);
EXPECT_GT(total, 2000);
}
INSTANTIATE_TEST_SUITE_P(dwarfs, segmenter_repeating_sequence_test,
::testing::Values('\0', 'G', '\xff'));