mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-16 15:58:06 -04:00
fix(segmenter): repeating sequence detection was broken + test
This never properly worked for bytes >= 0x10 due to hash collisions in the map that was used to verify the repeating sequence.
This commit is contained in:
parent
c322708a7b
commit
3be0715c1f
@ -38,6 +38,7 @@
|
||||
|
||||
#include <folly/hash/Hash.h>
|
||||
#include <folly/small_vector.h>
|
||||
#include <folly/sorted_vector_types.h>
|
||||
#include <folly/stats/Histogram.h>
|
||||
|
||||
#include "dwarfs/block_data.h"
|
||||
@ -152,7 +153,13 @@ class fast_multimap {
|
||||
collision_t collisions_;
|
||||
};
|
||||
|
||||
using repeating_sequence_map_type = phmap::flat_hash_map<uint32_t, uint8_t>;
|
||||
template <typename T, size_t MaxInline = 1>
|
||||
using small_sorted_vector_set =
|
||||
folly::sorted_vector_set<T, std::less<T>, std::allocator<T>, void,
|
||||
folly::small_vector<T, MaxInline>>;
|
||||
|
||||
using repeating_sequence_map_type =
|
||||
phmap::flat_hash_map<uint32_t, small_sorted_vector_set<uint8_t, 8>>;
|
||||
using repeating_collisions_map_type = std::unordered_map<uint8_t, uint32_t>;
|
||||
|
||||
/**
|
||||
@ -660,11 +667,11 @@ class segmenter_ final : public segmenter::impl, private SegmentingPolicy {
|
||||
LOG_VERBOSE << cfg_.context << "bloom filter size: "
|
||||
<< size_with_unit(global_filter_.size() / 8);
|
||||
|
||||
repeating_sequence_hash_values_.reserve(256);
|
||||
|
||||
for (int i = 0; i < 256; ++i) {
|
||||
repeating_sequence_hash_values_.emplace(
|
||||
rsync_hash::repeating_window(i, frames_to_bytes(window_size_)), i);
|
||||
auto val =
|
||||
rsync_hash::repeating_window(i, frames_to_bytes(window_size_));
|
||||
DWARFS_CHECK(repeating_sequence_hash_values_[val].emplace(i).second,
|
||||
"repeating sequence hash value / byte collision");
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -785,18 +792,24 @@ bool active_block<LoggerPolicy, GranularityPolicy>::
|
||||
auto& raw = data_->vec();
|
||||
auto winbeg = raw.begin() + frames_to_bytes(offset);
|
||||
auto winend = winbeg + frames_to_bytes(window_size_);
|
||||
auto byte = *winbeg;
|
||||
static_assert(std::is_same_v<typename decltype(it->second)::value_type,
|
||||
decltype(byte)>);
|
||||
|
||||
if (std::find_if(winbeg, winend, [byte = it->second](auto b) {
|
||||
return b != byte;
|
||||
}) == winend) {
|
||||
// check if this is a known character for a repeating sequence
|
||||
if (!it->second.contains(byte)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (std::find_if(winbeg, winend, [byte](auto b) { return b != byte; }) ==
|
||||
winend) {
|
||||
return offsets_.any_value_is(hashval, [&, this](auto off) {
|
||||
auto offbeg = raw.begin() + frames_to_bytes(off);
|
||||
auto offend = offbeg + frames_to_bytes(window_size_);
|
||||
|
||||
if (std::find_if(offbeg, offend, [byte = it->second](auto b) {
|
||||
return b != byte;
|
||||
}) == offend) {
|
||||
++repeating_collisions_[it->second];
|
||||
if (std::find_if(offbeg, offend,
|
||||
[byte](auto b) { return b != byte; }) == offend) {
|
||||
++repeating_collisions_[byte];
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -923,12 +936,11 @@ void segmenter_<LoggerPolicy, SegmentingPolicy>::finish() {
|
||||
<< ", lookups=" << stats_.bloom_lookups << ")";
|
||||
}
|
||||
if (stats_.total_matches > 0) {
|
||||
LOG_VERBOSE << cfg_.context
|
||||
<< "segmentation matches: good=" << stats_.good_matches
|
||||
<< ", bad=" << stats_.bad_matches << ", collisions="
|
||||
<< (stats_.total_matches -
|
||||
(stats_.bad_matches + stats_.good_matches))
|
||||
<< ", total=" << stats_.total_matches;
|
||||
LOG_VERBOSE << fmt::format(
|
||||
"{}segment matches: good={}, bad={}, collisions={}, total={}",
|
||||
cfg_.context, stats_.good_matches, stats_.bad_matches,
|
||||
(stats_.total_matches - (stats_.bad_matches + stats_.good_matches)),
|
||||
stats_.total_matches);
|
||||
}
|
||||
if (stats_.total_hashes > 0) {
|
||||
LOG_VERBOSE << cfg_.context << "segmentation collisions: L1="
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include <filesystem>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
#include <regex>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
@ -2044,3 +2045,59 @@ TEST(mkdwarfs_test, filesystem_read_error) {
|
||||
EXPECT_EQ(-EBADF, res.error());
|
||||
}
|
||||
}
|
||||
|
||||
class segmenter_repeating_sequence_test : public testing::TestWithParam<char> {
|
||||
};
|
||||
|
||||
TEST_P(segmenter_repeating_sequence_test, github161) {
|
||||
auto byte = GetParam();
|
||||
|
||||
static constexpr int const final_bytes{10'000'000};
|
||||
static constexpr int const repetitions{2'000};
|
||||
auto match = test::create_random_string(5'000);
|
||||
auto suffix = test::create_random_string(50);
|
||||
auto sequence = std::string(3'000, byte);
|
||||
|
||||
std::string content;
|
||||
content.reserve(match.size() + suffix.size() +
|
||||
(sequence.size() + match.size()) * repetitions + final_bytes);
|
||||
|
||||
content += match + suffix;
|
||||
for (int i = 0; i < repetitions; ++i) {
|
||||
content += sequence + match;
|
||||
}
|
||||
content += std::string(final_bytes, byte);
|
||||
|
||||
auto t = mkdwarfs_tester::create_empty();
|
||||
t.add_root_dir();
|
||||
t.os->add_file("/bug", content);
|
||||
|
||||
ASSERT_EQ(0, t.run("-i / -o - -C lz4 -W12 --log-level=verbose --no-progress"))
|
||||
<< t.err();
|
||||
|
||||
auto log = t.err();
|
||||
|
||||
EXPECT_THAT(log,
|
||||
::testing::ContainsRegex(test::fix_regex(fmt::format(
|
||||
"avoided \\d\\d\\d\\d+ collisions in 0x{:02x}-byte sequences",
|
||||
byte))));
|
||||
|
||||
std::regex const re{"segment matches: good=(\\d+), bad=(\\d+), "
|
||||
"collisions=(\\d+), total=(\\d+)"};
|
||||
std::smatch m;
|
||||
|
||||
ASSERT_TRUE(std::regex_search(log, m, re)) << log;
|
||||
|
||||
auto good = std::stoi(m[1]);
|
||||
auto bad = std::stoi(m[2]);
|
||||
auto collisions = std::stoi(m[3]);
|
||||
auto total = std::stoi(m[4]);
|
||||
|
||||
EXPECT_GT(good, 2000);
|
||||
EXPECT_EQ(0, bad);
|
||||
EXPECT_EQ(0, collisions);
|
||||
EXPECT_GT(total, 2000);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(dwarfs, segmenter_repeating_sequence_test,
|
||||
::testing::Values('\0', 'G', '\xff'));
|
||||
|
Loading…
x
Reference in New Issue
Block a user