mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-17 08:18:07 -04:00
fix(segmenter): repeating sequence detection was broken + test
This never properly worked for bytes >= 0x10 due to hash collisions in the map that was used to verify the repeating sequence.
This commit is contained in:
parent
c322708a7b
commit
3be0715c1f
@ -38,6 +38,7 @@
|
|||||||
|
|
||||||
#include <folly/hash/Hash.h>
|
#include <folly/hash/Hash.h>
|
||||||
#include <folly/small_vector.h>
|
#include <folly/small_vector.h>
|
||||||
|
#include <folly/sorted_vector_types.h>
|
||||||
#include <folly/stats/Histogram.h>
|
#include <folly/stats/Histogram.h>
|
||||||
|
|
||||||
#include "dwarfs/block_data.h"
|
#include "dwarfs/block_data.h"
|
||||||
@ -152,7 +153,13 @@ class fast_multimap {
|
|||||||
collision_t collisions_;
|
collision_t collisions_;
|
||||||
};
|
};
|
||||||
|
|
||||||
using repeating_sequence_map_type = phmap::flat_hash_map<uint32_t, uint8_t>;
|
template <typename T, size_t MaxInline = 1>
|
||||||
|
using small_sorted_vector_set =
|
||||||
|
folly::sorted_vector_set<T, std::less<T>, std::allocator<T>, void,
|
||||||
|
folly::small_vector<T, MaxInline>>;
|
||||||
|
|
||||||
|
using repeating_sequence_map_type =
|
||||||
|
phmap::flat_hash_map<uint32_t, small_sorted_vector_set<uint8_t, 8>>;
|
||||||
using repeating_collisions_map_type = std::unordered_map<uint8_t, uint32_t>;
|
using repeating_collisions_map_type = std::unordered_map<uint8_t, uint32_t>;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -660,11 +667,11 @@ class segmenter_ final : public segmenter::impl, private SegmentingPolicy {
|
|||||||
LOG_VERBOSE << cfg_.context << "bloom filter size: "
|
LOG_VERBOSE << cfg_.context << "bloom filter size: "
|
||||||
<< size_with_unit(global_filter_.size() / 8);
|
<< size_with_unit(global_filter_.size() / 8);
|
||||||
|
|
||||||
repeating_sequence_hash_values_.reserve(256);
|
|
||||||
|
|
||||||
for (int i = 0; i < 256; ++i) {
|
for (int i = 0; i < 256; ++i) {
|
||||||
repeating_sequence_hash_values_.emplace(
|
auto val =
|
||||||
rsync_hash::repeating_window(i, frames_to_bytes(window_size_)), i);
|
rsync_hash::repeating_window(i, frames_to_bytes(window_size_));
|
||||||
|
DWARFS_CHECK(repeating_sequence_hash_values_[val].emplace(i).second,
|
||||||
|
"repeating sequence hash value / byte collision");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -785,18 +792,24 @@ bool active_block<LoggerPolicy, GranularityPolicy>::
|
|||||||
auto& raw = data_->vec();
|
auto& raw = data_->vec();
|
||||||
auto winbeg = raw.begin() + frames_to_bytes(offset);
|
auto winbeg = raw.begin() + frames_to_bytes(offset);
|
||||||
auto winend = winbeg + frames_to_bytes(window_size_);
|
auto winend = winbeg + frames_to_bytes(window_size_);
|
||||||
|
auto byte = *winbeg;
|
||||||
|
static_assert(std::is_same_v<typename decltype(it->second)::value_type,
|
||||||
|
decltype(byte)>);
|
||||||
|
|
||||||
if (std::find_if(winbeg, winend, [byte = it->second](auto b) {
|
// check if this is a known character for a repeating sequence
|
||||||
return b != byte;
|
if (!it->second.contains(byte)) {
|
||||||
}) == winend) {
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (std::find_if(winbeg, winend, [byte](auto b) { return b != byte; }) ==
|
||||||
|
winend) {
|
||||||
return offsets_.any_value_is(hashval, [&, this](auto off) {
|
return offsets_.any_value_is(hashval, [&, this](auto off) {
|
||||||
auto offbeg = raw.begin() + frames_to_bytes(off);
|
auto offbeg = raw.begin() + frames_to_bytes(off);
|
||||||
auto offend = offbeg + frames_to_bytes(window_size_);
|
auto offend = offbeg + frames_to_bytes(window_size_);
|
||||||
|
|
||||||
if (std::find_if(offbeg, offend, [byte = it->second](auto b) {
|
if (std::find_if(offbeg, offend,
|
||||||
return b != byte;
|
[byte](auto b) { return b != byte; }) == offend) {
|
||||||
}) == offend) {
|
++repeating_collisions_[byte];
|
||||||
++repeating_collisions_[it->second];
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -923,12 +936,11 @@ void segmenter_<LoggerPolicy, SegmentingPolicy>::finish() {
|
|||||||
<< ", lookups=" << stats_.bloom_lookups << ")";
|
<< ", lookups=" << stats_.bloom_lookups << ")";
|
||||||
}
|
}
|
||||||
if (stats_.total_matches > 0) {
|
if (stats_.total_matches > 0) {
|
||||||
LOG_VERBOSE << cfg_.context
|
LOG_VERBOSE << fmt::format(
|
||||||
<< "segmentation matches: good=" << stats_.good_matches
|
"{}segment matches: good={}, bad={}, collisions={}, total={}",
|
||||||
<< ", bad=" << stats_.bad_matches << ", collisions="
|
cfg_.context, stats_.good_matches, stats_.bad_matches,
|
||||||
<< (stats_.total_matches -
|
(stats_.total_matches - (stats_.bad_matches + stats_.good_matches)),
|
||||||
(stats_.bad_matches + stats_.good_matches))
|
stats_.total_matches);
|
||||||
<< ", total=" << stats_.total_matches;
|
|
||||||
}
|
}
|
||||||
if (stats_.total_hashes > 0) {
|
if (stats_.total_hashes > 0) {
|
||||||
LOG_VERBOSE << cfg_.context << "segmentation collisions: L1="
|
LOG_VERBOSE << cfg_.context << "segmentation collisions: L1="
|
||||||
|
@ -24,6 +24,7 @@
|
|||||||
#include <filesystem>
|
#include <filesystem>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <random>
|
#include <random>
|
||||||
|
#include <regex>
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
@ -2044,3 +2045,59 @@ TEST(mkdwarfs_test, filesystem_read_error) {
|
|||||||
EXPECT_EQ(-EBADF, res.error());
|
EXPECT_EQ(-EBADF, res.error());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class segmenter_repeating_sequence_test : public testing::TestWithParam<char> {
|
||||||
|
};
|
||||||
|
|
||||||
|
TEST_P(segmenter_repeating_sequence_test, github161) {
|
||||||
|
auto byte = GetParam();
|
||||||
|
|
||||||
|
static constexpr int const final_bytes{10'000'000};
|
||||||
|
static constexpr int const repetitions{2'000};
|
||||||
|
auto match = test::create_random_string(5'000);
|
||||||
|
auto suffix = test::create_random_string(50);
|
||||||
|
auto sequence = std::string(3'000, byte);
|
||||||
|
|
||||||
|
std::string content;
|
||||||
|
content.reserve(match.size() + suffix.size() +
|
||||||
|
(sequence.size() + match.size()) * repetitions + final_bytes);
|
||||||
|
|
||||||
|
content += match + suffix;
|
||||||
|
for (int i = 0; i < repetitions; ++i) {
|
||||||
|
content += sequence + match;
|
||||||
|
}
|
||||||
|
content += std::string(final_bytes, byte);
|
||||||
|
|
||||||
|
auto t = mkdwarfs_tester::create_empty();
|
||||||
|
t.add_root_dir();
|
||||||
|
t.os->add_file("/bug", content);
|
||||||
|
|
||||||
|
ASSERT_EQ(0, t.run("-i / -o - -C lz4 -W12 --log-level=verbose --no-progress"))
|
||||||
|
<< t.err();
|
||||||
|
|
||||||
|
auto log = t.err();
|
||||||
|
|
||||||
|
EXPECT_THAT(log,
|
||||||
|
::testing::ContainsRegex(test::fix_regex(fmt::format(
|
||||||
|
"avoided \\d\\d\\d\\d+ collisions in 0x{:02x}-byte sequences",
|
||||||
|
byte))));
|
||||||
|
|
||||||
|
std::regex const re{"segment matches: good=(\\d+), bad=(\\d+), "
|
||||||
|
"collisions=(\\d+), total=(\\d+)"};
|
||||||
|
std::smatch m;
|
||||||
|
|
||||||
|
ASSERT_TRUE(std::regex_search(log, m, re)) << log;
|
||||||
|
|
||||||
|
auto good = std::stoi(m[1]);
|
||||||
|
auto bad = std::stoi(m[2]);
|
||||||
|
auto collisions = std::stoi(m[3]);
|
||||||
|
auto total = std::stoi(m[4]);
|
||||||
|
|
||||||
|
EXPECT_GT(good, 2000);
|
||||||
|
EXPECT_EQ(0, bad);
|
||||||
|
EXPECT_EQ(0, collisions);
|
||||||
|
EXPECT_GT(total, 2000);
|
||||||
|
}
|
||||||
|
|
||||||
|
INSTANTIATE_TEST_SUITE_P(dwarfs, segmenter_repeating_sequence_test,
|
||||||
|
::testing::Values('\0', 'G', '\xff'));
|
||||||
|
Loading…
x
Reference in New Issue
Block a user