fix(mkdwarfs): make --max-similarity-size work again + tests

This commit is contained in:
Marcus Holland-Moritz 2024-01-09 01:44:36 +01:00
parent 08a7887d8a
commit ae7de2486a
9 changed files with 263 additions and 41 deletions

View File

@ -23,6 +23,7 @@
#include <iosfwd> #include <iosfwd>
#include <memory> #include <memory>
#include <optional>
#include <vector> #include <vector>
#include <folly/small_vector.h> #include <folly/small_vector.h>
@ -54,8 +55,9 @@ class inode : public object {
virtual void set_num(uint32_t num) = 0; virtual void set_num(uint32_t num) = 0;
virtual uint32_t num() const = 0; virtual uint32_t num() const = 0;
virtual bool has_category(fragment_category cat) const = 0; virtual bool has_category(fragment_category cat) const = 0;
virtual uint32_t similarity_hash(fragment_category cat) const = 0; virtual std::optional<uint32_t>
virtual nilsimsa::hash_type const& similarity_hash(fragment_category cat) const = 0;
virtual nilsimsa::hash_type const*
nilsimsa_similarity_hash(fragment_category cat) const = 0; nilsimsa_similarity_hash(fragment_category cat) const = 0;
virtual size_t size() const = 0; virtual size_t size() const = 0;
virtual file const* any() const = 0; virtual file const* any() const = 0;

View File

@ -25,6 +25,7 @@
#include <iosfwd> #include <iosfwd>
#include <span> #include <span>
#include <string> #include <string>
#include <unordered_map>
#include <folly/small_vector.h> #include <folly/small_vector.h>
@ -99,6 +100,8 @@ class inode_fragments {
std::string std::string
to_string(mapper_function_type const& mapper = mapper_function_type()) const; to_string(mapper_function_type const& mapper = mapper_function_type()) const;
std::unordered_map<fragment_category, file_off_t> get_category_sizes() const;
private: private:
folly::small_vector<single_inode_fragment, 1> fragments_; folly::small_vector<single_inode_fragment, 1> fragments_;
}; };

View File

@ -31,11 +31,12 @@ class logger;
class progress; class progress;
class worker_group; class worker_group;
struct inode_options;
struct similarity_ordering_options; struct similarity_ordering_options;
class inode_ordering { class inode_ordering {
public: public:
inode_ordering(logger& lgr, progress& prog); inode_ordering(logger& lgr, progress& prog, inode_options const& opts);
void by_inode_number(sortable_inode_span& sp) const { void by_inode_number(sortable_inode_span& sp) const {
impl_->by_inode_number(sp); impl_->by_inode_number(sp);

View File

@ -101,12 +101,7 @@ struct file_order_options {
}; };
struct inode_options { struct inode_options {
// TODO: - clean this all up and name properly std::optional<size_t> max_similarity_scan_size;
// - the file_order thing should really be "fragment_order"
// - it should all belong into inode_options, where scanner
// can still access it
// - python scripts need to die
std::optional<size_t> max_similarity_scan_size; // TODO: not sure about this?
std::shared_ptr<categorizer_manager> categorizer_mgr; std::shared_ptr<categorizer_manager> categorizer_mgr;
categorized_option<file_order_options> fragment_order{file_order_options()}; categorized_option<file_order_options> fragment_order{file_order_options()};
}; };

View File

@ -36,7 +36,7 @@ inode_element_view::inode_element_view(
, cat_{cat} { , cat_{cat} {
hash_cache_.resize(inodes_.size()); hash_cache_.resize(inodes_.size());
for (auto i : index) { for (auto i : index) {
hash_cache_[i] = &inodes_[i]->nilsimsa_similarity_hash(cat); hash_cache_[i] = inodes_[i]->nilsimsa_similarity_hash(cat);
} }
} }
@ -84,6 +84,7 @@ std::string inode_element_view::description(size_t i) const {
} }
nilsimsa::hash_type const& inode_element_view::get_bits(size_t i) const { nilsimsa::hash_type const& inode_element_view::get_bits(size_t i) const {
assert(hash_cache_[i] != nullptr);
return *hash_cache_[i]; return *hash_cache_[i];
} }

View File

@ -90,4 +90,15 @@ inode_fragments::to_string(mapper_function_type const& mapper) const {
return oss.str(); return oss.str();
} }
std::unordered_map<fragment_category, file_off_t>
inode_fragments::get_category_sizes() const {
std::unordered_map<fragment_category, file_off_t> result;
for (auto const& f : span()) {
result[f.category()] += f.size();
}
return result;
}
} // namespace dwarfs } // namespace dwarfs

View File

@ -95,11 +95,15 @@ class inode_ : public inode {
fragments_, [cat](auto const& f) { return f.category() == cat; }); fragments_, [cat](auto const& f) { return f.category() == cat; });
} }
uint32_t similarity_hash(fragment_category cat) const override { std::optional<uint32_t>
return find_similarity<uint32_t>(cat); similarity_hash(fragment_category cat) const override {
if (auto sim = find_similarity<uint32_t>(cat)) {
return *sim;
}
return std::nullopt;
} }
nilsimsa::hash_type const& nilsimsa::hash_type const*
nilsimsa_similarity_hash(fragment_category cat) const override { nilsimsa_similarity_hash(fragment_category cat) const override {
return find_similarity<nilsimsa::hash_type>(cat); return find_similarity<nilsimsa::hash_type>(cat);
} }
@ -290,24 +294,26 @@ class inode_ : public inode {
} }
template <typename T> template <typename T>
T const& find_similarity(fragment_category cat) const { T const* find_similarity(fragment_category cat) const {
if (fragments_.empty()) [[unlikely]] { if (fragments_.empty()) [[unlikely]] {
DWARFS_THROW(runtime_error, fmt::format("inode has no fragments ({})", DWARFS_THROW(runtime_error, fmt::format("inode has no fragments ({})",
folly::demangle(typeid(T)))); folly::demangle(typeid(T))));
} }
if (std::holds_alternative<std::monostate>(similarity_)) {
return nullptr;
}
if (fragments_.size() == 1) { if (fragments_.size() == 1) {
if (fragments_.get_single_category() != cat) [[unlikely]] { if (fragments_.get_single_category() != cat) [[unlikely]] {
DWARFS_THROW(runtime_error, fmt::format("category mismatch ({})", DWARFS_THROW(runtime_error, fmt::format("category mismatch ({})",
folly::demangle(typeid(T)))); folly::demangle(typeid(T))));
} }
return std::get<T>(similarity_); return &std::get<T>(similarity_);
} }
auto& m = std::get<similarity_map_type>(similarity_); auto& m = std::get<similarity_map_type>(similarity_);
if (auto it = m.find(cat); it != m.end()) { if (auto it = m.find(cat); it != m.end()) {
return std::get<T>(it->second); return &std::get<T>(it->second);
} }
DWARFS_THROW(runtime_error, fmt::format("category not found ({})", return nullptr;
folly::demangle(typeid(T))));
} }
template <typename T> template <typename T>
@ -343,17 +349,22 @@ class inode_ : public inode {
std::unordered_map<fragment_category, similarity> sc; std::unordered_map<fragment_category, similarity> sc;
std::unordered_map<fragment_category, nilsimsa> nc; std::unordered_map<fragment_category, nilsimsa> nc;
for (auto const& f : fragments_.span()) { for (auto [cat, size] : fragments_.get_category_sizes()) {
switch (opts.fragment_order.get(f.category()).mode) { if (auto max = opts.max_similarity_scan_size;
max && static_cast<size_t>(size) > *max) {
continue;
}
switch (opts.fragment_order.get(cat).mode) {
case file_order_mode::NONE: case file_order_mode::NONE:
case file_order_mode::PATH: case file_order_mode::PATH:
case file_order_mode::REVPATH: case file_order_mode::REVPATH:
break; break;
case file_order_mode::SIMILARITY: case file_order_mode::SIMILARITY:
sc.try_emplace(f.category()); sc.try_emplace(cat);
break; break;
case file_order_mode::NILSIMSA: case file_order_mode::NILSIMSA:
nc.try_emplace(f.category()); nc.try_emplace(cat);
break; break;
} }
} }
@ -396,6 +407,12 @@ class inode_ : public inode {
size_t chunk_size) { size_t chunk_size) {
assert(fragments_.size() <= 1); assert(fragments_.size() <= 1);
if (mm) {
if (auto max = opts.max_similarity_scan_size; max && mm->size() > *max) {
return;
}
}
auto order_mode = auto order_mode =
fragments_.empty() fragments_.empty()
? opts.fragment_order.get().mode ? opts.fragment_order.get().mode
@ -477,7 +494,7 @@ class inode_manager_ final : public inode_manager::impl {
const override { const override {
auto span = sortable_span(); auto span = sortable_span();
span.all(); span.all();
inode_ordering(LOG_GET_LOGGER, prog_).by_inode_number(span); inode_ordering(LOG_GET_LOGGER, prog_, opts_).by_inode_number(span);
for (auto const& i : span) { for (auto const& i : span) {
fn(i); fn(i);
} }
@ -613,7 +630,7 @@ auto inode_manager_<LoggerPolicy>::ordered_span(fragment_category cat,
auto span = sortable_span(); auto span = sortable_span();
span.select([cat](auto const& v) { return v->has_category(cat); }); span.select([cat](auto const& v) { return v->has_category(cat); });
inode_ordering order(LOG_GET_LOGGER, prog_); inode_ordering order(LOG_GET_LOGGER, prog_, opts_);
switch (opts.mode) { switch (opts.mode) {
case file_order_mode::NONE: case file_order_mode::NONE:

View File

@ -25,6 +25,7 @@
#include "dwarfs/inode_element_view.h" #include "dwarfs/inode_element_view.h"
#include "dwarfs/inode_ordering.h" #include "dwarfs/inode_ordering.h"
#include "dwarfs/logger.h" #include "dwarfs/logger.h"
#include "dwarfs/options.h"
#include "dwarfs/promise_receiver.h" #include "dwarfs/promise_receiver.h"
#include "dwarfs/similarity_ordering.h" #include "dwarfs/similarity_ordering.h"
#include "dwarfs/worker_group.h" #include "dwarfs/worker_group.h"
@ -33,12 +34,19 @@ namespace dwarfs {
namespace { namespace {
bool inode_less_by_size(inode const* a, inode const* b) {
auto sa = a->size();
auto sb = b->size();
return sa > sb || (sa == sb && a->any()->less_revpath(*b->any()));
}
template <typename LoggerPolicy> template <typename LoggerPolicy>
class inode_ordering_ final : public inode_ordering::impl { class inode_ordering_ final : public inode_ordering::impl {
public: public:
inode_ordering_(logger& lgr, progress& prog) inode_ordering_(logger& lgr, progress& prog, inode_options const& opts)
: LOG_PROXY_INIT(lgr) : LOG_PROXY_INIT(lgr)
, prog_{prog} {} , prog_{prog}
, opts_{opts} {}
void by_inode_number(sortable_inode_span& sp) const override; void by_inode_number(sortable_inode_span& sp) const override;
void by_path(sortable_inode_span& sp) const override; void by_path(sortable_inode_span& sp) const override;
@ -50,8 +58,14 @@ class inode_ordering_ final : public inode_ordering::impl {
sortable_inode_span& sp, fragment_category cat) const override; sortable_inode_span& sp, fragment_category cat) const override;
private: private:
void
by_nilsimsa_impl(worker_group& wg, similarity_ordering_options const& opts,
std::span<std::shared_ptr<inode> const> inodes,
std::vector<uint32_t>& index, fragment_category cat) const;
LOG_PROXY_DECL(LoggerPolicy); LOG_PROXY_DECL(LoggerPolicy);
progress& prog_; progress& prog_;
inode_options const& opts_;
}; };
template <typename LoggerPolicy> template <typename LoggerPolicy>
@ -93,20 +107,42 @@ void inode_ordering_<LoggerPolicy>::by_reverse_path(
template <typename LoggerPolicy> template <typename LoggerPolicy>
void inode_ordering_<LoggerPolicy>::by_similarity(sortable_inode_span& sp, void inode_ordering_<LoggerPolicy>::by_similarity(sortable_inode_span& sp,
fragment_category cat) const { fragment_category cat) const {
std::vector<uint32_t> hash_cache; std::vector<std::optional<uint32_t>> hash_cache;
auto raw = sp.raw(); auto raw = sp.raw();
auto& index = sp.index(); auto& index = sp.index();
bool any_missing = false;
hash_cache.resize(raw.size()); hash_cache.resize(raw.size());
for (auto i : index) { for (auto i : index) {
hash_cache[i] = raw[i]->similarity_hash(cat); auto& cache = hash_cache[i];
cache = raw[i]->similarity_hash(cat);
if (!cache.has_value()) {
any_missing = true;
}
} }
std::sort(index.begin(), index.end(), [&](auto a, auto b) { auto size_pred = [&](auto a, auto b) {
auto const ca = hash_cache[a]; return inode_less_by_size(raw[a].get(), raw[b].get());
auto const cb = hash_cache[b]; };
auto start = index.begin();
if (any_missing) {
start = std::stable_partition(index.begin(), index.end(), [&](auto i) {
return !hash_cache[i].has_value();
});
std::sort(index.begin(), start, size_pred);
}
std::sort(start, index.end(), [&](auto a, auto b) {
assert(hash_cache[a].has_value());
assert(hash_cache[b].has_value());
auto const ca = *hash_cache[a];
auto const cb = *hash_cache[b];
if (ca < cb) { if (ca < cb) {
return true; return true;
@ -116,11 +152,7 @@ void inode_ordering_<LoggerPolicy>::by_similarity(sortable_inode_span& sp,
return false; return false;
} }
auto ia = raw[a].get(); return size_pred(a, b);
auto ib = raw[b].get();
return ia->size() > ib->size() ||
(ia->size() == ib->size() && ia->any()->less_revpath(*ib->any()));
}); });
} }
@ -128,19 +160,51 @@ template <typename LoggerPolicy>
void inode_ordering_<LoggerPolicy>::by_nilsimsa( void inode_ordering_<LoggerPolicy>::by_nilsimsa(
worker_group& wg, similarity_ordering_options const& opts, worker_group& wg, similarity_ordering_options const& opts,
sortable_inode_span& sp, fragment_category cat) const { sortable_inode_span& sp, fragment_category cat) const {
auto ev = inode_element_view(sp.raw(), sp.index(), cat); auto raw = sp.raw();
auto& index = sp.index();
if (opts_.max_similarity_scan_size) {
auto mid = std::stable_partition(index.begin(), index.end(), [&](auto i) {
return !raw[i]->nilsimsa_similarity_hash(cat);
});
if (mid != index.begin()) {
std::sort(index.begin(), mid, [&](auto a, auto b) {
return inode_less_by_size(raw[a].get(), raw[b].get());
});
if (mid != index.end()) {
std::vector<uint32_t> small_index(mid, index.end());
by_nilsimsa_impl(wg, opts, raw, small_index, cat);
std::copy(small_index.begin(), small_index.end(), mid);
}
return;
}
}
by_nilsimsa_impl(wg, opts, raw, index, cat);
}
template <typename LoggerPolicy>
void inode_ordering_<LoggerPolicy>::by_nilsimsa_impl(
worker_group& wg, similarity_ordering_options const& opts,
std::span<std::shared_ptr<inode> const> inodes,
std::vector<uint32_t>& index, fragment_category cat) const {
auto ev = inode_element_view(inodes, index, cat);
std::promise<std::vector<uint32_t>> promise; std::promise<std::vector<uint32_t>> promise;
auto future = promise.get_future(); auto future = promise.get_future();
auto sim_order = similarity_ordering(LOG_GET_LOGGER, prog_, wg, opts); auto sim_order = similarity_ordering(LOG_GET_LOGGER, prog_, wg, opts);
sim_order.order_nilsimsa(ev, make_receiver(std::move(promise)), sim_order.order_nilsimsa(ev, make_receiver(std::move(promise)),
std::move(sp.index())); std::move(index));
future.get().swap(sp.index()); future.get().swap(index);
} }
} // namespace } // namespace
inode_ordering::inode_ordering(logger& lgr, progress& prog) inode_ordering::inode_ordering(logger& lgr, progress& prog,
inode_options const& opts)
: impl_(make_unique_logging_object<impl, inode_ordering_, logger_policies>( : impl_(make_unique_logging_object<impl, inode_ordering_, logger_policies>(
lgr, prog)) {} lgr, prog, opts)) {}
} // namespace dwarfs } // namespace dwarfs

View File

@ -1268,3 +1268,131 @@ TEST(dwarfsck_test, export_metadata_close_error) {
EXPECT_THAT(t.err(), EXPECT_THAT(t.err(),
::testing::HasSubstr("failed to close metadata output file")); ::testing::HasSubstr("failed to close metadata output file"));
} }
class mkdwarfs_sim_order_test : public testing::TestWithParam<char const*> {};
TEST(mkdwarfs_test, max_similarity_size) {
static constexpr std::array sizes{50, 100, 200, 500, 1000, 2000, 5000, 10000};
auto make_tester = [] {
std::mt19937_64 rng{42};
auto t = mkdwarfs_tester::create_empty();
t.add_root_dir();
for (auto size : sizes) {
auto data = test::create_random_string(size, rng);
t.os->add_file("/file" + std::to_string(size), data);
}
return t;
};
auto get_sizes_in_offset_order = [](filesystem_v2 const& fs) {
std::vector<std::pair<size_t, size_t>> tmp;
for (auto size : sizes) {
auto path = "/file" + std::to_string(size);
auto iv = fs.find(path.c_str());
assert(iv);
auto info = fs.get_inode_info(*iv);
assert(1 == info["chunks"].size());
auto const& chunk = info["chunks"][0];
tmp.emplace_back(chunk["offset"].asInt(), chunk["size"].asInt());
}
std::sort(tmp.begin(), tmp.end(),
[](auto const& a, auto const& b) { return a.first < b.first; });
std::vector<size_t> sizes;
std::transform(tmp.begin(), tmp.end(), std::back_inserter(sizes),
[](auto const& p) { return p.second; });
return sizes;
};
auto partitioned_sizes = [&](std::vector<size_t> in, size_t max_size) {
auto mid = std::stable_partition(
in.begin(), in.end(), [=](auto size) { return size > max_size; });
std::sort(in.begin(), mid, std::greater<size_t>());
return in;
};
std::vector<size_t> sim_ordered_sizes;
std::vector<size_t> nilsimsa_ordered_sizes;
{
auto t = make_tester();
EXPECT_EQ(0, t.run("-i / -o - -l0 --order=similarity")) << t.err();
auto fs = t.fs_from_stdout();
sim_ordered_sizes = get_sizes_in_offset_order(fs);
}
{
auto t = make_tester();
EXPECT_EQ(0, t.run("-i / -o - -l0 --order=nilsimsa")) << t.err();
auto fs = t.fs_from_stdout();
nilsimsa_ordered_sizes = get_sizes_in_offset_order(fs);
}
EXPECT_FALSE(
std::is_sorted(sim_ordered_sizes.begin(), sim_ordered_sizes.end()));
static constexpr std::array max_sim_sizes{0, 1, 200, 999,
1000, 1001, 5000, 10000};
std::set<std::string> nilsimsa_results;
for (auto max_sim_size : max_sim_sizes) {
{
auto t = make_tester();
EXPECT_EQ(0,
t.run(fmt::format(
"-i / -o - -l0 --order=similarity --max-similarity-size={}",
max_sim_size)))
<< t.err();
auto fs = t.fs_from_stdout();
auto ordered_sizes = get_sizes_in_offset_order(fs);
if (max_sim_size == 0) {
EXPECT_EQ(sim_ordered_sizes, ordered_sizes) << max_sim_size;
} else {
auto partitioned = partitioned_sizes(sim_ordered_sizes, max_sim_size);
EXPECT_EQ(partitioned, ordered_sizes) << max_sim_size;
}
}
{
auto t = make_tester();
EXPECT_EQ(0,
t.run(fmt::format(
"-i / -o - -l0 --order=nilsimsa --max-similarity-size={}",
max_sim_size)))
<< t.err();
auto fs = t.fs_from_stdout();
auto ordered_sizes = get_sizes_in_offset_order(fs);
nilsimsa_results.insert(folly::join(",", ordered_sizes));
if (max_sim_size == 0) {
EXPECT_EQ(nilsimsa_ordered_sizes, ordered_sizes) << max_sim_size;
} else {
std::vector<size_t> expected;
std::copy_if(sizes.begin(), sizes.end(), std::back_inserter(expected),
[=](auto size) { return size > max_sim_size; });
std::sort(expected.begin(), expected.end(), std::greater<size_t>());
ordered_sizes.resize(expected.size());
EXPECT_EQ(expected, ordered_sizes) << max_sim_size;
}
}
}
EXPECT_GE(nilsimsa_results.size(), 3);
}