From ae7de2486a9b428bf1c8e07b35b681edfb39e1c3 Mon Sep 17 00:00:00 2001 From: Marcus Holland-Moritz Date: Tue, 9 Jan 2024 01:44:36 +0100 Subject: [PATCH] fix(mkdwarfs): make --max-similarity-size work again + tests --- include/dwarfs/inode.h | 6 +- include/dwarfs/inode_fragments.h | 3 + include/dwarfs/inode_ordering.h | 3 +- include/dwarfs/options.h | 7 +- src/dwarfs/inode_element_view.cpp | 3 +- src/dwarfs/inode_fragments.cpp | 11 +++ src/dwarfs/inode_manager.cpp | 45 +++++++---- src/dwarfs/inode_ordering.cpp | 98 +++++++++++++++++++---- test/tool_main_test.cpp | 128 ++++++++++++++++++++++++++++++ 9 files changed, 263 insertions(+), 41 deletions(-) diff --git a/include/dwarfs/inode.h b/include/dwarfs/inode.h index 5bfd76b4..df3fad9a 100644 --- a/include/dwarfs/inode.h +++ b/include/dwarfs/inode.h @@ -23,6 +23,7 @@ #include #include +#include #include #include @@ -54,8 +55,9 @@ class inode : public object { virtual void set_num(uint32_t num) = 0; virtual uint32_t num() const = 0; virtual bool has_category(fragment_category cat) const = 0; - virtual uint32_t similarity_hash(fragment_category cat) const = 0; - virtual nilsimsa::hash_type const& + virtual std::optional + similarity_hash(fragment_category cat) const = 0; + virtual nilsimsa::hash_type const* nilsimsa_similarity_hash(fragment_category cat) const = 0; virtual size_t size() const = 0; virtual file const* any() const = 0; diff --git a/include/dwarfs/inode_fragments.h b/include/dwarfs/inode_fragments.h index 3cf9bb21..583a3ad8 100644 --- a/include/dwarfs/inode_fragments.h +++ b/include/dwarfs/inode_fragments.h @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -99,6 +100,8 @@ class inode_fragments { std::string to_string(mapper_function_type const& mapper = mapper_function_type()) const; + std::unordered_map get_category_sizes() const; + private: folly::small_vector fragments_; }; diff --git a/include/dwarfs/inode_ordering.h b/include/dwarfs/inode_ordering.h index c557efbe..87767d4d 100644 --- a/include/dwarfs/inode_ordering.h +++ b/include/dwarfs/inode_ordering.h @@ -31,11 +31,12 @@ class logger; class progress; class worker_group; +struct inode_options; struct similarity_ordering_options; class inode_ordering { public: - inode_ordering(logger& lgr, progress& prog); + inode_ordering(logger& lgr, progress& prog, inode_options const& opts); void by_inode_number(sortable_inode_span& sp) const { impl_->by_inode_number(sp); diff --git a/include/dwarfs/options.h b/include/dwarfs/options.h index c9722254..ba775496 100644 --- a/include/dwarfs/options.h +++ b/include/dwarfs/options.h @@ -101,12 +101,7 @@ struct file_order_options { }; struct inode_options { - // TODO: - clean this all up and name properly - // - the file_order thing should really be "fragment_order" - // - it should all belong into inode_options, where scanner - // can still access it - // - python scripts need to die - std::optional max_similarity_scan_size; // TODO: not sure about this? + std::optional max_similarity_scan_size; std::shared_ptr categorizer_mgr; categorized_option fragment_order{file_order_options()}; }; diff --git a/src/dwarfs/inode_element_view.cpp b/src/dwarfs/inode_element_view.cpp index ff4583d8..768758bc 100644 --- a/src/dwarfs/inode_element_view.cpp +++ b/src/dwarfs/inode_element_view.cpp @@ -36,7 +36,7 @@ inode_element_view::inode_element_view( , cat_{cat} { hash_cache_.resize(inodes_.size()); for (auto i : index) { - hash_cache_[i] = &inodes_[i]->nilsimsa_similarity_hash(cat); + hash_cache_[i] = inodes_[i]->nilsimsa_similarity_hash(cat); } } @@ -84,6 +84,7 @@ std::string inode_element_view::description(size_t i) const { } nilsimsa::hash_type const& inode_element_view::get_bits(size_t i) const { + assert(hash_cache_[i] != nullptr); return *hash_cache_[i]; } diff --git a/src/dwarfs/inode_fragments.cpp b/src/dwarfs/inode_fragments.cpp index 1c6ddeb3..ebf04e12 100644 --- a/src/dwarfs/inode_fragments.cpp +++ b/src/dwarfs/inode_fragments.cpp @@ -90,4 +90,15 @@ inode_fragments::to_string(mapper_function_type const& mapper) const { return oss.str(); } +std::unordered_map +inode_fragments::get_category_sizes() const { + std::unordered_map result; + + for (auto const& f : span()) { + result[f.category()] += f.size(); + } + + return result; +} + } // namespace dwarfs diff --git a/src/dwarfs/inode_manager.cpp b/src/dwarfs/inode_manager.cpp index fbfe5ece..5496cc38 100644 --- a/src/dwarfs/inode_manager.cpp +++ b/src/dwarfs/inode_manager.cpp @@ -95,11 +95,15 @@ class inode_ : public inode { fragments_, [cat](auto const& f) { return f.category() == cat; }); } - uint32_t similarity_hash(fragment_category cat) const override { - return find_similarity(cat); + std::optional + similarity_hash(fragment_category cat) const override { + if (auto sim = find_similarity(cat)) { + return *sim; + } + return std::nullopt; } - nilsimsa::hash_type const& + nilsimsa::hash_type const* nilsimsa_similarity_hash(fragment_category cat) const override { return find_similarity(cat); } @@ -290,24 +294,26 @@ class inode_ : public inode { } template - T const& find_similarity(fragment_category cat) const { + T const* find_similarity(fragment_category cat) const { if (fragments_.empty()) [[unlikely]] { DWARFS_THROW(runtime_error, fmt::format("inode has no fragments ({})", folly::demangle(typeid(T)))); } + if (std::holds_alternative(similarity_)) { + return nullptr; + } if (fragments_.size() == 1) { if (fragments_.get_single_category() != cat) [[unlikely]] { DWARFS_THROW(runtime_error, fmt::format("category mismatch ({})", folly::demangle(typeid(T)))); } - return std::get(similarity_); + return &std::get(similarity_); } auto& m = std::get(similarity_); if (auto it = m.find(cat); it != m.end()) { - return std::get(it->second); + return &std::get(it->second); } - DWARFS_THROW(runtime_error, fmt::format("category not found ({})", - folly::demangle(typeid(T)))); + return nullptr; } template @@ -343,17 +349,22 @@ class inode_ : public inode { std::unordered_map sc; std::unordered_map nc; - for (auto const& f : fragments_.span()) { - switch (opts.fragment_order.get(f.category()).mode) { + for (auto [cat, size] : fragments_.get_category_sizes()) { + if (auto max = opts.max_similarity_scan_size; + max && static_cast(size) > *max) { + continue; + } + + switch (opts.fragment_order.get(cat).mode) { case file_order_mode::NONE: case file_order_mode::PATH: case file_order_mode::REVPATH: break; case file_order_mode::SIMILARITY: - sc.try_emplace(f.category()); + sc.try_emplace(cat); break; case file_order_mode::NILSIMSA: - nc.try_emplace(f.category()); + nc.try_emplace(cat); break; } } @@ -396,6 +407,12 @@ class inode_ : public inode { size_t chunk_size) { assert(fragments_.size() <= 1); + if (mm) { + if (auto max = opts.max_similarity_scan_size; max && mm->size() > *max) { + return; + } + } + auto order_mode = fragments_.empty() ? opts.fragment_order.get().mode @@ -477,7 +494,7 @@ class inode_manager_ final : public inode_manager::impl { const override { auto span = sortable_span(); span.all(); - inode_ordering(LOG_GET_LOGGER, prog_).by_inode_number(span); + inode_ordering(LOG_GET_LOGGER, prog_, opts_).by_inode_number(span); for (auto const& i : span) { fn(i); } @@ -613,7 +630,7 @@ auto inode_manager_::ordered_span(fragment_category cat, auto span = sortable_span(); span.select([cat](auto const& v) { return v->has_category(cat); }); - inode_ordering order(LOG_GET_LOGGER, prog_); + inode_ordering order(LOG_GET_LOGGER, prog_, opts_); switch (opts.mode) { case file_order_mode::NONE: diff --git a/src/dwarfs/inode_ordering.cpp b/src/dwarfs/inode_ordering.cpp index bd2c3b9e..ad37c8d9 100644 --- a/src/dwarfs/inode_ordering.cpp +++ b/src/dwarfs/inode_ordering.cpp @@ -25,6 +25,7 @@ #include "dwarfs/inode_element_view.h" #include "dwarfs/inode_ordering.h" #include "dwarfs/logger.h" +#include "dwarfs/options.h" #include "dwarfs/promise_receiver.h" #include "dwarfs/similarity_ordering.h" #include "dwarfs/worker_group.h" @@ -33,12 +34,19 @@ namespace dwarfs { namespace { +bool inode_less_by_size(inode const* a, inode const* b) { + auto sa = a->size(); + auto sb = b->size(); + return sa > sb || (sa == sb && a->any()->less_revpath(*b->any())); +} + template class inode_ordering_ final : public inode_ordering::impl { public: - inode_ordering_(logger& lgr, progress& prog) + inode_ordering_(logger& lgr, progress& prog, inode_options const& opts) : LOG_PROXY_INIT(lgr) - , prog_{prog} {} + , prog_{prog} + , opts_{opts} {} void by_inode_number(sortable_inode_span& sp) const override; void by_path(sortable_inode_span& sp) const override; @@ -50,8 +58,14 @@ class inode_ordering_ final : public inode_ordering::impl { sortable_inode_span& sp, fragment_category cat) const override; private: + void + by_nilsimsa_impl(worker_group& wg, similarity_ordering_options const& opts, + std::span const> inodes, + std::vector& index, fragment_category cat) const; + LOG_PROXY_DECL(LoggerPolicy); progress& prog_; + inode_options const& opts_; }; template @@ -93,20 +107,42 @@ void inode_ordering_::by_reverse_path( template void inode_ordering_::by_similarity(sortable_inode_span& sp, fragment_category cat) const { - std::vector hash_cache; + std::vector> hash_cache; auto raw = sp.raw(); auto& index = sp.index(); + bool any_missing = false; hash_cache.resize(raw.size()); for (auto i : index) { - hash_cache[i] = raw[i]->similarity_hash(cat); + auto& cache = hash_cache[i]; + cache = raw[i]->similarity_hash(cat); + if (!cache.has_value()) { + any_missing = true; + } } - std::sort(index.begin(), index.end(), [&](auto a, auto b) { - auto const ca = hash_cache[a]; - auto const cb = hash_cache[b]; + auto size_pred = [&](auto a, auto b) { + return inode_less_by_size(raw[a].get(), raw[b].get()); + }; + + auto start = index.begin(); + + if (any_missing) { + start = std::stable_partition(index.begin(), index.end(), [&](auto i) { + return !hash_cache[i].has_value(); + }); + + std::sort(index.begin(), start, size_pred); + } + + std::sort(start, index.end(), [&](auto a, auto b) { + assert(hash_cache[a].has_value()); + assert(hash_cache[b].has_value()); + + auto const ca = *hash_cache[a]; + auto const cb = *hash_cache[b]; if (ca < cb) { return true; @@ -116,11 +152,7 @@ void inode_ordering_::by_similarity(sortable_inode_span& sp, return false; } - auto ia = raw[a].get(); - auto ib = raw[b].get(); - - return ia->size() > ib->size() || - (ia->size() == ib->size() && ia->any()->less_revpath(*ib->any())); + return size_pred(a, b); }); } @@ -128,19 +160,51 @@ template void inode_ordering_::by_nilsimsa( worker_group& wg, similarity_ordering_options const& opts, sortable_inode_span& sp, fragment_category cat) const { - auto ev = inode_element_view(sp.raw(), sp.index(), cat); + auto raw = sp.raw(); + auto& index = sp.index(); + + if (opts_.max_similarity_scan_size) { + auto mid = std::stable_partition(index.begin(), index.end(), [&](auto i) { + return !raw[i]->nilsimsa_similarity_hash(cat); + }); + + if (mid != index.begin()) { + std::sort(index.begin(), mid, [&](auto a, auto b) { + return inode_less_by_size(raw[a].get(), raw[b].get()); + }); + + if (mid != index.end()) { + std::vector small_index(mid, index.end()); + by_nilsimsa_impl(wg, opts, raw, small_index, cat); + std::copy(small_index.begin(), small_index.end(), mid); + } + + return; + } + } + + by_nilsimsa_impl(wg, opts, raw, index, cat); +} + +template +void inode_ordering_::by_nilsimsa_impl( + worker_group& wg, similarity_ordering_options const& opts, + std::span const> inodes, + std::vector& index, fragment_category cat) const { + auto ev = inode_element_view(inodes, index, cat); std::promise> promise; auto future = promise.get_future(); auto sim_order = similarity_ordering(LOG_GET_LOGGER, prog_, wg, opts); sim_order.order_nilsimsa(ev, make_receiver(std::move(promise)), - std::move(sp.index())); - future.get().swap(sp.index()); + std::move(index)); + future.get().swap(index); } } // namespace -inode_ordering::inode_ordering(logger& lgr, progress& prog) +inode_ordering::inode_ordering(logger& lgr, progress& prog, + inode_options const& opts) : impl_(make_unique_logging_object( - lgr, prog)) {} + lgr, prog, opts)) {} } // namespace dwarfs diff --git a/test/tool_main_test.cpp b/test/tool_main_test.cpp index c83ea2f1..4920e2f3 100644 --- a/test/tool_main_test.cpp +++ b/test/tool_main_test.cpp @@ -1268,3 +1268,131 @@ TEST(dwarfsck_test, export_metadata_close_error) { EXPECT_THAT(t.err(), ::testing::HasSubstr("failed to close metadata output file")); } + +class mkdwarfs_sim_order_test : public testing::TestWithParam {}; + +TEST(mkdwarfs_test, max_similarity_size) { + static constexpr std::array sizes{50, 100, 200, 500, 1000, 2000, 5000, 10000}; + + auto make_tester = [] { + std::mt19937_64 rng{42}; + auto t = mkdwarfs_tester::create_empty(); + + t.add_root_dir(); + + for (auto size : sizes) { + auto data = test::create_random_string(size, rng); + t.os->add_file("/file" + std::to_string(size), data); + } + + return t; + }; + + auto get_sizes_in_offset_order = [](filesystem_v2 const& fs) { + std::vector> tmp; + + for (auto size : sizes) { + auto path = "/file" + std::to_string(size); + auto iv = fs.find(path.c_str()); + assert(iv); + auto info = fs.get_inode_info(*iv); + assert(1 == info["chunks"].size()); + auto const& chunk = info["chunks"][0]; + tmp.emplace_back(chunk["offset"].asInt(), chunk["size"].asInt()); + } + + std::sort(tmp.begin(), tmp.end(), + [](auto const& a, auto const& b) { return a.first < b.first; }); + + std::vector sizes; + + std::transform(tmp.begin(), tmp.end(), std::back_inserter(sizes), + [](auto const& p) { return p.second; }); + + return sizes; + }; + + auto partitioned_sizes = [&](std::vector in, size_t max_size) { + auto mid = std::stable_partition( + in.begin(), in.end(), [=](auto size) { return size > max_size; }); + + std::sort(in.begin(), mid, std::greater()); + + return in; + }; + + std::vector sim_ordered_sizes; + std::vector nilsimsa_ordered_sizes; + + { + auto t = make_tester(); + EXPECT_EQ(0, t.run("-i / -o - -l0 --order=similarity")) << t.err(); + auto fs = t.fs_from_stdout(); + sim_ordered_sizes = get_sizes_in_offset_order(fs); + } + + { + auto t = make_tester(); + EXPECT_EQ(0, t.run("-i / -o - -l0 --order=nilsimsa")) << t.err(); + auto fs = t.fs_from_stdout(); + nilsimsa_ordered_sizes = get_sizes_in_offset_order(fs); + } + + EXPECT_FALSE( + std::is_sorted(sim_ordered_sizes.begin(), sim_ordered_sizes.end())); + + static constexpr std::array max_sim_sizes{0, 1, 200, 999, + 1000, 1001, 5000, 10000}; + + std::set nilsimsa_results; + + for (auto max_sim_size : max_sim_sizes) { + { + auto t = make_tester(); + EXPECT_EQ(0, + t.run(fmt::format( + "-i / -o - -l0 --order=similarity --max-similarity-size={}", + max_sim_size))) + << t.err(); + + auto fs = t.fs_from_stdout(); + + auto ordered_sizes = get_sizes_in_offset_order(fs); + + if (max_sim_size == 0) { + EXPECT_EQ(sim_ordered_sizes, ordered_sizes) << max_sim_size; + } else { + auto partitioned = partitioned_sizes(sim_ordered_sizes, max_sim_size); + EXPECT_EQ(partitioned, ordered_sizes) << max_sim_size; + } + } + + { + auto t = make_tester(); + EXPECT_EQ(0, + t.run(fmt::format( + "-i / -o - -l0 --order=nilsimsa --max-similarity-size={}", + max_sim_size))) + << t.err(); + + auto fs = t.fs_from_stdout(); + + auto ordered_sizes = get_sizes_in_offset_order(fs); + + nilsimsa_results.insert(folly::join(",", ordered_sizes)); + + if (max_sim_size == 0) { + EXPECT_EQ(nilsimsa_ordered_sizes, ordered_sizes) << max_sim_size; + } else { + std::vector expected; + std::copy_if(sizes.begin(), sizes.end(), std::back_inserter(expected), + [=](auto size) { return size > max_sim_size; }); + std::sort(expected.begin(), expected.end(), std::greater()); + ordered_sizes.resize(expected.size()); + EXPECT_EQ(expected, ordered_sizes) << max_sim_size; + } + } + } + + EXPECT_GE(nilsimsa_results.size(), 3); +}