mirror of
https://github.com/mhx/dwarfs.git
synced 2025-09-11 05:23:29 -04:00
fix(mkdwarfs): make --max-similarity-size work again + tests
This commit is contained in:
parent
08a7887d8a
commit
ae7de2486a
@ -23,6 +23,7 @@
|
||||
|
||||
#include <iosfwd>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <vector>
|
||||
|
||||
#include <folly/small_vector.h>
|
||||
@ -54,8 +55,9 @@ class inode : public object {
|
||||
virtual void set_num(uint32_t num) = 0;
|
||||
virtual uint32_t num() const = 0;
|
||||
virtual bool has_category(fragment_category cat) const = 0;
|
||||
virtual uint32_t similarity_hash(fragment_category cat) const = 0;
|
||||
virtual nilsimsa::hash_type const&
|
||||
virtual std::optional<uint32_t>
|
||||
similarity_hash(fragment_category cat) const = 0;
|
||||
virtual nilsimsa::hash_type const*
|
||||
nilsimsa_similarity_hash(fragment_category cat) const = 0;
|
||||
virtual size_t size() const = 0;
|
||||
virtual file const* any() const = 0;
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include <iosfwd>
|
||||
#include <span>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
#include <folly/small_vector.h>
|
||||
|
||||
@ -99,6 +100,8 @@ class inode_fragments {
|
||||
std::string
|
||||
to_string(mapper_function_type const& mapper = mapper_function_type()) const;
|
||||
|
||||
std::unordered_map<fragment_category, file_off_t> get_category_sizes() const;
|
||||
|
||||
private:
|
||||
folly::small_vector<single_inode_fragment, 1> fragments_;
|
||||
};
|
||||
|
@ -31,11 +31,12 @@ class logger;
|
||||
class progress;
|
||||
class worker_group;
|
||||
|
||||
struct inode_options;
|
||||
struct similarity_ordering_options;
|
||||
|
||||
class inode_ordering {
|
||||
public:
|
||||
inode_ordering(logger& lgr, progress& prog);
|
||||
inode_ordering(logger& lgr, progress& prog, inode_options const& opts);
|
||||
|
||||
void by_inode_number(sortable_inode_span& sp) const {
|
||||
impl_->by_inode_number(sp);
|
||||
|
@ -101,12 +101,7 @@ struct file_order_options {
|
||||
};
|
||||
|
||||
struct inode_options {
|
||||
// TODO: - clean this all up and name properly
|
||||
// - the file_order thing should really be "fragment_order"
|
||||
// - it should all belong into inode_options, where scanner
|
||||
// can still access it
|
||||
// - python scripts need to die
|
||||
std::optional<size_t> max_similarity_scan_size; // TODO: not sure about this?
|
||||
std::optional<size_t> max_similarity_scan_size;
|
||||
std::shared_ptr<categorizer_manager> categorizer_mgr;
|
||||
categorized_option<file_order_options> fragment_order{file_order_options()};
|
||||
};
|
||||
|
@ -36,7 +36,7 @@ inode_element_view::inode_element_view(
|
||||
, cat_{cat} {
|
||||
hash_cache_.resize(inodes_.size());
|
||||
for (auto i : index) {
|
||||
hash_cache_[i] = &inodes_[i]->nilsimsa_similarity_hash(cat);
|
||||
hash_cache_[i] = inodes_[i]->nilsimsa_similarity_hash(cat);
|
||||
}
|
||||
}
|
||||
|
||||
@ -84,6 +84,7 @@ std::string inode_element_view::description(size_t i) const {
|
||||
}
|
||||
|
||||
nilsimsa::hash_type const& inode_element_view::get_bits(size_t i) const {
|
||||
assert(hash_cache_[i] != nullptr);
|
||||
return *hash_cache_[i];
|
||||
}
|
||||
|
||||
|
@ -90,4 +90,15 @@ inode_fragments::to_string(mapper_function_type const& mapper) const {
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
std::unordered_map<fragment_category, file_off_t>
|
||||
inode_fragments::get_category_sizes() const {
|
||||
std::unordered_map<fragment_category, file_off_t> result;
|
||||
|
||||
for (auto const& f : span()) {
|
||||
result[f.category()] += f.size();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace dwarfs
|
||||
|
@ -95,11 +95,15 @@ class inode_ : public inode {
|
||||
fragments_, [cat](auto const& f) { return f.category() == cat; });
|
||||
}
|
||||
|
||||
uint32_t similarity_hash(fragment_category cat) const override {
|
||||
return find_similarity<uint32_t>(cat);
|
||||
std::optional<uint32_t>
|
||||
similarity_hash(fragment_category cat) const override {
|
||||
if (auto sim = find_similarity<uint32_t>(cat)) {
|
||||
return *sim;
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
nilsimsa::hash_type const&
|
||||
nilsimsa::hash_type const*
|
||||
nilsimsa_similarity_hash(fragment_category cat) const override {
|
||||
return find_similarity<nilsimsa::hash_type>(cat);
|
||||
}
|
||||
@ -290,24 +294,26 @@ class inode_ : public inode {
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T const& find_similarity(fragment_category cat) const {
|
||||
T const* find_similarity(fragment_category cat) const {
|
||||
if (fragments_.empty()) [[unlikely]] {
|
||||
DWARFS_THROW(runtime_error, fmt::format("inode has no fragments ({})",
|
||||
folly::demangle(typeid(T))));
|
||||
}
|
||||
if (std::holds_alternative<std::monostate>(similarity_)) {
|
||||
return nullptr;
|
||||
}
|
||||
if (fragments_.size() == 1) {
|
||||
if (fragments_.get_single_category() != cat) [[unlikely]] {
|
||||
DWARFS_THROW(runtime_error, fmt::format("category mismatch ({})",
|
||||
folly::demangle(typeid(T))));
|
||||
}
|
||||
return std::get<T>(similarity_);
|
||||
return &std::get<T>(similarity_);
|
||||
}
|
||||
auto& m = std::get<similarity_map_type>(similarity_);
|
||||
if (auto it = m.find(cat); it != m.end()) {
|
||||
return std::get<T>(it->second);
|
||||
return &std::get<T>(it->second);
|
||||
}
|
||||
DWARFS_THROW(runtime_error, fmt::format("category not found ({})",
|
||||
folly::demangle(typeid(T))));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -343,17 +349,22 @@ class inode_ : public inode {
|
||||
std::unordered_map<fragment_category, similarity> sc;
|
||||
std::unordered_map<fragment_category, nilsimsa> nc;
|
||||
|
||||
for (auto const& f : fragments_.span()) {
|
||||
switch (opts.fragment_order.get(f.category()).mode) {
|
||||
for (auto [cat, size] : fragments_.get_category_sizes()) {
|
||||
if (auto max = opts.max_similarity_scan_size;
|
||||
max && static_cast<size_t>(size) > *max) {
|
||||
continue;
|
||||
}
|
||||
|
||||
switch (opts.fragment_order.get(cat).mode) {
|
||||
case file_order_mode::NONE:
|
||||
case file_order_mode::PATH:
|
||||
case file_order_mode::REVPATH:
|
||||
break;
|
||||
case file_order_mode::SIMILARITY:
|
||||
sc.try_emplace(f.category());
|
||||
sc.try_emplace(cat);
|
||||
break;
|
||||
case file_order_mode::NILSIMSA:
|
||||
nc.try_emplace(f.category());
|
||||
nc.try_emplace(cat);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -396,6 +407,12 @@ class inode_ : public inode {
|
||||
size_t chunk_size) {
|
||||
assert(fragments_.size() <= 1);
|
||||
|
||||
if (mm) {
|
||||
if (auto max = opts.max_similarity_scan_size; max && mm->size() > *max) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
auto order_mode =
|
||||
fragments_.empty()
|
||||
? opts.fragment_order.get().mode
|
||||
@ -477,7 +494,7 @@ class inode_manager_ final : public inode_manager::impl {
|
||||
const override {
|
||||
auto span = sortable_span();
|
||||
span.all();
|
||||
inode_ordering(LOG_GET_LOGGER, prog_).by_inode_number(span);
|
||||
inode_ordering(LOG_GET_LOGGER, prog_, opts_).by_inode_number(span);
|
||||
for (auto const& i : span) {
|
||||
fn(i);
|
||||
}
|
||||
@ -613,7 +630,7 @@ auto inode_manager_<LoggerPolicy>::ordered_span(fragment_category cat,
|
||||
auto span = sortable_span();
|
||||
span.select([cat](auto const& v) { return v->has_category(cat); });
|
||||
|
||||
inode_ordering order(LOG_GET_LOGGER, prog_);
|
||||
inode_ordering order(LOG_GET_LOGGER, prog_, opts_);
|
||||
|
||||
switch (opts.mode) {
|
||||
case file_order_mode::NONE:
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include "dwarfs/inode_element_view.h"
|
||||
#include "dwarfs/inode_ordering.h"
|
||||
#include "dwarfs/logger.h"
|
||||
#include "dwarfs/options.h"
|
||||
#include "dwarfs/promise_receiver.h"
|
||||
#include "dwarfs/similarity_ordering.h"
|
||||
#include "dwarfs/worker_group.h"
|
||||
@ -33,12 +34,19 @@ namespace dwarfs {
|
||||
|
||||
namespace {
|
||||
|
||||
bool inode_less_by_size(inode const* a, inode const* b) {
|
||||
auto sa = a->size();
|
||||
auto sb = b->size();
|
||||
return sa > sb || (sa == sb && a->any()->less_revpath(*b->any()));
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
class inode_ordering_ final : public inode_ordering::impl {
|
||||
public:
|
||||
inode_ordering_(logger& lgr, progress& prog)
|
||||
inode_ordering_(logger& lgr, progress& prog, inode_options const& opts)
|
||||
: LOG_PROXY_INIT(lgr)
|
||||
, prog_{prog} {}
|
||||
, prog_{prog}
|
||||
, opts_{opts} {}
|
||||
|
||||
void by_inode_number(sortable_inode_span& sp) const override;
|
||||
void by_path(sortable_inode_span& sp) const override;
|
||||
@ -50,8 +58,14 @@ class inode_ordering_ final : public inode_ordering::impl {
|
||||
sortable_inode_span& sp, fragment_category cat) const override;
|
||||
|
||||
private:
|
||||
void
|
||||
by_nilsimsa_impl(worker_group& wg, similarity_ordering_options const& opts,
|
||||
std::span<std::shared_ptr<inode> const> inodes,
|
||||
std::vector<uint32_t>& index, fragment_category cat) const;
|
||||
|
||||
LOG_PROXY_DECL(LoggerPolicy);
|
||||
progress& prog_;
|
||||
inode_options const& opts_;
|
||||
};
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
@ -93,20 +107,42 @@ void inode_ordering_<LoggerPolicy>::by_reverse_path(
|
||||
template <typename LoggerPolicy>
|
||||
void inode_ordering_<LoggerPolicy>::by_similarity(sortable_inode_span& sp,
|
||||
fragment_category cat) const {
|
||||
std::vector<uint32_t> hash_cache;
|
||||
std::vector<std::optional<uint32_t>> hash_cache;
|
||||
|
||||
auto raw = sp.raw();
|
||||
auto& index = sp.index();
|
||||
bool any_missing = false;
|
||||
|
||||
hash_cache.resize(raw.size());
|
||||
|
||||
for (auto i : index) {
|
||||
hash_cache[i] = raw[i]->similarity_hash(cat);
|
||||
auto& cache = hash_cache[i];
|
||||
cache = raw[i]->similarity_hash(cat);
|
||||
if (!cache.has_value()) {
|
||||
any_missing = true;
|
||||
}
|
||||
}
|
||||
|
||||
std::sort(index.begin(), index.end(), [&](auto a, auto b) {
|
||||
auto const ca = hash_cache[a];
|
||||
auto const cb = hash_cache[b];
|
||||
auto size_pred = [&](auto a, auto b) {
|
||||
return inode_less_by_size(raw[a].get(), raw[b].get());
|
||||
};
|
||||
|
||||
auto start = index.begin();
|
||||
|
||||
if (any_missing) {
|
||||
start = std::stable_partition(index.begin(), index.end(), [&](auto i) {
|
||||
return !hash_cache[i].has_value();
|
||||
});
|
||||
|
||||
std::sort(index.begin(), start, size_pred);
|
||||
}
|
||||
|
||||
std::sort(start, index.end(), [&](auto a, auto b) {
|
||||
assert(hash_cache[a].has_value());
|
||||
assert(hash_cache[b].has_value());
|
||||
|
||||
auto const ca = *hash_cache[a];
|
||||
auto const cb = *hash_cache[b];
|
||||
|
||||
if (ca < cb) {
|
||||
return true;
|
||||
@ -116,11 +152,7 @@ void inode_ordering_<LoggerPolicy>::by_similarity(sortable_inode_span& sp,
|
||||
return false;
|
||||
}
|
||||
|
||||
auto ia = raw[a].get();
|
||||
auto ib = raw[b].get();
|
||||
|
||||
return ia->size() > ib->size() ||
|
||||
(ia->size() == ib->size() && ia->any()->less_revpath(*ib->any()));
|
||||
return size_pred(a, b);
|
||||
});
|
||||
}
|
||||
|
||||
@ -128,19 +160,51 @@ template <typename LoggerPolicy>
|
||||
void inode_ordering_<LoggerPolicy>::by_nilsimsa(
|
||||
worker_group& wg, similarity_ordering_options const& opts,
|
||||
sortable_inode_span& sp, fragment_category cat) const {
|
||||
auto ev = inode_element_view(sp.raw(), sp.index(), cat);
|
||||
auto raw = sp.raw();
|
||||
auto& index = sp.index();
|
||||
|
||||
if (opts_.max_similarity_scan_size) {
|
||||
auto mid = std::stable_partition(index.begin(), index.end(), [&](auto i) {
|
||||
return !raw[i]->nilsimsa_similarity_hash(cat);
|
||||
});
|
||||
|
||||
if (mid != index.begin()) {
|
||||
std::sort(index.begin(), mid, [&](auto a, auto b) {
|
||||
return inode_less_by_size(raw[a].get(), raw[b].get());
|
||||
});
|
||||
|
||||
if (mid != index.end()) {
|
||||
std::vector<uint32_t> small_index(mid, index.end());
|
||||
by_nilsimsa_impl(wg, opts, raw, small_index, cat);
|
||||
std::copy(small_index.begin(), small_index.end(), mid);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
by_nilsimsa_impl(wg, opts, raw, index, cat);
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
void inode_ordering_<LoggerPolicy>::by_nilsimsa_impl(
|
||||
worker_group& wg, similarity_ordering_options const& opts,
|
||||
std::span<std::shared_ptr<inode> const> inodes,
|
||||
std::vector<uint32_t>& index, fragment_category cat) const {
|
||||
auto ev = inode_element_view(inodes, index, cat);
|
||||
std::promise<std::vector<uint32_t>> promise;
|
||||
auto future = promise.get_future();
|
||||
auto sim_order = similarity_ordering(LOG_GET_LOGGER, prog_, wg, opts);
|
||||
sim_order.order_nilsimsa(ev, make_receiver(std::move(promise)),
|
||||
std::move(sp.index()));
|
||||
future.get().swap(sp.index());
|
||||
std::move(index));
|
||||
future.get().swap(index);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
inode_ordering::inode_ordering(logger& lgr, progress& prog)
|
||||
inode_ordering::inode_ordering(logger& lgr, progress& prog,
|
||||
inode_options const& opts)
|
||||
: impl_(make_unique_logging_object<impl, inode_ordering_, logger_policies>(
|
||||
lgr, prog)) {}
|
||||
lgr, prog, opts)) {}
|
||||
|
||||
} // namespace dwarfs
|
||||
|
@ -1268,3 +1268,131 @@ TEST(dwarfsck_test, export_metadata_close_error) {
|
||||
EXPECT_THAT(t.err(),
|
||||
::testing::HasSubstr("failed to close metadata output file"));
|
||||
}
|
||||
|
||||
class mkdwarfs_sim_order_test : public testing::TestWithParam<char const*> {};
|
||||
|
||||
TEST(mkdwarfs_test, max_similarity_size) {
|
||||
static constexpr std::array sizes{50, 100, 200, 500, 1000, 2000, 5000, 10000};
|
||||
|
||||
auto make_tester = [] {
|
||||
std::mt19937_64 rng{42};
|
||||
auto t = mkdwarfs_tester::create_empty();
|
||||
|
||||
t.add_root_dir();
|
||||
|
||||
for (auto size : sizes) {
|
||||
auto data = test::create_random_string(size, rng);
|
||||
t.os->add_file("/file" + std::to_string(size), data);
|
||||
}
|
||||
|
||||
return t;
|
||||
};
|
||||
|
||||
auto get_sizes_in_offset_order = [](filesystem_v2 const& fs) {
|
||||
std::vector<std::pair<size_t, size_t>> tmp;
|
||||
|
||||
for (auto size : sizes) {
|
||||
auto path = "/file" + std::to_string(size);
|
||||
auto iv = fs.find(path.c_str());
|
||||
assert(iv);
|
||||
auto info = fs.get_inode_info(*iv);
|
||||
assert(1 == info["chunks"].size());
|
||||
auto const& chunk = info["chunks"][0];
|
||||
tmp.emplace_back(chunk["offset"].asInt(), chunk["size"].asInt());
|
||||
}
|
||||
|
||||
std::sort(tmp.begin(), tmp.end(),
|
||||
[](auto const& a, auto const& b) { return a.first < b.first; });
|
||||
|
||||
std::vector<size_t> sizes;
|
||||
|
||||
std::transform(tmp.begin(), tmp.end(), std::back_inserter(sizes),
|
||||
[](auto const& p) { return p.second; });
|
||||
|
||||
return sizes;
|
||||
};
|
||||
|
||||
auto partitioned_sizes = [&](std::vector<size_t> in, size_t max_size) {
|
||||
auto mid = std::stable_partition(
|
||||
in.begin(), in.end(), [=](auto size) { return size > max_size; });
|
||||
|
||||
std::sort(in.begin(), mid, std::greater<size_t>());
|
||||
|
||||
return in;
|
||||
};
|
||||
|
||||
std::vector<size_t> sim_ordered_sizes;
|
||||
std::vector<size_t> nilsimsa_ordered_sizes;
|
||||
|
||||
{
|
||||
auto t = make_tester();
|
||||
EXPECT_EQ(0, t.run("-i / -o - -l0 --order=similarity")) << t.err();
|
||||
auto fs = t.fs_from_stdout();
|
||||
sim_ordered_sizes = get_sizes_in_offset_order(fs);
|
||||
}
|
||||
|
||||
{
|
||||
auto t = make_tester();
|
||||
EXPECT_EQ(0, t.run("-i / -o - -l0 --order=nilsimsa")) << t.err();
|
||||
auto fs = t.fs_from_stdout();
|
||||
nilsimsa_ordered_sizes = get_sizes_in_offset_order(fs);
|
||||
}
|
||||
|
||||
EXPECT_FALSE(
|
||||
std::is_sorted(sim_ordered_sizes.begin(), sim_ordered_sizes.end()));
|
||||
|
||||
static constexpr std::array max_sim_sizes{0, 1, 200, 999,
|
||||
1000, 1001, 5000, 10000};
|
||||
|
||||
std::set<std::string> nilsimsa_results;
|
||||
|
||||
for (auto max_sim_size : max_sim_sizes) {
|
||||
{
|
||||
auto t = make_tester();
|
||||
EXPECT_EQ(0,
|
||||
t.run(fmt::format(
|
||||
"-i / -o - -l0 --order=similarity --max-similarity-size={}",
|
||||
max_sim_size)))
|
||||
<< t.err();
|
||||
|
||||
auto fs = t.fs_from_stdout();
|
||||
|
||||
auto ordered_sizes = get_sizes_in_offset_order(fs);
|
||||
|
||||
if (max_sim_size == 0) {
|
||||
EXPECT_EQ(sim_ordered_sizes, ordered_sizes) << max_sim_size;
|
||||
} else {
|
||||
auto partitioned = partitioned_sizes(sim_ordered_sizes, max_sim_size);
|
||||
EXPECT_EQ(partitioned, ordered_sizes) << max_sim_size;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
auto t = make_tester();
|
||||
EXPECT_EQ(0,
|
||||
t.run(fmt::format(
|
||||
"-i / -o - -l0 --order=nilsimsa --max-similarity-size={}",
|
||||
max_sim_size)))
|
||||
<< t.err();
|
||||
|
||||
auto fs = t.fs_from_stdout();
|
||||
|
||||
auto ordered_sizes = get_sizes_in_offset_order(fs);
|
||||
|
||||
nilsimsa_results.insert(folly::join(",", ordered_sizes));
|
||||
|
||||
if (max_sim_size == 0) {
|
||||
EXPECT_EQ(nilsimsa_ordered_sizes, ordered_sizes) << max_sim_size;
|
||||
} else {
|
||||
std::vector<size_t> expected;
|
||||
std::copy_if(sizes.begin(), sizes.end(), std::back_inserter(expected),
|
||||
[=](auto size) { return size > max_sim_size; });
|
||||
std::sort(expected.begin(), expected.end(), std::greater<size_t>());
|
||||
ordered_sizes.resize(expected.size());
|
||||
EXPECT_EQ(expected, ordered_sizes) << max_sim_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
EXPECT_GE(nilsimsa_results.size(), 3);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user