feat: add support for explicit inode ordering

This commit is contained in:
Marcus Holland-Moritz 2025-04-04 15:52:26 +02:00
parent c061a5253d
commit 1c7cbec1ee
8 changed files with 116 additions and 7 deletions

View File

@ -23,14 +23,22 @@
#pragma once
#include <filesystem>
#include <iosfwd>
#include <string>
#include <unordered_map>
namespace dwarfs::writer {
// TODO: rename? -> inode_order_mode / fragment_order_mode
enum class fragment_order_mode { NONE, PATH, REVPATH, SIMILARITY, NILSIMSA };
enum class fragment_order_mode {
NONE,
PATH,
REVPATH,
SIMILARITY,
NILSIMSA,
EXPLICIT
};
// TODO: rename? -> inode_order_options / fragment_order_options
struct fragment_order_options {
static constexpr int const kDefaultNilsimsaMaxChildren{16384};
static constexpr int const kDefaultNilsimsaMaxClusterSize{16384};
@ -38,6 +46,8 @@ struct fragment_order_options {
fragment_order_mode mode{fragment_order_mode::NONE};
int nilsimsa_max_children{kDefaultNilsimsaMaxChildren};
int nilsimsa_max_cluster_size{kDefaultNilsimsaMaxClusterSize};
std::string explicit_order_file{};
std::unordered_map<std::filesystem::path, size_t> explicit_order{};
};
std::ostream& operator<<(std::ostream& os, fragment_order_mode mode);

View File

@ -24,6 +24,7 @@
#pragma once
#include <cstddef>
#include <filesystem>
#include <functional>
#include <iosfwd>
#include <memory>
@ -77,7 +78,9 @@ class inode_manager {
size_t total_size{0};
};
inode_manager(logger& lgr, progress& prog, inode_options const& opts);
inode_manager(logger& lgr, progress& prog,
std::filesystem::path const& root_path,
inode_options const& opts);
std::shared_ptr<inode> create_inode() { return impl_->create_inode(); }

View File

@ -23,7 +23,9 @@
#pragma once
#include <filesystem>
#include <memory>
#include <unordered_map>
#include <dwarfs/writer/internal/inode.h>
@ -40,6 +42,7 @@ class worker_group;
namespace writer {
struct inode_options;
struct fragment_order_options;
namespace internal {
@ -71,6 +74,12 @@ class inode_ordering {
impl_->by_nilsimsa(wg, opts, sp, cat);
}
void by_explicit_order(sortable_inode_span& sp,
std::filesystem::path const& root_path,
fragment_order_options const& opts) const {
impl_->by_explicit_order(sp, root_path, opts);
}
class impl {
public:
virtual ~impl() = default;
@ -84,6 +93,10 @@ class inode_ordering {
by_nilsimsa(dwarfs::internal::worker_group& wg,
similarity_ordering_options const& opts,
sortable_inode_span& sp, fragment_category cat) const = 0;
virtual void
by_explicit_order(sortable_inode_span& sp,
std::filesystem::path const& root_path,
fragment_order_options const& opts) const = 0;
};
private:

View File

@ -50,6 +50,9 @@ std::ostream& operator<<(std::ostream& os, fragment_order_mode mode) {
case fragment_order_mode::NILSIMSA:
modestr = "nilsimsa";
break;
case fragment_order_mode::EXPLICIT:
modestr = "explicit";
break;
}
return os << modestr;

View File

@ -22,6 +22,7 @@
*/
#include <array>
#include <fstream>
#include <stdexcept>
#include <vector>
@ -47,6 +48,7 @@ constexpr std::array order_choices{
std::pair{"revpath"sv, fragment_order_mode::REVPATH},
std::pair{"similarity"sv, fragment_order_mode::SIMILARITY},
std::pair{"nilsimsa"sv, fragment_order_mode::NILSIMSA},
std::pair{"explicit"sv, fragment_order_mode::EXPLICIT},
};
} // namespace
@ -95,6 +97,21 @@ fragment_order_parser::parse(std::string_view arg) const {
}
break;
case fragment_order_mode::EXPLICIT: {
auto file = om.get<std::string>("file");
std::ifstream ifs{file};
if (!ifs) {
throw std::runtime_error(
fmt::format("failed to open explicit order file '{}'", file));
}
std::string line;
while (std::getline(ifs, line)) {
auto const path = std::filesystem::path{line}.relative_path();
rv.explicit_order[path] = rv.explicit_order.size();
}
rv.explicit_order_file = std::move(file);
} break;
default:
throw std::runtime_error(
fmt::format("inode order mode '{}' does not support options", algo));
@ -125,6 +142,9 @@ fragment_order_parser::to_string(fragment_order_options const& opts) const {
return fmt::format("nilsimsa:max_children={}:max_cluster_size={}",
opts.nilsimsa_max_children,
opts.nilsimsa_max_cluster_size);
case fragment_order_mode::EXPLICIT:
return fmt::format("explicit:file={}", opts.explicit_order_file);
}
return "<unknown>";
}

View File

@ -68,6 +68,7 @@
namespace dwarfs::writer::internal {
using namespace dwarfs::internal;
namespace fs = std::filesystem;
namespace {
@ -430,6 +431,7 @@ class inode_ : public inode {
case fragment_order_mode::NONE:
case fragment_order_mode::PATH:
case fragment_order_mode::REVPATH:
case fragment_order_mode::EXPLICIT:
break;
case fragment_order_mode::SIMILARITY:
sc.try_emplace(cat);
@ -493,6 +495,7 @@ class inode_ : public inode {
case fragment_order_mode::NONE:
case fragment_order_mode::PATH:
case fragment_order_mode::REVPATH:
case fragment_order_mode::EXPLICIT:
break;
case fragment_order_mode::SIMILARITY: {
@ -545,9 +548,11 @@ class inode_ : public inode {
template <typename LoggerPolicy>
class inode_manager_ final : public inode_manager::impl {
public:
inode_manager_(logger& lgr, progress& prog, inode_options const& opts)
inode_manager_(logger& lgr, progress& prog, fs::path const& root_path,
inode_options const& opts)
: LOG_PROXY_INIT(lgr)
, prog_(prog)
, root_path_{root_path}
, opts_{opts}
, inodes_need_scanning_{inodes_need_scanning(opts_)} {}
@ -657,6 +662,7 @@ class inode_manager_ final : public inode_manager::impl {
LOG_PROXY_DECL(LoggerPolicy);
std::vector<std::shared_ptr<inode>> inodes_;
progress& prog_;
fs::path const root_path_;
inode_options opts_;
bool const inodes_need_scanning_;
std::atomic<size_t> mutable num_invalid_inodes_{0};
@ -816,14 +822,25 @@ auto inode_manager_<LoggerPolicy>::ordered_span(fragment_category cat,
tv << prefix << span.size() << " inodes ordered";
break;
}
case fragment_order_mode::EXPLICIT: {
LOG_VERBOSE << prefix << "ordering " << span.size()
<< " inodes by explicit order...";
auto tv = LOG_CPU_TIMED_VERBOSE;
order.by_explicit_order(span, root_path_, opts);
tv << prefix << span.size() << " inodes ordered";
break;
}
}
return span;
}
inode_manager::inode_manager(logger& lgr, progress& prog,
fs::path const& root_path,
inode_options const& opts)
: impl_(make_unique_logging_object<impl, internal::inode_manager_,
logger_policies>(lgr, prog, opts)) {}
logger_policies>(lgr, prog, root_path,
opts)) {}
} // namespace dwarfs::writer::internal

View File

@ -36,6 +36,7 @@
namespace dwarfs::writer::internal {
using namespace dwarfs::internal;
namespace fs = std::filesystem;
namespace {
@ -63,6 +64,8 @@ class inode_ordering_ final : public inode_ordering::impl {
void
by_nilsimsa(worker_group& wg, similarity_ordering_options const& opts,
sortable_inode_span& sp, fragment_category cat) const override;
void by_explicit_order(sortable_inode_span& sp, fs::path const& root_path,
fragment_order_options const& opts) const override;
private:
void
@ -207,6 +210,46 @@ void inode_ordering_<LoggerPolicy>::by_nilsimsa_impl(
future.get().swap(index);
}
template <typename LoggerPolicy>
void inode_ordering_<LoggerPolicy>::by_explicit_order(
sortable_inode_span& sp, fs::path const& root_path,
fragment_order_options const& opts) const {
auto raw = sp.raw();
auto& index = sp.index();
auto const& order = opts.explicit_order;
if (order.empty()) {
LOG_WARN << "empty explicit order file set";
}
std::vector<std::filesystem::path> paths;
std::vector<std::optional<size_t>> path_order;
paths.resize(raw.size());
path_order.resize(raw.size());
for (auto i : index) {
paths[i] = fs::relative(raw[i]->any()->fs_path(), root_path);
if (auto it = order.find(paths[i]); it != order.end()) {
path_order[i] = it->second;
} else {
LOG_DEBUG << "explicit order: " << paths[i]
<< " not found in explicit order file";
}
}
std::sort(index.begin(), index.end(), [&](auto a, auto b) {
auto const& ai = path_order[a];
auto const& bi = path_order[b];
return ai.has_value() && bi.has_value() ? *ai < *bi
: raw[a]->num() < raw[b]->num();
});
for (auto i : index) {
LOG_DEBUG << "explicit order: " << paths[i];
}
}
inode_ordering::inode_ordering(logger& lgr, progress& prog,
inode_options const& opts)
: impl_(make_unique_logging_object<impl, internal::inode_ordering_,

View File

@ -723,7 +723,7 @@ void scanner_<LoggerPolicy>::scan(
prog.set_status_function(status_string);
inode_manager im(LOG_GET_LOGGER, prog, options_.inode);
inode_manager im(LOG_GET_LOGGER, prog, path, options_.inode);
file_scanner fs(LOG_GET_LOGGER, wg_, os_, im, prog,
{.hash_algo = options_.file_hash_algorithm,
.debug_inode_create = os_.getenv(kEnvVarDumpFilesRaw) ||