mirror of
https://github.com/mhx/dwarfs.git
synced 2025-08-03 17:56:12 -04:00
feat: add support for explicit inode ordering
This commit is contained in:
parent
c061a5253d
commit
1c7cbec1ee
@ -23,14 +23,22 @@
|
|||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <filesystem>
|
||||||
#include <iosfwd>
|
#include <iosfwd>
|
||||||
|
#include <string>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
namespace dwarfs::writer {
|
namespace dwarfs::writer {
|
||||||
|
|
||||||
// TODO: rename? -> inode_order_mode / fragment_order_mode
|
enum class fragment_order_mode {
|
||||||
enum class fragment_order_mode { NONE, PATH, REVPATH, SIMILARITY, NILSIMSA };
|
NONE,
|
||||||
|
PATH,
|
||||||
|
REVPATH,
|
||||||
|
SIMILARITY,
|
||||||
|
NILSIMSA,
|
||||||
|
EXPLICIT
|
||||||
|
};
|
||||||
|
|
||||||
// TODO: rename? -> inode_order_options / fragment_order_options
|
|
||||||
struct fragment_order_options {
|
struct fragment_order_options {
|
||||||
static constexpr int const kDefaultNilsimsaMaxChildren{16384};
|
static constexpr int const kDefaultNilsimsaMaxChildren{16384};
|
||||||
static constexpr int const kDefaultNilsimsaMaxClusterSize{16384};
|
static constexpr int const kDefaultNilsimsaMaxClusterSize{16384};
|
||||||
@ -38,6 +46,8 @@ struct fragment_order_options {
|
|||||||
fragment_order_mode mode{fragment_order_mode::NONE};
|
fragment_order_mode mode{fragment_order_mode::NONE};
|
||||||
int nilsimsa_max_children{kDefaultNilsimsaMaxChildren};
|
int nilsimsa_max_children{kDefaultNilsimsaMaxChildren};
|
||||||
int nilsimsa_max_cluster_size{kDefaultNilsimsaMaxClusterSize};
|
int nilsimsa_max_cluster_size{kDefaultNilsimsaMaxClusterSize};
|
||||||
|
std::string explicit_order_file{};
|
||||||
|
std::unordered_map<std::filesystem::path, size_t> explicit_order{};
|
||||||
};
|
};
|
||||||
|
|
||||||
std::ostream& operator<<(std::ostream& os, fragment_order_mode mode);
|
std::ostream& operator<<(std::ostream& os, fragment_order_mode mode);
|
||||||
|
@ -24,6 +24,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
|
#include <filesystem>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <iosfwd>
|
#include <iosfwd>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
@ -77,7 +78,9 @@ class inode_manager {
|
|||||||
size_t total_size{0};
|
size_t total_size{0};
|
||||||
};
|
};
|
||||||
|
|
||||||
inode_manager(logger& lgr, progress& prog, inode_options const& opts);
|
inode_manager(logger& lgr, progress& prog,
|
||||||
|
std::filesystem::path const& root_path,
|
||||||
|
inode_options const& opts);
|
||||||
|
|
||||||
std::shared_ptr<inode> create_inode() { return impl_->create_inode(); }
|
std::shared_ptr<inode> create_inode() { return impl_->create_inode(); }
|
||||||
|
|
||||||
|
@ -23,7 +23,9 @@
|
|||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include <filesystem>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
#include <dwarfs/writer/internal/inode.h>
|
#include <dwarfs/writer/internal/inode.h>
|
||||||
|
|
||||||
@ -40,6 +42,7 @@ class worker_group;
|
|||||||
namespace writer {
|
namespace writer {
|
||||||
|
|
||||||
struct inode_options;
|
struct inode_options;
|
||||||
|
struct fragment_order_options;
|
||||||
|
|
||||||
namespace internal {
|
namespace internal {
|
||||||
|
|
||||||
@ -71,6 +74,12 @@ class inode_ordering {
|
|||||||
impl_->by_nilsimsa(wg, opts, sp, cat);
|
impl_->by_nilsimsa(wg, opts, sp, cat);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void by_explicit_order(sortable_inode_span& sp,
|
||||||
|
std::filesystem::path const& root_path,
|
||||||
|
fragment_order_options const& opts) const {
|
||||||
|
impl_->by_explicit_order(sp, root_path, opts);
|
||||||
|
}
|
||||||
|
|
||||||
class impl {
|
class impl {
|
||||||
public:
|
public:
|
||||||
virtual ~impl() = default;
|
virtual ~impl() = default;
|
||||||
@ -84,6 +93,10 @@ class inode_ordering {
|
|||||||
by_nilsimsa(dwarfs::internal::worker_group& wg,
|
by_nilsimsa(dwarfs::internal::worker_group& wg,
|
||||||
similarity_ordering_options const& opts,
|
similarity_ordering_options const& opts,
|
||||||
sortable_inode_span& sp, fragment_category cat) const = 0;
|
sortable_inode_span& sp, fragment_category cat) const = 0;
|
||||||
|
virtual void
|
||||||
|
by_explicit_order(sortable_inode_span& sp,
|
||||||
|
std::filesystem::path const& root_path,
|
||||||
|
fragment_order_options const& opts) const = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -50,6 +50,9 @@ std::ostream& operator<<(std::ostream& os, fragment_order_mode mode) {
|
|||||||
case fragment_order_mode::NILSIMSA:
|
case fragment_order_mode::NILSIMSA:
|
||||||
modestr = "nilsimsa";
|
modestr = "nilsimsa";
|
||||||
break;
|
break;
|
||||||
|
case fragment_order_mode::EXPLICIT:
|
||||||
|
modestr = "explicit";
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
return os << modestr;
|
return os << modestr;
|
||||||
|
@ -22,6 +22,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include <array>
|
#include <array>
|
||||||
|
#include <fstream>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
@ -47,6 +48,7 @@ constexpr std::array order_choices{
|
|||||||
std::pair{"revpath"sv, fragment_order_mode::REVPATH},
|
std::pair{"revpath"sv, fragment_order_mode::REVPATH},
|
||||||
std::pair{"similarity"sv, fragment_order_mode::SIMILARITY},
|
std::pair{"similarity"sv, fragment_order_mode::SIMILARITY},
|
||||||
std::pair{"nilsimsa"sv, fragment_order_mode::NILSIMSA},
|
std::pair{"nilsimsa"sv, fragment_order_mode::NILSIMSA},
|
||||||
|
std::pair{"explicit"sv, fragment_order_mode::EXPLICIT},
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
@ -95,6 +97,21 @@ fragment_order_parser::parse(std::string_view arg) const {
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case fragment_order_mode::EXPLICIT: {
|
||||||
|
auto file = om.get<std::string>("file");
|
||||||
|
std::ifstream ifs{file};
|
||||||
|
if (!ifs) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
fmt::format("failed to open explicit order file '{}'", file));
|
||||||
|
}
|
||||||
|
std::string line;
|
||||||
|
while (std::getline(ifs, line)) {
|
||||||
|
auto const path = std::filesystem::path{line}.relative_path();
|
||||||
|
rv.explicit_order[path] = rv.explicit_order.size();
|
||||||
|
}
|
||||||
|
rv.explicit_order_file = std::move(file);
|
||||||
|
} break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error(
|
throw std::runtime_error(
|
||||||
fmt::format("inode order mode '{}' does not support options", algo));
|
fmt::format("inode order mode '{}' does not support options", algo));
|
||||||
@ -125,6 +142,9 @@ fragment_order_parser::to_string(fragment_order_options const& opts) const {
|
|||||||
return fmt::format("nilsimsa:max_children={}:max_cluster_size={}",
|
return fmt::format("nilsimsa:max_children={}:max_cluster_size={}",
|
||||||
opts.nilsimsa_max_children,
|
opts.nilsimsa_max_children,
|
||||||
opts.nilsimsa_max_cluster_size);
|
opts.nilsimsa_max_cluster_size);
|
||||||
|
|
||||||
|
case fragment_order_mode::EXPLICIT:
|
||||||
|
return fmt::format("explicit:file={}", opts.explicit_order_file);
|
||||||
}
|
}
|
||||||
return "<unknown>";
|
return "<unknown>";
|
||||||
}
|
}
|
||||||
|
@ -68,6 +68,7 @@
|
|||||||
namespace dwarfs::writer::internal {
|
namespace dwarfs::writer::internal {
|
||||||
|
|
||||||
using namespace dwarfs::internal;
|
using namespace dwarfs::internal;
|
||||||
|
namespace fs = std::filesystem;
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
@ -430,6 +431,7 @@ class inode_ : public inode {
|
|||||||
case fragment_order_mode::NONE:
|
case fragment_order_mode::NONE:
|
||||||
case fragment_order_mode::PATH:
|
case fragment_order_mode::PATH:
|
||||||
case fragment_order_mode::REVPATH:
|
case fragment_order_mode::REVPATH:
|
||||||
|
case fragment_order_mode::EXPLICIT:
|
||||||
break;
|
break;
|
||||||
case fragment_order_mode::SIMILARITY:
|
case fragment_order_mode::SIMILARITY:
|
||||||
sc.try_emplace(cat);
|
sc.try_emplace(cat);
|
||||||
@ -493,6 +495,7 @@ class inode_ : public inode {
|
|||||||
case fragment_order_mode::NONE:
|
case fragment_order_mode::NONE:
|
||||||
case fragment_order_mode::PATH:
|
case fragment_order_mode::PATH:
|
||||||
case fragment_order_mode::REVPATH:
|
case fragment_order_mode::REVPATH:
|
||||||
|
case fragment_order_mode::EXPLICIT:
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case fragment_order_mode::SIMILARITY: {
|
case fragment_order_mode::SIMILARITY: {
|
||||||
@ -545,9 +548,11 @@ class inode_ : public inode {
|
|||||||
template <typename LoggerPolicy>
|
template <typename LoggerPolicy>
|
||||||
class inode_manager_ final : public inode_manager::impl {
|
class inode_manager_ final : public inode_manager::impl {
|
||||||
public:
|
public:
|
||||||
inode_manager_(logger& lgr, progress& prog, inode_options const& opts)
|
inode_manager_(logger& lgr, progress& prog, fs::path const& root_path,
|
||||||
|
inode_options const& opts)
|
||||||
: LOG_PROXY_INIT(lgr)
|
: LOG_PROXY_INIT(lgr)
|
||||||
, prog_(prog)
|
, prog_(prog)
|
||||||
|
, root_path_{root_path}
|
||||||
, opts_{opts}
|
, opts_{opts}
|
||||||
, inodes_need_scanning_{inodes_need_scanning(opts_)} {}
|
, inodes_need_scanning_{inodes_need_scanning(opts_)} {}
|
||||||
|
|
||||||
@ -657,6 +662,7 @@ class inode_manager_ final : public inode_manager::impl {
|
|||||||
LOG_PROXY_DECL(LoggerPolicy);
|
LOG_PROXY_DECL(LoggerPolicy);
|
||||||
std::vector<std::shared_ptr<inode>> inodes_;
|
std::vector<std::shared_ptr<inode>> inodes_;
|
||||||
progress& prog_;
|
progress& prog_;
|
||||||
|
fs::path const root_path_;
|
||||||
inode_options opts_;
|
inode_options opts_;
|
||||||
bool const inodes_need_scanning_;
|
bool const inodes_need_scanning_;
|
||||||
std::atomic<size_t> mutable num_invalid_inodes_{0};
|
std::atomic<size_t> mutable num_invalid_inodes_{0};
|
||||||
@ -816,14 +822,25 @@ auto inode_manager_<LoggerPolicy>::ordered_span(fragment_category cat,
|
|||||||
tv << prefix << span.size() << " inodes ordered";
|
tv << prefix << span.size() << " inodes ordered";
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case fragment_order_mode::EXPLICIT: {
|
||||||
|
LOG_VERBOSE << prefix << "ordering " << span.size()
|
||||||
|
<< " inodes by explicit order...";
|
||||||
|
auto tv = LOG_CPU_TIMED_VERBOSE;
|
||||||
|
order.by_explicit_order(span, root_path_, opts);
|
||||||
|
tv << prefix << span.size() << " inodes ordered";
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return span;
|
return span;
|
||||||
}
|
}
|
||||||
|
|
||||||
inode_manager::inode_manager(logger& lgr, progress& prog,
|
inode_manager::inode_manager(logger& lgr, progress& prog,
|
||||||
|
fs::path const& root_path,
|
||||||
inode_options const& opts)
|
inode_options const& opts)
|
||||||
: impl_(make_unique_logging_object<impl, internal::inode_manager_,
|
: impl_(make_unique_logging_object<impl, internal::inode_manager_,
|
||||||
logger_policies>(lgr, prog, opts)) {}
|
logger_policies>(lgr, prog, root_path,
|
||||||
|
opts)) {}
|
||||||
|
|
||||||
} // namespace dwarfs::writer::internal
|
} // namespace dwarfs::writer::internal
|
||||||
|
@ -36,6 +36,7 @@
|
|||||||
namespace dwarfs::writer::internal {
|
namespace dwarfs::writer::internal {
|
||||||
|
|
||||||
using namespace dwarfs::internal;
|
using namespace dwarfs::internal;
|
||||||
|
namespace fs = std::filesystem;
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
@ -63,6 +64,8 @@ class inode_ordering_ final : public inode_ordering::impl {
|
|||||||
void
|
void
|
||||||
by_nilsimsa(worker_group& wg, similarity_ordering_options const& opts,
|
by_nilsimsa(worker_group& wg, similarity_ordering_options const& opts,
|
||||||
sortable_inode_span& sp, fragment_category cat) const override;
|
sortable_inode_span& sp, fragment_category cat) const override;
|
||||||
|
void by_explicit_order(sortable_inode_span& sp, fs::path const& root_path,
|
||||||
|
fragment_order_options const& opts) const override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void
|
void
|
||||||
@ -207,6 +210,46 @@ void inode_ordering_<LoggerPolicy>::by_nilsimsa_impl(
|
|||||||
future.get().swap(index);
|
future.get().swap(index);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename LoggerPolicy>
|
||||||
|
void inode_ordering_<LoggerPolicy>::by_explicit_order(
|
||||||
|
sortable_inode_span& sp, fs::path const& root_path,
|
||||||
|
fragment_order_options const& opts) const {
|
||||||
|
auto raw = sp.raw();
|
||||||
|
auto& index = sp.index();
|
||||||
|
auto const& order = opts.explicit_order;
|
||||||
|
|
||||||
|
if (order.empty()) {
|
||||||
|
LOG_WARN << "empty explicit order file set";
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::filesystem::path> paths;
|
||||||
|
std::vector<std::optional<size_t>> path_order;
|
||||||
|
paths.resize(raw.size());
|
||||||
|
path_order.resize(raw.size());
|
||||||
|
|
||||||
|
for (auto i : index) {
|
||||||
|
paths[i] = fs::relative(raw[i]->any()->fs_path(), root_path);
|
||||||
|
|
||||||
|
if (auto it = order.find(paths[i]); it != order.end()) {
|
||||||
|
path_order[i] = it->second;
|
||||||
|
} else {
|
||||||
|
LOG_DEBUG << "explicit order: " << paths[i]
|
||||||
|
<< " not found in explicit order file";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::sort(index.begin(), index.end(), [&](auto a, auto b) {
|
||||||
|
auto const& ai = path_order[a];
|
||||||
|
auto const& bi = path_order[b];
|
||||||
|
return ai.has_value() && bi.has_value() ? *ai < *bi
|
||||||
|
: raw[a]->num() < raw[b]->num();
|
||||||
|
});
|
||||||
|
|
||||||
|
for (auto i : index) {
|
||||||
|
LOG_DEBUG << "explicit order: " << paths[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
inode_ordering::inode_ordering(logger& lgr, progress& prog,
|
inode_ordering::inode_ordering(logger& lgr, progress& prog,
|
||||||
inode_options const& opts)
|
inode_options const& opts)
|
||||||
: impl_(make_unique_logging_object<impl, internal::inode_ordering_,
|
: impl_(make_unique_logging_object<impl, internal::inode_ordering_,
|
||||||
|
@ -723,7 +723,7 @@ void scanner_<LoggerPolicy>::scan(
|
|||||||
|
|
||||||
prog.set_status_function(status_string);
|
prog.set_status_function(status_string);
|
||||||
|
|
||||||
inode_manager im(LOG_GET_LOGGER, prog, options_.inode);
|
inode_manager im(LOG_GET_LOGGER, prog, path, options_.inode);
|
||||||
file_scanner fs(LOG_GET_LOGGER, wg_, os_, im, prog,
|
file_scanner fs(LOG_GET_LOGGER, wg_, os_, im, prog,
|
||||||
{.hash_algo = options_.file_hash_algorithm,
|
{.hash_algo = options_.file_hash_algorithm,
|
||||||
.debug_inode_create = os_.getenv(kEnvVarDumpFilesRaw) ||
|
.debug_inode_create = os_.getenv(kEnvVarDumpFilesRaw) ||
|
||||||
|
Loading…
x
Reference in New Issue
Block a user