refactor(categorizer): allow access to root and relative paths

This commit is contained in:
Marcus Holland-Moritz 2025-04-04 13:17:11 +02:00
parent db2d20dfce
commit d84d7535c7
9 changed files with 62 additions and 31 deletions

View File

@ -65,10 +65,26 @@ class categorizer {
subcategory_less(fragment_category a, fragment_category b) const = 0; subcategory_less(fragment_category a, fragment_category b) const = 0;
}; };
class file_path_info {
public:
file_path_info(std::filesystem::path const& root_path,
std::filesystem::path const& full_path)
: root_path_{root_path}
, full_path_{full_path} {}
std::filesystem::path const& root_path() const { return root_path_; }
std::filesystem::path const& full_path() const { return full_path_; }
std::filesystem::path relative_path() const;
private:
std::filesystem::path const& root_path_;
std::filesystem::path const& full_path_;
};
class random_access_categorizer : public categorizer { class random_access_categorizer : public categorizer {
public: public:
virtual inode_fragments virtual inode_fragments
categorize(std::filesystem::path const& path, std::span<uint8_t const> data, categorize(file_path_info const& path, std::span<uint8_t const> data,
category_mapper const& mapper) const = 0; category_mapper const& mapper) const = 0;
}; };
@ -85,7 +101,7 @@ class sequential_categorizer_job {
class sequential_categorizer : public categorizer { class sequential_categorizer : public categorizer {
public: public:
virtual std::unique_ptr<sequential_categorizer_job> virtual std::unique_ptr<sequential_categorizer_job>
job(std::filesystem::path const& path, size_t total_size, job(file_path_info const& path, size_t total_size,
category_mapper const& mapper) const = 0; category_mapper const& mapper) const = 0;
}; };
@ -129,7 +145,7 @@ class categorizer_job {
class categorizer_manager : public category_resolver { class categorizer_manager : public category_resolver {
public: public:
categorizer_manager(logger& lgr); categorizer_manager(logger& lgr, std::filesystem::path root);
static fragment_category default_category(); static fragment_category default_category();

View File

@ -45,6 +45,8 @@
namespace dwarfs::writer { namespace dwarfs::writer {
namespace fs = std::filesystem;
namespace internal { namespace internal {
using namespace std::placeholders; using namespace std::placeholders;
@ -61,9 +63,10 @@ template <typename LoggerPolicy>
class categorizer_job_ final : public categorizer_job::impl { class categorizer_job_ final : public categorizer_job::impl {
public: public:
categorizer_job_(logger& lgr, categorizer_manager_private const& mgr, categorizer_job_(logger& lgr, categorizer_manager_private const& mgr,
std::filesystem::path const& path) fs::path const& root_path, fs::path const& path)
: LOG_PROXY_INIT(lgr) : LOG_PROXY_INIT(lgr)
, mgr_{mgr} , mgr_{mgr}
, root_path_{root_path}
, path_{path} , path_{path}
, cat_mapper_{// NOLINTNEXTLINE(modernize-avoid-bind) , cat_mapper_{// NOLINTNEXTLINE(modernize-avoid-bind)
std::bind(&categorizer_manager_private::category, std::bind(&categorizer_manager_private::category,
@ -85,7 +88,8 @@ class categorizer_job_ final : public categorizer_job::impl {
size_t total_size_{0}; size_t total_size_{0};
std::vector<std::pair<int, std::unique_ptr<sequential_categorizer_job>>> std::vector<std::pair<int, std::unique_ptr<sequential_categorizer_job>>>
seq_jobs_; seq_jobs_;
std::filesystem::path const path_; fs::path const& root_path_;
fs::path const path_;
category_mapper cat_mapper_; category_mapper cat_mapper_;
}; };
@ -104,9 +108,11 @@ void categorizer_job_<LoggerPolicy>::categorize_random_access(
bool global_best = true; bool global_best = true;
file_path_info path_info{root_path_, path_};
for (auto&& [index, cat] : ranges::views::enumerate(mgr_.categorizers())) { for (auto&& [index, cat] : ranges::views::enumerate(mgr_.categorizers())) {
if (auto p = dynamic_cast<random_access_categorizer*>(cat.get())) { if (auto p = dynamic_cast<random_access_categorizer*>(cat.get())) {
if (auto c = p->categorize(path_, data, cat_mapper_)) { if (auto c = p->categorize(path_info, data, cat_mapper_)) {
best_ = c; best_ = c;
index_ = index; index_ = index;
is_global_best_ = global_best; is_global_best_ = global_best;
@ -126,13 +132,15 @@ void categorizer_job_<LoggerPolicy>::categorize_sequential(
} }
if (seq_jobs_.empty()) [[unlikely]] { if (seq_jobs_.empty()) [[unlikely]] {
file_path_info path_info{root_path_, path_};
for (auto&& [index, cat] : ranges::views::enumerate(mgr_.categorizers())) { for (auto&& [index, cat] : ranges::views::enumerate(mgr_.categorizers())) {
if (index_ >= 0 && std::cmp_greater_equal(index, index_)) { if (index_ >= 0 && std::cmp_greater_equal(index, index_)) {
break; break;
} }
if (auto p = dynamic_cast<sequential_categorizer*>(cat.get())) { if (auto p = dynamic_cast<sequential_categorizer*>(cat.get())) {
if (auto job = p->job(path_, total_size_, cat_mapper_)) { if (auto job = p->job(path_info, total_size_, cat_mapper_)) {
seq_jobs_.emplace_back(index, std::move(job)); seq_jobs_.emplace_back(index, std::move(job));
} }
} }
@ -174,15 +182,16 @@ bool categorizer_job_<LoggerPolicy>::best_result_found() const {
template <typename LoggerPolicy> template <typename LoggerPolicy>
class categorizer_manager_ final : public categorizer_manager_private { class categorizer_manager_ final : public categorizer_manager_private {
public: public:
explicit categorizer_manager_(logger& lgr) explicit categorizer_manager_(logger& lgr, fs::path root)
: lgr_{lgr} : lgr_{lgr}
, LOG_PROXY_INIT(lgr) { , LOG_PROXY_INIT(lgr)
, root_path_{std::move(root)} {
add_category(categorizer::DEFAULT_CATEGORY, add_category(categorizer::DEFAULT_CATEGORY,
std::numeric_limits<size_t>::max()); std::numeric_limits<size_t>::max());
} }
void add(std::shared_ptr<categorizer> c) override; void add(std::shared_ptr<categorizer> c) override;
categorizer_job job(std::filesystem::path const& path) const override; categorizer_job job(fs::path const& path) const override;
std::string_view std::string_view
category_name(fragment_category::value_type c) const override; category_name(fragment_category::value_type c) const override;
@ -229,6 +238,7 @@ class categorizer_manager_ final : public categorizer_manager_private {
// TODO: category descriptions? // TODO: category descriptions?
std::vector<std::pair<std::string_view, size_t>> categories_; std::vector<std::pair<std::string_view, size_t>> categories_;
std::unordered_map<std::string_view, fragment_category::value_type> catmap_; std::unordered_map<std::string_view, fragment_category::value_type> catmap_;
fs::path root_path_;
}; };
template <typename LoggerPolicy> template <typename LoggerPolicy>
@ -241,11 +251,12 @@ void categorizer_manager_<LoggerPolicy>::add(std::shared_ptr<categorizer> c) {
} }
template <typename LoggerPolicy> template <typename LoggerPolicy>
categorizer_job categorizer_manager_<LoggerPolicy>::job( categorizer_job
std::filesystem::path const& path) const { categorizer_manager_<LoggerPolicy>::job(fs::path const& path) const {
return categorizer_job( return categorizer_job(
make_unique_logging_object<categorizer_job::impl, categorizer_job_, make_unique_logging_object<categorizer_job::impl, categorizer_job_,
logger_policies>(lgr_, *this, path)); logger_policies>(lgr_, *this, root_path_,
path));
} }
template <typename LoggerPolicy> template <typename LoggerPolicy>
@ -307,6 +318,10 @@ bool categorizer_manager_<LoggerPolicy>::deterministic_less(
namespace po = boost::program_options; namespace po = boost::program_options;
fs::path file_path_info::relative_path() const {
return full_path_.lexically_relative(root_path_);
}
std::string category_prefix(std::shared_ptr<categorizer_manager> const& mgr, std::string category_prefix(std::shared_ptr<categorizer_manager> const& mgr,
fragment_category cat) { fragment_category cat) {
return category_prefix(mgr.get(), cat); return category_prefix(mgr.get(), cat);
@ -361,9 +376,10 @@ categorizer_job::categorizer_job() = default;
categorizer_job::categorizer_job(std::unique_ptr<impl> impl) categorizer_job::categorizer_job(std::unique_ptr<impl> impl)
: impl_{std::move(impl)} {} : impl_{std::move(impl)} {}
categorizer_manager::categorizer_manager(logger& lgr) categorizer_manager::categorizer_manager(logger& lgr, fs::path root)
: impl_(make_unique_logging_object<impl, internal::categorizer_manager_, : impl_(make_unique_logging_object<impl, internal::categorizer_manager_,
logger_policies>(lgr)) {} logger_policies>(lgr, std::move(root))) {
}
fragment_category categorizer_manager::default_category() { fragment_category categorizer_manager::default_category() {
return fragment_category(0); return fragment_category(0);

View File

@ -336,7 +336,7 @@ class fits_categorizer_ final : public fits_categorizer_base {
} }
inode_fragments inode_fragments
categorize(fs::path const& path, std::span<uint8_t const> data, categorize(file_path_info const& path, std::span<uint8_t const> data,
category_mapper const& mapper) const override; category_mapper const& mapper) const override;
std::string category_metadata(std::string_view category_name, std::string category_metadata(std::string_view category_name,
@ -392,7 +392,7 @@ bool fits_categorizer_<LoggerPolicy>::check_metadata(
template <typename LoggerPolicy> template <typename LoggerPolicy>
inode_fragments fits_categorizer_<LoggerPolicy>::categorize( inode_fragments fits_categorizer_<LoggerPolicy>::categorize(
fs::path const& path, std::span<uint8_t const> data, file_path_info const& path, std::span<uint8_t const> data,
category_mapper const& mapper) const { category_mapper const& mapper) const {
inode_fragments fragments; inode_fragments fragments;
@ -406,7 +406,7 @@ inode_fragments fits_categorizer_<LoggerPolicy>::categorize(
meta.unused_lsb_count = fi->unused_lsb_count; meta.unused_lsb_count = fi->unused_lsb_count;
meta.component_count = fi->component_count; meta.component_count = fi->component_count;
if (check_metadata(meta, path)) { if (check_metadata(meta, path.full_path())) {
auto subcategory = meta_.wlock()->add(meta); auto subcategory = meta_.wlock()->add(meta);
fragments.emplace_back(fragment_category(mapper(METADATA_CATEGORY)), fragments.emplace_back(fragment_category(mapper(METADATA_CATEGORY)),
fi->header.size()); fi->header.size());

View File

@ -208,7 +208,7 @@ class incompressible_categorizer_ final : public sequential_categorizer {
std::span<std::string_view const> categories() const override; std::span<std::string_view const> categories() const override;
std::unique_ptr<sequential_categorizer_job> std::unique_ptr<sequential_categorizer_job>
job(std::filesystem::path const& path, size_t total_size, job(file_path_info const& path, size_t total_size,
category_mapper const& mapper) const override; category_mapper const& mapper) const override;
bool bool
@ -235,8 +235,7 @@ incompressible_categorizer_::categories() const {
} }
std::unique_ptr<sequential_categorizer_job> std::unique_ptr<sequential_categorizer_job>
incompressible_categorizer_::job(std::filesystem::path const& path, incompressible_categorizer_::job(file_path_info const& path, size_t total_size,
size_t total_size,
category_mapper const& mapper) const { category_mapper const& mapper) const {
if (total_size < config_.min_input_size) { if (total_size < config_.min_input_size) {
return nullptr; return nullptr;
@ -244,8 +243,8 @@ incompressible_categorizer_::job(std::filesystem::path const& path,
return make_unique_logging_object<sequential_categorizer_job, return make_unique_logging_object<sequential_categorizer_job,
incompressible_categorizer_job_, incompressible_categorizer_job_,
logger_policies>(lgr_, config_, ctxmgr_, logger_policies>(
path, total_size, mapper); lgr_, config_, ctxmgr_, path.full_path(), total_size, mapper);
} }
bool incompressible_categorizer_::subcategory_less(fragment_category, bool incompressible_categorizer_::subcategory_less(fragment_category,

View File

@ -528,7 +528,7 @@ class pcmaudio_categorizer_ final : public pcmaudio_categorizer_base {
} }
inode_fragments inode_fragments
categorize(fs::path const& path, std::span<uint8_t const> data, categorize(file_path_info const& path, std::span<uint8_t const> data,
category_mapper const& mapper) const override; category_mapper const& mapper) const override;
std::string category_metadata(std::string_view category_name, std::string category_metadata(std::string_view category_name,
@ -1114,7 +1114,7 @@ void pcmaudio_categorizer_<LoggerPolicy>::add_fragments(
template <typename LoggerPolicy> template <typename LoggerPolicy>
inode_fragments pcmaudio_categorizer_<LoggerPolicy>::categorize( inode_fragments pcmaudio_categorizer_<LoggerPolicy>::categorize(
fs::path const& path, std::span<uint8_t const> data, file_path_info const& path, std::span<uint8_t const> data,
category_mapper const& mapper) const { category_mapper const& mapper) const {
inode_fragments fragments; inode_fragments fragments;
@ -1127,7 +1127,7 @@ inode_fragments pcmaudio_categorizer_<LoggerPolicy>::categorize(
&pcmaudio_categorizer_::check_wav64, &pcmaudio_categorizer_::check_wav64,
// clang-format on // clang-format on
}) { }) {
if ((this->*f)(fragments, path, data, mapper)) { if ((this->*f)(fragments, path.full_path(), data, mapper)) {
break; break;
} }

View File

@ -68,7 +68,7 @@ class fits_categorizer_fixture : public Base {
po::store(parsed, vm); po::store(parsed, vm);
po::notify(vm); po::notify(vm);
catmgr = std::make_shared<writer::categorizer_manager>(lgr); catmgr = std::make_shared<writer::categorizer_manager>(lgr, "/");
catmgr->add(catreg.create(lgr, "fits", vm)); catmgr->add(catreg.create(lgr, "fits", vm));
} }

View File

@ -92,7 +92,7 @@ class incompressible_categorizer_fixture : public Base {
po::store(parsed, vm); po::store(parsed, vm);
po::notify(vm); po::notify(vm);
catmgr = std::make_shared<writer::categorizer_manager>(lgr); catmgr = std::make_shared<writer::categorizer_manager>(lgr, "/");
catmgr->add(catreg.create(lgr, "incompressible", vm)); catmgr->add(catreg.create(lgr, "incompressible", vm));
} }

View File

@ -215,7 +215,7 @@ TEST(pcmaudio_categorizer, requirements) {
test::test_logger logger(logger::INFO); test::test_logger logger(logger::INFO);
boost::program_options::variables_map vm; boost::program_options::variables_map vm;
auto& catreg = writer::categorizer_registry::instance(); auto& catreg = writer::categorizer_registry::instance();
auto catmgr = writer::categorizer_manager(logger); auto catmgr = writer::categorizer_manager(logger, "/");
catmgr.add(catreg.create(logger, "pcmaudio", vm)); catmgr.add(catreg.create(logger, "pcmaudio", vm));
@ -294,7 +294,7 @@ TEST(pcmaudio_categorizer, requirements) {
class pcmaudio_error_test : public testing::Test { class pcmaudio_error_test : public testing::Test {
public: public:
test::test_logger logger{logger::VERBOSE}; test::test_logger logger{logger::VERBOSE};
writer::categorizer_manager catmgr{logger}; writer::categorizer_manager catmgr{logger, "/"};
auto categorize(pcmfile_builder const& builder) { auto categorize(pcmfile_builder const& builder) {
// std::cout << folly::hexDump(builder.data.data(), builder.data.size()); // std::cout << folly::hexDump(builder.data.data(), builder.data.size());

View File

@ -1184,7 +1184,7 @@ int mkdwarfs_main(int argc, sys_char** argv, iolayer const& iol) {
split_to<std::vector<std::string>>(categorizer_list.value(), ','); split_to<std::vector<std::string>>(categorizer_list.value(), ',');
options.inode.categorizer_mgr = options.inode.categorizer_mgr =
std::make_shared<writer::categorizer_manager>(lgr); std::make_shared<writer::categorizer_manager>(lgr, path);
for (auto const& name : categorizers) { for (auto const& name : categorizers) {
options.inode.categorizer_mgr->add(catreg.create(lgr, name, vm)); options.inode.categorizer_mgr->add(catreg.create(lgr, name, vm));