diff --git a/cmake/libdwarfs.cmake b/cmake/libdwarfs.cmake index 9e6e7704..fc2a2913 100644 --- a/cmake/libdwarfs.cmake +++ b/cmake/libdwarfs.cmake @@ -146,6 +146,7 @@ add_library( # src/writer/categorizer/binary_categorizer.cpp src/writer/categorizer/fits_categorizer.cpp + src/writer/categorizer/hotness_categorizer.cpp src/writer/categorizer/incompressible_categorizer.cpp src/writer/categorizer/pcmaudio_categorizer.cpp diff --git a/doc/mkdwarfs.md b/doc/mkdwarfs.md index 2e5665ff..57244cdf 100644 --- a/doc/mkdwarfs.md +++ b/doc/mkdwarfs.md @@ -401,6 +401,11 @@ Most other options are concerned with compression tuning: you can switch to `ascii`, which is like `unicode`, but looks less fancy. +- `--hotness-list=`*file*: + A file containing the paths of all "hot" files for the "hotness" + categorizer. The paths must be relative to the `--input` path, but + may start with a leading `/`. + - `--incompressible-min-input-size=`*value*: The minimum size of a file to be checked for incompressibility when the `incompressible` categorizer is active. diff --git a/include/dwarfs/writer/categorizer.h b/include/dwarfs/writer/categorizer.h index c86512a7..a3927ea6 100644 --- a/include/dwarfs/writer/categorizer.h +++ b/include/dwarfs/writer/categorizer.h @@ -253,6 +253,7 @@ namespace detail { void binary_categorizer_factory_registrar(categorizer_registry&); void fits_categorizer_factory_registrar(categorizer_registry&); +void hotness_categorizer_factory_registrar(categorizer_registry&); void incompressible_categorizer_factory_registrar(categorizer_registry&); void libmagic_categorizer_factory_registrar(categorizer_registry&); void pcmaudio_categorizer_factory_registrar(categorizer_registry&); diff --git a/src/writer/categorizer.cpp b/src/writer/categorizer.cpp index 2dc65f6b..8352bb85 100644 --- a/src/writer/categorizer.cpp +++ b/src/writer/categorizer.cpp @@ -429,6 +429,7 @@ categorizer_registry::categorizer_registry() { // binary_categorizer_factory_registrar(*this); fits_categorizer_factory_registrar(*this); + hotness_categorizer_factory_registrar(*this); incompressible_categorizer_factory_registrar(*this); // libmagic_categorizer_factory_registrar(*this); pcmaudio_categorizer_factory_registrar(*this); diff --git a/src/writer/categorizer/hotness_categorizer.cpp b/src/writer/categorizer/hotness_categorizer.cpp new file mode 100644 index 00000000..d0817773 --- /dev/null +++ b/src/writer/categorizer/hotness_categorizer.cpp @@ -0,0 +1,180 @@ +/* vim:set ts=2 sw=2 sts=2 et: */ +/** + * \author Marcus Holland-Moritz (github@mhxnet.de) + * \copyright Copyright (c) Marcus Holland-Moritz + * + * This file is part of dwarfs. + * + * dwarfs is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * dwarfs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with dwarfs. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include + +namespace dwarfs::writer { + +namespace po = boost::program_options; + +namespace { + +constexpr std::string_view const HOTNESS_CATEGORY{"hotness"}; + +struct hotness_categorizer_config { + std::string hotness_list; +}; + +template +class hotness_categorizer_ final : public random_access_categorizer { + public: + hotness_categorizer_(logger& lgr, hotness_categorizer_config const& cfg); + + std::span categories() const override; + + inode_fragments + categorize(file_path_info const& path, std::span data, + category_mapper const& mapper) const override; + + bool + subcategory_less(fragment_category a, fragment_category b) const override; + + private: + LOG_PROXY_DECL(LoggerPolicy); + std::unordered_set hotness_set_; + std::atomic mutable warned_no_list_{false}; + hotness_categorizer_config const cfg_; +}; + +template +hotness_categorizer_::hotness_categorizer_( + logger& lgr, hotness_categorizer_config const& cfg) + : LOG_PROXY_INIT(lgr) + , cfg_{cfg} { + auto const& file = cfg_.hotness_list; + + if (!file.empty()) { + std::ifstream ifs{file}; + if (!ifs) { + DWARFS_THROW(runtime_error, + fmt::format("failed to open file '{}'", file)); + } + + std::string line; + while (std::getline(ifs, line)) { + auto const path = std::filesystem::path{line}.relative_path(); + LOG_DEBUG << "hotness categorizer: adding path '" << path << "'"; + if (!hotness_set_.emplace(path.string()).second) { + DWARFS_THROW(runtime_error, + fmt::format("duplicate path in hotness list: '{}'", line)); + } + } + + if (hotness_set_.empty()) { + LOG_WARN << "hotness categorizer: empty hotness list"; + } + } +} + +template +std::span +hotness_categorizer_::categories() const { + static constexpr std::array const s_categories{ + HOTNESS_CATEGORY, + }; + return s_categories; +} + +template +inode_fragments hotness_categorizer_::categorize( + file_path_info const& path, std::span data, + category_mapper const& mapper) const { + inode_fragments fragments; + + if (!hotness_set_.empty()) { + auto const rel_path = path.relative_path(); + + LOG_DEBUG << "hotness categorizer: checking path '" << rel_path << "'"; + + if (auto it = hotness_set_.find(rel_path.string()); + it != hotness_set_.end()) { + fragments.emplace_back(fragment_category(mapper(HOTNESS_CATEGORY)), + data.size()); + } + } else if (!warned_no_list_) { + if (cfg_.hotness_list.empty()) { + LOG_WARN << "hotness categorizer: no hotness list provided"; + } + warned_no_list_ = true; + } + + return fragments; +} + +template +bool hotness_categorizer_::subcategory_less( + fragment_category a, fragment_category b) const { + return a.subcategory() < b.subcategory(); +} + +class hotness_categorizer_factory : public categorizer_factory { + public: + hotness_categorizer_factory() + : opts_{std::make_shared( + "Hotness categorizer options")} { + // clang-format off + opts_->add_options() + ("hotness-list", + po::value(&cfg_.hotness_list) + ->value_name("file"), + "file with list of hot file paths") + ; + // clang-format on + } + + std::string_view name() const override { return "hotness"; } + + std::shared_ptr options() const override { + return opts_; + } + + std::unique_ptr + create(logger& lgr, po::variables_map const& /*vm*/) const override { + return make_unique_logging_object(lgr, cfg_); + } + + private: + hotness_categorizer_config cfg_; + std::shared_ptr opts_; +}; + +} // namespace + +REGISTER_CATEGORIZER_FACTORY(hotness_categorizer_factory) + +} // namespace dwarfs::writer