feat: add hotness categorizer

This commit is contained in:
Marcus Holland-Moritz 2025-04-04 15:52:42 +02:00
parent 1c7cbec1ee
commit 09068bfada
5 changed files with 188 additions and 0 deletions

View File

@ -146,6 +146,7 @@ add_library(
# src/writer/categorizer/binary_categorizer.cpp
src/writer/categorizer/fits_categorizer.cpp
src/writer/categorizer/hotness_categorizer.cpp
src/writer/categorizer/incompressible_categorizer.cpp
src/writer/categorizer/pcmaudio_categorizer.cpp

View File

@ -401,6 +401,11 @@ Most other options are concerned with compression tuning:
you can switch to `ascii`, which is like `unicode`, but looks less
fancy.
- `--hotness-list=`*file*:
A file containing the paths of all "hot" files for the "hotness"
categorizer. The paths must be relative to the `--input` path, but
may start with a leading `/`.
- `--incompressible-min-input-size=`*value*:
The minimum size of a file to be checked for incompressibility when
the `incompressible` categorizer is active.

View File

@ -253,6 +253,7 @@ namespace detail {
void binary_categorizer_factory_registrar(categorizer_registry&);
void fits_categorizer_factory_registrar(categorizer_registry&);
void hotness_categorizer_factory_registrar(categorizer_registry&);
void incompressible_categorizer_factory_registrar(categorizer_registry&);
void libmagic_categorizer_factory_registrar(categorizer_registry&);
void pcmaudio_categorizer_factory_registrar(categorizer_registry&);

View File

@ -429,6 +429,7 @@ categorizer_registry::categorizer_registry() {
// binary_categorizer_factory_registrar(*this);
fits_categorizer_factory_registrar(*this);
hotness_categorizer_factory_registrar(*this);
incompressible_categorizer_factory_registrar(*this);
// libmagic_categorizer_factory_registrar(*this);
pcmaudio_categorizer_factory_registrar(*this);

View File

@ -0,0 +1,180 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#include <array>
#include <atomic>
#include <cassert>
#include <cstring>
#include <fstream>
#include <numeric>
#include <unordered_set>
#include <vector>
#include <boost/program_options.hpp>
#include <fmt/format.h>
#include <dwarfs/error.h>
#include <dwarfs/logger.h>
#include <dwarfs/util.h>
#include <dwarfs/writer/categorizer.h>
namespace dwarfs::writer {
namespace po = boost::program_options;
namespace {
constexpr std::string_view const HOTNESS_CATEGORY{"hotness"};
struct hotness_categorizer_config {
std::string hotness_list;
};
template <typename LoggerPolicy>
class hotness_categorizer_ final : public random_access_categorizer {
public:
hotness_categorizer_(logger& lgr, hotness_categorizer_config const& cfg);
std::span<std::string_view const> categories() const override;
inode_fragments
categorize(file_path_info const& path, std::span<uint8_t const> data,
category_mapper const& mapper) const override;
bool
subcategory_less(fragment_category a, fragment_category b) const override;
private:
LOG_PROXY_DECL(LoggerPolicy);
std::unordered_set<std::string> hotness_set_;
std::atomic<bool> mutable warned_no_list_{false};
hotness_categorizer_config const cfg_;
};
template <typename LoggerPolicy>
hotness_categorizer_<LoggerPolicy>::hotness_categorizer_(
logger& lgr, hotness_categorizer_config const& cfg)
: LOG_PROXY_INIT(lgr)
, cfg_{cfg} {
auto const& file = cfg_.hotness_list;
if (!file.empty()) {
std::ifstream ifs{file};
if (!ifs) {
DWARFS_THROW(runtime_error,
fmt::format("failed to open file '{}'", file));
}
std::string line;
while (std::getline(ifs, line)) {
auto const path = std::filesystem::path{line}.relative_path();
LOG_DEBUG << "hotness categorizer: adding path '" << path << "'";
if (!hotness_set_.emplace(path.string()).second) {
DWARFS_THROW(runtime_error,
fmt::format("duplicate path in hotness list: '{}'", line));
}
}
if (hotness_set_.empty()) {
LOG_WARN << "hotness categorizer: empty hotness list";
}
}
}
template <typename LoggerPolicy>
std::span<std::string_view const>
hotness_categorizer_<LoggerPolicy>::categories() const {
static constexpr std::array const s_categories{
HOTNESS_CATEGORY,
};
return s_categories;
}
template <typename LoggerPolicy>
inode_fragments hotness_categorizer_<LoggerPolicy>::categorize(
file_path_info const& path, std::span<uint8_t const> data,
category_mapper const& mapper) const {
inode_fragments fragments;
if (!hotness_set_.empty()) {
auto const rel_path = path.relative_path();
LOG_DEBUG << "hotness categorizer: checking path '" << rel_path << "'";
if (auto it = hotness_set_.find(rel_path.string());
it != hotness_set_.end()) {
fragments.emplace_back(fragment_category(mapper(HOTNESS_CATEGORY)),
data.size());
}
} else if (!warned_no_list_) {
if (cfg_.hotness_list.empty()) {
LOG_WARN << "hotness categorizer: no hotness list provided";
}
warned_no_list_ = true;
}
return fragments;
}
template <typename LoggerPolicy>
bool hotness_categorizer_<LoggerPolicy>::subcategory_less(
fragment_category a, fragment_category b) const {
return a.subcategory() < b.subcategory();
}
class hotness_categorizer_factory : public categorizer_factory {
public:
hotness_categorizer_factory()
: opts_{std::make_shared<po::options_description>(
"Hotness categorizer options")} {
// clang-format off
opts_->add_options()
("hotness-list",
po::value<std::string>(&cfg_.hotness_list)
->value_name("file"),
"file with list of hot file paths")
;
// clang-format on
}
std::string_view name() const override { return "hotness"; }
std::shared_ptr<po::options_description const> options() const override {
return opts_;
}
std::unique_ptr<categorizer>
create(logger& lgr, po::variables_map const& /*vm*/) const override {
return make_unique_logging_object<categorizer, hotness_categorizer_,
logger_policies>(lgr, cfg_);
}
private:
hotness_categorizer_config cfg_;
std::shared_ptr<po::options_description> opts_;
};
} // namespace
REGISTER_CATEGORIZER_FACTORY(hotness_categorizer_factory)
} // namespace dwarfs::writer