mirror of
https://github.com/mhx/dwarfs.git
synced 2025-08-04 02:06:22 -04:00
feat: add hotness categorizer
This commit is contained in:
parent
1c7cbec1ee
commit
09068bfada
@ -146,6 +146,7 @@ add_library(
|
||||
|
||||
# src/writer/categorizer/binary_categorizer.cpp
|
||||
src/writer/categorizer/fits_categorizer.cpp
|
||||
src/writer/categorizer/hotness_categorizer.cpp
|
||||
src/writer/categorizer/incompressible_categorizer.cpp
|
||||
src/writer/categorizer/pcmaudio_categorizer.cpp
|
||||
|
||||
|
@ -401,6 +401,11 @@ Most other options are concerned with compression tuning:
|
||||
you can switch to `ascii`, which is like `unicode`, but looks less
|
||||
fancy.
|
||||
|
||||
- `--hotness-list=`*file*:
|
||||
A file containing the paths of all "hot" files for the "hotness"
|
||||
categorizer. The paths must be relative to the `--input` path, but
|
||||
may start with a leading `/`.
|
||||
|
||||
- `--incompressible-min-input-size=`*value*:
|
||||
The minimum size of a file to be checked for incompressibility when
|
||||
the `incompressible` categorizer is active.
|
||||
|
@ -253,6 +253,7 @@ namespace detail {
|
||||
|
||||
void binary_categorizer_factory_registrar(categorizer_registry&);
|
||||
void fits_categorizer_factory_registrar(categorizer_registry&);
|
||||
void hotness_categorizer_factory_registrar(categorizer_registry&);
|
||||
void incompressible_categorizer_factory_registrar(categorizer_registry&);
|
||||
void libmagic_categorizer_factory_registrar(categorizer_registry&);
|
||||
void pcmaudio_categorizer_factory_registrar(categorizer_registry&);
|
||||
|
@ -429,6 +429,7 @@ categorizer_registry::categorizer_registry() {
|
||||
|
||||
// binary_categorizer_factory_registrar(*this);
|
||||
fits_categorizer_factory_registrar(*this);
|
||||
hotness_categorizer_factory_registrar(*this);
|
||||
incompressible_categorizer_factory_registrar(*this);
|
||||
// libmagic_categorizer_factory_registrar(*this);
|
||||
pcmaudio_categorizer_factory_registrar(*this);
|
||||
|
180
src/writer/categorizer/hotness_categorizer.cpp
Normal file
180
src/writer/categorizer/hotness_categorizer.cpp
Normal file
@ -0,0 +1,180 @@
|
||||
/* vim:set ts=2 sw=2 sts=2 et: */
|
||||
/**
|
||||
* \author Marcus Holland-Moritz (github@mhxnet.de)
|
||||
* \copyright Copyright (c) Marcus Holland-Moritz
|
||||
*
|
||||
* This file is part of dwarfs.
|
||||
*
|
||||
* dwarfs is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* dwarfs is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <array>
|
||||
#include <atomic>
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <numeric>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
|
||||
#include <fmt/format.h>
|
||||
|
||||
#include <dwarfs/error.h>
|
||||
#include <dwarfs/logger.h>
|
||||
#include <dwarfs/util.h>
|
||||
#include <dwarfs/writer/categorizer.h>
|
||||
|
||||
namespace dwarfs::writer {
|
||||
|
||||
namespace po = boost::program_options;
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr std::string_view const HOTNESS_CATEGORY{"hotness"};
|
||||
|
||||
struct hotness_categorizer_config {
|
||||
std::string hotness_list;
|
||||
};
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
class hotness_categorizer_ final : public random_access_categorizer {
|
||||
public:
|
||||
hotness_categorizer_(logger& lgr, hotness_categorizer_config const& cfg);
|
||||
|
||||
std::span<std::string_view const> categories() const override;
|
||||
|
||||
inode_fragments
|
||||
categorize(file_path_info const& path, std::span<uint8_t const> data,
|
||||
category_mapper const& mapper) const override;
|
||||
|
||||
bool
|
||||
subcategory_less(fragment_category a, fragment_category b) const override;
|
||||
|
||||
private:
|
||||
LOG_PROXY_DECL(LoggerPolicy);
|
||||
std::unordered_set<std::string> hotness_set_;
|
||||
std::atomic<bool> mutable warned_no_list_{false};
|
||||
hotness_categorizer_config const cfg_;
|
||||
};
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
hotness_categorizer_<LoggerPolicy>::hotness_categorizer_(
|
||||
logger& lgr, hotness_categorizer_config const& cfg)
|
||||
: LOG_PROXY_INIT(lgr)
|
||||
, cfg_{cfg} {
|
||||
auto const& file = cfg_.hotness_list;
|
||||
|
||||
if (!file.empty()) {
|
||||
std::ifstream ifs{file};
|
||||
if (!ifs) {
|
||||
DWARFS_THROW(runtime_error,
|
||||
fmt::format("failed to open file '{}'", file));
|
||||
}
|
||||
|
||||
std::string line;
|
||||
while (std::getline(ifs, line)) {
|
||||
auto const path = std::filesystem::path{line}.relative_path();
|
||||
LOG_DEBUG << "hotness categorizer: adding path '" << path << "'";
|
||||
if (!hotness_set_.emplace(path.string()).second) {
|
||||
DWARFS_THROW(runtime_error,
|
||||
fmt::format("duplicate path in hotness list: '{}'", line));
|
||||
}
|
||||
}
|
||||
|
||||
if (hotness_set_.empty()) {
|
||||
LOG_WARN << "hotness categorizer: empty hotness list";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
std::span<std::string_view const>
|
||||
hotness_categorizer_<LoggerPolicy>::categories() const {
|
||||
static constexpr std::array const s_categories{
|
||||
HOTNESS_CATEGORY,
|
||||
};
|
||||
return s_categories;
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
inode_fragments hotness_categorizer_<LoggerPolicy>::categorize(
|
||||
file_path_info const& path, std::span<uint8_t const> data,
|
||||
category_mapper const& mapper) const {
|
||||
inode_fragments fragments;
|
||||
|
||||
if (!hotness_set_.empty()) {
|
||||
auto const rel_path = path.relative_path();
|
||||
|
||||
LOG_DEBUG << "hotness categorizer: checking path '" << rel_path << "'";
|
||||
|
||||
if (auto it = hotness_set_.find(rel_path.string());
|
||||
it != hotness_set_.end()) {
|
||||
fragments.emplace_back(fragment_category(mapper(HOTNESS_CATEGORY)),
|
||||
data.size());
|
||||
}
|
||||
} else if (!warned_no_list_) {
|
||||
if (cfg_.hotness_list.empty()) {
|
||||
LOG_WARN << "hotness categorizer: no hotness list provided";
|
||||
}
|
||||
warned_no_list_ = true;
|
||||
}
|
||||
|
||||
return fragments;
|
||||
}
|
||||
|
||||
template <typename LoggerPolicy>
|
||||
bool hotness_categorizer_<LoggerPolicy>::subcategory_less(
|
||||
fragment_category a, fragment_category b) const {
|
||||
return a.subcategory() < b.subcategory();
|
||||
}
|
||||
|
||||
class hotness_categorizer_factory : public categorizer_factory {
|
||||
public:
|
||||
hotness_categorizer_factory()
|
||||
: opts_{std::make_shared<po::options_description>(
|
||||
"Hotness categorizer options")} {
|
||||
// clang-format off
|
||||
opts_->add_options()
|
||||
("hotness-list",
|
||||
po::value<std::string>(&cfg_.hotness_list)
|
||||
->value_name("file"),
|
||||
"file with list of hot file paths")
|
||||
;
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
std::string_view name() const override { return "hotness"; }
|
||||
|
||||
std::shared_ptr<po::options_description const> options() const override {
|
||||
return opts_;
|
||||
}
|
||||
|
||||
std::unique_ptr<categorizer>
|
||||
create(logger& lgr, po::variables_map const& /*vm*/) const override {
|
||||
return make_unique_logging_object<categorizer, hotness_categorizer_,
|
||||
logger_policies>(lgr, cfg_);
|
||||
}
|
||||
|
||||
private:
|
||||
hotness_categorizer_config cfg_;
|
||||
std::shared_ptr<po::options_description> opts_;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
REGISTER_CATEGORIZER_FACTORY(hotness_categorizer_factory)
|
||||
|
||||
} // namespace dwarfs::writer
|
Loading…
x
Reference in New Issue
Block a user