Switch categorizers to returning inode fragments

This commit is contained in:
Marcus Holland-Moritz 2023-07-18 12:29:13 +02:00
parent 1ac36bb6fa
commit 33b2dfb95f
12 changed files with 312 additions and 86 deletions

View File

@ -373,6 +373,7 @@ list(
src/dwarfs/fstypes.cpp src/dwarfs/fstypes.cpp
src/dwarfs/fs_section.cpp src/dwarfs/fs_section.cpp
src/dwarfs/global_entry_data.cpp src/dwarfs/global_entry_data.cpp
src/dwarfs/inode_fragments.cpp
src/dwarfs/inode_manager.cpp src/dwarfs/inode_manager.cpp
src/dwarfs/inode_reader_v2.cpp src/dwarfs/inode_reader_v2.cpp
src/dwarfs/logger.cpp src/dwarfs/logger.cpp

View File

@ -23,14 +23,14 @@
#include <cstdint> #include <cstdint>
#include <filesystem> #include <filesystem>
#include <functional>
#include <limits> #include <limits>
#include <map> #include <map>
#include <memory> #include <memory>
#include <optional>
#include <span> #include <span>
#include <string_view> #include <string_view>
#include "dwarfs/file_category.h" #include "dwarfs/inode_fragments.h"
namespace boost::program_options { namespace boost::program_options {
class options_description; class options_description;
@ -41,6 +41,9 @@ namespace dwarfs {
class logger; class logger;
using category_mapper =
std::function<fragment_category::value_type(std::string_view)>;
class categorizer { class categorizer {
public: public:
virtual ~categorizer() = default; virtual ~categorizer() = default;
@ -50,23 +53,26 @@ class categorizer {
class random_access_categorizer : public categorizer { class random_access_categorizer : public categorizer {
public: public:
virtual std::optional<std::string_view> virtual inode_fragments
categorize(std::filesystem::path const& path, categorize(std::filesystem::path const& path, std::span<uint8_t const> data,
std::span<uint8_t const> data) const = 0; category_mapper const& mapper) const = 0;
}; };
// TODO: add call to check if categorizer can return multiple fragments
// if it *can* we must run it before we start similarity hashing
class sequential_categorizer_job { class sequential_categorizer_job {
public: public:
virtual ~sequential_categorizer_job() = default; virtual ~sequential_categorizer_job() = default;
virtual void add(std::span<uint8_t const> data) = 0; virtual void add(std::span<uint8_t const> data) = 0;
virtual std::optional<std::string_view> result() = 0; virtual inode_fragments result() = 0;
}; };
class sequential_categorizer : public categorizer { class sequential_categorizer : public categorizer {
public: public:
virtual std::unique_ptr<sequential_categorizer_job> virtual std::unique_ptr<sequential_categorizer_job>
job(std::filesystem::path const& path, size_t total_size) const = 0; job(std::filesystem::path const& path, size_t total_size,
category_mapper const& mapper) const = 0;
}; };
class categorizer_job { class categorizer_job {
@ -76,6 +82,10 @@ class categorizer_job {
categorizer_job(); categorizer_job();
categorizer_job(std::unique_ptr<impl> impl); categorizer_job(std::unique_ptr<impl> impl);
void set_total_size(size_t total_size) {
return impl_->set_total_size(total_size);
}
void categorize_random_access(std::span<uint8_t const> data) { void categorize_random_access(std::span<uint8_t const> data) {
return impl_->categorize_random_access(data); return impl_->categorize_random_access(data);
} }
@ -84,7 +94,7 @@ class categorizer_job {
return impl_->categorize_sequential(data); return impl_->categorize_sequential(data);
} }
file_category result() { return impl_->result(); } inode_fragments result() { return impl_->result(); }
explicit operator bool() const { return impl_ != nullptr; } explicit operator bool() const { return impl_ != nullptr; }
@ -92,9 +102,10 @@ class categorizer_job {
public: public:
virtual ~impl() = default; virtual ~impl() = default;
virtual void set_total_size(size_t total_size) = 0;
virtual void categorize_random_access(std::span<uint8_t const> data) = 0; virtual void categorize_random_access(std::span<uint8_t const> data) = 0;
virtual void categorize_sequential(std::span<uint8_t const> data) = 0; virtual void categorize_sequential(std::span<uint8_t const> data) = 0;
virtual file_category result() = 0; virtual inode_fragments result() = 0;
}; };
private: private:
@ -111,7 +122,7 @@ class categorizer_manager {
return impl_->job(path); return impl_->job(path);
} }
std::string_view category_name(file_category c) const { std::string_view category_name(fragment_category::value_type c) const {
return impl_->category_name(c); return impl_->category_name(c);
} }
@ -121,7 +132,8 @@ class categorizer_manager {
virtual void add(std::shared_ptr<categorizer const> c) = 0; virtual void add(std::shared_ptr<categorizer const> c) = 0;
virtual categorizer_job job(std::filesystem::path const& path) const = 0; virtual categorizer_job job(std::filesystem::path const& path) const = 0;
virtual std::string_view category_name(file_category c) const = 0; virtual std::string_view
category_name(fragment_category::value_type c) const = 0;
}; };
private: private:

View File

@ -21,12 +21,13 @@
#pragma once #pragma once
#include <cassert>
#include <cstdint> #include <cstdint>
#include <limits> #include <limits>
namespace dwarfs { namespace dwarfs {
class file_category { class fragment_category {
public: public:
using value_type = uint32_t; using value_type = uint32_t;
@ -36,37 +37,60 @@ class file_category {
static constexpr value_type const max{std::numeric_limits<value_type>::max() - static constexpr value_type const max{std::numeric_limits<value_type>::max() -
1}; 1};
file_category() fragment_category() = default;
: value_{uninitialized} {}
file_category(value_type v) explicit fragment_category(value_type v)
: value_{v} {} : value_{v} {}
file_category(file_category const&) = default; fragment_category(value_type v, value_type subcategory)
file_category(file_category&&) = default; : value_{v}
, subcategory_{subcategory} {}
file_category& operator=(file_category const&) = default; fragment_category(fragment_category const&) = default;
file_category& operator=(file_category&&) = default; fragment_category(fragment_category&&) = default;
file_category& operator=(value_type v) { fragment_category& operator=(fragment_category const&) = default;
fragment_category& operator=(fragment_category&&) = default;
fragment_category& operator=(value_type v) {
assert(v != uninitialized);
value_ = v; value_ = v;
return *this; return *this;
} }
value_type value() const { value_type value() const {
if (empty()) { assert(!empty());
throw std::range_error("file_category is uninitialized");
}
return value_; return value_;
} }
void clear() { value_ = uninitialized; } void clear() {
value_ = uninitialized;
subcategory_ = uninitialized;
}
bool empty() const { return value_ == uninitialized; } bool empty() const { return value_ == uninitialized; }
explicit operator bool() const { return !empty(); } explicit operator bool() const { return !empty(); }
void set_subcategory(value_type subcategory) {
assert(!empty());
assert(subcategory != uninitialized);
subcategory_ = subcategory;
}
bool has_subcategory() const {
return !empty() && subcategory_ != uninitialized;
}
value_type subcategory() const {
assert(!empty());
assert(subcategory_ != uninitialized);
return subcategory_;
}
private: private:
value_type value_; value_type value_{uninitialized};
value_type subcategory_{uninitialized};
}; };
} // namespace dwarfs } // namespace dwarfs

View File

@ -27,7 +27,7 @@
#include <folly/small_vector.h> #include <folly/small_vector.h>
#include "dwarfs/file_category.h" #include "dwarfs/inode_fragments.h"
#include "dwarfs/nilsimsa.h" #include "dwarfs/nilsimsa.h"
#include "dwarfs/object.h" #include "dwarfs/object.h"
@ -59,7 +59,7 @@ class inode : public object {
virtual void add_chunk(size_t block, size_t offset, size_t size) = 0; virtual void add_chunk(size_t block, size_t offset, size_t size) = 0;
virtual void virtual void
append_chunks_to(std::vector<thrift::metadata::chunk>& vec) const = 0; append_chunks_to(std::vector<thrift::metadata::chunk>& vec) const = 0;
virtual file_category category() const = 0; virtual inode_fragments const& fragments() const = 0;
}; };
} // namespace dwarfs } // namespace dwarfs

View File

@ -0,0 +1,83 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#pragma once
#include <functional>
#include <iosfwd>
#include <span>
#include <string>
#include <folly/small_vector.h>
#include "dwarfs/fragment_category.h"
#include "dwarfs/types.h"
namespace dwarfs {
class single_inode_fragment {
public:
single_inode_fragment(fragment_category category, file_off_t length)
: category_{category}
, length_{length} {}
fragment_category category() const { return category_; }
file_off_t length() const { return length_; }
file_off_t size() const { return length_; }
private:
fragment_category category_;
file_off_t length_;
};
class inode_fragments {
public:
using mapper_function_type =
std::function<std::string(fragment_category::value_type)>;
inode_fragments() = default;
single_inode_fragment&
emplace_back(fragment_category category, file_off_t length) {
return fragments_.emplace_back(category, length);
}
std::span<single_inode_fragment const> span() const { return fragments_; }
bool empty() const { return fragments_.empty(); }
explicit operator bool() const { return !empty(); }
std::ostream&
to_stream(std::ostream& os,
mapper_function_type const& mapper = mapper_function_type()) const;
std::string
to_string(mapper_function_type const& mapper = mapper_function_type()) const;
private:
folly::small_vector<single_inode_fragment, 1> fragments_;
};
inline std::ostream& operator<<(std::ostream& os, inode_fragments const& frag) {
return frag.to_stream(os);
}
} // namespace dwarfs

View File

@ -28,7 +28,7 @@
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "dwarfs/file_category.h" #include "dwarfs/fragment_category.h"
namespace dwarfs { namespace dwarfs {
@ -59,7 +59,8 @@ class inode_manager {
impl_->for_each_inode_in_order(fn); impl_->for_each_inode_in_order(fn);
} }
std::vector<std::pair<file_category, size_t>> category_counts() const { std::vector<std::pair<fragment_category::value_type, size_t>>
category_counts() const {
return impl_->category_counts(); return impl_->category_counts();
} }
@ -74,7 +75,7 @@ class inode_manager {
file_order_options const& file_order, order_cb const& fn) = 0; file_order_options const& file_order, order_cb const& fn) = 0;
virtual void for_each_inode_in_order( virtual void for_each_inode_in_order(
std::function<void(std::shared_ptr<inode> const&)> const& fn) const = 0; std::function<void(std::shared_ptr<inode> const&)> const& fn) const = 0;
virtual std::vector<std::pair<file_category, size_t>> virtual std::vector<std::pair<fragment_category::value_type, size_t>>
category_counts() const = 0; category_counts() const = 0;
}; };

View File

@ -35,19 +35,20 @@
namespace dwarfs { namespace dwarfs {
using namespace std::placeholders;
namespace po = boost::program_options; namespace po = boost::program_options;
namespace { namespace {
constexpr std::string_view const DEFAULT_CATEGORY{"<default>"}; constexpr std::string_view const DEFAULT_CATEGORY{"<default>"};
} }
class categorizer_manager_private { class categorizer_manager_private : public categorizer_manager::impl {
public: public:
virtual ~categorizer_manager_private() = default;
virtual std::vector<std::shared_ptr<categorizer const>> const& virtual std::vector<std::shared_ptr<categorizer const>> const&
categorizers() const = 0; categorizers() const = 0;
virtual file_category category(std::string_view cat) const = 0; virtual fragment_category::value_type
category(std::string_view cat) const = 0;
}; };
template <typename LoggerPolicy> template <typename LoggerPolicy>
@ -57,39 +58,48 @@ class categorizer_job_ final : public categorizer_job::impl {
std::filesystem::path const& path) std::filesystem::path const& path)
: LOG_PROXY_INIT(lgr) : LOG_PROXY_INIT(lgr)
, mgr_{mgr} , mgr_{mgr}
, path_{path} {} , path_{path}
, cat_mapper_{std::bind(&categorizer_manager_private::category,
std::cref(mgr_), _1)} {}
void set_total_size(size_t total_size) override;
void categorize_random_access(std::span<uint8_t const> data) override; void categorize_random_access(std::span<uint8_t const> data) override;
void categorize_sequential(std::span<uint8_t const> data) override; void categorize_sequential(std::span<uint8_t const> data) override;
file_category result() override; inode_fragments result() override;
private: private:
LOG_PROXY_DECL(LoggerPolicy); LOG_PROXY_DECL(LoggerPolicy);
categorizer_manager_private const& mgr_; categorizer_manager_private const& mgr_;
std::string_view best_{DEFAULT_CATEGORY}; inode_fragments best_;
int index_{-1}; int index_{-1};
bool is_global_best_{false}; bool is_global_best_{false};
size_t total_size_hint_{0}; size_t total_size_{0};
std::vector<std::pair<int, std::unique_ptr<sequential_categorizer_job>>> std::vector<std::pair<int, std::unique_ptr<sequential_categorizer_job>>>
seq_jobs_; seq_jobs_;
std::filesystem::path const path_; std::filesystem::path const path_;
category_mapper cat_mapper_;
}; };
template <typename LoggerPolicy>
void categorizer_job_<LoggerPolicy>::set_total_size(size_t total_size) {
total_size_ = total_size;
}
template <typename LoggerPolicy> template <typename LoggerPolicy>
void categorizer_job_<LoggerPolicy>::categorize_random_access( void categorizer_job_<LoggerPolicy>::categorize_random_access(
std::span<uint8_t const> data) { std::span<uint8_t const> data) {
DWARFS_CHECK(index_ < 0, DWARFS_CHECK(index_ < 0,
"internal error: index already set in categorize_random_access"); "internal error: index already set in categorize_random_access");
total_size_hint_ = data.size(); total_size_ = data.size();
bool global_best = true; bool global_best = true;
for (auto&& [index, cat] : folly::enumerate(mgr_.categorizers())) { for (auto&& [index, cat] : folly::enumerate(mgr_.categorizers())) {
if (auto p = dynamic_cast<random_access_categorizer const*>(cat.get())) { if (auto p = dynamic_cast<random_access_categorizer const*>(cat.get())) {
if (auto c = p->categorize(path_, data)) { if (auto c = p->categorize(path_, data, cat_mapper_)) {
best_ = *c; best_ = c;
index_ = index; index_ = index;
is_global_best_ = global_best; is_global_best_ = global_best;
break; break;
@ -114,7 +124,7 @@ void categorizer_job_<LoggerPolicy>::categorize_sequential(
} }
if (auto p = dynamic_cast<sequential_categorizer const*>(cat.get())) { if (auto p = dynamic_cast<sequential_categorizer const*>(cat.get())) {
if (auto job = p->job(path_, total_size_hint_)) { if (auto job = p->job(path_, total_size_, cat_mapper_)) {
seq_jobs_.emplace_back(index, std::move(job)); seq_jobs_.emplace_back(index, std::move(job));
} }
} }
@ -127,12 +137,12 @@ void categorizer_job_<LoggerPolicy>::categorize_sequential(
} }
template <typename LoggerPolicy> template <typename LoggerPolicy>
file_category categorizer_job_<LoggerPolicy>::result() { inode_fragments categorizer_job_<LoggerPolicy>::result() {
if (!seq_jobs_.empty()) { if (!seq_jobs_.empty()) {
for (auto&& [index, job] : seq_jobs_) { for (auto&& [index, job] : seq_jobs_) {
if (auto c = job->result()) { if (auto c = job->result()) {
assert(index_ < 0 || index < index_); assert(index_ < 0 || index < index_);
best_ = *c; best_ = c;
break; break;
} }
} }
@ -140,9 +150,12 @@ file_category categorizer_job_<LoggerPolicy>::result() {
seq_jobs_.clear(); seq_jobs_.clear();
} }
LOG_TRACE << path_ << " -> " << best_; LOG_TRACE << path_ << " -> "
<< best_.to_string([this](fragment_category::value_type c) {
return std::string(mgr_.category_name(c));
});
return mgr_.category(best_); return best_;
} }
categorizer_job::categorizer_job() = default; categorizer_job::categorizer_job() = default;
@ -151,8 +164,7 @@ categorizer_job::categorizer_job(std::unique_ptr<impl> impl)
: impl_{std::move(impl)} {} : impl_{std::move(impl)} {}
template <typename LoggerPolicy> template <typename LoggerPolicy>
class categorizer_manager_ final : public categorizer_manager::impl, class categorizer_manager_ final : public categorizer_manager_private {
public categorizer_manager_private {
public: public:
categorizer_manager_(logger& lgr) categorizer_manager_(logger& lgr)
: lgr_{lgr} : lgr_{lgr}
@ -162,14 +174,15 @@ class categorizer_manager_ final : public categorizer_manager::impl,
void add(std::shared_ptr<categorizer const> c) override; void add(std::shared_ptr<categorizer const> c) override;
categorizer_job job(std::filesystem::path const& path) const override; categorizer_job job(std::filesystem::path const& path) const override;
std::string_view category_name(file_category c) const override; std::string_view
category_name(fragment_category::value_type c) const override;
std::vector<std::shared_ptr<categorizer const>> const& std::vector<std::shared_ptr<categorizer const>> const&
categorizers() const override { categorizers() const override {
return categorizers_; return categorizers_;
} }
file_category category(std::string_view cat) const override { fragment_category::value_type category(std::string_view cat) const override {
auto it = catmap_.find(cat); auto it = catmap_.find(cat);
DWARFS_CHECK(it != catmap_.end(), fmt::format("unknown category: {}", cat)); DWARFS_CHECK(it != catmap_.end(), fmt::format("unknown category: {}", cat));
return it->second; return it->second;
@ -188,7 +201,7 @@ class categorizer_manager_ final : public categorizer_manager::impl,
LOG_PROXY_DECL(LoggerPolicy); LOG_PROXY_DECL(LoggerPolicy);
std::vector<std::shared_ptr<categorizer const>> categorizers_; std::vector<std::shared_ptr<categorizer const>> categorizers_;
std::vector<std::string_view> categories_; std::vector<std::string_view> categories_;
std::unordered_map<std::string_view, file_category> catmap_; std::unordered_map<std::string_view, fragment_category::value_type> catmap_;
}; };
template <typename LoggerPolicy> template <typename LoggerPolicy>
@ -210,9 +223,9 @@ categorizer_job categorizer_manager_<LoggerPolicy>::job(
} }
template <typename LoggerPolicy> template <typename LoggerPolicy>
std::string_view std::string_view categorizer_manager_<LoggerPolicy>::category_name(
categorizer_manager_<LoggerPolicy>::category_name(file_category c) const { fragment_category::value_type c) const {
return DWARFS_NOTHROW(categories_.at(c.value())); return DWARFS_NOTHROW(categories_.at(c));
} }
categorizer_manager::categorizer_manager(logger& lgr) categorizer_manager::categorizer_manager(logger& lgr)

View File

@ -58,9 +58,9 @@ class binary_categorizer_ final : public binary_categorizer_base {
binary_categorizer_(logger& lgr) binary_categorizer_(logger& lgr)
: LOG_PROXY_INIT(lgr) {} : LOG_PROXY_INIT(lgr) {}
std::optional<std::string_view> inode_fragments
categorize(std::filesystem::path const& path, categorize(std::filesystem::path const& path, std::span<uint8_t const> data,
std::span<uint8_t const> data) const override; category_mapper const& mapper) const override;
private: private:
LOG_PROXY_DECL(LoggerPolicy); LOG_PROXY_DECL(LoggerPolicy);
@ -74,10 +74,12 @@ std::span<std::string_view const> binary_categorizer_base::categories() const {
} }
template <typename LoggerPolicy> template <typename LoggerPolicy>
std::optional<std::string_view> inode_fragments binary_categorizer_<LoggerPolicy>::categorize(
binary_categorizer_<LoggerPolicy>::categorize(std::filesystem::path const&, std::filesystem::path const&,
std::span<uint8_t const> data std::span<uint8_t const> data [[maybe_unused]],
[[maybe_unused]]) const { category_mapper const& /*mapper*/) const {
inode_fragments fragments;
#ifndef _WIN32 #ifndef _WIN32
auto p = data.data(); auto p = data.data();
if (data.size() >= EI_NIDENT && ::memcmp(p, ELFMAG, 4) == 0) { if (data.size() >= EI_NIDENT && ::memcmp(p, ELFMAG, 4) == 0) {
@ -101,7 +103,7 @@ binary_categorizer_<LoggerPolicy>::categorize(std::filesystem::path const&,
} }
#endif #endif
return std::nullopt; return fragments;
} }
class binary_categorizer_factory : public categorizer_factory { class binary_categorizer_factory : public categorizer_factory {

View File

@ -57,10 +57,12 @@ class incompressible_categorizer_job_ : public sequential_categorizer_job {
incompressible_categorizer_job_(logger& lgr, incompressible_categorizer_job_(logger& lgr,
incompressible_categorizer_config const& cfg, incompressible_categorizer_config const& cfg,
std::filesystem::path const& path, std::filesystem::path const& path,
size_t total_size) size_t total_size,
category_mapper const& mapper)
: LOG_PROXY_INIT(lgr) : LOG_PROXY_INIT(lgr)
, cfg_{cfg} , cfg_{cfg}
, path_{path} { , path_{path}
, mapper_{mapper} {
input_.reserve(total_size < block_size ? total_size : block_size); input_.reserve(total_size < block_size ? total_size : block_size);
state_ = ::malloc(LZ4_sizeofState()); state_ = ::malloc(LZ4_sizeofState());
} }
@ -77,7 +79,8 @@ class incompressible_categorizer_job_ : public sequential_categorizer_job {
} }
} }
std::optional<std::string_view> result() override { inode_fragments result() override {
inode_fragments fragments;
if (!input_.empty()) { if (!input_.empty()) {
compress(); compress();
} }
@ -88,9 +91,11 @@ class incompressible_categorizer_job_ : public sequential_categorizer_job {
if (total_blocks_ > 0 && if (total_blocks_ > 0 &&
(total_output_size_ >= cfg_.max_ratio_size * total_input_size_ || (total_output_size_ >= cfg_.max_ratio_size * total_input_size_ ||
incompressible_blocks_ >= cfg_.max_ratio_blocks * total_blocks_)) { incompressible_blocks_ >= cfg_.max_ratio_blocks * total_blocks_)) {
return INCOMPRESSIBLE_CATEGORY; fragments.emplace_back(
fragment_category(mapper_(INCOMPRESSIBLE_CATEGORY)),
total_input_size_);
} }
return std::nullopt; return fragments;
} }
private: private:
@ -139,6 +144,7 @@ class incompressible_categorizer_job_ : public sequential_categorizer_job {
size_t incompressible_blocks_{0}; size_t incompressible_blocks_{0};
incompressible_categorizer_config const& cfg_; incompressible_categorizer_config const& cfg_;
std::filesystem::path const& path_; std::filesystem::path const& path_;
category_mapper const& mapper_;
}; };
class incompressible_categorizer_ final : public sequential_categorizer { class incompressible_categorizer_ final : public sequential_categorizer {
@ -148,7 +154,8 @@ class incompressible_categorizer_ final : public sequential_categorizer {
std::span<std::string_view const> categories() const override; std::span<std::string_view const> categories() const override;
std::unique_ptr<sequential_categorizer_job> std::unique_ptr<sequential_categorizer_job>
job(std::filesystem::path const& path, size_t total_size) const override; job(std::filesystem::path const& path, size_t total_size,
category_mapper const& mapper) const override;
private: private:
logger& lgr_; logger& lgr_;
@ -170,7 +177,8 @@ incompressible_categorizer_::categories() const {
std::unique_ptr<sequential_categorizer_job> std::unique_ptr<sequential_categorizer_job>
incompressible_categorizer_::job(std::filesystem::path const& path, incompressible_categorizer_::job(std::filesystem::path const& path,
size_t total_size) const { size_t total_size,
category_mapper const& mapper) const {
if (total_size < config_.min_input_size) { if (total_size < config_.min_input_size) {
return nullptr; return nullptr;
} }
@ -178,7 +186,7 @@ incompressible_categorizer_::job(std::filesystem::path const& path,
return make_unique_logging_object<sequential_categorizer_job, return make_unique_logging_object<sequential_categorizer_job,
incompressible_categorizer_job_, incompressible_categorizer_job_,
logger_policies>(lgr_, config_, path, logger_policies>(lgr_, config_, path,
total_size); total_size, mapper);
} }
class incompressible_categorizer_factory : public categorizer_factory { class incompressible_categorizer_factory : public categorizer_factory {

View File

@ -143,9 +143,9 @@ class libmagic_categorizer_ final : public libmagic_categorizer_base {
} }
} }
std::optional<std::string_view> inode_fragments
categorize(std::filesystem::path const& path, categorize(std::filesystem::path const& path, std::span<uint8_t const> data,
std::span<uint8_t const> data) const override; category_mapper const& mapper) const override;
private: private:
LOG_PROXY_DECL(LoggerPolicy); LOG_PROXY_DECL(LoggerPolicy);
@ -162,15 +162,17 @@ libmagic_categorizer_base::categories() const {
} }
template <typename LoggerPolicy> template <typename LoggerPolicy>
std::optional<std::string_view> libmagic_categorizer_<LoggerPolicy>::categorize( inode_fragments libmagic_categorizer_<LoggerPolicy>::categorize(
std::filesystem::path const& path, std::span<uint8_t const> data) const { std::filesystem::path const& path, std::span<uint8_t const> data,
category_mapper const& /*mapper*/) const {
inode_fragments fragments; // TODO: actually fill this :-)
auto id = m_.identify(data); auto id = m_.identify(data);
LOG_DEBUG << path << " -> (magic) " << id; LOG_DEBUG << path << " -> (magic) " << id;
{ {
auto wlock = mimetypes_.wlock(); auto wlock = mimetypes_.wlock();
++(*wlock)[id]; ++(*wlock)[id];
} }
return std::nullopt; return fragments;
} }
class libmagic_categorizer_factory : public categorizer_factory { class libmagic_categorizer_factory : public categorizer_factory {

View File

@ -0,0 +1,74 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#include <ostream>
#include <sstream>
#include "dwarfs/inode_fragments.h"
namespace dwarfs {
std::ostream&
inode_fragments::to_stream(std::ostream& os,
mapper_function_type const& mapper) const {
if (empty()) {
os << "(empty)";
} else {
os << "[";
bool first = true;
for (auto const& f : span()) {
if (first) {
first = false;
} else {
os << ", ";
}
os << "(";
auto const& cat = f.category();
if (mapper) {
os << mapper(cat.value());
} else {
os << cat.value();
}
if (cat.has_subcategory()) {
os << "/" << cat.subcategory();
}
os << ", " << f.size() << ")";
}
os << "]";
}
return os;
}
std::string
inode_fragments::to_string(mapper_function_type const& mapper) const {
std::ostringstream oss;
to_stream(oss, mapper);
return oss.str();
}
} // namespace dwarfs

View File

@ -155,6 +155,7 @@ class inode_ : public inode {
if (mm) { if (mm) {
if (catjob) { if (catjob) {
catjob.set_total_size(mm->size());
catjob.categorize_random_access(mm->span()); catjob.categorize_random_access(mm->span());
} }
@ -201,7 +202,7 @@ class inode_ : public inode {
} }
if (catjob) { if (catjob) {
category_ = catjob.result(); fragments_ = catjob.result();
} }
} }
@ -228,12 +229,14 @@ class inode_ : public inode {
vec.insert(vec.end(), chunks_.begin(), chunks_.end()); vec.insert(vec.end(), chunks_.begin(), chunks_.end());
} }
file_category category() const override { return category_; } inode_fragments const& fragments() const override { return fragments_; }
private: private:
// TODO: can we move optional stuff (e.g. nilsimsa_similarity_hash_) out of
// here?
std::optional<uint32_t> num_; std::optional<uint32_t> num_;
uint32_t similarity_hash_{0}; uint32_t similarity_hash_{0};
file_category category_; inode_fragments fragments_;
files_vector files_; files_vector files_;
std::vector<chunk_type> chunks_; std::vector<chunk_type> chunks_;
nilsimsa::hash_type nilsimsa_similarity_hash_; nilsimsa::hash_type nilsimsa_similarity_hash_;
@ -278,23 +281,26 @@ class inode_manager_ final : public inode_manager::impl {
} }
} }
std::vector<std::pair<file_category, size_t>> std::vector<std::pair<fragment_category::value_type, size_t>>
category_counts() const override { category_counts() const override {
std::unordered_map<file_category::value_type, size_t> tmp; std::unordered_map<fragment_category::value_type, size_t> tmp;
for (auto const& i : inodes_) { for (auto const& i : inodes_) {
++tmp[i->category().value()]; if (auto const& fragments = i->fragments(); !fragments.empty()) {
for (auto const& frag : fragments.span()) {
++tmp[frag.category().value()];
}
}
} }
std::vector<std::pair<file_category, size_t>> rv; std::vector<std::pair<fragment_category::value_type, size_t>> rv;
for (auto const& [k, v] : tmp) { for (auto const& [k, v] : tmp) {
rv.emplace_back(k, v); rv.emplace_back(k, v);
} }
std::sort(rv.begin(), rv.end(), [](auto const& a, auto const& b) { std::sort(rv.begin(), rv.end(),
return a.first.value() < b.first.value(); [](auto const& a, auto const& b) { return a.first < b.first; });
});
return rv; return rv;
} }