Switch categorizers to returning inode fragments

This commit is contained in:
Marcus Holland-Moritz 2023-07-18 12:29:13 +02:00
parent 1ac36bb6fa
commit 33b2dfb95f
12 changed files with 312 additions and 86 deletions

View File

@ -373,6 +373,7 @@ list(
src/dwarfs/fstypes.cpp
src/dwarfs/fs_section.cpp
src/dwarfs/global_entry_data.cpp
src/dwarfs/inode_fragments.cpp
src/dwarfs/inode_manager.cpp
src/dwarfs/inode_reader_v2.cpp
src/dwarfs/logger.cpp

View File

@ -23,14 +23,14 @@
#include <cstdint>
#include <filesystem>
#include <functional>
#include <limits>
#include <map>
#include <memory>
#include <optional>
#include <span>
#include <string_view>
#include "dwarfs/file_category.h"
#include "dwarfs/inode_fragments.h"
namespace boost::program_options {
class options_description;
@ -41,6 +41,9 @@ namespace dwarfs {
class logger;
using category_mapper =
std::function<fragment_category::value_type(std::string_view)>;
class categorizer {
public:
virtual ~categorizer() = default;
@ -50,23 +53,26 @@ class categorizer {
class random_access_categorizer : public categorizer {
public:
virtual std::optional<std::string_view>
categorize(std::filesystem::path const& path,
std::span<uint8_t const> data) const = 0;
virtual inode_fragments
categorize(std::filesystem::path const& path, std::span<uint8_t const> data,
category_mapper const& mapper) const = 0;
};
// TODO: add call to check if categorizer can return multiple fragments
// if it *can* we must run it before we start similarity hashing
class sequential_categorizer_job {
public:
virtual ~sequential_categorizer_job() = default;
virtual void add(std::span<uint8_t const> data) = 0;
virtual std::optional<std::string_view> result() = 0;
virtual inode_fragments result() = 0;
};
class sequential_categorizer : public categorizer {
public:
virtual std::unique_ptr<sequential_categorizer_job>
job(std::filesystem::path const& path, size_t total_size) const = 0;
job(std::filesystem::path const& path, size_t total_size,
category_mapper const& mapper) const = 0;
};
class categorizer_job {
@ -76,6 +82,10 @@ class categorizer_job {
categorizer_job();
categorizer_job(std::unique_ptr<impl> impl);
void set_total_size(size_t total_size) {
return impl_->set_total_size(total_size);
}
void categorize_random_access(std::span<uint8_t const> data) {
return impl_->categorize_random_access(data);
}
@ -84,7 +94,7 @@ class categorizer_job {
return impl_->categorize_sequential(data);
}
file_category result() { return impl_->result(); }
inode_fragments result() { return impl_->result(); }
explicit operator bool() const { return impl_ != nullptr; }
@ -92,9 +102,10 @@ class categorizer_job {
public:
virtual ~impl() = default;
virtual void set_total_size(size_t total_size) = 0;
virtual void categorize_random_access(std::span<uint8_t const> data) = 0;
virtual void categorize_sequential(std::span<uint8_t const> data) = 0;
virtual file_category result() = 0;
virtual inode_fragments result() = 0;
};
private:
@ -111,7 +122,7 @@ class categorizer_manager {
return impl_->job(path);
}
std::string_view category_name(file_category c) const {
std::string_view category_name(fragment_category::value_type c) const {
return impl_->category_name(c);
}
@ -121,7 +132,8 @@ class categorizer_manager {
virtual void add(std::shared_ptr<categorizer const> c) = 0;
virtual categorizer_job job(std::filesystem::path const& path) const = 0;
virtual std::string_view category_name(file_category c) const = 0;
virtual std::string_view
category_name(fragment_category::value_type c) const = 0;
};
private:

View File

@ -21,12 +21,13 @@
#pragma once
#include <cassert>
#include <cstdint>
#include <limits>
namespace dwarfs {
class file_category {
class fragment_category {
public:
using value_type = uint32_t;
@ -36,37 +37,60 @@ class file_category {
static constexpr value_type const max{std::numeric_limits<value_type>::max() -
1};
file_category()
: value_{uninitialized} {}
file_category(value_type v)
fragment_category() = default;
explicit fragment_category(value_type v)
: value_{v} {}
file_category(file_category const&) = default;
file_category(file_category&&) = default;
fragment_category(value_type v, value_type subcategory)
: value_{v}
, subcategory_{subcategory} {}
file_category& operator=(file_category const&) = default;
file_category& operator=(file_category&&) = default;
fragment_category(fragment_category const&) = default;
fragment_category(fragment_category&&) = default;
file_category& operator=(value_type v) {
fragment_category& operator=(fragment_category const&) = default;
fragment_category& operator=(fragment_category&&) = default;
fragment_category& operator=(value_type v) {
assert(v != uninitialized);
value_ = v;
return *this;
}
value_type value() const {
if (empty()) {
throw std::range_error("file_category is uninitialized");
}
assert(!empty());
return value_;
}
void clear() { value_ = uninitialized; }
void clear() {
value_ = uninitialized;
subcategory_ = uninitialized;
}
bool empty() const { return value_ == uninitialized; }
explicit operator bool() const { return !empty(); }
void set_subcategory(value_type subcategory) {
assert(!empty());
assert(subcategory != uninitialized);
subcategory_ = subcategory;
}
bool has_subcategory() const {
return !empty() && subcategory_ != uninitialized;
}
value_type subcategory() const {
assert(!empty());
assert(subcategory_ != uninitialized);
return subcategory_;
}
private:
value_type value_;
value_type value_{uninitialized};
value_type subcategory_{uninitialized};
};
} // namespace dwarfs

View File

@ -27,7 +27,7 @@
#include <folly/small_vector.h>
#include "dwarfs/file_category.h"
#include "dwarfs/inode_fragments.h"
#include "dwarfs/nilsimsa.h"
#include "dwarfs/object.h"
@ -59,7 +59,7 @@ class inode : public object {
virtual void add_chunk(size_t block, size_t offset, size_t size) = 0;
virtual void
append_chunks_to(std::vector<thrift::metadata::chunk>& vec) const = 0;
virtual file_category category() const = 0;
virtual inode_fragments const& fragments() const = 0;
};
} // namespace dwarfs

View File

@ -0,0 +1,83 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#pragma once
#include <functional>
#include <iosfwd>
#include <span>
#include <string>
#include <folly/small_vector.h>
#include "dwarfs/fragment_category.h"
#include "dwarfs/types.h"
namespace dwarfs {
class single_inode_fragment {
public:
single_inode_fragment(fragment_category category, file_off_t length)
: category_{category}
, length_{length} {}
fragment_category category() const { return category_; }
file_off_t length() const { return length_; }
file_off_t size() const { return length_; }
private:
fragment_category category_;
file_off_t length_;
};
class inode_fragments {
public:
using mapper_function_type =
std::function<std::string(fragment_category::value_type)>;
inode_fragments() = default;
single_inode_fragment&
emplace_back(fragment_category category, file_off_t length) {
return fragments_.emplace_back(category, length);
}
std::span<single_inode_fragment const> span() const { return fragments_; }
bool empty() const { return fragments_.empty(); }
explicit operator bool() const { return !empty(); }
std::ostream&
to_stream(std::ostream& os,
mapper_function_type const& mapper = mapper_function_type()) const;
std::string
to_string(mapper_function_type const& mapper = mapper_function_type()) const;
private:
folly::small_vector<single_inode_fragment, 1> fragments_;
};
inline std::ostream& operator<<(std::ostream& os, inode_fragments const& frag) {
return frag.to_stream(os);
}
} // namespace dwarfs

View File

@ -28,7 +28,7 @@
#include <utility>
#include <vector>
#include "dwarfs/file_category.h"
#include "dwarfs/fragment_category.h"
namespace dwarfs {
@ -59,7 +59,8 @@ class inode_manager {
impl_->for_each_inode_in_order(fn);
}
std::vector<std::pair<file_category, size_t>> category_counts() const {
std::vector<std::pair<fragment_category::value_type, size_t>>
category_counts() const {
return impl_->category_counts();
}
@ -74,7 +75,7 @@ class inode_manager {
file_order_options const& file_order, order_cb const& fn) = 0;
virtual void for_each_inode_in_order(
std::function<void(std::shared_ptr<inode> const&)> const& fn) const = 0;
virtual std::vector<std::pair<file_category, size_t>>
virtual std::vector<std::pair<fragment_category::value_type, size_t>>
category_counts() const = 0;
};

View File

@ -35,19 +35,20 @@
namespace dwarfs {
using namespace std::placeholders;
namespace po = boost::program_options;
namespace {
constexpr std::string_view const DEFAULT_CATEGORY{"<default>"};
}
class categorizer_manager_private {
class categorizer_manager_private : public categorizer_manager::impl {
public:
virtual ~categorizer_manager_private() = default;
virtual std::vector<std::shared_ptr<categorizer const>> const&
categorizers() const = 0;
virtual file_category category(std::string_view cat) const = 0;
virtual fragment_category::value_type
category(std::string_view cat) const = 0;
};
template <typename LoggerPolicy>
@ -57,39 +58,48 @@ class categorizer_job_ final : public categorizer_job::impl {
std::filesystem::path const& path)
: LOG_PROXY_INIT(lgr)
, mgr_{mgr}
, path_{path} {}
, path_{path}
, cat_mapper_{std::bind(&categorizer_manager_private::category,
std::cref(mgr_), _1)} {}
void set_total_size(size_t total_size) override;
void categorize_random_access(std::span<uint8_t const> data) override;
void categorize_sequential(std::span<uint8_t const> data) override;
file_category result() override;
inode_fragments result() override;
private:
LOG_PROXY_DECL(LoggerPolicy);
categorizer_manager_private const& mgr_;
std::string_view best_{DEFAULT_CATEGORY};
inode_fragments best_;
int index_{-1};
bool is_global_best_{false};
size_t total_size_hint_{0};
size_t total_size_{0};
std::vector<std::pair<int, std::unique_ptr<sequential_categorizer_job>>>
seq_jobs_;
std::filesystem::path const path_;
category_mapper cat_mapper_;
};
template <typename LoggerPolicy>
void categorizer_job_<LoggerPolicy>::set_total_size(size_t total_size) {
total_size_ = total_size;
}
template <typename LoggerPolicy>
void categorizer_job_<LoggerPolicy>::categorize_random_access(
std::span<uint8_t const> data) {
DWARFS_CHECK(index_ < 0,
"internal error: index already set in categorize_random_access");
total_size_hint_ = data.size();
total_size_ = data.size();
bool global_best = true;
for (auto&& [index, cat] : folly::enumerate(mgr_.categorizers())) {
if (auto p = dynamic_cast<random_access_categorizer const*>(cat.get())) {
if (auto c = p->categorize(path_, data)) {
best_ = *c;
if (auto c = p->categorize(path_, data, cat_mapper_)) {
best_ = c;
index_ = index;
is_global_best_ = global_best;
break;
@ -114,7 +124,7 @@ void categorizer_job_<LoggerPolicy>::categorize_sequential(
}
if (auto p = dynamic_cast<sequential_categorizer const*>(cat.get())) {
if (auto job = p->job(path_, total_size_hint_)) {
if (auto job = p->job(path_, total_size_, cat_mapper_)) {
seq_jobs_.emplace_back(index, std::move(job));
}
}
@ -127,12 +137,12 @@ void categorizer_job_<LoggerPolicy>::categorize_sequential(
}
template <typename LoggerPolicy>
file_category categorizer_job_<LoggerPolicy>::result() {
inode_fragments categorizer_job_<LoggerPolicy>::result() {
if (!seq_jobs_.empty()) {
for (auto&& [index, job] : seq_jobs_) {
if (auto c = job->result()) {
assert(index_ < 0 || index < index_);
best_ = *c;
best_ = c;
break;
}
}
@ -140,9 +150,12 @@ file_category categorizer_job_<LoggerPolicy>::result() {
seq_jobs_.clear();
}
LOG_TRACE << path_ << " -> " << best_;
LOG_TRACE << path_ << " -> "
<< best_.to_string([this](fragment_category::value_type c) {
return std::string(mgr_.category_name(c));
});
return mgr_.category(best_);
return best_;
}
categorizer_job::categorizer_job() = default;
@ -151,8 +164,7 @@ categorizer_job::categorizer_job(std::unique_ptr<impl> impl)
: impl_{std::move(impl)} {}
template <typename LoggerPolicy>
class categorizer_manager_ final : public categorizer_manager::impl,
public categorizer_manager_private {
class categorizer_manager_ final : public categorizer_manager_private {
public:
categorizer_manager_(logger& lgr)
: lgr_{lgr}
@ -162,14 +174,15 @@ class categorizer_manager_ final : public categorizer_manager::impl,
void add(std::shared_ptr<categorizer const> c) override;
categorizer_job job(std::filesystem::path const& path) const override;
std::string_view category_name(file_category c) const override;
std::string_view
category_name(fragment_category::value_type c) const override;
std::vector<std::shared_ptr<categorizer const>> const&
categorizers() const override {
return categorizers_;
}
file_category category(std::string_view cat) const override {
fragment_category::value_type category(std::string_view cat) const override {
auto it = catmap_.find(cat);
DWARFS_CHECK(it != catmap_.end(), fmt::format("unknown category: {}", cat));
return it->second;
@ -188,7 +201,7 @@ class categorizer_manager_ final : public categorizer_manager::impl,
LOG_PROXY_DECL(LoggerPolicy);
std::vector<std::shared_ptr<categorizer const>> categorizers_;
std::vector<std::string_view> categories_;
std::unordered_map<std::string_view, file_category> catmap_;
std::unordered_map<std::string_view, fragment_category::value_type> catmap_;
};
template <typename LoggerPolicy>
@ -210,9 +223,9 @@ categorizer_job categorizer_manager_<LoggerPolicy>::job(
}
template <typename LoggerPolicy>
std::string_view
categorizer_manager_<LoggerPolicy>::category_name(file_category c) const {
return DWARFS_NOTHROW(categories_.at(c.value()));
std::string_view categorizer_manager_<LoggerPolicy>::category_name(
fragment_category::value_type c) const {
return DWARFS_NOTHROW(categories_.at(c));
}
categorizer_manager::categorizer_manager(logger& lgr)

View File

@ -58,9 +58,9 @@ class binary_categorizer_ final : public binary_categorizer_base {
binary_categorizer_(logger& lgr)
: LOG_PROXY_INIT(lgr) {}
std::optional<std::string_view>
categorize(std::filesystem::path const& path,
std::span<uint8_t const> data) const override;
inode_fragments
categorize(std::filesystem::path const& path, std::span<uint8_t const> data,
category_mapper const& mapper) const override;
private:
LOG_PROXY_DECL(LoggerPolicy);
@ -74,10 +74,12 @@ std::span<std::string_view const> binary_categorizer_base::categories() const {
}
template <typename LoggerPolicy>
std::optional<std::string_view>
binary_categorizer_<LoggerPolicy>::categorize(std::filesystem::path const&,
std::span<uint8_t const> data
[[maybe_unused]]) const {
inode_fragments binary_categorizer_<LoggerPolicy>::categorize(
std::filesystem::path const&,
std::span<uint8_t const> data [[maybe_unused]],
category_mapper const& /*mapper*/) const {
inode_fragments fragments;
#ifndef _WIN32
auto p = data.data();
if (data.size() >= EI_NIDENT && ::memcmp(p, ELFMAG, 4) == 0) {
@ -101,7 +103,7 @@ binary_categorizer_<LoggerPolicy>::categorize(std::filesystem::path const&,
}
#endif
return std::nullopt;
return fragments;
}
class binary_categorizer_factory : public categorizer_factory {

View File

@ -57,10 +57,12 @@ class incompressible_categorizer_job_ : public sequential_categorizer_job {
incompressible_categorizer_job_(logger& lgr,
incompressible_categorizer_config const& cfg,
std::filesystem::path const& path,
size_t total_size)
size_t total_size,
category_mapper const& mapper)
: LOG_PROXY_INIT(lgr)
, cfg_{cfg}
, path_{path} {
, path_{path}
, mapper_{mapper} {
input_.reserve(total_size < block_size ? total_size : block_size);
state_ = ::malloc(LZ4_sizeofState());
}
@ -77,7 +79,8 @@ class incompressible_categorizer_job_ : public sequential_categorizer_job {
}
}
std::optional<std::string_view> result() override {
inode_fragments result() override {
inode_fragments fragments;
if (!input_.empty()) {
compress();
}
@ -88,9 +91,11 @@ class incompressible_categorizer_job_ : public sequential_categorizer_job {
if (total_blocks_ > 0 &&
(total_output_size_ >= cfg_.max_ratio_size * total_input_size_ ||
incompressible_blocks_ >= cfg_.max_ratio_blocks * total_blocks_)) {
return INCOMPRESSIBLE_CATEGORY;
fragments.emplace_back(
fragment_category(mapper_(INCOMPRESSIBLE_CATEGORY)),
total_input_size_);
}
return std::nullopt;
return fragments;
}
private:
@ -139,6 +144,7 @@ class incompressible_categorizer_job_ : public sequential_categorizer_job {
size_t incompressible_blocks_{0};
incompressible_categorizer_config const& cfg_;
std::filesystem::path const& path_;
category_mapper const& mapper_;
};
class incompressible_categorizer_ final : public sequential_categorizer {
@ -148,7 +154,8 @@ class incompressible_categorizer_ final : public sequential_categorizer {
std::span<std::string_view const> categories() const override;
std::unique_ptr<sequential_categorizer_job>
job(std::filesystem::path const& path, size_t total_size) const override;
job(std::filesystem::path const& path, size_t total_size,
category_mapper const& mapper) const override;
private:
logger& lgr_;
@ -170,7 +177,8 @@ incompressible_categorizer_::categories() const {
std::unique_ptr<sequential_categorizer_job>
incompressible_categorizer_::job(std::filesystem::path const& path,
size_t total_size) const {
size_t total_size,
category_mapper const& mapper) const {
if (total_size < config_.min_input_size) {
return nullptr;
}
@ -178,7 +186,7 @@ incompressible_categorizer_::job(std::filesystem::path const& path,
return make_unique_logging_object<sequential_categorizer_job,
incompressible_categorizer_job_,
logger_policies>(lgr_, config_, path,
total_size);
total_size, mapper);
}
class incompressible_categorizer_factory : public categorizer_factory {

View File

@ -143,9 +143,9 @@ class libmagic_categorizer_ final : public libmagic_categorizer_base {
}
}
std::optional<std::string_view>
categorize(std::filesystem::path const& path,
std::span<uint8_t const> data) const override;
inode_fragments
categorize(std::filesystem::path const& path, std::span<uint8_t const> data,
category_mapper const& mapper) const override;
private:
LOG_PROXY_DECL(LoggerPolicy);
@ -162,15 +162,17 @@ libmagic_categorizer_base::categories() const {
}
template <typename LoggerPolicy>
std::optional<std::string_view> libmagic_categorizer_<LoggerPolicy>::categorize(
std::filesystem::path const& path, std::span<uint8_t const> data) const {
inode_fragments libmagic_categorizer_<LoggerPolicy>::categorize(
std::filesystem::path const& path, std::span<uint8_t const> data,
category_mapper const& /*mapper*/) const {
inode_fragments fragments; // TODO: actually fill this :-)
auto id = m_.identify(data);
LOG_DEBUG << path << " -> (magic) " << id;
{
auto wlock = mimetypes_.wlock();
++(*wlock)[id];
}
return std::nullopt;
return fragments;
}
class libmagic_categorizer_factory : public categorizer_factory {

View File

@ -0,0 +1,74 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/
#include <ostream>
#include <sstream>
#include "dwarfs/inode_fragments.h"
namespace dwarfs {
std::ostream&
inode_fragments::to_stream(std::ostream& os,
mapper_function_type const& mapper) const {
if (empty()) {
os << "(empty)";
} else {
os << "[";
bool first = true;
for (auto const& f : span()) {
if (first) {
first = false;
} else {
os << ", ";
}
os << "(";
auto const& cat = f.category();
if (mapper) {
os << mapper(cat.value());
} else {
os << cat.value();
}
if (cat.has_subcategory()) {
os << "/" << cat.subcategory();
}
os << ", " << f.size() << ")";
}
os << "]";
}
return os;
}
std::string
inode_fragments::to_string(mapper_function_type const& mapper) const {
std::ostringstream oss;
to_stream(oss, mapper);
return oss.str();
}
} // namespace dwarfs

View File

@ -155,6 +155,7 @@ class inode_ : public inode {
if (mm) {
if (catjob) {
catjob.set_total_size(mm->size());
catjob.categorize_random_access(mm->span());
}
@ -201,7 +202,7 @@ class inode_ : public inode {
}
if (catjob) {
category_ = catjob.result();
fragments_ = catjob.result();
}
}
@ -228,12 +229,14 @@ class inode_ : public inode {
vec.insert(vec.end(), chunks_.begin(), chunks_.end());
}
file_category category() const override { return category_; }
inode_fragments const& fragments() const override { return fragments_; }
private:
// TODO: can we move optional stuff (e.g. nilsimsa_similarity_hash_) out of
// here?
std::optional<uint32_t> num_;
uint32_t similarity_hash_{0};
file_category category_;
inode_fragments fragments_;
files_vector files_;
std::vector<chunk_type> chunks_;
nilsimsa::hash_type nilsimsa_similarity_hash_;
@ -278,23 +281,26 @@ class inode_manager_ final : public inode_manager::impl {
}
}
std::vector<std::pair<file_category, size_t>>
std::vector<std::pair<fragment_category::value_type, size_t>>
category_counts() const override {
std::unordered_map<file_category::value_type, size_t> tmp;
std::unordered_map<fragment_category::value_type, size_t> tmp;
for (auto const& i : inodes_) {
++tmp[i->category().value()];
if (auto const& fragments = i->fragments(); !fragments.empty()) {
for (auto const& frag : fragments.span()) {
++tmp[frag.category().value()];
}
}
}
std::vector<std::pair<file_category, size_t>> rv;
std::vector<std::pair<fragment_category::value_type, size_t>> rv;
for (auto const& [k, v] : tmp) {
rv.emplace_back(k, v);
}
std::sort(rv.begin(), rv.end(), [](auto const& a, auto const& b) {
return a.first.value() < b.first.value();
});
std::sort(rv.begin(), rv.end(),
[](auto const& a, auto const& b) { return a.first < b.first; });
return rv;
}